## ----include = FALSE---------------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) ## ----setup, include=F--------------------------------------------------------- library(tidyverse) ## ----echo=F------------------------------------------------------------------- sim_data <- read_csv("sim_data.csv") sim_data %>% mutate( name = ifelse(name == "time", "Time Usage (s)", "Memory Usage (MB)"), join_type = ifelse(join_type == "Jaccard Distance", "Jaccard Distance Join", "Euclidean Distance Joins" ), ) %>% ggplot(aes(x = as.numeric(n), y = value, col = package, linetype = package)) + geom_point() + geom_line() + facet_wrap(~ join_type + name, scales = "free") + scale_y_continuous("Time (s) / memory (MB)") ## ----echo=T, eval=F----------------------------------------------------------- # library(zoomerjoin) # library(fuzzyjoin) # library(tidyverse) # library(microbenchmark) # library(profmem) # # # # Sample million rows from DIME dataset # data_1 <- as.data.frame(sample_n(dime_data, 10^6)) # names(data_1) <- c("id_1", "name") # data_2 <- as.data.frame(sample_n(dime_data, 10^6)) # names(data_2) <- c("id_2", "name") # # # Generate datasets for euclidean join benchmarking # n <- 10^5 # p <- 50 # X <- matrix(rnorm(n * p), n, p) # X_1 <- as.data.frame(X) # X_2 <- as.data.frame(X + .000000001) # # # Get time and memory use statistics for fuzzyjoin when performing jaccard join # fuzzy_jaccard_bench <- function(n) { # time <- microbenchmark( # stringdist_inner_join(data_1[1:n, ], # data_2[1:n, ], # method = "jaccard", # max_dist = .6, # q = 4 # ), # times = 10 # )$time %>% # median() # # mem <- profmem(stringdist_inner_join(data_1[1:n, ], # data_2[1:n, ], # method = "jaccard", # max_dist = .6, # q = 4 # )) %>% # total() # # return(c(time = time, memory = mem)) # } # # # # Get time and memory use statistics for zoomerjoin when performing jaccard join # zoomer_jaccard_bench <- function(n) { # time <- microbenchmark( # jaccard_inner_join(data_1[1:n, ], data_2[1:n, ], # by = "name", band_width = 11, # n_bands = 350, threshold = .7, # n_gram_width = 4 # ), # times = 50 # )$time %>% # median() # # mem <- profmem( # jaccard_inner_join(data_1[1:n, ], data_2[1:n, ], # by = "name", band_width = 11, # n_bands = 350, threshold = .7, # n_gram_width = 4 # ) # ) %>% # total() # # return(c(time = time, memory = mem)) # } # # # Get time and memory use statistics for fuzzyjoin when performing Euclidean join # fuzzy_euclid_bench <- function(n) { # time <- microbenchmark( # distance_join(X_1[1:n, ], X_2[1:n, ], max_dist = .1, method = "euclidean"), # times = 10 # )$time %>% # median() # # mem <- total(profmem( # distance_join(X_1[1:n, ], X_2[1:n, ], max_dist = .1, method = "euclidean") # )) # # return(c(time = time, memory = mem)) # } # # # Get time and memory use statistics for zoomerjoin when performing Euclidean join # zoomer_euclid_bench <- function(n) { # time <- microbenchmark( # euclidean_inner_join(X_1[1:n, ], X_2[1:n, ], # threshold = .1, n_bands = 90, # band_width = 2, r = .1 # ), # times = 50 # )$time %>% # median() # # mem <- profmem(euclidean_inner_join(X_1[1:n, ], X_2[1:n, ], # threshold = .1, n_bands = 90, # band_width = 2, r = .1 # )) %>% # total() # # return(c(time = time, memory = mem)) # } # # # # Run Grid of Jaccard Benchmarks, Collect results into DF # n <- seq(500, 4000, 250) # names(n) <- n # fuzzy_jacard_benches <- map_df(n, fuzzy_jaccard_bench, .id = "n") # zoomer_jacard_benches <- map_df(n, zoomer_jaccard_bench, .id = "n") # fuzzy_jacard_benches$package <- "fuzzyjoin" # zoomer_jacard_benches$package <- "zoomerjoin" # jaccard_benches <- bind_rows(fuzzy_jacard_benches, zoomer_jacard_benches) # jaccard_benches$join_type <- "Jaccard Distance" # # # Run Grid of Euclidean Benchmarks, Collect results into DF # n <- seq(250, 4000, 250) # names(n) <- n # fuzzy_euclid_benches <- map_df(n, fuzzy_euclid_bench, .id = "n") # zoomer_euclid_benches <- map_df(n, zoomer_euclid_bench, .id = "n") # fuzzy_euclid_benches$package <- "fuzzyjoin" # zoomer_euclid_benches$package <- "zoomerjoin" # euclid_benches <- bind_rows(fuzzy_euclid_benches, zoomer_euclid_benches) # euclid_benches$join_type <- "Euclidean Distance" # # sim_data <- bind_rows(euclid_benches, jaccard_benches) %>% # pivot_longer(c(time, memory)) %>% # mutate(value = ifelse(name == "time", value / 10^9, value / 10^6)) # convert ns to s and bytes to Gb. # # write_csv(sim_data, "sim_data.csv")