# ========== generating results ================= source("define_benchmark.r") args = commandArgs(trailingOnly=TRUE) if (args[1] == "small") { mode = "small" n1 = 10 n2 = 10 } else if (args[1] == "medium") { mode = "medium" n1 = 10 n2 = 20 } else { mode = "large" n1 = 20 n2 = 20 } N = 20 # lapply, type mbm.spark.lapply.type <- run.mbm.spark.lapply.type(n2) p <- mbm.spark.lapply.type %>% plot.box.mbm filename <- sprintf("%slapply.type.%s.png", dir_path, mode) ggsave(filename, width=7, height=4) # lapply, len mbm.spark.lapply.len <- run.mbm.spark.lapply.len(mode, n1) p <- mbm.spark.lapply.len %>% plot.box.mbm filename <- sprintf("%slapply.len.%s.png", dir_path, mode) ggsave(filename, width=7, height=4) # dapply, type mbm.dapply.type <- run.mbm.dapply.type(n2) p <- mbm.dapply.type %>% plot.box.mbm filename <- sprintf("%sdapply.type.%s.png", dir_path, mode) ggsave(filename, width=7, height=4) # dapply, len mbm.dapply.len <- run.mbm.dapply.len(mode, n1) p <- mbm.dapply.len %>% plot.box.mbm filename <- sprintf("%sdapply.len.%s.png", dir_path, mode) ggsave(filename, width=7, height=4) # dapply, ncol mbm.dapply.ncol <- run.mbm.dapply.ncol(mode, n1) p <- mbm.dapply.ncol %>% plot.box.mbm filename <- sprintf("%sdapply.ncol.%s.png", dir_path, mode) ggsave(filename, width=7, height=4) # dapplyCollect, type mbm.dapplyCollect.type <- run.mbm.dapplyCollect.type(N) p <- mbm.dapplyCollect.type %>% plot.box.mbm filename <- sprintf("%sdapplyCollect.type.%s.png", dir_path, mode) ggsave(filename, width=7, height=4) # dapplyCollect, len mbm.dapplyCollect.len <- run.mbm.dapplyCollect.len(mode, N) p <- mbm.dapplyCollect.len %>% plot.box.mbm filename <- sprintf("%sdapplyCollect.len.%s.png", dir_path, mode) ggsave(filename, width=7, height=4) # dapplyCollect, ncol mbm.dapplyCollect.ncol <- run.mbm.dapplyCollect.ncol(mode, n1) p <- mbm.dapplyCollect.ncol %>% plot.box.mbm filename <- sprintf("%sdapplyCollect.ncol.%s.png", dir_path, mode) ggsave(filename, width=7, height=4) # gapply, nkey mbm.gapply.nkey <- run.mbm.gapply.nkey(mode, n1) p <- mbm.gapply.nkey %>% plot.box.mbm filename <- sprintf("%sgapply.nkey.%s.png", dir_path, mode) ggsave(filename, width=7, height=4) # gapply, nrow mbm.gapply.nrow <- run.mbm.gapply.nrow(mode, n1) p <- mbm.gapply.nrow %>% plot.box.mbm filename <- sprintf("%sgapply.nrow.%s.png", dir_path, mode) ggsave(filename, width=7, height=4) # gapply, keytype mbm.gapply.keytype <- run.mbm.gapply.keytype(n1) p <- mbm.gapply.keytype %>% plot.box.mbm filename <- sprintf("%sgapply.keytype.%s.png", dir_path, mode) ggsave(filename, width=7, height=4) # gapplyCollect, nkey mbm.gapplyCollect.nkey <- run.mbm.gapplyCollect.nkey(mode, n1) p <- mbm.gapplyCollect.nkey %>% plot.box.mbm filename <- sprintf("%sgapplyCollect.nkey.%s.png", dir_path, mode) ggsave(filename, width=7, height=4) # gapplyCollect, nrow mbm.gapplyCollect.nrow <- run.mbm.gapplyCollect.nrow(mode, n1) p <- mbm.gapplyCollect.nrow %>% plot.box.mbm filename <- sprintf("%sgapplyCollect.nrow.%s.png", dir_path, mode) ggsave(filename, width=7, height=4) # gapplyCollect, keytype mbm.gapplyCollect.keytype <- run.mbm.gapplyCollect.keytype(n1) p <- mbm.gapplyCollect.keytype %>% plot.box.mbm filename <- sprintf("%sgapplyCollect.keytype.%s.png", dir_path, mode) ggsave(filename, width=7, height=4) tmp <- rbind( mbm.spark.lapply.type, mbm.spark.lapply.len, mbm.dapply.type, mbm.dapply.len, mbm.dapply.ncol, mbm.dapplyCollect.type, mbm.dapplyCollect.len, mbm.dapplyCollect.ncol, mbm.gapply.nkey, mbm.gapply.nrow, mbm.gapply.keytype, mbm.gapplyCollect.nkey, mbm.gapplyCollect.nrow, mbm.gapplyCollect.keytype) # compute throughput tmp_size <- merge(tmp, df.sizes, by.x = "expr", by.y = "obj_names", all.x=TRUE) tmp_size$throughput <- round(tmp_size$obj_sizes*1000000/tmp_size$time, digits=2) # bytes per second # plot throughput p <- tmp_size %>% plot.box.throughput filename <- sprintf("%sall.throughput.%s.png", dir_path, mode) ggsave(filename, width=7, height=6) # save raw data to csv file towrite <- tmp_size[order(tmp_size$expr, tmp_size$time),] write.csv(towrite, file="results/results.csv", row.names = F) # save mean value in ml.perf_metrics format # timestamp: timestamp, benchmarkId: string, benchmarkName: string, # metricName: string, metricValue: string, isLargerBetter: boolean, parameters map op <- options(digits.secs = 3) curTimestamp <- Sys.time() benchmarkName <- "com.databricks.spark.sql.perf.sparkr.UserDefinedFunction" metricName <- "throughput.byte.per.second" isLargerBetter <- TRUE perf_metric <- aggregate(towrite$throughput, list(towrite$expr), mean) names(perf_metric) <- c("benchmarkId", "throughput") perf_metric$timestamp <- curTimestamp perf_metric$benchmarkName <- benchmarkName perf_metric$metricName <- metricName perf_metric$isLargerBetter <- isLargerBetter perf_metric$parameters <- NULL write.csv(perf_metric, file="results/perf_metrics.csv", row.names = F)