### R code from vignette source 'article.Rnw' ################################################### ### code chunk number 1: preliminaries ################################################### options(prompt = "R> ", continue = "+ ", width = 70, useFancyQuotes = FALSE) pkgs <- c( "jti", # includes sparta "ess", "glue", "dplyr", "tidyr", "forcats", "stringr", "scales", "purrr", "tictoc", "ggplot2", "microbenchmark", "igraph", "ggh4x" ) install.packages(setdiff(pkgs, rownames(installed.packages()))) # # if (!requireNamespace("weaver", quietly = TRUE)) { # install.packages("BiocManager") # BiocManager::install("weaver") # BiocManager::install(c("graph", "Rgraphviz", "RBGL")) # # } # if (!requireNamespace("gRbase", quietly = TRUE)) { # install.packages("gRbase", # dependencies = TRUE, # repos = c("https://cloud.r-project.org", "https://bioconductor.org") # ) # } set.seed(11) ################################################### ################################################### ### Section 4 ################################################### ################################################### dn <- function(x) setNames(lapply(x, paste0, 1:2), toupper(x)) d <- c(2, 2, 2) f <- array(c(5, 4, 0, 7, 0, 9, 0, 0), d, dn(c("x", "y", "z"))) g <- array(c(7, 6, 0, 6, 0, 0, 9, 0), d, dn(c("y", "z", "w"))) ftable(f, row.vars = "X") ftable(g, row.vars = "W") df <- as.data.frame.table(f, stringsAsFactors=FALSE) df <- df[df$Freq != 0,] dg <- as.data.frame.table(g, stringsAsFactors=FALSE) dg <- dg[dg$Freq != 0,] print(df, row.names = FALSE) print(dg, row.names = FALSE) ################################################### ### code chunk: multiplication ################################################### sparse_prod <- function(df, dg){ S <- setdiff(intersect(names(df), names(dg)), "Freq") mrg <- merge(df, dg, by = S, suffixes = c("_df", "_dg")) mrg <- within(mrg, val <- Freq_df * Freq_dg) mrg[, setdiff(names(mrg), c("Freq_df", "Freq_dg"))] } sparse_prod(df, dg) ################################################### ### code chunk: Marginalization ################################################### aggregate(Freq ~ Y + Z, data = df, FUN = sum) ################################################### ################################################### ### Section 5: Sparse Tables ################################################### ################################################### ftable(f, row.vars = "X") ftable(g, row.vars = "W") ################################################### ### code chunk: sparta ################################################### library("sparta") sf <- as_sparta(f); sg <- as_sparta(g) print.default(sf) print(sf) ################################################### ### code chunk: named cell ################################################### get_cell_name(sf, sf[, 2L]) ################################################### ### code chunk: product ################################################### mfg <- mult(sf, sg) mfg ################################################### ### code chunk: conditional probability table ################################################### sf_cpt <- as_cpt(sf, y = "Z") sf_cpt ################################################### ### code chunk: slicing ################################################### library("sparta") slice(sf, s = c(X = "x1"), drop = TRUE) ################################################### ### code chunk: marginalization ################################################### marg(sg, y = c("Y")) ################################################## ################################################## #### Section 5.2 : When to use sparta ################################################## ################################################## library("ggplot2") library("dplyr") y1 <- function(nnz, nvars) nnz * (4 * nvars + 8) / 1e9 y2 <- function(x) x * 8 / 1e9 sparsity <- c(0.05, .1, .2, .3, .4, .5) N <- 9 bins <- 9 nvars_ <- c(4, 6, 8) d <- lapply(nvars_, function(nvars) { y <- c() sp <- c() for (sp_ in sparsity) { y <- c(y, y1(bins^(1:N) * sp_, nvars)) sp <- c(sp, rep(sp_, bins)) } x <- rep(y2(bins^(1:N)), length(sparsity)) d <- data.frame( x = x, y = y, Sparsity = sp, nvars = rep(nvars, length(y)) ) }) d <- do.call(rbind, d) %>% mutate(Sparsity = 1 - Sparsity) %>% mutate(Sparsity = as.character(Sparsity)) %>% mutate(nvars = glue::glue("Variables: {nvars}")) p <- ggplot(d, aes(x = x, y = y, col = Sparsity)) + geom_line() + geom_abline(intercept = 0, slope = 1) + facet_grid(~nvars) + xlab("Memory (Gb) storage of a dense table") + ylab("Memory (Gb) storage of a sparta table") + theme_bw() p # ggsave(plot = p, filename = paste0("sparta_when.pdf"), # width = 8, height = 3) ################################################## ################################################## #### Section 5.3 : Probability Trees and Value Based Potential ################################################## ################################################## dt <- cbind( expand.grid(Z = 1:2, Y = 1:2, X = 1:2), Freq = c(.4, .1, .7, .1, .6, 0, 0, .9), idx = 1:2^3 ) print(dt, row.names = FALSE) structure(dt$Freq, names = dt$idx) dt_no_zeroes <- subset(dt, Freq != 0) unique_values <- unique(dt_no_zeroes$Freq) A <- structure(unique_values, names = seq_along(unique_values)) D <- structure(sapply(dt_no_zeroes$Freq, function(k) { match(k, A) }), names = dt_no_zeroes$idx) (IDM <- list(A = A, D = D)) (B <- structure(dt_no_zeroes$Freq, names = dt_no_zeroes$idx)) ################################################## ################################################## #### Section 6 : A usecase for jti ################################################## ################################################## library("jti") cl <- cpt_list(asia2) cl tri <- triangulate(cl, tri = "min_fill") cp <- compile(cl, evidence = c(tub = "yes"), tri = "min_fill") cp j <- jt(cp) query_belief(j, nodes = "xray") ######################### ### Figure 3 ### ######################### library("igraph") set.seed(5) el <- matrix(c( "A", "T", "T", "E", "S", "L", "S", "B", "L", "E", "E", "X", "E", "D", "B", "D"), nc = 2, byrow = TRUE ) g <- igraph::graph_from_edgelist(el) cl <- cpt_list(asia, g) par(mar = c(0, 0, 0, 0)+.1) plot(g, vertex.size = 30, vertex.color = "white", vertex.label.color = "black", vertex.label.cex = 3, edge.width = 3, edge.color = "black" ) ######################### ### Figure 4 ### ######################### set.seed(14) cp <- compile(cl, evidence = c(T = "y"), tri = "min_fill") j <- jt(cp, propagate = "no") par(mar = c(0, 0, 0, 0) + .1) plot(j, vertex.size = 50, vertex.color = "white", vertex.label.color = "black", vertex.label.cex = 2, edge.width = 4, edge.color = "black" ) ################################################### ################################################### ### Section 7: Benchmarking ################################################### ################################################### library("dplyr") library("tidyr") library("purrr") library("ggplot2") library("scales") library("forcats") library("stringr") library("microbenchmark") if (!file.exists("results.rds")) { # Helpers R_sparse_prod <- function(df, dg){ S <- setdiff(intersect(names(df), names(dg)), "val") mrg <- merge(df, dg, by = S) val <- mrg$val.x * mrg$val.y mrg$val <- val mrg[,setdiff(names(mrg), c("val.x", "val.y"))] } R_sparse_marg <- function(form, dat) { if (nrow(dat) == 0) return(0) aggregate(form, data = dat, FUN = sum) } make_form <- function(y) { rhs <- y[1] for (k in seq_along(y)[-1]) { rhs <- paste(rhs, "+", y[k]) } as.formula(paste("val ~ ", rhs, sep = "")) } results <- data.frame( nlvls = integer(0L), nvars = integer(0L), nmut = integer(0L), sparsity = double(0L), log_ncells = double(0L), time_s = double(0L), size_mb = double(0L), pkg = character(0L) ) max_nlvls <- 6 max_nvars <- 5 max_log_ncells <- 14 neval <- 50L d <- 16L # digits for rounding the time variable # Call grbase prior to timing: # - apparently there is some overhead in the first call due to byte compilation a <- array(0:1, 2, list(a = c("0", "1"))) invisible(gRbase::tabMult(a, a)) invisible(sparta::as_sparta(a)) # Reproducibility set.seed(3007) counter <- 1L for (nlvls in 2:max_nlvls) { for (nvars in 2:max_nvars) { for (nmut in 2:nvars) { for (sparsity in c(0, 0.4, 0.8)) { ncells <- nlvls^(2*nvars - nmut) log_ncells <- round(log(ncells), 5) print(glue::glue("----------------------------------------------------------------")) print( glue::glue( "{counter} - ", "nlvls: {nlvls}, ", "nvars: {nvars}, ", "nmut: {nmut}, ", "log_ncells: {log_ncells}, ", "sparsity: {sparsity}" ) ) if (log_ncells > max_log_ncells) { counter <- counter + 1L print(glue::glue("Intentionally skipping ...")) next # We dont want to crash your computer } print(glue::glue("Making arrays ...")) p <- c(sparsity, 1-sparsity) # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # X # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ xnames <- LETTERS[1:nvars] lvlsx <- structure(lapply(1:nvars, function(x) letters[1:nlvls]), names = xnames) x <- array(sample(c(0,1), nlvls^nvars, replace = TRUE, prob = p), rep(nlvls, nvars), lvlsx) x[matrix(1L, ncol = nvars)] <- 1L sp_tabx <- sparta::as_sparta(x) gr_tabx <- x R_tabx <- as.data.frame.table(x, stringsAsFactors=FALSE) R_tabx <- R_tabx[R_tabx$Freq != 0,] colnames(R_tabx)[ncol(R_tabx)] <- "val" # bn_tabx <- list(cpt = R_tabx[, -ncol(R_tabx)], prob = R_tabx$val) rm(x) gc() # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Y # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ynames <- if (nmut == nvars) { LETTERS[1:nvars] } else { start <- seq(nvars, nvars - nmut + 1) end <- (nvars+1):(2*nvars-nmut) LETTERS[c(start, end)] } lvlsy <- structure(lapply(1:nvars, function(x) letters[1:nlvls]), names = ynames) y <- array(sample(c(0,1), nlvls^nvars, replace = TRUE, prob = p), rep(nlvls, nvars), lvlsy) sp_taby <- sparta::as_sparta(y) gr_taby <- y R_taby <- as.data.frame.table(y, stringsAsFactors=FALSE) R_taby <- R_taby[R_taby$Freq != 0,] colnames(R_taby)[ncol(R_taby)] <- "val" # bn_taby <- list(cpt = R_taby[, -ncol(R_taby)], prob = R_taby$val) rm(y) gc() # --------------------------------------------------------- # MULTIPLICATION # --------------------------------------------------------- print(glue::glue("Multiplying ... \n")) tictoc::tic(quiet = TRUE) for (k in 1:neval) gr <- gRbase::tabMult(gr_tabx, gr_taby) t1 <- tictoc::toc(quiet = TRUE) if (ncells != length(gr)) stop("ncell is not correct...") tictoc::tic(quiet = TRUE) for (k in 1:neval) sp <- sparta:::mult(sp_tabx, sp_taby) t2 <- tictoc::toc(quiet = TRUE) # tictoc::tic(quiet = TRUE) # for (k in 1:neval) bn <- factor.product(bn_tabx, bn_taby) # t3 <- tictoc::toc(quiet = TRUE) tictoc::tic(quiet = TRUE) for (k in 1:neval) R_ <- R_sparse_prod(R_tabx, R_taby) t4 <- tictoc::toc(quiet = TRUE) # Save the information gr_time_sec <- (t1$toc - t1$tic) / neval sp_time_sec <- (t2$toc - t2$tic) / neval R_time_sec <- (t4$toc - t4$tic) / neval # bn_time_sec <- (t3$toc - t3$tic) / neval gr_size_mb <- as.integer(object.size(gr)) / 1e6 sp_size_mb <- as.integer(object.size(sp)) / 1e6 R_size_mb <- as.integer(object.size(R_)) / 1e6 # bn_size_mb <- as.integer(object.size(bn)) / 1e6 tmu <- microbenchmark::microbenchmark( grbase = { gRbase::tabMult(gr_tabx, gr_taby) }, sparta = { sp <- sparta:::mult(sp_tabx, sp_taby) }, R = { R_ <- R_sparse_prod(R_tabx, R_taby) }, times = neval) prod_sparsity <- 1L - ncol(sp) / ncells # sparta::sparsity(sp) parms <- data.frame(nlvls, nvars, nmut, sparsity, prod_sparsity, ncells, log_ncells) gr_row <- cbind( parms, data.frame( # time_s = round(gr_time_sec, d), time_s = tmu$time[which(tmu$expr == "grbase")], size_mb = gr_size_mb, pkg = "grbase", type = "mult" ) ) sp_row <- cbind( parms, data.frame( # time_s = round(sp_time_sec, d), time_s = tmu$time[which(tmu$expr == "sparta")], size_mb = sp_size_mb, pkg = "sparta", type = "mult" ) ) R_row <- cbind( parms, data.frame( # time_s = round(R_time_sec, d), time_s = tmu$time[which(tmu$expr == "R")], size_mb = R_size_mb, pkg = "R", type = "mult" ) ) # bn_row <- cbind( # parms, # data.frame( # time_s = round(bn_time_sec, d), # size_mb = bn_size_mb, # pkg = "bayesnetbp", # type = "mult" # ) # ) results <- results %>% dplyr::bind_rows(gr_row) %>% dplyr::bind_rows(sp_row) %>% dplyr::bind_rows(R_row) # %>% # dplyr::bind_rows(bn_row) # ----------------------------------------------------------------------------- # MARGINALIZATION: Marginalize out one at a time until no variables are present # ----------------------------------------------------------------------------- print(glue::glue("Marginalizing ... \n")) vars <- names(sp) vars2 <- lapply(rev(seq_along(vars)[-length(vars)]), function(k) vars[1:k]) vars3 <- vars2 vars2[length(vars2)+1] <- list(NULL) vars3[length(vars3)+1] <- "" tictoc::tic(quiet = TRUE) grm <- gRbase::tabMarg(gr, vars2[[1]]) for (v in vars2[-1]) grm <- gRbase::tabMarg(grm, v) t1m <- tictoc::toc(quiet = TRUE) tictoc::tic(quiet = TRUE) spm <- sparta:::marg(sp, vars[1]) for (v in vars[-1]) spm <- sparta:::marg(spm, v) t2m <- tictoc::toc(quiet = TRUE) # tictoc::tic(quiet = TRUE) # bnm <- marginalize.discrete(bn, vars3[1]) # for (v in vars[-1]) bn <- marginalize.discrete(bn, v) # t3m <- tictoc::toc(quiet = TRUE) tictoc::tic(quiet = TRUE) Rm <- R_sparse_marg(make_form(vars2[[1]]), R_) for (v in vars2[-1]) Rm <- R_sparse_marg(make_form(vars2[[1]]), R_) t4m <- tictoc::toc(quiet = TRUE) # Save the information grm_time_sec <- (t1m$toc - t1m$tic) spm_time_sec <- (t2m$toc - t2m$tic) # bnm_time_sec <- (t3m$toc - t3m$tic) } R_time_sec <- (t4m$toc - t4m$tic) tm <- microbenchmark::microbenchmark( grbase = { grm <- gRbase::tabMarg(gr, vars2[[1]]); for (v in vars2[-1]) grm <- gRbase::tabMarg(grm, v) }, sparta = { spm <- sparta:::marg(sp, vars[1]) for (v in vars[-1]) spm <- sparta:::marg(spm, v) }, R = { Rm <- R_sparse_marg(make_form(vars2[[1]]), R_) for (v in vars2[-1]) Rm <- R_sparse_marg(make_form(vars2[[1]]), R_) }, times = 1L) grm_row <- cbind( parms, data.frame( time_s = tm$time[which(tm$expr == "grbase")], # time_s = round(grm_time_sec, d), size_mb = NA, pkg = "grbase", type = "marg" ) ) spm_row <- cbind( parms, data.frame( time_s = tm$time[which(tm$expr == "sparta")], # time_s = round(spm_time_sec, d), size_mb = NA, pkg = "sparta", type = "marg" ) ) # bnm_row <- cbind( # parms, # data.frame( # time_s = round(bn_time_sec, d), # size_mb = NA, # pkg = "bayesnetbp", # type = "marg" # ) # ) R_row <- cbind( parms, data.frame( time_s = tm$time[which(tm$expr == "R")] , # time_s = round(R_time_sec, d), size_mb = NA, pkg = "R", type = "marg" ) ) results <- results %>% dplyr::bind_rows(grm_row) %>% dplyr::bind_rows(spm_row) %>% dplyr::bind_rows(R_row) # %>% counter <- counter + 1L } } } } saveRDS(results, "results.rds") } else { results <- readRDS("results.rds") } ######################### ### Figure 5 ### ######################### ## scales ## labels for cell counts log10x <- function(x){ x[x>0] <- log10(x[x>0]) x } bins <- c(0, 10^(c(2, 4, 6)), Inf) ## works bins_label <- function(b){ bb <- log10x(b) bl <- character(length(b)-1) for(i in 2:length(bl)){ if(i > 2){ ten <- "10^"; zero <- "(" } else{ ten <- ""; zero <- "[" } bl[i-1] <- paste0("group('", zero ,"',", "list(", paste0(ten, bb[i-1]), paste0(", 10^", bb[i]),")", ",']')") } bl[length(bl)] <- paste0("group('(',", "list(", paste0(ten, bb[i]), ", infinity)", ",')')") bl } results_TT <- results %>% arrange(sparsity) %>% as_tibble() %>% mutate(pkg = ifelse(pkg == "R", "base R", pkg)) %>% mutate(Sparsity = paste0(round(sparsity*100), "%")) %>% mutate( pkg = ifelse(pkg == "bayesnetbp", "BayesNetBP", pkg), non_zero_cells = ncells*prod_sparsity ) join_by <- names(results_TT %>% select(-c(time_s, size_mb, pkg))) results_TT_relative <- results_TT %>% list(filter(., pkg == "grbase") %>% select(-pkg), filter(., pkg != "grbase")) %>% {.[-1]} %>% purrr::reduce(inner_join, by = join_by, suffix = c("_gRbase","")) %>% mutate( time_rel = time_s / time_s_gRbase, size_rel = size_mb / size_mb_gRbase ) sparse_bin <- c(-0.01, 0.02, 0.75, 1) sparse_bin_lab <- gsub(" ", "", c("0%", "(0%-75%]", "> 75%")) sparse_bin_lab <- sub("1\\.0", "1", gsub("0\\.", ".", sparse_bin_lab)) results_TT_relative <- results_TT_relative %>% mutate( ncells_log10_bin = cut(ncells, breaks = bins, labels = bins_label(bins)), Operation = ifelse(type == "marg", "Marginalization", "Multiplication"), ncells_log10_bin_lab = paste("No.~of~cells:", ncells_log10_bin), ncells_log10_bin_lab = factor(ncells_log10_bin_lab, levels = paste("No.~of~cells:", levels(ncells_log10_bin))), prod_sparsity_bin = cut(prod_sparsity, breaks = sparse_bin, include.lowest = FALSE, labels = sparse_bin_lab) ) results_TT_relative_ <- results_TT_relative %>% tidyr::gather(key = measure, value = relative, time_rel, size_rel) %>% filter(!(type == "marg" & measure == "size_rel")) %>% mutate( Operation = paste(Operation, ifelse(measure == "size_rel", "(size)", "(time)")), Operation = factor(Operation, levels = paste(rep(c("Multiplication", "Marginalization"), each = 2), rep(c("(size)", "(time)"), 2))) ) epsilon <- 1/10^6 ## avoid log(0) ## Keeping the same ranges for timing (across mult and marg) results_TT_relative_range <- results_TT_relative_ %>% filter(!(pkg == "base R" & relative < 0.001)) %>% group_by(measure) %>% reframe( Operation = Operation[1], relative = range(relative + epsilon, finite = TRUE), .groups = "drop" ) %>% ungroup() %>% bind_rows(mutate(., Operation = sub("Multiplication", "Marginalization", Operation))) %>% filter(Operation != "Marginalization (size)") %>% mutate(Operation = factor(Operation, levels = levels(results_TT_relative_$Operation))) ## START p3_rel_bin_facet <- results_TT_relative_ %>% filter(!(pkg == "base R" & relative < 0.001)) %>% ggplot(aes(x = prod_sparsity_bin, y = I(relative + epsilon), fill = pkg)) + geom_blank(data = results_TT_relative_range, inherit.aes = FALSE, aes(y = relative)) + geom_hline(yintercept = 1, lty = 2) + geom_boxplot() + theme_bw(base_size = 20) + facet_grid(Operation ~ ncells_log10_bin_lab, scales = "free_y", labeller = labeller(Operation = label_value, ncells_log10_bin_lab = label_parsed)) + ggh4x::scale_y_facet( PANEL %in% 1:9, trans = "log10", breaks = breaks_log(), labels = function(x) format(x, scientific = FALSE) ) + theme_bw(base_size = 20) + theme( legend.position = "top", axis.text.x = element_text(angle = 0, hjust = 0.5, vjust = 1)) + theme( # mads fix: axis.text = element_text(size = 20), axis.title = element_text(size = 25), legend.text = element_text(size = 25), strip.text.x = element_text(size = 20), strip.text.y = element_text(size = 20) ) + labs(y = "Performance relative to gRbase", x = "Sparsity of resulting table", fill = "Package:") + scale_fill_brewer(palette = "Set1") p3_rel_bin_facet