############################################################################################## ####### install the needed libraries ####### packages = c("stagedtrees", "colorspace", "bnlearn", "igraph", "rpart", "randomForest", "nnet", "e1071") package.check <- lapply( packages, FUN = function(x) { if (!require(x, character.only = TRUE)) { message("install package ", x) install.packages(x, dependencies = TRUE) } } ) message("packages available") ############################################### ## create folders dir.create("Tables", showWarnings = FALSE) dir.create("Figures", showWarnings = FALSE) # set graphical parameters as in the Rnw file par(mai=c(0.8, 0.8, 0.8, 0.2)) ############################################### # SECTION 4 # ############################################### ### 4.1 message("4.1 Learning the stage structure from a dataset") data("Titanic", package = "datasets") str(Titanic) library("stagedtrees") order <- c("Class", "Sex", "Age", "Survived") m.full <- full(Titanic, name_unobserved = "na", order = order) m.indep <- indep(Titanic, name_unobserved = "na", order = order) m.full m.indep library("colorspace") pdf("Figures/figure3-left.pdf") plot(m.full, col = \(s) qualitative_hcl(length(s), "Dynamic")) dev.off() pdf("Figures/figure3-right.pdf") plot(m.indep) dev.off() mod1 <- stndnaming(stages_hc(m.indep)) mod2 <- stndnaming(stages_bj(m.full, thr = 0.1)) pdf("Figures/figure4-left.pdf") plot(mod1, cex_label_nodes = 1.5, cex_nodes = 0, font = 2) dev.off() pdf("Figures/figure4-right.pdf") plot(mod2, cex_label_nodes = 1.5, cex_nodes = 0, font = 2, col = \(s) qualitative_hcl(length(s), "Dynamic")) dev.off() mod3 <- stndnaming(stages_hc(mod2)) pdf("Figures/figure5-left.pdf") plot(mod3, ignore = NULL, cex_label_nodes = 1.5, cex_nodes = 0, font = 2) dev.off() pdf("Figures/figure5-right.pdf") compare_stages(mod1, mod3, method = "stages", plot = TRUE) dev.off() cbind(AIC(mod1, mod2, mod3), BIC = BIC(mod1, mod2, mod3)$BIC) ### 4.2 message("4.2 Bayesian networks as staged trees") titanic.df <- as.data.frame(Titanic) titanic.df <- titanic.df[rep(row.names(titanic.df), titanic.df$Freq), 1:4] library("bnlearn") mod.bn <- bnlearn::hc(titanic.df) pdf("Figures/figure6-left.pdf") plot(mod.bn) dev.off() mod.bn <- bn.fit(mod.bn, titanic.df) bn.tree <- sevt_fit(as_sevt(mod.bn), data = titanic.df, lambda = 0) pdf("Figures/figure6-right.pdf") plot(bn.tree) dev.off() mod4 <- stages_hclust(bn.tree, k = 2) pdf("Figures/figure7-left.pdf") plot(mod4, col = \(s) c("red3", "blue3")) dev.off() library("igraph") pdf("Figures/figure7-right.pdf") plot(ceg(mod4), col = \(s) c("red3", "blue3")) dev.off() ### 4.3 message("4.3. Querying the model") subtree.crew <- subtree(mod1, c(Class = "Crew")) subtree.crew pdf("Figures/figure8.pdf") plot(subtree.crew) dev.off() mod1 <- stndnaming(mod1) summary(mod1) get_path(mod1, var = "Survived", stage = "1") get_path(mod1, var = "Survived", stage = "3") get_stage(mod1, path = c("Crew", "Female")) prob(mod1, c(Survived = "Yes")) prob(mod1, c(Survived = "Yes"), conditional_on = c(Age = "Adult")) prob(mod1, c(Survived = "Yes"), conditional_on = c(Age = "Child")) cond <- c(Age = "Child", Class = "1st", Sex = "Male") prob(mod1, c(Survived = "Yes"), conditional_on = cond) obs <- expand.grid(mod1$tree[4:1])[, 4:1] cbind(obs, p = round(prob(mod1, obs), 6)) confint(mod1, "Age", method = "goodman") pdf("Figures/figure9.pdf") barplot(mod3, "Survived", legend.text = TRUE, horiz = TRUE, args.legend = list(x = 1), ylab = "Survived") dev.off() ############################################### # SECTION 5 # ############################################### options(scipen = 999) # for number without scientific notation in printing. ####### load the needed libraries ####### library("e1071") library("nnet") library("randomForest") library("rpart") ####### load 10 datasets (see supplementary file v102i06-datasets.zip) ####### Asym <- read.csv("datasets/Asym.csv", colClasses = "factor") Asym <- Asym[, c(NCOL(Asym), c(1:(NCOL(Asym) - 1)))] chestSim500 <- read.csv("datasets/chestSim500.csv") chestSim500 <- chestSim500[, c(NCOL(chestSim500), c(1:(NCOL(chestSim500) - 1)))] FallEld <- read.csv("datasets/FallEld.csv") FallEld <- FallEld[, c(NCOL(FallEld), c(1:(NCOL(FallEld) - 1)))] monks1 <- read.csv("datasets/monks1.csv", colClasses = "factor") monks1 <- monks1[, c(NCOL(monks1), c(1:(NCOL(monks1) - 1)))] PhDArticles <- read.csv("datasets/PhDArticles.csv") PhDArticles <- PhDArticles[, c(NCOL(PhDArticles), c(1:(NCOL(PhDArticles) - 1)))] Pokemon <- read.csv("datasets/Pokemon.csv") Pokemon <- Pokemon[, c(NCOL(Pokemon), c(1:(NCOL(Pokemon) - 1)))] puffin <- read.csv("datasets/puffin.csv", colClasses = "factor") puffin <- puffin[, c(NCOL(puffin), c(1:(NCOL(puffin) - 1)))] reinis <- read.csv("datasets/reinis.csv") reinis <- reinis[, c(NCOL(reinis), c(1:(NCOL(reinis) - 1)))] selfy <- read.csv("datasets/selfy.csv") selfy <- selfy[, c(NCOL(selfy), c(1:(NCOL(selfy) - 1)))] Titanic <- read.csv("datasets/Titanic.csv") Titanic <- Titanic[, c(NCOL(Titanic), c(1:(NCOL(Titanic) - 1)))] datasets <- factor(c("Asym", "chestSim500", "FallEld", "monks1", "PhDArticles", "Pokemon", "puffin", "reinis", "selfy", "Titanic")) ####### splits for each dataset to create train and test set ####### nreps <- 10 k <- 5 splits_path <- "splits/" dir.create(splits_path, showWarnings = FALSE) for (nam in datasets) { path <- paste0(splits_path, nam, "/") dir.create(path, showWarnings = FALSE, recursive = TRUE) data <- read.csv(paste0("datasets/", nam, ".csv"), colClasses = "factor") data <- data[, c(NCOL(data), c(1:(NCOL(data) - 1)))] N <- NROW(data) ## replicability dataset-wise set.seed(2020) for (r in 1:nreps) { zero_count <- 1 while(zero_count > 0) { # at least 1 observation for each level for # each variable (marginally) in the train set. zero_count <- 0 id_test <- sample(1:N, N / k, replace = FALSE) marginal_counts <- rep(list(list()), NCOL(data)) for(t in 1:NCOL(data)) { marginal_counts[[t]] <- table(data[-id_test, t]) if(any(marginal_counts[[t]] == 0)) { zero_count <- zero_count + 1 } } } saveRDS(id_test, file = paste0(path, r, "_id_test.rds")) } } ####### statistics to compute ####### statistics <- c("time", "accuracy") time <- function(res, true) { res$time } accuracy <- function(res, true) { sum(diag(table(res$predict, true))) / length(true) } ####### algorithms to estimate ####### ## stagedtrees predict_st <- function(model, train, test) { pred <- factor(predict(model, newdata = test, class = colnames(test)[1]), levels = levels(train[, 1])) df <- stagedtrees:::sevt_df(model) aic_value <- AIC(model) bic_value <- BIC(model) log_likelihood <- model$ll return(list(pred = pred, df = df, loglik = log_likelihood, aic = aic_value, bic = bic_value)) } st_full <- function(train, test, ...) { model <- full(train, lambda = 1, join_unobserved = TRUE) predict_st(model, train, test) } st_indep <- function(train, test, ...) { model <- indep(train, lambda = 1, join_unobserved = TRUE) predict_st(model, train, test) } st_hc_full <- function(train, test, ...) { model <- full(train, lambda = 1, join_unobserved = TRUE) model <- stages_hc(model) predict_st(model, train, test) } st_hc_indep <- function(train, test, ...) { model <- indep(train, lambda = 1, join_unobserved = TRUE) model <- stages_hc(model) predict_st(model, train, test) } st_fbhc <- function(train, test, ...) { model <- full(train, lambda = 1, join_unobserved = TRUE) model <- stages_fbhc(model) predict_st(model, train, test) } st_bhcr <- function(train, test, ...) { model <- full(train, lambda = 1, join_unobserved = TRUE) set.seed(2020) model <- stages_bhcr(model) predict_st(model, train, test) } st_bhc <- function(train, test, ...) { model <- full(train, lambda = 1, join_unobserved = TRUE) model <- stages_bhc(model) predict_st(model, train, test) } st_bj_kl_01 <- function(train, test, ...) { model <- full(train, lambda = 1, join_unobserved = TRUE) model <- stages_bj(model, distance = "kullback", thr = 0.01) predict_st(model, train, test) } st_bj_kl_05 <- function(train, test, ...) { model <- full(train, lambda = 1, join_unobserved = TRUE) model <- stages_bj(model, distance = "kullback", thr = 0.05) predict_st(model, train, test) } st_bj_kl_20 <- function(train, test, ...) { model <- full(train, lambda = 1, join_unobserved = TRUE) model <- stages_bj(model, distance = "kullback", thr = 0.20) predict_st(model, train, test) } st_naive <- function(train, test, ...) { model <- full(train, lambda = 1, join_unobserved = TRUE) model <- stages_hclust(model, k = 2, distance = "totvar") predict_st(model, train, test) } st_bn_staged <- function(train, test, ...) { set.seed(2020) model_bn <- bnlearn::tabu(train) model <- bn.fit(model_bn, train) model <- sevt_fit(as_sevt(model), data = train, lambda = 1) model <- join_unobserved(model) model <- stages_bhc(model) predict_st(model, train, test) } ## BN predict_bnlearn <- function(model, train, test) { pred <- factor(predict(model, node = colnames(test)[1], data = test, prob = FALSE), levels = levels(train[, 1])) return(list(pred = pred)) } bnlearn_tabu <- function(train, test) { set.seed(2020) bn <- bnlearn::tabu(train) model <- bn.fit(bn, train) predict_bnlearn(model, train, test) } bnlearn_hc <- function(train, test) { set.seed(2020) bn <- bnlearn::hc(train) model <- bn.fit(bn, train) predict_bnlearn(model, train, test) } ## logistic predict_logistic <- function(model, train, test) { pred <- factor(predict(model, newdata = test), levels = levels(train[, 1])) return(list(pred = pred)) } nnet_logistic <- function(train, test) { formula <- as.formula(paste(colnames(train)[1], "~", paste(colnames(train)[-1], collapse = " + "))) model <- nnet::multinom(formula, data = train, trace = 0) predict_logistic(model, train, test) } ## neural network predict_nnet <- function(model, train, test) { pred <- factor(predict(model, newdata = test, type = "class"), levels = levels(train[, 1])) return(list(pred = pred)) } nnet_neural_network <- function(train, test) { formula <- as.formula(paste(colnames(train)[1], "~", paste(colnames(train)[-1], collapse = " + "))) model <- nnet::nnet(formula, data = train, size = 10, decay = 0.001, maxit = 5000, linout = FALSE, MaxNWts = 5000, trace = 0) predict_nnet(model, train, test) } ## classification tree predict_rpart <- function(model, train, test) { pred <- factor(predict(model, newdata = test, type = "class"), levels = levels(train[, 1])) return(list(pred = pred)) } rpart_tree <- function(train, test) { formula <- as.formula(paste(colnames(train)[1], "~", paste(colnames(train)[-1], collapse = " + "))) model <- rpart::rpart(formula, data = train, method = "class", cp = 0) predict_rpart(model, train, test) } ## random forest predict_rf <- function(model, train, test) { pred <- factor(predict(model, newdata = test, type = "response"), levels = levels(train[, 1])) return(list(pred = pred)) } randomForest_rf <- function(train, test) { formula <- as.formula(paste(colnames(train)[1], "~", paste(colnames(train)[-1], collapse = " + "))) model <- randomForest::randomForest(formula, data = train, nodesize = 1, ntree = 200, mtry = 3) predict_rf(model, train, test) } ## naive bayes predict_e1071 <- function(model, train, test) { pred <- factor(predict(model, newdata = test, type = "class"), levels = levels(train[, 1])) return(list(pred = pred)) } e1071_naive <- function(train, test) { model <- e1071::naiveBayes(x = train[, -1], y = train[, 1]) predict_e1071(model, train, test) } ## classifiers classifiers <- c("st_indep", "st_full", "st_hc_indep", "st_hc_full", "st_bhc", "st_fbhc", "st_bhcr", "st_bj_kl_01", "st_bj_kl_05", "st_bj_kl_20", "st_bn_staged", "st_naive", "bnlearn_hc", "bnlearn_tabu", "nnet_logistic", "e1071_naive", "nnet_neural_network", "rpart_tree", "randomForest_rf") ####### experiment to run ####### dir.create("results/", showWarnings = FALSE) for (d in datasets) { message(d) timestamp() res_path <- paste0("results/", d, "/") dir.create(res_path, showWarnings = FALSE) data <- read.csv(paste0("datasets/", d, ".csv"), colClasses = "factor") data <- data[, c(NCOL(data), c(1:(NCOL(data) - 1)))] split_path <- paste0("splits/", d, "/") for (r in 1:nreps) { id_test <- readRDS(paste0(split_path, r, "_id_test.rds")) train <- data[-id_test, ] test <- data[id_test, ] for (c_name in classifiers) { if((d != "selfy") & any(c_name == classifiers[c(1:4, 6:12)])) { next } c_fun <- get(c_name) time_elapsed <- system.time(predict <- c_fun(train, test))[3] filename <- paste0(res_path, c_name, "_", r, ".rds") if(c_name %in% classifiers[-c(1:12)]) { # df, logLik, AIC, BIC are calculated only for stagedtrees algorithms, # for other algorithms they are NA. saveRDS(list(time = time_elapsed, predict = predict$pred, df = NA, loglik = NA, aic = NA, bic = NA), file = filename) } else { saveRDS(list(time = time_elapsed, predict = predict$pred, df = predict$df, loglik = predict$loglik, aic = predict$aic, bic = predict$bic), file = filename) } } } if(d == levels(datasets)[length(levels(datasets))]) timestamp() } ####### create TABLE with statistics saved in results folder for each dataset and classifier ####### TABLE <- array( data = NA, dim = c( length(statistics) + 4, length(datasets), length(classifiers), nreps ), dimnames = list( stat = c(statistics, "df", "logLik", "AIC", "BIC"), data = datasets, classifier = classifiers, rep = 1:nreps ) ) for (d in datasets) { res_path <- paste0("results/", d, "/") data <- read.csv(paste0("datasets/", d, ".csv"), colClasses = "factor") data <- data[, c(NCOL(data), c(1:(NCOL(data) - 1)))] split_path <- paste0("splits/", d, "/") for (r in 1:nreps) { id_test <- readRDS(paste0(split_path, r, "_id_test.rds")) true <- data[id_test, 1] for (c_name in classifiers) { filename <- paste0(res_path, c_name, "_", r, ".rds" ) if (file.exists(filename)) { res <- readRDS(filename) for (stat in statistics) { stat_fun <- get(stat) TABLE[stat, d, c_name, r] <- stat_fun(res, true) } TABLE[attr(TABLE, "dimnames")[[1]][-c(1:length(statistics))], d, c_name, r] <- c(res$df, res$loglik, res$aic, res$bic) } } } } ####### compute averages on 10 reps for each statistics and classifier on each dataset ####### AVG <- apply(TABLE, c(1,2,3), mean, na.rm = TRUE) saveRDS(AVG, "AVG.rds") ####### create paper's tables with interesting results ####### table1 <- matrix(NA, nrow = length(datasets), ncol = 6) count <- 0 for(nam in datasets) { count <- count + 1 out <- out2 <- 1 zeros <- 0 data <- read.csv(paste0("datasets/", nam, ".csv"), colClasses = "factor") data <- data[, c(NCOL(data), c(1:(NCOL(data) - 1)))] m0 <- full(data) edges <- length(m0$tree[[1]]) for(i in 1:NCOL(data)) { out <- out * length(levels(data[, i])) if(i != NCOL(data)) { zeros <- zeros + sum(m0$stages[[i]] == "UNOBSERVED") out2 <- out2 + length(m0$stages[[i]]) edges <- edges + length(m0$stages[[i]]) * length(m0$tree[[i+1]]) } } table1[count, ] <- c(NROW(data), NCOL(data), out, out2, zeros, edges) } rownames(table1) <- datasets colnames(table1) <- c("# observations", "# variables", "# root-to-leaf paths", "# non-leaf nodes", "# 0 cells", "# edges") write.csv(table1, file = "Tables/table1.csv") name_classifiers <- c("Indep", "Full", "HC - Indep", "HC - Full", "BHC", "Fast BHC", "Random BHC", "Kullback-Leibler - 0.01", "Kullback-Leibler - 0.05", "Kullback-Leibler - 0.20", "Refined BN", "HClust k = 2", "Bnlearn Hill-Climbing", "Bnlearn Tabu", "Logistic Model", "Naive Bayes Classifier", "Neural Network", "Classification Tree", "Random Forest") table4 <- (t(AVG[c(3:6, 2, 1), 9, 1:12])) attr(table4, "dimnames") <- list(Model = name_classifiers[1:12], c("df", "logLik", "AIC", "BIC", "Accuracy", "Computational Time")) write.csv(table4, file = "Tables/table4.csv") table5 <- round(t(AVG[2, , c(5, 13:19)]), 4) attr(table5, "dimnames") <- list(Model = c("Stagedtrees BHC", name_classifiers[13:19]), datasets) write.csv(table5, file = "Tables/table5.csv") ############################################### # SECTION 6 # ############################################### data("PhDArticles", package = "stagedtrees") str(PhDArticles) bn <- bnlearn::hc(PhDArticles) pdf("Figures/figure10-left.pdf") plot(bn) dev.off() order <- c("Gender", "Kids", "Married", "Articles") bn.as.tree <- as_sevt(bn.fit(bn, data = PhDArticles), order = order) pdf("Figures/figure10-right.pdf") plot(bn.as.tree) dev.off() phd.mod1 <- PhDArticles |> indep(order = order) |> stages_hc() phd.mod2 <- PhDArticles |> full(order = order) |> stages_hc() phd.mod1 <- stndnaming(phd.mod1) pdf("Figures/figure11-left.pdf") plot(phd.mod1, cex_label_nodes = 2, cex_nodes = 0, col = "stages") dev.off() phd.mod2 <- stndnaming(phd.mod2) pdf("Figures/figure11-right.pdf") plot(phd.mod2, cex_label_nodes = 2, cex_nodes = 0, col = "stages") dev.off() pdf("Figures/figure12-left.pdf") compare_stages(phd.mod1, phd.mod2, plot = TRUE, method = "stages") dev.off() pdf("Figures/figure12-right.pdf") barplot(phd.mod2, "Articles", legend.text = TRUE, xlab = "Articles") dev.off() lr_test(phd.mod1, phd.mod2) order <- c("Prestige", "Mentor", order) phd.all <- PhDArticles |> full(order = order) |> stages_bj(thr = 0.5) |> stndnaming() pdf("Figures/figure13-left.pdf") plot(phd.all, col = "stages") dev.off() pdf("Figures/figure13-right.pdf") barplot(phd.all, "Articles", legend.text = TRUE, beside = TRUE, col = "stages", xlab = "Articles") dev.off() get_path(phd.all, "Articles", "3") phd.order <- search_best(PhDArticles, alg = stages_bhc) phd.order BIC(phd.all, phd.order)