##############################################################################################

#######   install the needed libraries  #######   

packages = c("stagedtrees", "colorspace",
             "bnlearn", "igraph",
             "rpart", "randomForest",
             "nnet", "e1071")

package.check <- lapply(
  packages,
  FUN = function(x) {
    if (!require(x, character.only = TRUE)) {
      message("install package ", x)
      install.packages(x, dependencies = TRUE)
    }
  }
)
message("packages available")
###############################################


## create folders

dir.create("Tables", showWarnings = FALSE)
dir.create("Figures", showWarnings = FALSE)

# set graphical parameters as in the Rnw file
par(mai=c(0.8, 0.8, 0.8, 0.2))

###############################################
#                 SECTION  4                  #
###############################################

###  4.1
message("4.1 Learning the stage structure from a dataset")
data("Titanic", package = "datasets")
str(Titanic)

library("stagedtrees")
order <- c("Class", "Sex", "Age", "Survived")
m.full <- full(Titanic, name_unobserved = "na", order = order)
m.indep <- indep(Titanic, name_unobserved = "na", order = order)
m.full 
m.indep

library("colorspace")
pdf("Figures/figure3-left.pdf")
plot(m.full, col = \(s) qualitative_hcl(length(s), "Dynamic"))
dev.off()
pdf("Figures/figure3-right.pdf")
plot(m.indep)
dev.off()

mod1 <- stndnaming(stages_hc(m.indep))
mod2 <- stndnaming(stages_bj(m.full, thr = 0.1))

pdf("Figures/figure4-left.pdf")
plot(mod1,
     cex_label_nodes = 1.5, cex_nodes = 0, font = 2)
dev.off()
pdf("Figures/figure4-right.pdf")
plot(mod2,
     cex_label_nodes = 1.5, cex_nodes = 0, font = 2,
     col = \(s) qualitative_hcl(length(s), "Dynamic"))
dev.off()

mod3 <- stndnaming(stages_hc(mod2))

pdf("Figures/figure5-left.pdf")
plot(mod3, ignore = NULL,
     cex_label_nodes = 1.5, cex_nodes = 0, font = 2)
dev.off()
pdf("Figures/figure5-right.pdf")
compare_stages(mod1, mod3, method = "stages", plot = TRUE)
dev.off()

cbind(AIC(mod1, mod2, mod3), BIC = BIC(mod1, mod2, mod3)$BIC)


###  4.2
message("4.2 Bayesian networks as staged trees")

titanic.df <- as.data.frame(Titanic)
titanic.df <- titanic.df[rep(row.names(titanic.df), titanic.df$Freq), 1:4]

library("bnlearn")
mod.bn <- bnlearn::hc(titanic.df)
pdf("Figures/figure6-left.pdf")
plot(mod.bn)
dev.off()

mod.bn <- bn.fit(mod.bn, titanic.df)
bn.tree <- sevt_fit(as_sevt(mod.bn), data = titanic.df, lambda = 0)
pdf("Figures/figure6-right.pdf")
plot(bn.tree)
dev.off()

mod4 <- stages_hclust(bn.tree, k = 2)
pdf("Figures/figure7-left.pdf")
plot(mod4, col = \(s) c("red3", "blue3"))
dev.off()

library("igraph")
pdf("Figures/figure7-right.pdf")
plot(ceg(mod4), col = \(s) c("red3", "blue3"))
dev.off()


###  4.3
message("4.3. Querying the model")

subtree.crew <- subtree(mod1, c(Class = "Crew"))
subtree.crew
pdf("Figures/figure8.pdf")
plot(subtree.crew)
dev.off()

mod1 <- stndnaming(mod1)
summary(mod1)

get_path(mod1, var = "Survived", stage = "1")

get_path(mod1, var = "Survived", stage = "3")

get_stage(mod1, path = c("Crew", "Female"))

prob(mod1, c(Survived = "Yes"))
prob(mod1, c(Survived = "Yes"), conditional_on = c(Age = "Adult"))
prob(mod1, c(Survived = "Yes"), conditional_on = c(Age = "Child"))

cond <- c(Age = "Child", Class = "1st", Sex = "Male")
prob(mod1, c(Survived = "Yes"), conditional_on = cond)

obs <- expand.grid(mod1$tree[4:1])[, 4:1]
cbind(obs, p = round(prob(mod1, obs), 6))

confint(mod1, "Age", method = "goodman")

pdf("Figures/figure9.pdf")
barplot(mod3, "Survived", legend.text = TRUE, horiz = TRUE,
        args.legend = list(x = 1), ylab = "Survived")
dev.off()


###############################################
#                 SECTION  5                  #
###############################################

options(scipen = 999) # for number without scientific notation in printing.


#######   load the needed libraries  #######   
library("e1071")
library("nnet")
library("randomForest")
library("rpart")


#######   load 10 datasets (see supplementary file v102i06-datasets.zip) #######   
Asym <- read.csv("datasets/Asym.csv", colClasses = "factor")
Asym <- Asym[, c(NCOL(Asym), c(1:(NCOL(Asym) - 1)))]
chestSim500 <- read.csv("datasets/chestSim500.csv")
chestSim500 <- chestSim500[, c(NCOL(chestSim500), c(1:(NCOL(chestSim500) - 1)))]
FallEld <- read.csv("datasets/FallEld.csv")
FallEld <- FallEld[, c(NCOL(FallEld), c(1:(NCOL(FallEld) - 1)))]
monks1 <- read.csv("datasets/monks1.csv", colClasses = "factor")
monks1 <- monks1[, c(NCOL(monks1), c(1:(NCOL(monks1) - 1)))]
PhDArticles <- read.csv("datasets/PhDArticles.csv")
PhDArticles <- PhDArticles[, c(NCOL(PhDArticles), c(1:(NCOL(PhDArticles) - 1)))]
Pokemon <- read.csv("datasets/Pokemon.csv")
Pokemon <- Pokemon[, c(NCOL(Pokemon), c(1:(NCOL(Pokemon) - 1)))]
puffin <- read.csv("datasets/puffin.csv", colClasses = "factor")
puffin <- puffin[, c(NCOL(puffin), c(1:(NCOL(puffin) - 1)))]
reinis <- read.csv("datasets/reinis.csv")
reinis <- reinis[, c(NCOL(reinis), c(1:(NCOL(reinis) - 1)))]
selfy <- read.csv("datasets/selfy.csv")
selfy <- selfy[, c(NCOL(selfy), c(1:(NCOL(selfy) - 1)))]
Titanic <- read.csv("datasets/Titanic.csv")
Titanic <- Titanic[, c(NCOL(Titanic), c(1:(NCOL(Titanic) - 1)))]

datasets <- factor(c("Asym", "chestSim500", "FallEld", "monks1",
                     "PhDArticles", "Pokemon", "puffin", "reinis", "selfy", "Titanic"))

#######   splits for each dataset to create train and test set  #######   
nreps <- 10
k <- 5

splits_path <- "splits/"
dir.create(splits_path, showWarnings = FALSE)
for (nam in datasets) {
  path <- paste0(splits_path, nam, "/")
  dir.create(path, showWarnings = FALSE, recursive = TRUE)
  data <- read.csv(paste0("datasets/", nam, ".csv"), colClasses = "factor")
  data <- data[, c(NCOL(data), c(1:(NCOL(data) - 1)))]
  N <- NROW(data)
  ## replicability dataset-wise
  set.seed(2020)
  for (r in 1:nreps) {
    zero_count <- 1
    while(zero_count > 0) { # at least 1 observation for each level for 
      # each variable (marginally) in the train set.
      zero_count <- 0
      id_test <- sample(1:N, N / k, replace = FALSE)
      marginal_counts <- rep(list(list()), NCOL(data))
      for(t in 1:NCOL(data)) {
        marginal_counts[[t]] <- table(data[-id_test, t])
        if(any(marginal_counts[[t]] == 0)) { 
          zero_count <- zero_count + 1 
        }
      }
    }
    saveRDS(id_test, file = paste0(path, r, "_id_test.rds"))
  }
}


#######   statistics to compute   #######   
statistics <- c("time", "accuracy")

time <- function(res, true) {
  res$time
}

accuracy <- function(res, true) {
  sum(diag(table(res$predict, true))) / length(true)
}


#######   algorithms to estimate  #######   

## stagedtrees
predict_st <- function(model, train, test) {
  pred <- factor(predict(model, newdata = test, class = colnames(test)[1]), 
                 levels = levels(train[, 1]))
  df <- stagedtrees:::sevt_df(model)
  aic_value <- AIC(model)
  bic_value <- BIC(model)
  log_likelihood <- model$ll
  return(list(pred = pred, df = df, loglik = log_likelihood, aic = aic_value, bic = bic_value))
}

st_full <- function(train, test, ...) {
  model <- full(train, lambda = 1, join_unobserved = TRUE)
  predict_st(model, train, test)
}

st_indep <- function(train, test, ...) {
  model <- indep(train, lambda = 1, join_unobserved = TRUE)
  predict_st(model, train, test)
}

st_hc_full <- function(train, test, ...) {
  model <- full(train, lambda = 1, join_unobserved = TRUE)
  model <- stages_hc(model)
  predict_st(model, train, test)
}

st_hc_indep <- function(train, test, ...) {
  model <- indep(train, lambda = 1, join_unobserved = TRUE)
  model <- stages_hc(model)
  predict_st(model, train, test)
}

st_fbhc <- function(train, test, ...) {
  model <- full(train, lambda = 1, join_unobserved = TRUE)
  model <- stages_fbhc(model)
  predict_st(model, train, test)
}

st_bhcr <- function(train, test, ...) {
  model <- full(train, lambda = 1, join_unobserved = TRUE)
  set.seed(2020)
  model <- stages_bhcr(model)
  predict_st(model, train, test)
}

st_bhc <- function(train, test, ...) {
  model <- full(train, lambda = 1, join_unobserved = TRUE)
  model <- stages_bhc(model)
  predict_st(model, train, test)
}

st_bj_kl_01 <- function(train, test, ...) {
  model <- full(train, lambda = 1, join_unobserved = TRUE)
  model <- stages_bj(model, distance = "kullback", thr = 0.01)
  predict_st(model, train, test)
}

st_bj_kl_05 <- function(train, test, ...) {
  model <- full(train, lambda = 1, join_unobserved = TRUE)
  model <- stages_bj(model, distance = "kullback", thr = 0.05)
  predict_st(model, train, test)
}

st_bj_kl_20 <- function(train, test, ...) {
  model <- full(train, lambda = 1, join_unobserved = TRUE)
  model <- stages_bj(model, distance = "kullback", thr = 0.20)
  predict_st(model, train, test)
}

st_naive <- function(train, test, ...) {
  model <- full(train, lambda = 1, join_unobserved = TRUE)
  model <- stages_hclust(model, k = 2, distance = "totvar")
  predict_st(model, train, test)
}

st_bn_staged <- function(train, test, ...) {
  set.seed(2020)
  model_bn <- bnlearn::tabu(train)
  model <- bn.fit(model_bn, train)
  model <- sevt_fit(as_sevt(model), data = train, lambda = 1)
  model <- join_unobserved(model)
  model <- stages_bhc(model)
  predict_st(model, train, test)
}


## BN
predict_bnlearn <- function(model, train, test) {
  pred <- factor(predict(model, node = colnames(test)[1], data = test, prob = FALSE), 
                 levels = levels(train[, 1]))
  return(list(pred = pred))
}

bnlearn_tabu <- function(train, test) {
  set.seed(2020)
  bn <- bnlearn::tabu(train)
  model <- bn.fit(bn, train)
  predict_bnlearn(model, train, test)
}

bnlearn_hc <- function(train, test) {
  set.seed(2020)
  bn <- bnlearn::hc(train)
  model <- bn.fit(bn, train)
  predict_bnlearn(model, train, test)
}


## logistic 
predict_logistic <- function(model, train, test) {
  pred <- factor(predict(model, newdata = test), levels = levels(train[, 1]))
  return(list(pred = pred))
}

nnet_logistic <- function(train, test) {
  formula <- as.formula(paste(colnames(train)[1], "~", paste(colnames(train)[-1], collapse = " + ")))
  model <- nnet::multinom(formula, data = train, trace = 0)
  predict_logistic(model, train, test)
}


## neural network
predict_nnet <- function(model, train, test) {
  pred <- factor(predict(model, newdata = test, type = "class"), levels = levels(train[, 1]))
  return(list(pred = pred))
}

nnet_neural_network <- function(train, test) {
  formula <- as.formula(paste(colnames(train)[1], "~", paste(colnames(train)[-1], collapse = " + ")))
  model <- nnet::nnet(formula, data = train, size = 10, decay = 0.001, 
                      maxit = 5000, linout = FALSE, MaxNWts = 5000, trace = 0)
  predict_nnet(model, train, test)
}


## classification tree 
predict_rpart <- function(model, train, test) {
  pred <- factor(predict(model, newdata = test, type = "class"), 
                 levels = levels(train[, 1]))
  return(list(pred = pred))
}

rpart_tree <- function(train, test) {
  formula <- as.formula(paste(colnames(train)[1], "~", paste(colnames(train)[-1], collapse = " + ")))
  model <- rpart::rpart(formula, data = train, method = "class", cp = 0)
  predict_rpart(model, train, test)
}


## random forest
predict_rf <- function(model, train, test) {
  pred <- factor(predict(model, newdata = test, type = "response"), 
                 levels = levels(train[, 1]))
  return(list(pred = pred))
}

randomForest_rf <- function(train, test) {
  formula <- as.formula(paste(colnames(train)[1], "~", paste(colnames(train)[-1], collapse = " + ")))
  model <- randomForest::randomForest(formula, data = train, nodesize = 1, ntree = 200, mtry = 3)
  predict_rf(model, train, test)
}


## naive bayes
predict_e1071 <- function(model, train, test) {
  pred <- factor(predict(model, newdata = test, type = "class"), 
                 levels = levels(train[, 1]))
  return(list(pred = pred))
}

e1071_naive <- function(train, test) {
  model <- e1071::naiveBayes(x = train[, -1], y = train[, 1])
  predict_e1071(model, train, test)
}

## classifiers
classifiers <- c("st_indep", "st_full", "st_hc_indep", "st_hc_full", "st_bhc", "st_fbhc", "st_bhcr", 
                 "st_bj_kl_01", "st_bj_kl_05", "st_bj_kl_20", "st_bn_staged",  "st_naive", 
                 "bnlearn_hc", "bnlearn_tabu", "nnet_logistic", "e1071_naive", "nnet_neural_network", 
                 "rpart_tree", "randomForest_rf")


#######   experiment to run   #######   
dir.create("results/", showWarnings = FALSE)
for (d in datasets) {
  message(d)
  timestamp()
  res_path <- paste0("results/", d, "/") 
  dir.create(res_path, showWarnings = FALSE)
  data <- read.csv(paste0("datasets/", d, ".csv"), colClasses = "factor")
  data <- data[, c(NCOL(data), c(1:(NCOL(data) - 1)))]
  split_path <- paste0("splits/", d, "/")
  for (r in 1:nreps) { 
    id_test <- readRDS(paste0(split_path, r, "_id_test.rds"))
    train <- data[-id_test, ]
    test <- data[id_test, ]
    for (c_name in classifiers) {
      if((d != "selfy") & any(c_name == classifiers[c(1:4, 6:12)])) { 
        next
      }
      c_fun <- get(c_name)
      time_elapsed <- system.time(predict <- c_fun(train, test))[3]
      filename <- paste0(res_path, c_name, "_", r, ".rds")
      if(c_name %in% classifiers[-c(1:12)]) { 
        # df, logLik, AIC, BIC are calculated only for stagedtrees algorithms, 
        # for other algorithms they are NA. 
        saveRDS(list(time = time_elapsed, predict = predict$pred, df = NA, 
                     loglik = NA, aic = NA, bic = NA), file = filename)
      }
      else {
        saveRDS(list(time = time_elapsed, predict = predict$pred, df = predict$df, loglik = predict$loglik, 
                     aic = predict$aic, bic = predict$bic), file = filename)
      }
    }
  }
  if(d == levels(datasets)[length(levels(datasets))]) timestamp()
}


#######   create TABLE with statistics saved in results folder for each dataset and classifier   #######   

TABLE <- array(
  data = NA,
  dim = c(
    length(statistics) + 4,
    length(datasets),
    length(classifiers),
    nreps
  ),
  dimnames = list(
    stat = c(statistics, "df", "logLik", "AIC", "BIC"),
    data = datasets,
    classifier = classifiers,
    rep = 1:nreps
  )
)


for (d in datasets) {
  res_path <- paste0("results/", d, "/")
  data <- read.csv(paste0("datasets/", d, ".csv"), colClasses = "factor")
  data <- data[, c(NCOL(data), c(1:(NCOL(data) - 1)))]
  split_path <- paste0("splits/", d, "/")
  for (r in 1:nreps) {  
    id_test <- readRDS(paste0(split_path, r, "_id_test.rds"))
    true <- data[id_test, 1]
    for (c_name in classifiers) {
      filename <- paste0(res_path, c_name, "_", r, ".rds" )
      if (file.exists(filename)) {
        res <- readRDS(filename)
        for (stat in statistics) {
          stat_fun <- get(stat)
          TABLE[stat, d, c_name, r] <- stat_fun(res, true)
        }
        TABLE[attr(TABLE, "dimnames")[[1]][-c(1:length(statistics))], 
              d, c_name, r] <- c(res$df, res$loglik, res$aic, res$bic)
      }
    }
  }
}


#######   compute averages on 10 reps for each statistics and classifier on each dataset   #######   
AVG <- apply(TABLE, c(1,2,3), mean, na.rm = TRUE)
saveRDS(AVG, "AVG.rds")


#######   create paper's tables with interesting results   #######   
table1 <- matrix(NA, nrow = length(datasets), ncol = 6)
count <- 0
for(nam in datasets) {
  count <- count + 1
  out <- out2 <- 1
  zeros <- 0
  data <- read.csv(paste0("datasets/", nam, ".csv"), colClasses = "factor")
  data <- data[, c(NCOL(data), c(1:(NCOL(data) - 1)))]
  m0 <- full(data)
  edges <- length(m0$tree[[1]])
  for(i in 1:NCOL(data)) {
    out <- out * length(levels(data[, i]))
    if(i != NCOL(data)) {
      zeros <- zeros + sum(m0$stages[[i]] == "UNOBSERVED")
      out2 <- out2 + length(m0$stages[[i]])
      edges <- edges + length(m0$stages[[i]]) * length(m0$tree[[i+1]])
    }
  }
  table1[count, ] <- c(NROW(data), NCOL(data), out, out2, zeros, edges)
}
rownames(table1) <- datasets
colnames(table1) <- c("# observations", "# variables", "# root-to-leaf paths", "# non-leaf nodes", "# 0 cells", "# edges")
write.csv(table1, file = "Tables/table1.csv")


name_classifiers <- c("Indep", "Full", "HC - Indep", "HC - Full", "BHC", "Fast BHC", "Random BHC",
                      "Kullback-Leibler - 0.01", "Kullback-Leibler - 0.05", "Kullback-Leibler - 0.20",
                      "Refined BN", "HClust k = 2", "Bnlearn Hill-Climbing",
                      "Bnlearn Tabu", "Logistic Model", "Naive Bayes Classifier", "Neural Network",
                      "Classification Tree", "Random Forest")

table4 <- (t(AVG[c(3:6, 2, 1), 9, 1:12]))
attr(table4, "dimnames") <- list(Model = name_classifiers[1:12], 
                                 c("df", "logLik", "AIC", "BIC", "Accuracy", "Computational Time"))
write.csv(table4, file = "Tables/table4.csv") 


table5 <- round(t(AVG[2, , c(5, 13:19)]), 4)
attr(table5, "dimnames") <- list(Model = c("Stagedtrees BHC", name_classifiers[13:19]), 
                                 datasets)
write.csv(table5, file = "Tables/table5.csv")


###############################################
#                 SECTION  6                  #
###############################################

data("PhDArticles", package = "stagedtrees")
str(PhDArticles)

bn <- bnlearn::hc(PhDArticles)
pdf("Figures/figure10-left.pdf")
plot(bn)
dev.off()

order <- c("Gender", "Kids", "Married", "Articles")
bn.as.tree <- as_sevt(bn.fit(bn, data = PhDArticles), order = order)
pdf("Figures/figure10-right.pdf")
plot(bn.as.tree) 
dev.off()

phd.mod1 <- PhDArticles |> indep(order = order) |> stages_hc()
phd.mod2 <- PhDArticles |> full(order = order) |> stages_hc() 

phd.mod1 <- stndnaming(phd.mod1)
pdf("Figures/figure11-left.pdf")
plot(phd.mod1, cex_label_nodes = 2, cex_nodes = 0, col = "stages")
dev.off()

phd.mod2 <- stndnaming(phd.mod2)
pdf("Figures/figure11-right.pdf")
plot(phd.mod2, cex_label_nodes = 2, cex_nodes = 0, col = "stages")
dev.off()

pdf("Figures/figure12-left.pdf")
compare_stages(phd.mod1, phd.mod2, plot = TRUE, method = "stages") 
dev.off()
pdf("Figures/figure12-right.pdf")
barplot(phd.mod2, "Articles", legend.text = TRUE, xlab = "Articles")
dev.off()

lr_test(phd.mod1, phd.mod2)


order <- c("Prestige", "Mentor", order)
phd.all <- PhDArticles |> full(order = order) |> stages_bj(thr = 0.5) |> stndnaming() 
pdf("Figures/figure13-left.pdf")
plot(phd.all, col = "stages")
dev.off()
pdf("Figures/figure13-right.pdf")
barplot(phd.all, "Articles", legend.text = TRUE, beside = TRUE,
        col = "stages",
        xlab = "Articles")
dev.off()

get_path(phd.all, "Articles", "3")


phd.order <- search_best(PhDArticles, alg = stages_bhc)
phd.order

BIC(phd.all, phd.order)