## Replication script for ## ## "fairadapt: Causal Reasoning for Fair Data Pre-processing" ## ## Results presented in the manuscript were obtained using a machine ## running MacOS. Note that those parts which rely on package ranger ## may not be expected to be identical across platforms due to ## implementation of the RNG in C++. library("fairadapt") library("ggplot2") library("ggraph") library("ranger") library("microbenchmark") library("xtable") quick_build <- is_on_cran <- run_sim <- FALSE ## ---- basic----------------------------------------------------------------------- n_samp <- 500 data("uni_admission", package = "fairadapt") uni_dat <- uni_admission[seq_len(2 * n_samp), ] head(uni_dat) uni_trn <- head(uni_dat, n = n_samp) uni_tst <- tail(uni_dat, n = n_samp) uni_dim <- c( "gender", "edu", "test", "score") uni_adj <- matrix(c( 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0), ncol = length(uni_dim), dimnames = rep(list(uni_dim), 2), byrow = TRUE) set.seed(2022) basic <- fairadapt(score ~ ., train.data = uni_trn, test.data = uni_tst, adj.mat = uni_adj, prot.attr = "gender") basic ## ----fairadapt-summary------------------------------------------------------------ summary(basic) ## ----adapted-data-example--------------------------------------------------------- head(adaptedData(basic, train = FALSE)) ## ---- graph-model----------------------------------------------------------------- uni_graph <- graphModel(uni_adj) ## ---- graph-plot------------------------------------------------------------------ set.seed(11) ggraph(graphModel(uni_adj), "igraph", algorithm = "sugiyama") + geom_edge_link(arrow = arrow(length = unit(4, "mm"), angle = 15, type = "closed"), end_cap = circle(8, "mm"), color = "grey20") + geom_node_point(color = "grey80", size = 21) + geom_node_text(aes(x = x, y = y, label = name), size = 5) + theme_bw() + theme(plot.margin = margin(30, 30, 30, 30), panel.border = element_blank(), panel.background = element_blank(), axis.title = element_blank(), axis.text = element_blank(), axis.line = element_blank(), axis.ticks = element_blank(), panel.grid = element_blank()) + coord_cartesian(clip = "off") ## ----benchmark-quant-methods------------------------------------------------------ linebreak <- function(..., align = c("l", "c", "r"), linebreaker = "\n") { x <- c(...) ifelse( grepl(linebreaker, x, fixed = TRUE), paste0("\\makecell[", match.arg(align), "]{", gsub(linebreaker, "\\\\", x, fixed = TRUE), "}"), x ) } bm_cache <- system.file("extdata", "bm_quant.rds", package = "fairadapt") if (!run_sim && file.exists(bm_cache)) { bmark <- readRDS(bm_cache) } else { nsamps <- c(200L, 500L) bmark <- lapply( c(rangerQuants, mcqrnnQuants, linearQuants), function(quant.method) { lapply( nsamps, function(nsamp) { tim <- microbenchmark( fairadapt(score ~ ., train.data = uni_admission[seq_len(nsamp), ], test.data = uni_admission[seq.int(nsamp + 1 , 2 * nsamp), ], adj.mat = uni_adj, prot.attr = "gender", quant.method = quant.method), times = 5L )$time round(mean(tim) / 10^9, digits = 1L) } ) } ) bmark <- do.call(cbind, bmark) colnames(bmark) <- c("Random forest", "Neural network", "Linear regression") rownames(bmark) <- as.character(nsamps) saveRDS(bmark, file.path("..", "inst", "extdata", "bm_quant.rds")) } rownames(bmark) <- paste0("$T_{\\text{uni}}(", rownames(bmark), ")$") tbl <- data.frame( linebreak( "\\pkg{ranger}", "\\code{rangerQuants}", "$O(np\\log n)$", "$ntrees = 500$\n$mtry = \\sqrt{p}$" ), linebreak( "\\pkg{qrnn}", "\\code{mcqrnnQuants}", "$O(npn_{\\text{epochs}})$", "1 hidden layer\nfully connected\nfeed-forward\nnetwork" ), linebreak( "\\pkg{quantreg}", "\\code{linearQuants}", "$O(p^2n)$", "\\code{\"br\"} method of\nBarrodale and\nRoberts used for\nfitting" ) ) colnames(tbl) <- colnames(bmark) rownames(tbl) <- linebreak( "\\proglang{R} package", "\\texttt{quant.method}", "Complexity", "Default\nparameters" ) tbl <- rbind(tbl, bmark) capt <- paste( "Summary table of different quantile regression methods. $n$ is the number", "of samples, $p$ number of covariates, $n_{\\text{epochs}}$ number of", "training epochs for the neural network. $T_{\\text{uni}}(n)$ denotes the", "runtime of different methods on the university admission dataset, with $n$", "training and $n$ testing samples. The runtimes were obtained on a system", "with Intel Core i7-8750H CPU @ 2.2GHz running MacOS Big Sur 11.6.", "The version of \\proglang{R} was 4.2.0 \"Vigorous Calisthenics\" with \\pkg{quantreg}", "version 5.93, \\pkg{ranger} version 0.13.1, and \\pkg{mcqrnn} version 2.0.5." ) print( xtable(tbl, caption = capt, label = "tab:qmethods", align = rep("l", ncol(tbl) + 1L)), booktabs = TRUE, table.placement = "t", sanitize.text.function = identity, comment = FALSE ) ## ---- quantFit-------------------------------------------------------------------- set.seed(22) fit_qual <- fairadapt(score ~ ., train.data = uni_trn, adj.mat = uni_adj, prot.attr = "gender", eval.qfit = 3L) quantFit(fit_qual) ## ---- fairtwin-uni---------------------------------------------------------------- ft_basic <- fairTwins(basic, train.id = seq_len(n_samp)) head(ft_basic, n = 3) ## ---- compas---------------------------------------------------------------------- data("compas", package = "fairadapt") cmp_mat <- matrix(0, nrow = ncol(compas), ncol = ncol(compas), dimnames = list(names(compas), names(compas))) cmp_mat[c("race", "sex", "age"), c("juv_fel_count", "juv_misd_count", "juv_other_count", "priors_count", "c_charge_degree", "two_year_recid")] <- 1 cmp_mat[c("juv_fel_count", "juv_misd_count", "juv_other_count"), c("priors_count", "c_charge_degree", "two_year_recid")] <- 1 cmp_mat["priors_count", c("c_charge_degree", "two_year_recid")] <- 1 cmp_mat["c_charge_degree", "two_year_recid"] <- 1 head(compas) ## ---- compas-boot-slow------------------------------------------------------------ cmp_trn <- tail(compas, n = 6000L) cmp_tst <- head(compas, n = 1214L) n_itr <- 50L set.seed(2022) fa_boot_fin <- fairadaptBoot(two_year_recid ~ ., "race", cmp_mat, cmp_trn, cmp_tst, rand.mode = "finsamp", n.boot = n_itr) ## ---- compas-boot-quant----------------------------------------------------------- set.seed(2022) fa_boot_quant <- fairadaptBoot(two_year_recid ~ ., "race", cmp_mat, cmp_trn, cmp_tst, rand.mode = "quant", n.boot = n_itr) ## ---- compas-forest--------------------------------------------------------------- fit_rf <- function(x) { ranger(factor(two_year_recid) ~ ., cmp_trn[x, ], probability = TRUE) } extract_pred <- function(x) x$predictions[, 2L] set.seed(2022) cmp_rf <- lapply(fa_boot_fin$boot.ind, fit_rf) pred_fin <- Map(predict, cmp_rf, adaptedData(fa_boot_fin, train = FALSE)) pred_fin <- do.call(cbind, lapply(pred_fin, extract_pred)) pred_quant <- Map(predict, cmp_rf, adaptedData(fa_boot_quant, train = FALSE)) pred_quant <- do.call(cbind, lapply(pred_quant, extract_pred)) ## ---- decision-maker-1------------------------------------------------------------ jac_frm <- function(x, modes = "single") { jac <- function(a, b) { intersection <- length(intersect(a, b)) union <- length(a) + length(b) - intersection intersection / union } res <- lapply( seq(quantile(x[, 1L], 0.05), quantile(x[, 1L], 0.95), 0.01), function(tsh) { ret <- replicate(100L, { col <- sample(ncol(x), 2L) jac(which(x[, col[1L]] > tsh), which(x[, col[2L]] > tsh)) }) data.frame(tsh = tsh, y = mean(ret), sd = sd(ret), mode = modes) } ) do.call(rbind, res) } jac_df <- rbind(jac_frm(pred_fin, "Finite Sample"), jac_frm(pred_quant, "Quantiles")) ## ---- decision-maker-2------------------------------------------------------------ ord_ind <- function(x, modes = "single") { res <- replicate(5000L, { row <- sample(nrow(x), 2) ord <- mean(x[row[1], ] > x[row[2], ]) ord^2 + (1 - ord)^2 }) data.frame(res = res, mode = modes) } ord_df <- rbind(ord_ind(pred_fin, "Finite Sample"), ord_ind(pred_quant, "Quantiles")) ## ---- decision-maker-3------------------------------------------------------------ inv_frm <- function(x, modes = "single") { gt <- function(x) x[1L] > x[2L] res <- replicate(100L, { col <- sample(ncol(x), 2L) prm <- order(x[, col[2L]][order(x[, col[1L]])]) sum(combn(prm, 2L, gt)) / choose(length(prm), 2L) }) data.frame(res = res, mode = modes) } inv_df <- rbind(inv_frm(pred_fin, "Finite Sample"), inv_frm(pred_quant, "Quantiles")) ## ---- decision-maker-4------------------------------------------------------------ prb_frm <- function(x, modes = "single") { qnt <- apply(x, 1L, quantile, probs = c(0.05, 0.95)) data.frame(width = qnt[2L, ] - qnt[1L, ], mode = modes) } prb_df <- rbind(prb_frm(pred_fin, "Finite Sample"), prb_frm(pred_quant, "Quantiles")) ## ---- compas-dm------------------------------------------------------------------- p1 <- ggplot(jac_df, aes(x = tsh, y = y, color = mode)) + geom_point() + geom_line() + geom_ribbon(aes(ymin = y - sd, ymax = y + sd, fill = mode, color = mode), alpha = 0.3) + theme_bw() + xlab("Decision threshold") + ylab("Jaccard similarity") + theme( legend.position = "inside", legend.position.inside = c(0.3, 0.3), legend.box.background = element_rect() ) + scale_color_discrete(labels = c(" \"finsamp\" ", " \"quant\" "), name = "rand.mode") + scale_fill_discrete(labels = c(" \"finsamp\" ", " \"quant\" "), name = "rand.mode") p2 <- ggplot(ord_df, aes(x = res, color = mode)) + stat_ecdf() + theme_bw() + xlab("Probability of preserving ordering") + ylab("Cumulative proportion") + theme( legend.position = "inside", legend.position.inside = c(0.3, 0.7), legend.box.background = element_rect() ) + scale_color_discrete(labels = c(" \"finsamp\" ", " \"quant\" "), name = "rand.mode") p3 <- ggplot(inv_df, aes(x = res, fill = mode)) + geom_density(alpha = 0.3) + xlim(0, 0.25) + theme_bw() + xlab("Normalized inversion number") + ylab("Density") + theme( legend.position = "inside", legend.position.inside = c(0.3, 0.7), legend.box.background = element_rect() ) + scale_fill_discrete(labels = c(" \"finsamp\" ", " \"quant\" "), name = "rand.mode") p4 <- ggplot(prb_df, aes(x = width, color = mode)) + stat_ecdf() + theme_bw() + xlab("95% CI width") + ylab("Cumulative probability") + theme( legend.position = "inside", legend.position.inside = c(0.3, 0.7), legend.box.background = element_rect() ) + scale_x_reverse() + scale_color_discrete(labels = c(" \"finsamp\" ", " \"quant\" "), name = "rand.mode") cowplot::plot_grid( p1, p2, p3, p4, ncol = 2L, labels = LETTERS[1:4] ) ## ---- indiv-uncq------------------------------------------------------------------ ind_prb <- data.frame( prob = as.vector(t(pred_quant[seq_len(3), ])), individual = rep(c(1, 2, 3), each = fa_boot_quant$n.boot) ) ## ---- compas-indiv---------------------------------------------------------------- ggplot(ind_prb, aes(x = prob, group = individual, fill = factor(individual))) + geom_density(alpha = 0.2) + scale_fill_discrete(labels = paste0("#", seq_len(3)), name = "Individual") + xlab("Probability Estimate") + ylab("Density") + theme_bw() + xlim(0, 1) + theme( legend.position = "inside", legend.position.inside = c(0.7, 0.75), legend.box.background = element_rect() ) ## ---- load-census----------------------------------------------------------------- data("gov_census", package = "fairadapt") dem <- c("age", "race", "hispanic_origin", "citizenship", "nativity", "economic_region") fam <- c("marital", "family_size", "children") edu <- c("education_level", "english_level") occ <- c("hours_worked", "weeks_worked", "occupation", "industry") prt <- "sex" res <- "salary" ## ---- census-adj------------------------------------------------------------------ cols <- c(dem, fam, edu, occ, prt, res) gov_adj <- matrix(0, nrow = length(cols), ncol = length(cols), dimnames = rep(list(cols), 2)) gov_cfd <- gov_adj gov_adj[dem, c(fam, edu, occ, res)] <- 1 gov_adj[fam, c( edu, occ, res)] <- 1 gov_adj[edu, c( occ, res)] <- 1 gov_adj[occ, res ] <- 1 gov_adj[prt, c(fam, edu, occ, res)] <- 1 gov_cfd[prt, dem] <- 1 gov_cfd[dem, prt] <- 1 gov_grph <- graphModel(gov_adj, gov_cfd) ## ---- census-graph---------------------------------------------------------------- set.seed(11) gov_tmp <- graphModel( `dimnames<-`(gov_adj, lapply(dimnames(gov_adj), substr, 1L, 2L)), `dimnames<-`(gov_cfd, lapply(dimnames(gov_cfd), substr, 1L, 2L)) ) grp <- list(dem, fam, edu, occ, prt, res) grp <- `names<-`( rep(c("demographic", "familial", "educational", "occupational", "protected", "response"), lengths(grp)), substr(unlist(grp), 1L, 2L) ) igraph::V(gov_tmp)$color <- grp[names(igraph::V(gov_tmp))] gov_tmp <- igraph::delete_edges( gov_tmp, which(igraph::E(gov_tmp)$lty == "blank") ) lty_selector <- function(lty) { function(layout) { get_all <- get_edges() edges <- get_all(layout) res <- edges[edges$lty == lty, ] res } } ggraph(gov_tmp, "igraph", algorithm = "fr") + geom_edge_arc(data = lty_selector("dashed"), arrow = arrow(length = unit(4, "mm"), angle = 15, ends = "both", type = "closed"), start_cap = circle(6.5, "mm"), end_cap = circle(6.5, "mm"), strength = 0.25, color = "grey20", linetype = "dashed") + geom_edge_link(data = lty_selector("solid"), arrow = arrow(length = unit(4, "mm"), angle = 15, type = "closed"), end_cap = circle(6.5, "mm"), color = "grey20", linetype = "solid") + geom_node_point(aes(color = color), size = 15) + geom_node_text(aes(x = x, y = y, label = name), size = 5) + scale_color_discrete(name = "Grouping") + theme_bw() + theme(plot.margin = margin(30, 30, 30, 30), legend.position = "bottom", panel.background = element_blank(), axis.title = element_blank(), axis.text = element_blank(), axis.line = element_blank(), axis.ticks = element_blank(), panel.grid = element_blank(), panel.border = element_blank()) + coord_cartesian(clip = "off") ## ---- log-sub-slow---------------------------------------------------------------- n_samp <- 30000 n_pred <- 5 ## ---- census-adapt---------------------------------------------------------------- gov_census$salary <- log(gov_census$salary) gov_trn <- head(gov_census, n = n_samp) gov_prd <- tail(gov_census, n = n_pred) set.seed(22) gov_ada <- fairadapt(salary ~ ., train.data = gov_trn, adj.mat = gov_adj, cfd.mat = gov_cfd, prot.attr = prt) ## ---- census-vis------------------------------------------------------------------ p1 <- ggplot(gov_trn, aes(x = salary, fill = sex)) + geom_density(alpha = 0.4) + theme_bw() p2 <- autoplot(gov_ada, when = "after") + theme_bw() + ggtitle(NULL) legend_join <- cowplot::get_plot_component( p1 + guides(color = guide_legend(nrow = 1)) + theme(legend.position = "bottom"), "guide-box-bottom", return_all = TRUE ) panels <- cowplot::plot_grid( p1 + theme(legend.position = "none"), p2 + theme(legend.position = "none"), labels = LETTERS[1:2] ) cowplot::plot_grid(panels, legend_join, ncol = 1, rel_heights = c(1, .08)) ## ---- census-predict-------------------------------------------------------------- set.seed(2022) gov_prd_ada <- predict(gov_ada, newdata = gov_prd) gov_prd_ada[, c("sex", "age", "education_level", "salary")] ## ---- census-twins---------------------------------------------------------------- fairTwins(gov_ada, train.id = 1:5, cols = c("sex", "age", "salary")) ## ---- res-uni--------------------------------------------------------------------- res_basic <- fairadapt(score ~ ., train.data = uni_trn, test.data = uni_tst, adj.mat = uni_adj, prot.attr = "gender", res.vars = "test") summary(res_basic) ## ---- res-assign------------------------------------------------------------------ uni_res <- graphModel(uni_adj, res.vars = "test") ## ---- res-graph------------------------------------------------------------------- set.seed(11) ggraph(graphModel(uni_adj, res.vars = "test"), "igraph", algorithm = "sugiyama") + geom_edge_link(arrow = arrow(length = unit(4, "mm"), angle = 15, type = "closed"), end_cap = circle(8, "mm"), color = "grey20") + geom_node_point(aes(color = color), size = 21, show.legend = FALSE) + geom_node_text(aes(x = x, y = y, label = name), size = 5) + theme_bw() + theme(plot.margin = margin(30, 30, 30, 30), panel.border = element_blank(), panel.background = element_blank(), axis.title = element_blank(), axis.text = element_blank(), axis.line = element_blank(), axis.ticks = element_blank(), panel.grid = element_blank()) + coord_cartesian(clip = "off") ## ---- semi-markov-uni------------------------------------------------------------- uni_cfd <- matrix(0, nrow = nrow(uni_adj), ncol = ncol(uni_adj), dimnames = dimnames(uni_adj)) uni_cfd["test", "score"] <- 1 uni_cfd["score", "test"] <- 1 semi <- fairadapt(score ~ ., train.data = uni_trn, test.data = uni_tst, adj.mat = uni_adj, cfd.mat = uni_cfd, prot.attr = "gender") ## ---- semi-graph------------------------------------------------------------------ set.seed(17) ggraph(semi$graph, "igraph", algorithm = "fr") + geom_edge_arc(data = lty_selector("dashed"), arrow = arrow(length = unit(4, "mm"), angle = 15, ends = "both", type = "closed"), start_cap = circle(8, "mm"), end_cap = circle(8, "mm"), strength = 0.25, color = "grey20", linetype = "dashed") + geom_edge_link(data = lty_selector("solid"), arrow = arrow(length = unit(4, "mm"), angle = 15, type = "closed"), end_cap = circle(8, "mm"), color = "grey20", linetype = "solid") + geom_node_point(color = "grey80", size = 21) + geom_node_text(aes(x = x, y = y, label = name), size = 5) + theme_bw() + theme(plot.margin = margin(30, 30, 30, 30), panel.border = element_blank(), panel.background = element_blank(), axis.title = element_blank(), axis.text = element_blank(), axis.line = element_blank(), axis.ticks = element_blank(), panel.grid = element_blank()) + coord_cartesian(clip = "off") ## ----top-ord---------------------------------------------------------------------- set.seed(2022) top_ord <- fairadapt(score ~ ., train.data = uni_trn, test.data = uni_tst, top.ord = c("gender", "edu", "test", "score"), prot.attr = "gender") summary(top_ord) ## ----session-info----------------------------------------------------------------- sessionInfo()