--- title: "Efficiency Relative to Storage and Time" author: "Thierry Onkelinx" output: rmarkdown::html_vignette: fig_caption: yes vignette: > %\VignetteIndexEntry{Efficiency Relative to Storage and Time} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} %\VignetteDepends{git2r} %\VignetteDepends{microbenchmark} %\VignetteDepends{ggplot2} --- ```{r setup, include = FALSE} library(knitr) opts_chunk$set( fig.height = 4, fig.width = 6, collapse = TRUE, comment = "#>" ) library(ggplot2) inbo_colours <- c("#959B38", "#729BB7", "#E87837", "#BDDDD7", "#E4E517", "#843860", "#C04384", "#C2C444", "#685457") theme_inbo <- function(base_size = 12, base_family = "") { rect_bg <- "white" legend_bg <- "white" panel_bg <- "#F3F3F3" panel_grid <- "white" plot_bg <- "white" half_line <- base_size / 2 ggplot2::theme( line = ggplot2::element_line(colour = "black", size = 0.5, linetype = 1, lineend = "butt"), rect = ggplot2::element_rect(fill = rect_bg, colour = "black", size = 0.5, linetype = 1), text = ggplot2::element_text(family = base_family, face = "plain", colour = "#843860", size = base_size, hjust = 0.5, vjust = 0.5, angle = 0, lineheight = 0.9, margin = ggplot2::margin(), debug = FALSE), axis.line = ggplot2::element_blank(), axis.line.x = ggplot2::element_blank(), axis.line.y = ggplot2::element_blank(), axis.text = ggplot2::element_text(size = ggplot2::rel(0.8)), axis.text.x = ggplot2::element_text( margin = ggplot2::margin(t = 0.8 * half_line / 2), vjust = 1 ), axis.text.x.top = NULL, axis.text.y = ggplot2::element_text( margin = ggplot2::margin(r = 0.8 * half_line / 2), hjust = 1 ), axis.text.y.right = NULL, axis.ticks = ggplot2::element_line(), axis.ticks.length = ggplot2::unit(0.15, "cm"), axis.title = ggplot2::element_text(colour = "black"), axis.title.x = ggplot2::element_text( margin = ggplot2::margin(t = 0.8 * half_line, b = 0.8 * half_line / 2) ), axis.title.x.top = NULL, axis.title.y = ggplot2::element_text( margin = ggplot2::margin(r = 0.8 * half_line, l = 0.8 * half_line / 2), angle = 90 ), axis.title.y.right = NULL, legend.background = ggplot2::element_rect(colour = NA, fill = legend_bg), legend.key = ggplot2::element_rect(fill = panel_bg, colour = NA), legend.key.size = ggplot2::unit(1.2, "lines"), legend.key.height = NULL, legend.key.width = NULL, legend.margin = NULL, legend.spacing = ggplot2::unit(0.2, "cm"), legend.spacing.x = NULL, legend.spacing.y = NULL, legend.text = ggplot2::element_text(size = ggplot2::rel(0.8)), legend.text.align = NULL, legend.title = ggplot2::element_text( size = ggplot2::rel(0.8), face = "bold", hjust = 0, colour = "black" ), legend.title.align = NULL, legend.position = "right", legend.direction = NULL, legend.justification = "center", legend.box = NULL, legend.box.margin = ggplot2::margin( t = half_line, r = half_line, b = half_line, l = half_line ), legend.box.background = ggplot2::element_rect( colour = NA, fill = legend_bg ), legend.box.spacing = ggplot2::unit(0.2, "cm"), panel.background = ggplot2::element_rect(fill = panel_bg, colour = NA), panel.border = ggplot2::element_blank(), panel.grid = ggplot2::element_line(colour = panel_grid), panel.grid.minor = ggplot2::element_line(colour = panel_grid, size = 0.25), panel.spacing = ggplot2::unit(half_line, "pt"), panel.spacing.x = NULL, panel.spacing.y = NULL, panel.ontop = FALSE, strip.background = ggplot2::element_rect(fill = "#8E9DA7", colour = NA), strip.text = ggplot2::element_text( size = ggplot2::rel(0.8), colour = "#F3F3F3" ), strip.text.x = ggplot2::element_text( margin = ggplot2::margin(t = half_line, b = half_line) ), strip.text.y = ggplot2::element_text( margin = ggplot2::margin(r = half_line, l = half_line), angle = -90 ), strip.switch.pad.grid = ggplot2::unit(0.1, "cm"), strip.switch.pad.wrap = ggplot2::unit(0.1, "cm"), strip.placement = "outside", plot.background = ggplot2::element_rect(colour = NA, fill = plot_bg), plot.title = ggplot2::element_text( size = ggplot2::rel(1.2), margin = ggplot2::margin(0, 0, half_line, 0) ), plot.subtitle = ggplot2::element_text( size = ggplot2::rel(1), margin = ggplot2::margin(0, 0, half_line, 0) ), plot.caption = ggplot2::element_text( size = ggplot2::rel(0.6), margin = ggplot2::margin(0, 0, half_line, 0) ), plot.margin = ggplot2::margin( t = half_line, r = half_line, b = half_line, l = half_line ), plot.tag = ggplot2::element_text( size = ggplot2::rel(1.2), hjust = 0.5, vjust = 0.5 ), plot.tag.position = "topleft", complete = TRUE ) } theme_set(theme_inbo()) update_geom_defaults("line", list(colour = "#356196")) update_geom_defaults("hline", list(colour = "#356196")) update_geom_defaults("boxplot", list(colour = "#356196")) ``` ## Introduction This vignette compares storage and retrieval of data by `git2rdata` with other standard R functionality. We consider `write.table()` and `read.table()` for data stored in a plain text format. `saveRDS()` and `readRDS()` use a compressed binary format. To get some meaningful results, we will use the `nassCDS` dataset from the [DAAG](https://www.rdocumentation.org/packages/DAAG/versions/1.22/topics/nassCDS) package. We'll avoid the dependency on the package by directly downloading the data. ```{r download_data, eval = system.file("efficiency", "airbag.rds", package = "git2rdata") == ""} airbag <- read.csv( "https://vincentarelbundock.github.io/Rdatasets/csv/DAAG/nassCDS.csv" ) airbag$dead <- airbag$dead == "dead" airbag$airbag <- airbag$airbag == "airbag" airbag$seatbelt <- airbag$seatbelt == "belted" airbag$dvcat <- as.ordered(airbag$dvcat) ``` ```{r load_data, echo = FALSE} if (system.file("efficiency", "airbag.rds", package = "git2rdata") == "") { saveRDS(airbag, file.path("..", "inst", "efficiency", "airbag.rds")) } else { airbag <- readRDS( system.file("efficiency", "airbag.rds", package = "git2rdata") ) } ``` ```{r data_structure} str(airbag) ``` ## Data Storage ### On a File System We start by writing the dataset as is with `write.table()`, `saveRDS()`, `write_vc()` and `write_vc()` without storage optimization. Note that `write_vc()` uses optimization by default. Since `write_vc()` creates two files for each data set, we take their combined file size into account. ```{r set_tmp_dir} library(git2rdata) root <- tempfile("git2rdata-efficient") dir.create(root) ``` ```{r file_size} write.table(airbag, file.path(root, "base_R.tsv"), sep = "\t") base_size <- file.size(file.path(root, "base_R.tsv")) saveRDS(airbag, file.path(root, "base_R.rds")) rds_size <- file.size(file.path(root, "base_R.rds")) fn <- write_vc(airbag, "airbag_optimize", root, sorting = "X") optim_size <- sum(file.size(file.path(root, fn))) fn <- write_vc(airbag, "airbag_verbose", root, sorting = "X", optimize = FALSE) verbose_size <- sum(file.size(file.path(root, fn))) ``` Since the data is highly compressible, `saveRDS()` yields the smallest file at the cost of having a binary file format. Both `write_vc()` formats yield smaller files than `write.table()`. Partly because `write_vc()` doesn't store row names and doesn't use quotes unless needed. The difference between the optimized and verbose version of `write_vc()` is, in this case, solely due to the way `write_vc()` stores factors in the data (`tsv`) file. The optimized version stores the indices of the factor whereas the verbose version stores the levels. For example: `airbag$dvcat` has 5 levels with short labels (on average 5 character), storing the index requires 1 character. This results in more compact files. ```{r table_file_size, echo = FALSE} kable( data.frame( method = c("saveRDS()", "write_vc(), optimized", "write_vc(), verbose", "write.table()"), file_size = c(rds_size, optim_size, verbose_size, base_size) / 2 ^ 10, relative = c(rds_size, optim_size, verbose_size, base_size) / base_size ), caption = "Resulting file sizes (in kB) and file sizes relative to the size of write.table().", digits = 2 ) ``` The reduction in file size when storing in factors depends on the length of the labels, the number of levels and the number of observations. The figure below illustrates the strong gain as soon as the level labels contain more than two characters. The gain is less pronounced when the factor has a large number of levels. The optimization fails in extreme cases with short factor labels and a high number of levels. ```{r factor_label_length, echo = FALSE, fig.cap = "Effect of the label length on the efficiency of storing factor optimized, assuming 1000 observations", warning = FALSE} ratio <- function(label_length = 1:20, n_levels = 9, n_obs = 1000) { meta_length <- 63 + (5 + label_length + floor(log10(n_levels))) * n_levels optimized <- n_obs * mean(ceiling(log10(seq_len(n_levels) + 1))) verbose <- n_obs * label_length ifelse( 62 ^ label_length >= n_levels, (optimized + meta_length) / (verbose + meta_length), NA ) } lengths <- 1:50 f_ratio <- rbind( data.frame( label_length = lengths, levels = 1000, ratio = ratio(lengths, 1000, 1000) ), data.frame( label_length = lengths, levels = 100, ratio = ratio(lengths, 100, 1000) ), data.frame( label_length = lengths, levels = 10, ratio = ratio(lengths, 10, 1000) ), data.frame( label_length = lengths, levels = 3, ratio = ratio(lengths, 3, 1000) ) ) f_ratio$levels <- factor(f_ratio$levels, levels = c(1000, 100, 10, 3)) ggplot(f_ratio, aes(x = label_length, y = ratio, colour = levels)) + geom_hline(yintercept = 1, linetype = 2) + geom_line() + scale_x_continuous("label length (characters)") + scale_y_continuous(paste("optimized bytes", "verbose bytes", sep = " / "), breaks = seq(0, 1.25, by = 0.25)) + scale_colour_manual("number of \nlevels", values = inbo_colours) ``` The effect of the number of observations is mainly due to the overhead of storing the metadata. The importance of this overhead increases when the number of observations is small. ```{r factor_observations, echo = FALSE, fig.cap = "Effect of the number of observations on the efficiency of storing factor optimized assuming labels with 10 characters"} n_obs <- 10 ^ seq(log10(1), log10(10000), length = 41) f_ratio <- rbind( data.frame( observations = n_obs, levels = 3, ratio = sapply(n_obs, ratio, label_length = 10, n_levels = 3) ), data.frame( observations = n_obs, levels = 10, ratio = sapply(n_obs, ratio, label_length = 10, n_levels = 10) ), data.frame( observations = n_obs, levels = 100, ratio = sapply(n_obs, ratio, label_length = 10, n_levels = 100) ), data.frame( observations = n_obs, levels = 1000, ratio = sapply(n_obs, ratio, label_length = 10, n_levels = 1000) ) ) f_ratio$levels <- factor(f_ratio$levels, levels = c(1000, 100, 10, 3)) ggplot(f_ratio, aes(x = observations, y = ratio, colour = levels)) + geom_hline(yintercept = 1, linetype = 2) + geom_line() + scale_x_log10() + scale_y_continuous(paste("optimized bytes", "verbose bytes", sep = " / "), breaks = seq(0, 1.25, by = 0.25)) + scale_colour_manual("number of \nlevels", values = inbo_colours) ``` ### In Git Repositories Here we will simulate how much space the data requires to store the history in a git repository. We will create a git repository for each method and store different subsets of the same data. Each commit contains a new version of the data. Each version is a random sample containing 90% of the observations of the `airbag` data. Two consecutive versions of the subset will have about 90% of the observations in common. After writing each version, we commit the file, perform garbage collection (`git gc`) on the git repository and then calculate the size of the git history (`git count-objects -v`). ```{r git_size, eval = system.file("efficiency", "git_size.rds", package = "git2rdata") == ""} library(git2r) tmp_repo <- function() { root <- tempfile("git2rdata-efficient-git") dir.create(root) repo <- git2r::init(root) git2r::config(repo, user.name = "me", user.email = "me@me.com") return(repo) } commit_and_size <- function(repo, filename) { add(repo, filename) commit(repo, "test", session = TRUE) git_size <- system( sprintf("cd %s\ngit gc\ngit count-objects -v", dirname(repo$path)), intern = TRUE ) git_size <- git_size[grep("size-pack", git_size)] as.integer(gsub(".*: (.*)", "\\1", git_size)) } repo_wt <- tmp_repo() repo_wts <- tmp_repo() repo_rds <- tmp_repo() repo_wvco <- tmp_repo() repo_wvcv <- tmp_repo() repo_size <- replicate( 100, { observed_subset <- rbinom(nrow(airbag), size = 1, prob = 0.9) == 1 this <- airbag[ sample(which(observed_subset)), sample(ncol(airbag)) ] this_sorted <- airbag[observed_subset, ] fn_wt <- file.path(workdir(repo_wt), "base_R.tsv") write.table(this, fn_wt, sep = "\t") fn_wts <- file.path(workdir(repo_wts), "base_R.tsv") write.table(this_sorted, fn_wts, sep = "\t") fn_rds <- file.path(workdir(repo_rds), "base_R.rds") saveRDS(this, fn_rds) fn_wvco <- write_vc(this, "airbag_optimize", repo_wvco, sorting = "X") fn_wvcv <- write_vc( this, "airbag_verbose", repo_wvcv, sorting = "X", optimize = FALSE ) c( write.table = commit_and_size(repo_wt, fn_wt), write.table.sorted = commit_and_size(repo_wts, fn_wts), saveRDS = commit_and_size(repo_rds, fn_rds), write_vc.optimized = commit_and_size(repo_wvco, fn_wvco), write_vc.verbose = commit_and_size(repo_wvcv, fn_wvcv) ) }) ``` ```{r store_git_size, echo = FALSE} if (system.file("efficiency", "git_size.rds", package = "git2rdata") == "") { saveRDS(repo_size, file.path("..", "inst", "efficiency", "git_size.rds")) } else { repo_size <- readRDS( system.file("efficiency", "git_size.rds", package = "git2rdata") ) } ``` Each version of the data has on purpose a random order of observations and variables. This is what would happen in a worst case scenario as it would generate the largest possible diff. We also test `write.table()` with a stable ordering of the observations and variables. The randomised `write.table()` yields the largest git repository, converging to about `r sprintf("%.1f", repo_size["write.table", 100] / repo_size["write.table.sorted", 100])` times the size of a git repository based on the sorted `write.table()`. `saveRDS()` yields a `r sprintf("%.0f%%", 100 - 100 * repo_size["saveRDS", 100] / repo_size["write.table", 100])` reduction in repository size compared to the randomised `write.table()`, but still is `r sprintf("%.1f", repo_size["saveRDS", 100] / repo_size["write.table.sorted", 100])` times larger than the sorted `write.table()`. Note that the gain of storing binary files in a git repository is much smaller than the gain in individual file size because git compresses its history. The optimized `write_vc()` starts at `r sprintf("%.0f%%", 100 * repo_size["write_vc.optimized", 1] / repo_size["write.table.sorted", 1])` and converges toward `r sprintf("%.0f%%", 100 * repo_size["write_vc.optimized", 100] / repo_size["write.table.sorted", 100])`, the verbose version starts at `r sprintf("%.0f%%", 100 * repo_size["write_vc.verbose", 1] / repo_size["write.table.sorted", 1])` and converges towards `r sprintf("%.0f%%", 100 * repo_size["write_vc.verbose", 100] / repo_size["write.table.sorted", 100])`. Storage size is a lot smaller when using `write_vc()` with optimization. The verbose option of `write_vc()` has little the gain in storage size. Another advantage is that `write_vc()` stores metadata. ```{r plot_git_size, echo = FALSE, fig.cap = "Size of the git history using the different storage methods."} rs <- lapply( rownames(repo_size), function(x) { if (x == "saveRDS") { fun <- "saveRDS" optimized <- "yes" } else if (x == "write_vc.optimized") { fun <- "write_vc" optimized <- "yes" } else if (x == "write_vc.verbose") { fun <- "write_vc" optimized <- "no" } else if (x == "write.table") { fun <- "write.table" optimized <- "no" } else if (x == "write.table.sorted") { fun <- "write.table" optimized <- "yes" } data.frame(commit = seq_along(repo_size[x, ]), size = repo_size[x, ], rel_size = repo_size[x, ] / repo_size["write.table.sorted", ], fun = fun, optimized = optimized, stringsAsFactors = FALSE) } ) rs <- do.call(rbind, rs) rs$optimized <- factor(rs$optimized, levels = c("yes", "no")) ggplot( rs, aes(x = commit, y = size / 2^10, colour = fun, linetype = optimized) ) + geom_line() + scale_y_continuous("repo size (in MiB)") + scale_colour_manual("function", values = inbo_colours) ``` ```{r plot_rel_git_size, echo = FALSE, fig.cap = "Relative size of the git repository when compared to write.table()."} ggplot(rs, aes(x = commit, y = rel_size, colour = fun, linetype = optimized)) + geom_line() + scale_y_continuous("size relative to sorted write.table()", breaks = 0:10) + scale_colour_manual("function", values = inbo_colours) ``` ## Timings The code below runs a microbenchmark on the four methods. A microbenchmark runs the code a hundred times and yields a distribution of timings for each expression. ### Writing Data ```{r get_file_timings, eval = system.file("efficiency", "file_timings.rds", package = "git2rdata") == ""} library(microbenchmark) mb <- microbenchmark( write.table = write.table(airbag, file.path(root, "base_R.tsv"), sep = "\t"), saveRDS = saveRDS(airbag, file.path(root, "base_R.rds")), write_vc.optim = write_vc(airbag, "airbag_optimize", root, sorting = "X"), write_vc.verbose = write_vc(airbag, "airbag_verbose", root, sorting = "X", optimize = FALSE) ) mb$time <- mb$time / 1e6 ``` ```{r store_file_timings, echo = FALSE} if ( system.file("efficiency", "file_timings.rds", package = "git2rdata") == "" ) { saveRDS(mb, file.path("..", "inst", "efficiency", "file_timings.rds")) } else { mb <- readRDS( system.file("efficiency", "file_timings.rds", package = "git2rdata") ) } ``` ```{r median_write, echo = FALSE} median_time <- aggregate(time ~ expr, data = mb, FUN = median) write_ratio <- 100 * median_time$time / median_time$time[median_time$expr == "write.table"] names(write_ratio) <- median_time$expr ``` `write_vc()` takes `r paste(sprintf("%.0f%%", -100 + write_ratio[grep("write_vc", names(write_ratio))]), collapse = " to ")` more time than `write.table()` because it needs to prepare the metadata and sort the observations and variables. When overwriting existing data, `write_vc()` checks the new data against the existing metadata. `saveRDS()` requires `r sprintf("%.0f%%", write_ratio["saveRDS"])` of the time that `write.table()` needs. ```{r plot_file_timings, echo = FALSE, fig.cap = "Boxplot of the write timings for the different methods."} mb$expr <- reorder(mb$expr, mb$time, FUN = median) levels(mb$expr) <- gsub("write_vc\\.", "write_vc\n", levels(mb$expr)) ggplot(mb, aes(x = expr, y = time)) + geom_boxplot() + scale_y_continuous("Time (in milliseconds)", limits = c(0, NA)) + theme(axis.title.x = ggplot2::element_blank()) ``` ### Reading Data ```{r get_read_timings, eval = system.file("efficiency", "read_timings.rds", package = "git2rdata") == ""} mb <- microbenchmark( read.table = read.table(file.path(root, "base_R.tsv"), header = TRUE, sep = "\t"), readRDS = readRDS(file.path(root, "base_R.rds")), read_vc.optim = read_vc("airbag_optimize", root), read_vc.verbose = read_vc("airbag_verbose", root) ) mb$time <- mb$time / 1e6 ``` ```{r store_read_timings, echo = FALSE} if ( system.file("efficiency", "read_timings.rds", package = "git2rdata") == "" ) { saveRDS(mb, file.path("..", "inst", "efficiency", "read_timings.rds")) } else { mb <- readRDS( system.file("efficiency", "read_timings.rds", package = "git2rdata") ) } ``` ```{r median_read, echo = FALSE} median_time <- aggregate(time ~ expr, data = mb, FUN = median) read_ratio <- 100 * median_time$time / median_time$time[median_time$expr == "read.table"] names(read_ratio) <- median_time$expr ``` The timings on reading the data is another story. Reading the binary format takes about `r sprintf("%.0f%%", read_ratio["readRDS"])` of the time needed to read the standard plain text format using `read.table()`. `read_vc()` takes about `r sprintf("%.0f%%", read_ratio["read_vc.optim"])` (optimized) and `r sprintf("%.0f%%", read_ratio["read_vc.verbose"])` (verbose) of the time needed by `read.table()`, which at first seems strange because `read_vc()` calls `read.table()` to read the files and has some extra work to convert the variables to the correct data type. The main difference is that `read_vc()` knows the required data type _a priori_ and passes this info to `read.table()`. Otherwise, `read.table()` has to guess the correct data type from the file. ```{r plot_read_timings, echo = FALSE, fig.cap = "Boxplots for the read timings for the different methods."} mb$expr <- factor( mb$expr, levels = c("readRDS", "read.table", "read_vc.optim", "read_vc.verbose") ) levels(mb$expr) <- gsub("read_vc\\.", "read_vc\n", levels(mb$expr)) ggplot(mb, aes(x = expr, y = time)) + geom_boxplot() + scale_y_continuous("Time (in milliseconds)", limits = c(0, NA)) + theme(axis.title.x = ggplot2::element_blank()) ```