add <- function(x,y) { return(x+y) }
Can test this manually:
result <- add(1,2) result == 3 [1] TRUE
add <- function(x,y) { return(x+y+1) }
Test FAILS:
result <- add(1,2) result == 3 [1] FALSE
testthat
Packageinstall.packages("testthat")
testthat
Unit Testlibrary(testthat) test_that("add() correctly adds things", { expect_equal(add(1,2), 3) expect_equal(add(5,6), 11) } ) Test passed
test_that
Anatomytest_that("add() correctly adds things", { expect_equal(add(1,2), 3) expect_equal(add(5,6), 11) } )
add() correctly adds things
){}
written using expect_X
functions from the testthat packagetest_that
Failurestest_that("add() correctly adds things", { expect_equal(add(1,2), 3) expect_equal(add(5,6), 10) # incorrect expected value } ) -- Failure (Line 3): add() correctly adds things ------------------------------- add(5, 6) not equal to 10. 1/1 mismatches [1] 11 - 10 == 1 Error: Test failed
test_functions.R
test_file()
add <- function(x,y) { return(x+y) } testthat::test_file("test_functions.R") == Testing test_functions.R ======================================================= [ FAIL 0 | WARN 0 | SKIP 0 | PASS 2 ] Done!
testthat
tests calling functions that are not defined will fail like tests that find incorrect outputtest_that("mul() correctly multiplies things",{ expect_equal(mul(1,2), 2) }) -- Error (Line 1): new function ------------------------------------------------ Error in `mul(1, 2)`: could not find function "mul" Backtrace: 1. testthat::expect_equal(mul(1, 2), 2) 2. testthat::quasi_label(enquo(object), label, arg = "object") 3. rlang::eval_bare(expr, quo_get_env(quo)) Error: Test failed
Some use cases make running scripts in RStudio ineffective
Must turn an R script into a tool
source()
function, to execute all code in a file in an R interpreter$ cat simple_script.R print('hello moon') print(1+2) print(str(c(1,2,3,4))) $ R > source("simple_script.R") [1] "hello moon" [1] 3 num [1:4] 1 2 3 4
Rscript
Rscript
command that can be run on the command line$ Rscript Usage: /path/to/Rscript [--options] [-e expr [-e expr2 ...] | file] [args] --options accepted are --help Print usage and exit --version Print version and exit --verbose Print information on progress --default-packages=list Where 'list' is a comma-separated set of package names, or 'NULL' or options to R, in addition to --no-echo --no-restore, such as --save Do save workspace at the end of the session --no-environ Don't read the site and user environment files --no-site-file Don't read the site-wide Rprofile --no-init-file Don't read the user R profile --restore Do restore previously saved objects at startup --vanilla Combine --no-save, --no-restore, --no-site-file --no-init-file and --no-environ
Rscript
Example$ cat simple_script.R print('hello moon') print(1+2) print(str(c(1,2,3,4))) $ Rscript simple_script.R [1] "hello moon" [1] 3 num [1:4] 1 2 3 4 $
Rscript
SpecificsRscript
is a convenience function that runs the following R
command:$ Rscript simple_script.R # is equivalent to: $ R --no-echo --no-restore --file=simple_script.R
commandArgs()
Rscript
command$ Rscript simple_script.R abc [1] "hello moon" [1] 3 num [1:4] 1 2 3 4
commandArgs()
commandArgs()
function to access command line arguments in the shellargs <- commandArgs(trailingOnly=TRUE)
commandArgs()
args
variable:$ cat echo_args.R print(commandArgs(trailingOnly=TRUE)) $ Rscript echo_args.R abc [1] "abc" $ Rscript echo_args.R abc 123 [1] "abc" "123" $ Rscript echo_args.R # no args character(0)
trailingOnly=FALSE
commandArgs()
includes the Rscript
command itself and any additional arguments:$ Rscript -e "commandArgs()" abc 123 [1] "/usr/bin/R" [2] "--no-echo" [3] "--no-restore" [4] "-e" [5] "commandArgs()" [6] "--args" [7] "abc" [8] "123"
trailingOnly=TRUE
The trailingOnly=TRUE
argument returns only the arguments provided at the end of the command, after the Rscript
portion:
$ Rscript -e "commandArgs(trailingOnly=TRUE)" abc 123 [1] "abc" "123"
args <- commandArgs(trailingOnly=TRUE) if(length(args) != 1) { # cat() writes characters to the screen cat("Usage: simple_script.R <csv file>\n") cat("Provide exactly one argument that is a CSV filename\n") quit(save="no", status=1) } fn <- args[1] library(tidyverse) read_csv(fn)
$ cat data.csv gene,sampleA,sampleB,sampleC g1,1,35,20 g2,32,56,99 g3,392,583,444 g4,39853,16288,66928
$ Rscript inspect_csv.R data.csv ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ── ✔ ggplot2 3.3.5 ✔ purrr 0.3.4 ✔ tibble 3.1.6 ✔ dplyr 1.0.7 ✔ tidyr 1.2.0 ✔ stringr 1.4.0 ✔ readr 2.1.2 ✔ forcats 0.5.1 ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ── ✖ dplyr::filter() masks stats::filter() ✖ dplyr::lag() masks stats::lag() Rows: 4 Columns: 4 ── Column specification ──────────────────────────────────────────────────────── Delimiter: "," chr (1): gene dbl (3): sampleA, sampleB, sampleC ℹ Use `spec()` to retrieve the full column specification for this data. ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message . # A tibble: 4 × 4 gene sampleA sampleB sampleC <chr> <dbl> <dbl> <dbl> 1 g1 1 35 20 2 g2 32 56 99 3 g3 392 583 444 4 g4 39853 16288 66928
commandArgs()
always produces a vector of stringshead.R
args <- commandArgs(trailingOnly=TRUE) if(length(args) != 2) { cat("Usage: head.R <filename> <N>\n") cat("Provide exactly two arguments: a CSV filename and an integer N\n") quit(save="no", status=1) } # read in arguments fn <- args[1] n <- as.integer(args[2])
head.R
# the csv output will include a header row, so reduce n by 1 n <- n-1 # suppressMessages() prevents messages like library loading text from being printed to the screen suppressMessages( { library(tidyverse, quietly=TRUE) read_csv(fn) %>% slice_head(n=n) %>% write_csv(stdout()) } )
head.R
$ Rscript head.R data.csv 3 gene,sampleA,sampleB,sampleC g1,1,35,20 g2,32,56,99
argparser
argparser
package helps build CLI argument interfaceslibrary(argparser, quietly=TRUE) parser <- arg_parser("R implementation of GNU coreutils head command") parser <- add_argument(parser, "filename", help="file to print lines from") parser <- add_argument(parser, "-n", help="number of lines to print", type="numeric", default=10) parser <- parse_args(parser, c("-n", 3, "data.csv")) print(paste("printing from file:", parser$filename)) print(paste("printing top n:", parser$n))
head.R + argparse
library(argparser, quietly=TRUE) parser <- arg_parser("R implementation of GNU coreutils head command") parser <- add_argument( parser, arg="filename", help="file to print lines from" ) parser <- add_argument( parser, arg="--top", help="number of lines to print", type="numeric", default=10, short='-n' )
head.R + argparse
args <- parse_args(parser) fn <- args$filename # the csv output will include a header row, so reduce n by 1 n <- args$top-1 suppressMessages( { library(tidyverse, quietly=TRUE) read_csv(fn) %>% slice_head(n=n) %>% write_csv(stdout()) } )
head.R + argparse
$ Rscript head.R -n 3 data.csv gene,sampleA,sampleB,sampleC g1,1,35,20 g2,32,56,99
head.R + argparse
$ Rscript head.R -h usage: head.R [--] [--help] [--opts OPTS] [--top TOP] filename R implementation of GNU coreutils head command positional arguments: filename file to print lines from flags: -h, --help show this help message and exit optional arguments: -x, --opts RDS file containing argument values -n, --top number of lines to print [default: 10]
bwa
and STAR
exploit this inherently parallel structure of read alignment problemqsub
and each job uses 16 cores = \(100 * 16 =\) 1600x speedup!apply
Is Pleasingly Parallelapply
, lapply
, vapply
etc computations are pleasantly parallelapply
iterates over the input collection and executes the function on each of themapply
function can be parallelized in Rparallel
packageparallel
package]mclapply()
, or m
ultic
ore apply
is multicore implementation of apply
mclapply
accepts all of the same arguments as lapply
and a few additional starting with mc.
mc.cores
, which specifies the number of cores to use when executing tasks in parallelmclapply
is not available on Windows, only Mac and linuxmclapply
Examplelibrary(parallel) args <- commandArgs(trailingOnly=TRUE) cores <- as.integer(args[1]) ret <- mclapply( 1:10, function(x) { print(paste(Sys.time(),'process',x)) Sys.sleep(3) }, mc.cores=cores )
mclapply
Example - 1 core$ Rscript mclapply.R 1 [1] "2022-03-23 20:33:09 process 1" [1] "2022-03-23 20:33:12 process 2" [1] "2022-03-23 20:33:15 process 3" [1] "2022-03-23 20:33:18 process 4" [1] "2022-03-23 20:33:21 process 5" [1] "2022-03-23 20:33:24 process 6" [1] "2022-03-23 20:33:27 process 7" [1] "2022-03-23 20:33:30 process 8" [1] "2022-03-23 20:33:33 process 9" [1] "2022-03-23 20:33:36 process 10"
mclapply
Example - 3 cores$ Rscript mclapply.R 3 [1] "2022-03-23 20:29:56 process 1" [1] "2022-03-23 20:29:56 process 2" [1] "2022-03-23 20:29:56 process 3" [1] "2022-03-23 20:29:59 process 4" [1] "2022-03-23 20:29:59 process 5" [1] "2022-03-23 20:29:59 process 6" [1] "2022-03-23 20:30:02 process 7" [1] "2022-03-23 20:30:02 process 8" [1] "2022-03-23 20:30:02 process 9" [1] "2022-03-23 20:30:05 process 10"
mclapply
Example - 10 cores- Rscript mclapply.R 10 [1] "2022-03-23 20:34:59 process 1" [1] "2022-03-23 20:34:59 process 2" [1] "2022-03-23 20:34:59 process 3" [1] "2022-03-23 20:34:59 process 4" [1] "2022-03-23 20:34:59 process 5" [1] "2022-03-23 20:34:59 process 6" [1] "2022-03-23 20:34:59 process 7" [1] "2022-03-23 20:34:59 process 8" [1] "2022-03-23 20:34:59 process 9" [1] "2022-03-23 20:34:59 process 10"
library(ggplot2) ggplot2::diamonds
## # A tibble: 53,940 × 10 ## carat cut color clarity depth table price x y z ## <dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl> ## 1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43 ## 2 0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31 ## 3 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31 ## 4 0.29 Premium I VS2 62.4 58 334 4.2 4.23 2.63 ## 5 0.31 Good J SI2 63.3 58 335 4.34 4.35 2.75 ## 6 0.24 Very Good J VVS2 62.8 57 336 3.94 3.96 2.48 ## 7 0.24 Very Good I VVS1 62.3 57 336 3.95 3.98 2.47 ## 8 0.26 Very Good H SI1 61.9 55 337 4.07 4.11 2.53 ## 9 0.22 Fair E VS2 65.1 61 337 3.87 3.78 2.49 ## 10 0.23 Very Good H VS1 59.4 61 338 4 4.05 2.39 ## # ℹ 53,930 more rows
summary()
function:diamonds <- ggplot2::diamonds summary(diamonds$carat) #> Min. 1st Qu. Median Mean 3rd Qu. Max. #> 0.20 0.40 0.70 0.80 1.04 5.01 summary(diamonds$cut) #> Fair Good Very Good Premium Ideal #> 1610 4906 12082 13791 21551
class Read(object): def __init__(self, header, seq, qual): self.header = header self.seq = seq self.qual = qual def to_fastq(self): return '@{}\n{}\n+\n{}'.format( self.header, self.seq, self.qual ) read1 = Read("read1", "ACTG", "FFFF") read1.seq # access sequence read1.fastq() # output fastq format
new_Read <- function(header, seq, qual=NULL) { structure( list( header=header, seq=seq, qual=qual ), class="Read" ) } read1 <- new_Read("read1", "ACGT", "FFFF") fastq.Read <- function(x) { UseMethod("fastq") paste0("@",self$header,"\n",self$seq,"+\n",self$qual,"\n") } fastq(read1)
Read <- R6Class("Read", list( header = NULL, seq = NULL, qual = NULL, initialize = function(header, seq, qual=NULL) { self$header <- header self$seq <- seq self$qual <- qual }, fastq = function() { invisible(paste0("@",self$header,"\n",self$seq,"+\n",self$qual,"\n")) } } } )) read1 <- Read$new("read1","ACTG","FFFF") read1$fastq()
Packages are the fundamental unit of reproducible code in R. Most of the time, we as practitioners simply use packages that have been provided by others. However, sometimes, when we have developed a methodology that others might find useful, it may be advantageous to write our own R packages and share them with the community. Writing an R package involves creating a directory with a specific structure and adding code, documentation, and package metadata in a way that allows it to be distributed.
Hadley Wickham and Jenny Bryan have written an excellent book on how to write R packages. If ever you need to write your own R package, this should be the only place you need to go.