add <- function(x,y) {
return(x+y)
}
Can test this manually:
result <- add(1,2) result == 3 [1] TRUE
add <- function(x,y) {
return(x+y+1)
}
Test FAILS:
result <- add(1,2) result == 3 [1] FALSE
testthat Packageinstall.packages("testthat")testthat Unit Testlibrary(testthat)
test_that("add() correctly adds things", {
expect_equal(add(1,2), 3)
expect_equal(add(5,6), 11)
}
)
Test passed
test_that Anatomytest_that("add() correctly adds things", {
expect_equal(add(1,2), 3)
expect_equal(add(5,6), 11)
}
)
add() correctly adds things){} written using expect_X functions from the testthat packagetest_that Failurestest_that("add() correctly adds things", {
expect_equal(add(1,2), 3)
expect_equal(add(5,6), 10) # incorrect expected value
}
)
-- Failure (Line 3): add() correctly adds things -------------------------------
add(5, 6) not equal to 10.
1/1 mismatches
[1] 11 - 10 == 1
Error: Test failed
test_functions.Rtest_file()add <- function(x,y) {
return(x+y)
}
testthat::test_file("test_functions.R")
== Testing test_functions.R =======================================================
[ FAIL 0 | WARN 0 | SKIP 0 | PASS 2 ] Done!
testthat tests calling functions that are not defined will fail like tests that find incorrect outputtest_that("mul() correctly multiplies things",{
expect_equal(mul(1,2), 2)
})
-- Error (Line 1): new function ------------------------------------------------
Error in `mul(1, 2)`: could not find function "mul"
Backtrace:
1. testthat::expect_equal(mul(1, 2), 2)
2. testthat::quasi_label(enquo(object), label, arg = "object")
3. rlang::eval_bare(expr, quo_get_env(quo))
Error: Test failed
Some use cases make running scripts in RStudio ineffective
Must turn an R script into a tool
source() function, to execute all code in a file in an R interpreter$ cat simple_script.R
print('hello moon')
print(1+2)
print(str(c(1,2,3,4)))
$ R
> source("simple_script.R")
[1] "hello moon"
[1] 3
num [1:4] 1 2 3 4
RscriptRscript command that can be run on the command line$ Rscript
Usage: /path/to/Rscript [--options] [-e expr [-e expr2 ...] | file] [args]
--options accepted are
--help Print usage and exit
--version Print version and exit
--verbose Print information on progress
--default-packages=list
Where 'list' is a comma-separated set
of package names, or 'NULL'
or options to R, in addition to --no-echo --no-restore, such as
--save Do save workspace at the end of the session
--no-environ Don't read the site and user environment files
--no-site-file Don't read the site-wide Rprofile
--no-init-file Don't read the user R profile
--restore Do restore previously saved objects at startup
--vanilla Combine --no-save, --no-restore, --no-site-file
--no-init-file and --no-environ
Rscript Example$ cat simple_script.R
print('hello moon')
print(1+2)
print(str(c(1,2,3,4)))
$ Rscript simple_script.R
[1] "hello moon"
[1] 3
num [1:4] 1 2 3 4
$
Rscript SpecificsRscript is a convenience function that runs the following R command:$ Rscript simple_script.R # is equivalent to: $ R --no-echo --no-restore --file=simple_script.R
commandArgs()Rscript command$ Rscript simple_script.R abc [1] "hello moon" [1] 3 num [1:4] 1 2 3 4
commandArgs()commandArgs() function to access command line arguments in the shellargs <- commandArgs(trailingOnly=TRUE)
commandArgs()args variable:$ cat echo_args.R print(commandArgs(trailingOnly=TRUE)) $ Rscript echo_args.R abc [1] "abc" $ Rscript echo_args.R abc 123 [1] "abc" "123" $ Rscript echo_args.R # no args character(0)
trailingOnly=FALSEcommandArgs() includes the Rscript command itself and any additional arguments:$ Rscript -e "commandArgs()" abc 123 [1] "/usr/bin/R" [2] "--no-echo" [3] "--no-restore" [4] "-e" [5] "commandArgs()" [6] "--args" [7] "abc" [8] "123"
trailingOnly=TRUEThe trailingOnly=TRUE argument returns only the arguments provided at the end of the command, after the Rscript portion:
$ Rscript -e "commandArgs(trailingOnly=TRUE)" abc 123 [1] "abc" "123"
args <- commandArgs(trailingOnly=TRUE)
if(length(args) != 1) {
# cat() writes characters to the screen
cat("Usage: simple_script.R <csv file>\n")
cat("Provide exactly one argument that is a CSV filename\n")
quit(save="no", status=1)
}
fn <- args[1]
library(tidyverse)
read_csv(fn)
$ cat data.csv gene,sampleA,sampleB,sampleC g1,1,35,20 g2,32,56,99 g3,392,583,444 g4,39853,16288,66928
$ Rscript inspect_csv.R data.csv ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ── ✔ ggplot2 3.3.5 ✔ purrr 0.3.4 ✔ tibble 3.1.6 ✔ dplyr 1.0.7 ✔ tidyr 1.2.0 ✔ stringr 1.4.0 ✔ readr 2.1.2 ✔ forcats 0.5.1 ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ── ✖ dplyr::filter() masks stats::filter() ✖ dplyr::lag() masks stats::lag() Rows: 4 Columns: 4 ── Column specification ──────────────────────────────────────────────────────── Delimiter: "," chr (1): gene dbl (3): sampleA, sampleB, sampleC ℹ Use `spec()` to retrieve the full column specification for this data. ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message . # A tibble: 4 × 4 gene sampleA sampleB sampleC <chr> <dbl> <dbl> <dbl> 1 g1 1 35 20 2 g2 32 56 99 3 g3 392 583 444 4 g4 39853 16288 66928
commandArgs() always produces a vector of stringshead.Rargs <- commandArgs(trailingOnly=TRUE)
if(length(args) != 2) {
cat("Usage: head.R <filename> <N>\n")
cat("Provide exactly two arguments: a CSV filename and an integer N\n")
quit(save="no", status=1)
}
# read in arguments
fn <- args[1]
n <- as.integer(args[2])
head.R# the csv output will include a header row, so reduce n by 1
n <- n-1
# suppressMessages() prevents messages like library loading text from being printed to the screen
suppressMessages(
{
library(tidyverse, quietly=TRUE)
read_csv(fn) %>%
slice_head(n=n) %>%
write_csv(stdout())
}
)
head.R$ Rscript head.R data.csv 3 gene,sampleA,sampleB,sampleC g1,1,35,20 g2,32,56,99
argparserargparser package helps build CLI argument interfaceslibrary(argparser, quietly=TRUE)
parser <- arg_parser("R implementation of GNU coreutils head command")
parser <- add_argument(parser, "filename", help="file to print lines from")
parser <- add_argument(parser, "-n", help="number of lines to print", type="numeric", default=10)
parser <- parse_args(parser, c("-n", 3, "data.csv"))
print(paste("printing from file:", parser$filename))
print(paste("printing top n:", parser$n))
head.R + argparselibrary(argparser, quietly=TRUE)
parser <- arg_parser("R implementation of GNU coreutils head command")
parser <- add_argument(
parser,
arg="filename",
help="file to print lines from"
)
parser <- add_argument(
parser,
arg="--top",
help="number of lines to print",
type="numeric",
default=10,
short='-n'
)
head.R + argparseargs <- parse_args(parser)
fn <- args$filename
# the csv output will include a header row, so reduce n by 1
n <- args$top-1
suppressMessages(
{
library(tidyverse, quietly=TRUE)
read_csv(fn) %>%
slice_head(n=n) %>%
write_csv(stdout())
}
)
head.R + argparse$ Rscript head.R -n 3 data.csv gene,sampleA,sampleB,sampleC g1,1,35,20 g2,32,56,99
head.R + argparse$ Rscript head.R -h usage: head.R [--] [--help] [--opts OPTS] [--top TOP] filename R implementation of GNU coreutils head command positional arguments: filename file to print lines from flags: -h, --help show this help message and exit optional arguments: -x, --opts RDS file containing argument values -n, --top number of lines to print [default: 10]
bwa and STAR exploit this inherently parallel structure of read alignment problemqsub and each job uses 16 cores = \(100 * 16 =\) 1600x speedup!apply Is Pleasingly Parallelapply, lapply, vapply etc computations are pleasantly parallelapply iterates over the input collection and executes the function on each of themapply function can be parallelized in Rparallel packageparallel package]mclapply(), or multicore apply is multicore implementation of applymclapply accepts all of the same arguments as lapply and a few additional starting with mc.mc.cores, which specifies the number of cores to use when executing tasks in parallelmclapply is not available on Windows, only Mac and linuxmclapply Examplelibrary(parallel)
args <- commandArgs(trailingOnly=TRUE)
cores <- as.integer(args[1])
ret <- mclapply(
1:10,
function(x) {
print(paste(Sys.time(),'process',x))
Sys.sleep(3)
},
mc.cores=cores
)
mclapply Example - 1 core$ Rscript mclapply.R 1 [1] "2022-03-23 20:33:09 process 1" [1] "2022-03-23 20:33:12 process 2" [1] "2022-03-23 20:33:15 process 3" [1] "2022-03-23 20:33:18 process 4" [1] "2022-03-23 20:33:21 process 5" [1] "2022-03-23 20:33:24 process 6" [1] "2022-03-23 20:33:27 process 7" [1] "2022-03-23 20:33:30 process 8" [1] "2022-03-23 20:33:33 process 9" [1] "2022-03-23 20:33:36 process 10"
mclapply Example - 3 cores$ Rscript mclapply.R 3 [1] "2022-03-23 20:29:56 process 1" [1] "2022-03-23 20:29:56 process 2" [1] "2022-03-23 20:29:56 process 3" [1] "2022-03-23 20:29:59 process 4" [1] "2022-03-23 20:29:59 process 5" [1] "2022-03-23 20:29:59 process 6" [1] "2022-03-23 20:30:02 process 7" [1] "2022-03-23 20:30:02 process 8" [1] "2022-03-23 20:30:02 process 9" [1] "2022-03-23 20:30:05 process 10"
mclapply Example - 10 cores- Rscript mclapply.R 10 [1] "2022-03-23 20:34:59 process 1" [1] "2022-03-23 20:34:59 process 2" [1] "2022-03-23 20:34:59 process 3" [1] "2022-03-23 20:34:59 process 4" [1] "2022-03-23 20:34:59 process 5" [1] "2022-03-23 20:34:59 process 6" [1] "2022-03-23 20:34:59 process 7" [1] "2022-03-23 20:34:59 process 8" [1] "2022-03-23 20:34:59 process 9" [1] "2022-03-23 20:34:59 process 10"
library(ggplot2) ggplot2::diamonds
## # A tibble: 53,940 × 10 ## carat cut color clarity depth table price x y z ## <dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl> ## 1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43 ## 2 0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31 ## 3 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31 ## 4 0.29 Premium I VS2 62.4 58 334 4.2 4.23 2.63 ## 5 0.31 Good J SI2 63.3 58 335 4.34 4.35 2.75 ## 6 0.24 Very Good J VVS2 62.8 57 336 3.94 3.96 2.48 ## 7 0.24 Very Good I VVS1 62.3 57 336 3.95 3.98 2.47 ## 8 0.26 Very Good H SI1 61.9 55 337 4.07 4.11 2.53 ## 9 0.22 Fair E VS2 65.1 61 337 3.87 3.78 2.49 ## 10 0.23 Very Good H VS1 59.4 61 338 4 4.05 2.39 ## # ℹ 53,930 more rows
summary() function:diamonds <- ggplot2::diamonds summary(diamonds$carat) #> Min. 1st Qu. Median Mean 3rd Qu. Max. #> 0.20 0.40 0.70 0.80 1.04 5.01 summary(diamonds$cut) #> Fair Good Very Good Premium Ideal #> 1610 4906 12082 13791 21551
class Read(object):
def __init__(self, header, seq, qual):
self.header = header
self.seq = seq
self.qual = qual
def to_fastq(self):
return '@{}\n{}\n+\n{}'.format(
self.header,
self.seq,
self.qual
)
read1 = Read("read1", "ACTG", "FFFF")
read1.seq # access sequence
read1.fastq() # output fastq format
new_Read <- function(header, seq, qual=NULL) {
structure(
list(
header=header,
seq=seq,
qual=qual
),
class="Read"
)
}
read1 <- new_Read("read1", "ACGT", "FFFF")
fastq.Read <- function(x) {
UseMethod("fastq")
paste0("@",self$header,"\n",self$seq,"+\n",self$qual,"\n")
}
fastq(read1)
Read <- R6Class("Read", list(
header = NULL,
seq = NULL,
qual = NULL,
initialize = function(header, seq, qual=NULL) {
self$header <- header
self$seq <- seq
self$qual <- qual
},
fastq = function() {
invisible(paste0("@",self$header,"\n",self$seq,"+\n",self$qual,"\n"))
}
}
}
))
read1 <- Read$new("read1","ACTG","FFFF")
read1$fastq()
Packages are the fundamental unit of reproducible code in R. Most of the time, we as practitioners simply use packages that have been provided by others. However, sometimes, when we have developed a methodology that others might find useful, it may be advantageous to write our own R packages and share them with the community. Writing an R package involves creating a directory with a specific structure and adding code, documentation, and package metadata in a way that allows it to be distributed.
Hadley Wickham and Jenny Bryan have written an excellent book on how to write R packages. If ever you need to write your own R package, this should be the only place you need to go.