Any DNA molecules may be subjected to sequencing via this method
Many different types of experiments are possible:
Each of these experiments create the same type of data
Each must be interpreted appropriately based on experiment
@SEQ_ID GATTTGGGGTTCAAAGCAGTATCGATCAAATAGTAAATCCATTTGTTCAACTCACAGTTT + !''*((((***+))%%%++)(%%%%).1***-+*''))**55CCF>>>>>>CCCCCCC65
@
- header, unique read name per flowcellGAT...
- the nucleotide sequence+
- second header, usually blank!''...
- the phred quality scores for each base in the readThe read counts that align to each locus of a genome form a distribution
Count data have two important properties:
count data are not normally distributed
Techniques that assume data are normally distributed (like linear regression) are not appropriate for count data
Must account for this in one of two ways:
csv
or tsv
(tab separated) formatcounts <- read_tsv("verse_counts.tsv")
counts
## # A tibble: 55,416 × 9 ## gene P0_1 P0_2 P4_1 P4_2 P7_1 P7_2 Ad_1 Ad_2 ## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> ## 1 ENSMUSG00000102693.2 0 0 0 0 0 0 0 0 ## 2 ENSMUSG00000064842.3 0 0 0 0 0 0 0 0 ## 3 ENSMUSG00000051951.6 19 24 17 17 17 12 7 5 ## 4 ENSMUSG00000102851.2 0 0 0 0 1 0 0 0 ## 5 ENSMUSG00000103377.2 1 0 0 1 0 1 1 0 ## 6 ENSMUSG00000104017.2 0 3 0 0 0 1 0 0 ## 7 ENSMUSG00000103025.2 0 0 0 0 0 0 0 0 ## 8 ENSMUSG00000089699.2 0 0 0 0 0 0 0 0 ## 9 ENSMUSG00000103201.2 0 0 0 0 0 0 0 1 ## 10 ENSMUSG00000103147.2 0 0 0 0 0 0 0 0 ## # ℹ 55,406 more rows
There are typically three steps when analyzing a RNASeq count matrix:
nonzero_genes <- rowSums(counts[-1])!=0 filtered_counts <- counts[nonzero_genes,] slice_head(filtered_counts, n=5)
## # A tibble: 5 × 9 ## gene P0_1 P0_2 P4_1 P4_2 P7_1 P7_2 Ad_1 Ad_2 ## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> ## 1 ENSMUSG00000051951.6 19 24 17 17 17 12 7 5 ## 2 ENSMUSG00000102851.2 0 0 0 0 1 0 0 0 ## 3 ENSMUSG00000103377.2 1 0 0 1 0 1 1 0 ## 4 ENSMUSG00000104017.2 0 3 0 0 0 1 0 0 ## 5 ENSMUSG00000103201.2 0 0 0 0 0 0 0 1
nonzero_genes <- rowSums(counts[-1])>=2 filtered_counts <- counts[nonzero_genes,] slice_head(filtered_counts, n=5)
## # A tibble: 5 × 9 ## gene P0_1 P0_2 P4_1 P4_2 P7_1 P7_2 Ad_1 Ad_2 ## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> ## 1 ENSMUSG00000051951.6 19 24 17 17 17 12 7 5 ## 2 ENSMUSG00000103377.2 1 0 0 1 0 1 1 0 ## 3 ENSMUSG00000104017.2 0 3 0 0 0 1 0 0 ## 4 ENSMUSG00000102331.2 5 8 2 6 6 11 4 3 ## 5 ENSMUSG00000025900.14 4 3 6 7 7 3 36 35
dplyr::select(filtered_counts, Ad_1) %>% filter(Ad_1 > 0) %>% # avoid taking log10(0) mutate(`log10(counts)`=log10(Ad_1)) %>% ggplot(aes(x=`log10(counts+1)`)) + geom_histogram(bins=50) + labs(title='log10(counts) distribution for Adult mouse')
\[ cpm_{s,i} = \frac{c_{s,i}}{l_s} * 10^6 \]
\[ cpm_{s,i} = \frac{c_{s,i}}{l_s} * 10^6 \]
library(DESeq2) # DESeq2 requires a counts matrix # column data (sample information), and a formula # the counts matrix *must be raw counts* count_mat <- as.matrix(filtered_counts[-1]) row.names(count_mat) <- filtered_counts$gene dds <- DESeqDataSetFromMatrix( countData=count_mat, colData=tibble(sample_name=colnames(filtered_counts[-1])), design=~1 # no formula needed, ~1 produces a trivial design matrix )
# compute normalization factors dds <- estimateSizeFactors(dds) # extract the normalized counts deseq_norm_counts <- as_tibble(counts(dds,normalized=TRUE)) %>% mutate(gene=filtered_counts$gene) %>% relocate(gene) # relocate changes column order
The DESeq2 normalization procedure has two important considerations:
The CPM normalization procedure does not borrow information across all samples, and therefore is not subject to these considerations.
log()
rlog()
function that performs a regularized logarithmic transformation that avoids these biases# the dds is the DESeq2 object from earlier rld <- rlog(dds)
model.matrix()
functionad_metadata
## # A tibble: 6 × 8 ## ID age_at_death condition tau abeta iba1 gfap braak_stage ## <chr> <dbl> <fct> <dbl> <dbl> <dbl> <dbl> <fct> ## 1 A1 73 AD 96548 176324 157501 78139 4 ## 2 A2 82 AD 95251 0 147637 79348 4 ## 3 A3 82 AD 40287 45365 17655 127131 2 ## 4 C1 80 Control 62684 93739 131595 124075 3 ## 5 C2 77 Control 63598 69838 7189 35597 3 ## 6 C3 72 Control 52951 19444 54673 17221 2
~ condition
argument is an an R formulamodel.matrix(~ condition, data=ad_metadata)
model.matrix(~ condition, data=ad_metadata)
## (Intercept) conditionAD ## 1 1 1 ## 2 1 1 ## 3 1 1 ## 4 1 0 ## 5 1 0 ## 6 1 0 ## attr(,"assign") ## [1] 0 1 ## attr(,"contrasts") ## attr(,"contrasts")$condition ## [1] "contr.treatment"
The general format of a formula is as follows:
# portions in [] are optional [<outcome variable>] ~ <predictor 1> [+ <predictor 2>]... # examples # model Gene 3 expression as a function of disease status `Gene 3` ~ condition # model the amount of tau pathology as a function of abeta pathology, # adjusting for age at death tau ~ age_at_death + abeta # can create a model without an outcome variable ~ age_at_death + condition
age_at_death
as a covariate in the model:model.matrix(~ age_at_death + condition, data=ad_metadata)
model.matrix(~ age_at_death + condition, data=ad_metadata)
## (Intercept) age_at_death conditionAD ## 1 1 73 1 ## 2 1 82 1 ## 3 1 82 1 ## 4 1 80 0 ## 5 1 77 0 ## 6 1 72 0 ## attr(,"assign") ## [1] 0 1 2 ## attr(,"contrasts") ## attr(,"contrasts")$condition ## [1] "contr.treatment"
li
near m
odels of mi
croarraysExpressionSet
or SummarizedExperiment
containerad_se <- SummarizedExperiment( assays=list(intensities=intensities), colData=ad_metadata, rowData=rownames(intensities) ) # design matrix for AD vs control, adjusting for age at death ad_vs_control_model <- model.matrix( ~ age_at_death + condition, data=ad_metadata )
# now run limma # first fit all genes with the model fit <- limma::lmFit( assays(se)$intensities, ad_vs_control_model ) # now better control fit errors with the empirical Bayes method fit <- limma::eBayes(fit)
limma::topTable()
function:# the coefficient name conditionAD is the column name in the design matrix topTable(fit, coef="conditionAD", adjust="BH", number=5)
## logFC AveExpr t P.Value adj.P.Val B ## 206354_at -2.7507429 5.669691 -5.349105 3.436185e-05 0.9992944 -2.387433 ## 234942_s_at 0.9906708 8.397425 5.206286 4.726372e-05 0.9992944 -2.447456 ## 243618_s_at -0.6879431 7.148455 -4.864695 1.020980e-04 0.9992944 -2.598600 ## 223703_at 0.8693657 6.154392 4.745033 1.340200e-04 0.9992944 -2.654092 ## 244391_at -0.5251423 5.532712 -4.691482 1.514215e-04 0.9992944 -2.679354
\[ f(k;\lambda) = \mathtt{Pr}(X=k) = \frac{\lambda^k e^{-\lambda}}{k!} \]
\[ \lambda = \mathtt{E}(X) = \mathtt{Var}(X) \]
\[ f(k; r,p) = \mathtt{Pr}(X=k) = \binom{k+r-1}{k} (1-p)^k p^r \]
\[ p = \frac{\mu}{\sigma^2}, r = \frac{\mu^2}{\sigma^2 - \mu} \]
\[ D = \frac{\sigma^2}{\mu} \]
As mentioned briefly above, DESeq2 requires three pieces of information to perform differential expression:
library(DESeq2) # the counts matrix *must be raw counts* count_mat <- as.matrix(filtered_counts[-1]) row.names(count_mat) <- filtered_counts$gene # create a sample matrix from sample names sample_info <- tibblesample_name=colnames(filtered_counts[-1])) %>% separate(sample_name, c("timepoint","replicate"),sep="_",remove=FALSE ) %>% mutate( timepoint=factor(timepoint,levels=c("Ad","P0","P4","P7")) ) design <- formula(~ timepoint)
sample_info
## # A tibble: 8 × 3 ## sample_name timepoint replicate ## <chr> <fct> <chr> ## 1 P0_1 P0 1 ## 2 P0_2 P0 2 ## 3 P4_1 P4 1 ## 4 P4_2 P4 2 ## 5 P7_1 P7 1 ## 6 P7_2 P7 2 ## 7 Ad_1 Ad 1 ## 8 Ad_2 Ad 2
dds <- DESeqDataSetFromMatrix( countData=count_mat, colData=sample_info, design=design ) dds <- DESeq(dds) resultsNames(dds)
## [1] "Intercept" "timepoint_P0_vs_Ad" "timepoint_P4_vs_Ad" ## [4] "timepoint_P7_vs_Ad"
res <- results(dds, name="timepoint_P0_vs_Ad") p0_vs_Ad_de <- as_tibble(res) %>% mutate(gene=rownames(res)) %>% relocate(gene) %>% arrange(pvalue) head(p0_vs_Ad_de, 5)
## # A tibble: 5 × 7 ## gene baseMean log2FoldChange lfcSE stat pvalue padj ## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> ## 1 ENSMUSG00000000031.17 19624. 6.75 0.187 36.2 2.79e-286 5.43e-282 ## 2 ENSMUSG00000026418.17 9671. 9.84 0.283 34.7 2.91e-264 2.83e-260 ## 3 ENSMUSG00000051855.16 3462. 5.35 0.155 34.5 4.27e-261 2.76e-257 ## 4 ENSMUSG00000027750.17 9482. 4.05 0.123 33.1 1.23e-239 5.98e-236 ## 5 ENSMUSG00000042828.13 3906. -5.32 0.165 -32.2 3.98e-227 1.55e-223
# number of genes significant at FDR < 0.05 filter(p0_vs_Ad_de,padj<0.05)
## # A tibble: 7,467 × 7 ## gene baseMean log2FoldChange lfcSE stat pvalue padj ## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> ## 1 ENSMUSG00000000031.17 19624. 6.75 0.187 36.2 2.79e-286 5.43e-282 ## 2 ENSMUSG00000026418.17 9671. 9.84 0.283 34.7 2.91e-264 2.83e-260 ## 3 ENSMUSG00000051855.16 3462. 5.35 0.155 34.5 4.27e-261 2.76e-257 ## 4 ENSMUSG00000027750.17 9482. 4.05 0.123 33.1 1.23e-239 5.98e-236 ## 5 ENSMUSG00000042828.13 3906. -5.32 0.165 -32.2 3.98e-227 1.55e-223 ## 6 ENSMUSG00000046598.16 1154. -6.08 0.197 -30.9 5.77e-210 1.87e-206 ## 7 ENSMUSG00000019817.19 1952. 5.15 0.170 30.3 1.14e-201 3.18e-198 ## 8 ENSMUSG00000021622.4 17146. -4.57 0.155 -29.5 5.56e-191 1.35e-187 ## 9 ENSMUSG00000002500.16 2121. -6.20 0.216 -28.7 2.36e-181 5.10e-178 ## 10 ENSMUSG00000024598.10 1885. 5.90 0.208 28.4 1.46e-177 2.83e-174 ## # ℹ 7,457 more rows
mutate( p0_vs_Ad_de, `-log10(adjusted p)`=-log10(padj), `FDR<0.05`=padj<0.05 ) %>% ggplot(aes(x=log2FoldChange,y=`-log10(adjusted p)`,color=`FDR<0.05`)) + geom_point()
## Warning: Removed 9551 rows containing missing values or values outside the scale range ## (`geom_point()`).
design <- data.frame(swirl = c("swirl.1", "swirl.2", "swirl.3", "swirl.4"), condition = c(1, -1, 1, -1)) dge <- DGEList(counts=counts) keep <- filterByExpr(dge, design) dge <- dge[keep,,keep.lib.sizes=FALSE]
topTable()
as in microarray limma results:# limma trend logCPM <- cpm(dge, log=TRUE, prior.count=3) fit <- lmFit(logCPM, design) fit <- eBayes(fit, trend=TRUE) topTable(fit, coef=ncol(design))