Data modeling goal:
Models:
The scientific model and statistical model are related but independent choices we make
Consider three imaginary genes measured in Parkinson’s Disease (PD) patients blood:
set.seed(1337) gene_exp <- tibble( sample_name = c(str_c("P",1:100), str_c("C",1:100)), `Disease Status` = factor(c(rep("PD",100),rep("Control",100))), `Gene 1` = c(rnorm(100,255,10), rnorm(100,250,10)), `Gene 2` = c(rnorm(100,520,100), rnorm(100,600,60)), `Gene 3` = c(rnorm(100,500,40), rnorm(100,330,40)) )
gene_exp
## # A tibble: 200 × 5 ## sample_name `Disease Status` `Gene 1` `Gene 2` `Gene 3` ## <chr> <fct> <dbl> <dbl> <dbl> ## 1 P1 PD 257. 636. 504. ## 2 P2 PD 241. 556. 484. ## 3 P3 PD 252. 426. 476. ## 4 P4 PD 271. 405. 458. ## 5 P5 PD 248. 482. 520. ## 6 P6 PD 275. 521. 460. ## 7 P7 PD 264. 415. 568. ## 8 P8 PD 276. 450. 495. ## 9 P9 PD 274. 490. 496. ## 10 P10 PD 251. 584. 573. ## # ℹ 190 more rows
pivot_longer( gene_exp, c(`Gene 1`, `Gene 2`, `Gene 3`), names_to="Gene", values_to="Expression" ) %>% ggplot( aes(x=`Disease Status`,y=Expression,fill=`Disease Status`) ) + facet_wrap(vars(Gene)) + geom_violin()
heatmap()
function:heatmap()
function creates a clustered heatmap where the rows and columns have been hierarchically clustered# heatmap() requires a R matrix, and cannot accept a tibble or a dataframe marker_matrix <- as.matrix( dplyr::select(ad_metadata,c(tau,abeta,iba1,gfap)) ) # rownames of the matrix become y labels rownames(marker_matrix) <- ad_metadata$ID heatmap(marker_matrix)
heatmap()
Functionalityheatmap( marker_matrix, Rowv=NA, # don't cluster rows Colv=NA, # don't cluster columns scale="none", # don't scale rows )
heatmap()
Drawbackheatmap()
function has the major drawback that no color key is provided!heatmap.2()
in gplots
package has a similar interfacelibrary(gplots) heatmap.2(marker_matrix)
heatmap.2()
Example\[d(p, q) = \sqrt{(p - q)^2}\]
Below we will cluster the synthetic dataset introduced above in R:
ggplot(well_clustered_data, aes(x=f1,y=f2)) + geom_point() + labs(title="Unclustered")
# compute all pairwise distances using euclidean distance # as the distance metric euc_dist <- dist(dplyr::select(well_clustered_data,-ID)) # produce a clustering of the data using the hclust for # hierarchical clustering hc <- hclust(euc_dist, method="ave") # add ID as labels to the clustering object hc$labels <- well_clustered_data$ID
str(hc)
## List of 7 ## $ merge : int [1:59, 1:2] -50 -33 -48 -46 -44 -21 -11 -4 -36 -3 ... ## $ height : num [1:59] 0.00429 0.08695 0.23536 0.24789 0.25588 ... ## $ order : int [1:60] 8 9 7 6 19 18 11 16 4 14 ... ## $ labels : chr [1:60] "A1" "A2" "A3" "A4" ... ## $ method : chr "average" ## $ call : language hclust(d = euc_dist, method = "ave") ## $ dist.method: chr "euclidean" ## - attr(*, "class")= chr "hclust"
hclust()
outputhclust()
return object describes the clustering as a treelibrary(ggdendro) ggdendrogram(hc)
cutree
to divide the tree into three groups using k=3
:labels <- cutree(hc,k=3) labels
## A1 A2 A3 A4 A5 A6 A7 A8 A9 A10 A11 A12 A13 A14 A15 A16 A17 A18 A19 A20 ## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ## B1 B2 B3 B4 B5 B6 B7 B8 B9 B10 B11 B12 B13 B14 B15 B16 B17 B18 B19 B20 ## 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 ## C1 C2 C3 C4 C5 C6 C7 C8 C9 C10 C11 C12 C13 C14 C15 C16 C17 C18 C19 C20 ## 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
# we turn our labels into a tibble so we can join them with the original tibble well_clustered_data %>% left_join( tibble( ID=names(labels), label=as.factor(labels) ) ) %>% ggplot(aes(x=f1,y=f2,color=label)) + geom_point() + labs(title="Clustered")
## Joining with `by = join_by(ID)`
stats::prcomp()
function performs PCA in R# intensities contains microarray expression data for ~54k probesets x 20 # samples # transpose expression values so samples are rows expr_mat <- intensities %>% pivot_longer(-c(probeset_id),names_to="sample") %>% pivot_wider(names_from=probeset_id) # PCA expects all features (i.e. probesets) to be mean-centered, # convert to dataframe so we can use rownames expr_mat_centered <- as.data.frame( lapply(dplyr::select(expr_mat,-c(sample)),function(x) x-mean(x)) ) rownames(expr_mat_centered) <- expr_mat$sample
# prcomp performs PCA pca <- prcomp( expr_mat_centered, center=FALSE, # already centered scale=TRUE # prcomp scales each feature to have unit variance )
# the str() function prints the structure of its argument str(pca)
## List of 5 ## $ sdev : num [1:20] 101.5 81 71.2 61.9 57.9 ... ## $ rotation: num [1:54675, 1:20] 0.00219 0.00173 -0.00313 0.00465 0.0034 ... ## ..- attr(*, "dimnames")=List of 2 ## .. ..$ : chr [1:54675] "X1007_s_at" "X1053_at" "X117_at" "X121_at" ... ## .. ..$ : chr [1:20] "PC1" "PC2" "PC3" "PC4" ... ## $ center : logi FALSE ## $ scale : Named num [1:54675] 0.379 0.46 0.628 0.272 0.196 ... ## ..- attr(*, "names")= chr [1:54675] "X1007_s_at" "X1053_at" "X117_at" "X121_at" ... ## $ x : num [1:20, 1:20] -67.94 186.14 6.07 -70.72 9.58 ... ## ..- attr(*, "dimnames")=List of 2 ## .. ..$ : chr [1:20] "A1" "A2" "A3" "A4" ... ## .. ..$ : chr [1:20] "PC1" "PC2" "PC3" "PC4" ... ## - attr(*, "class")= chr "prcomp"
The result of prcomp()
is a list with five members:
sdev
- the standard deviation (i.e. the square root of the variance) for each componentrotation
- a matrix where the principal components are in columnsx
- the projections of the original datacenter
- if center=TRUE
was passed, a vector of feature meansscale
- if scale=TRUE
was passed, a vector of the feature variancesprcomp()
result doesn’t provide this fractionsdev
variable returned by prcomp()
may be used to first calculate the variance explained by each component by squaring it, then dividing by the sumlibrary(patchwork) pca_var <- tibble( PC=factor(str_c("PC",1:20),str_c("PC",1:20)), Variance=pca$sdev**2, `% Explained Variance`=Variance/sum(Variance)*100, `Cumulative % Explained Variance`=cumsum(`% Explained Variance`) )
exp_g <- pca_var %>% ggplot(aes(x=PC,y=`% Explained Variance`,group=1)) + geom_point() + geom_line() + theme(axis.text.x=element_text(angle=90,hjust=0,vjust=0.5)) cum_g <- pca_var %>% ggplot(aes(x=PC,y=`Cumulative % Explained Variance`,group=1)) + geom_point() + geom_line() + ylim(0,100) + # set y limits to [0,100] theme(axis.text.x=element_text(angle=90,hjust=0,vjust=0.5)) exp_g | cum_g
as_tibble(pca$x) %>% ggplot(aes(x=PC1,y=PC2)) + geom_point()
ggpairs() function
in the GGally packagelibrary(GGally) as_tibble(pca$x) %>% dplyr::select(PC1:PC6) %>% ggpairs()
library(ggbeeswarm) as_tibble(pca$x) %>% pivot_longer(everything(),names_to="PC",values_to="projection") %>% mutate(PC=fct_relevel(PC,str_c("PC",1:20))) %>% ggplot(aes(x=PC,y=projection)) + geom_beeswarm() + labs(title="PCA Projection Plot")
as_tibble(pca$x) %>% mutate(type=stringr::str_sub(rownames(pca$x),1,1)) %>% ggplot(aes(x=PC1,y=PC2,color=type)) + geom_point()
library(GGally) as_tibble(pca$x) %>% mutate(type=stringr::str_sub(rownames(pca$x),1,1)) %>% dplyr::select(c(type,PC1:PC6)) %>% ggpairs(columns=1:6,mapping=aes(fill=type))
ggplot(gene_exp, aes(x=`Gene 1`)) + geom_histogram(bins=30,fill="#a93c13")
ggplot(gene_exp, aes(x=`Gene 1`)) + geom_histogram(bins=30,fill="#a93c13") + geom_vline(xintercept=mean(gene_exp$`Gene 1`))
library(patchwork) well_behaved_data <- tibble(data = rnorm(1000)) # introduce some outliers data_w_outliers <- tibble(data = c(rnorm(800), rep(5, 200))) g_no_outlier <- ggplot(well_behaved_data, aes(x = data)) + geom_histogram(fill = "#56CBF9", color = "grey", bins = 30) + geom_vline(xintercept = mean(well_behaved_data$data)) + ggtitle("Mean example, no outliers") g_outlier <- ggplot(data_w_outliers, aes(x = data)) + geom_histogram(fill = "#7FBEEB", color = "grey", bins = 30) + geom_vline(xintercept = mean(data_w_outliers$data)) + ggtitle("Mean example, big outliers") g_no_outlier | g_outlier
g_no_outlier <- ggplot(well_behaved_data, aes(x = data)) + geom_histogram(fill = "#AFBED1", color = "grey", bins = 30) + geom_vline(xintercept = median(well_behaved_data$data)) + ggtitle("Median example") g_outlier <- ggplot(data_w_outliers, aes(x = data)) + geom_histogram(fill = "#7FBEEB", color = "grey", bins = 30) + geom_vline(xintercept = median(data_w_outliers$data)) + ggtitle("Median example, big outliers") g_no_outlier | g_outlier
sd()
g1_mean <- mean(gene_exp$`Gene 1`) g1_sd <- sd(gene_exp$`Gene 1`) ggplot(gene_exp, aes(x=`Gene 1`)) + geom_histogram(bins=30,fill="#a93c13") + geom_vline(xintercept=g1_mean) + geom_segment(x=g1_mean-g1_sd, y=10, xend=g1_mean+g1_sd, yend=10)
data <- tibble(data = c(rnorm(1000, sd=1.75))) ggplot(data, aes(x = data)) + geom_histogram(fill = "#EAC5D8", color = "white", bins = 30) + geom_vline(xintercept = seq(-3,3,1) * sd(data$data)) + xlim(c(-6, 6)) + ggtitle("Standard deviations aplenty", paste("SD:", sd(data$data)))
g1_mean <- mean(gene_exp$`Gene 1`) g1_sd <- sd(gene_exp$`Gene 1`) ggplot(gene_exp, aes(x=`Gene 1`)) + geom_histogram( aes(y=after_stat(density)), bins=30, fill="#a93c13" ) + stat_function(fun=dnorm, args = list(mean=g1_mean, sd=g1_sd), linewidth=2 )
\[ Gene\;1 \sim \mathcal{N}(254, 11) \]
g_norm <- ggplot(tibble(data = rnorm(5000)), aes(x = data)) + geom_histogram(fill = "#D0FCB3", bins = 50, color = "gray") + ggtitle("Normal distribution", "rnorm(n = 1000)") g_unif <- ggplot(tibble(data = runif(5000)), aes(x = data)) + geom_histogram(fill = "#271F30", bins = 50, color = "white") + ggtitle("Uniform distribution", "runif(n = 1000)") g_logistic <- ggplot(tibble(data = rlogis(5000)), aes(x = data)) + geom_histogram(fill = "#9BC59D", bins = 50, color = "black") + ggtitle("Logistic distribution", "rlogis(n = 1000)") g_exp <- ggplot(tibble(data = rexp(5000, rate = 1)), aes(x = data)) + geom_histogram(fill = "#6C5A49", bins = 50, color = "white") + ggtitle("Exponential distribution", "rexp(n = 1000, rate = 1)") (g_norm | g_unif) / (g_logistic | g_exp)
We chose to model our gene 1 expression data using a normaly distribution parameterized by the arithmetic mean and standard deviation
\[ Y_i = \beta_0 + \beta_1 \phi_1 ( X_{i1} ) + \beta_2 \phi_2 ( X_{i2} ) + \ldots + \beta_p \phi_p ( X_{ip} ) + \epsilon_i \]
library(ggbeeswarm) ggplot(gene_exp, aes(x=`Disease Status`, y=`Gene 3`, color=`Disease Status`)) + geom_beeswarm()
exp_summ <- pivot_longer( gene_exp, c(`Gene 3`) ) %>% group_by(`Disease Status`) %>% summarize(mean=mean(value),sd=sd(value)) pd_mean <- filter(exp_summ, `Disease Status` == "PD")$mean c_mean <- filter(exp_summ, `Disease Status` == "Control")$mean ggplot(gene_exp, aes(x=`Gene 3`, fill=`Disease Status`)) + geom_histogram(bins=20, alpha=0.6,position="identity") + annotate("segment", x=c_mean, xend=pd_mean, y=20, yend=20, arrow=arrow(ends="both", angle=90)) + annotate("text", x=mean(c(c_mean,pd_mean)), y=21, hjust=0.5, label="How different?")
pd_mean - c_mean
## [1] 164.0942
In other words, this point estimate suggests that on average Parkinson’s patients have 164.1 greater expression than Controls.
ggplot(gene_exp, aes(x=`Disease Status`, y=`Gene 3`, color=`Disease Status`)) + geom_beeswarm() + annotate("segment", x=0, xend=3, y=2*c_mean-pd_mean, yend=2*pd_mean-c_mean)
Y ~ X
, or Gene 3 ~ Disease Status
fit <- lm(`Gene 3` ~ `Disease Status`, data=gene_exp) fit
## ## Call: ## lm(formula = `Gene 3` ~ `Disease Status`, data = gene_exp) ## ## Coefficients: ## (Intercept) `Disease Status`PD ## 334.6 164.1
lm()
Fitsummary(fit)
## ## Call: ## lm(formula = `Gene 3` ~ `Disease Status`, data = gene_exp) ## ## Residuals: ## Min 1Q Median 3Q Max ## -124.303 -25.758 -2.434 30.518 119.348 ## ## Coefficients: ## Estimate Std. Error t value Pr(>|t|) ## (Intercept) 334.578 4.414 75.80 <2e-16 *** ## `Disease Status`PD 164.094 6.243 26.29 <2e-16 *** ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## ## Residual standard error: 44.14 on 198 degrees of freedom ## Multiple R-squared: 0.7773, Adjusted R-squared: 0.7761 ## F-statistic: 691 on 1 and 198 DF, p-value: < 2.2e-16
# naive solution fit1 <- lm(`Gene 1` ~ `Disease Status`, data=gene_exp) fit2 <- lm(`Gene 2` ~ `Disease Status`, data=gene_exp) fit3 <- lm(`Gene 3` ~ `Disease Status`, data=gene_exp)
gene_stats <- lapply( c("Gene 1", "Gene 2", "Gene 3"), function(gene) { model <- paste0('`',gene,'`',' ~ `Disease Status`') fit <- lm(model, data=gene_exp) v <- c(gene, coefficients(fit), summary(fit)$coefficients[2,4] ) names(v) <- c("Gene", "Intercept", "PD", "pvalue") return(v) } ) %>% bind_rows() # turn the list into a tibble # compute FDR from nominal p-values gene_stats$padj <- p.adjust(gene_stats$pvalue,method="fdr")
gene_stats ## A tibble: 3 × 5 # Gene Intercept PD pvalue padj # <chr> <chr> <chr> <chr> <dbl> #1 Gene 1 250.410 6.959 3.921e-06 3.92e- 6 #2 Gene 2 597.763 -93.629 2.373e-13 3.56e-13 #3 Gene 3 334.577 164.094 1.727e-66 5.18e-66
\[Y = g^{-1}(\beta_0 + \beta_1 \phi_1 ( X_{i1} ) + \beta_2 \phi_2 ( X_{i2} ) + \ldots + \beta_p \phi_p ( X_{ip} ))\]
Any DNA molecules may be subjected to sequencing via this method
Many different types of experiments are possible:
Each of these experiments create the same type of data
Each must be interpreted appropriately based on experiment
@SEQ_ID GATTTGGGGTTCAAAGCAGTATCGATCAAATAGTAAATCCATTTGTTCAACTCACAGTTT + !''*((((***+))%%%++)(%%%%).1***-+*''))**55CCF>>>>>>CCCCCCC65
@
- header, unique read name per flowcellGAT...
- the nucleotide sequence+
- second header, usually blank!''...
- the phred quality scores for each base in the readThe read counts that align to each locus of a genome form a distribution
Count data have two important properties:
count data are not normally distributed
Techniques that assume data are normally distributed (like linear regression) are not appropriate for count data
Must account for this in one of two ways:
csv
or tsv
(tab separated) formatcounts <- read_tsv("verse_counts.tsv")
counts
## # A tibble: 55,416 × 9 ## gene P0_1 P0_2 P4_1 P4_2 P7_1 P7_2 Ad_1 Ad_2 ## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> ## 1 ENSMUSG00000102693.2 0 0 0 0 0 0 0 0 ## 2 ENSMUSG00000064842.3 0 0 0 0 0 0 0 0 ## 3 ENSMUSG00000051951.6 19 24 17 17 17 12 7 5 ## 4 ENSMUSG00000102851.2 0 0 0 0 1 0 0 0 ## 5 ENSMUSG00000103377.2 1 0 0 1 0 1 1 0 ## 6 ENSMUSG00000104017.2 0 3 0 0 0 1 0 0 ## 7 ENSMUSG00000103025.2 0 0 0 0 0 0 0 0 ## 8 ENSMUSG00000089699.2 0 0 0 0 0 0 0 0 ## 9 ENSMUSG00000103201.2 0 0 0 0 0 0 0 1 ## 10 ENSMUSG00000103147.2 0 0 0 0 0 0 0 0 ## # ℹ 55,406 more rows
There are typically three steps when analyzing a RNASeq count matrix:
nonzero_genes <- rowSums(counts[-1])!=0 filtered_counts <- counts[nonzero_genes,] slice_head(filtered_counts, n=5)
## # A tibble: 5 × 9 ## gene P0_1 P0_2 P4_1 P4_2 P7_1 P7_2 Ad_1 Ad_2 ## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> ## 1 ENSMUSG00000051951.6 19 24 17 17 17 12 7 5 ## 2 ENSMUSG00000102851.2 0 0 0 0 1 0 0 0 ## 3 ENSMUSG00000103377.2 1 0 0 1 0 1 1 0 ## 4 ENSMUSG00000104017.2 0 3 0 0 0 1 0 0 ## 5 ENSMUSG00000103201.2 0 0 0 0 0 0 0 1
nonzero_genes <- rowSums(counts[-1])>=2 filtered_counts <- counts[nonzero_genes,] slice_head(filtered_counts, n=5)
## # A tibble: 5 × 9 ## gene P0_1 P0_2 P4_1 P4_2 P7_1 P7_2 Ad_1 Ad_2 ## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> ## 1 ENSMUSG00000051951.6 19 24 17 17 17 12 7 5 ## 2 ENSMUSG00000103377.2 1 0 0 1 0 1 1 0 ## 3 ENSMUSG00000104017.2 0 3 0 0 0 1 0 0 ## 4 ENSMUSG00000102331.2 5 8 2 6 6 11 4 3 ## 5 ENSMUSG00000025900.14 4 3 6 7 7 3 36 35
dplyr::select(filtered_counts, Ad_1) %>% filter(Ad_1 > 0) %>% # avoid taking log10(0) mutate(`log10(counts)`=log10(Ad_1)) %>% ggplot(aes(x=`log10(counts+1)`)) + geom_histogram(bins=50) + labs(title='log10(counts) distribution for Adult mouse')
\[ cpm_{s,i} = \frac{c_{s,i}}{l_s} * 10^6 \]
\[ cpm_{s,i} = \frac{c_{s,i}}{l_s} * 10^6 \] * \(c_{s,i} =\) raw count of gene \(i\) in sample \(s\) * \(l_s =\) library size for sample \(s\) * In principle, the proportion of read counts for each gene will be comparable between samples
library(DESeq2) # DESeq2 requires a counts matrix # column data (sample information), and a formula # the counts matrix *must be raw counts* count_mat <- as.matrix(filtered_counts[-1]) row.names(count_mat) <- filtered_counts$gene dds <- DESeqDataSetFromMatrix( countData=count_mat, colData=tibble(sample_name=colnames(filtered_counts[-1])), design=~1 # no formula needed, ~1 produces a trivial design matrix )
# compute normalization factors dds <- estimateSizeFactors(dds) # extract the normalized counts deseq_norm_counts <- as_tibble(counts(dds,normalized=TRUE)) %>% mutate(gene=filtered_counts$gene) %>% relocate(gene) # relocate changes column order
The DESeq2 normalization procedure has two important considerations:
The CPM normalization procedure does not borrow information across all samples, and therefore is not subject to these considerations.
log()
rlog()
function that performs a regularized logarithmic transformation that avoids these biases# the dds is the DESeq2 object from earlier rld <- rlog(dds)
# Differential Expression Analysis
model.matrix()
functionad_metadata
## # A tibble: 6 × 8 ## ID age_at_death condition tau abeta iba1 gfap braak_stage ## <chr> <dbl> <fct> <dbl> <dbl> <dbl> <dbl> <fct> ## 1 A1 73 AD 96548 176324 157501 78139 4 ## 2 A2 82 AD 95251 0 147637 79348 4 ## 3 A3 82 AD 40287 45365 17655 127131 2 ## 4 C1 80 Control 62684 93739 131595 124075 3 ## 5 C2 77 Control 63598 69838 7189 35597 3 ## 6 C3 72 Control 52951 19444 54673 17221 2
~ condition
argument is an an R formulamodel.matrix(~ condition, data=ad_metadata)
model.matrix(~ condition, data=ad_metadata)
## (Intercept) conditionAD ## 1 1 1 ## 2 1 1 ## 3 1 1 ## 4 1 0 ## 5 1 0 ## 6 1 0 ## attr(,"assign") ## [1] 0 1 ## attr(,"contrasts") ## attr(,"contrasts")$condition ## [1] "contr.treatment"
The general format of a formula is as follows:
# portions in [] are optional [<outcome variable>] ~ <predictor 1> [+ <predictor 2>]... # examples # model Gene 3 expression as a function of disease status `Gene 3` ~ condition # model the amount of tau pathology as a function of abeta pathology, # adjusting for age at death tau ~ age_at_death + abeta # can create a model without an outcome variable ~ age_at_death + condition
age_at_death
as a covariate in the model:model.matrix(~ age_at_death + condition, data=ad_metadata)
model.matrix(~ age_at_death + condition, data=ad_metadata)
## (Intercept) age_at_death conditionAD ## 1 1 73 1 ## 2 1 82 1 ## 3 1 82 1 ## 4 1 80 0 ## 5 1 77 0 ## 6 1 72 0 ## attr(,"assign") ## [1] 0 1 2 ## attr(,"contrasts") ## attr(,"contrasts")$condition ## [1] "contr.treatment"
li
near m
odels of mi
croarraysExpressionSet
or SummarizedExperiment
containerad_se <- SummarizedExperiment( assays=list(intensities=intensities), colData=ad_metadata, rowData=rownames(intensities) ) # design matrix for AD vs control, adjusting for age at death ad_vs_control_model <- model.matrix( ~ age_at_death + condition, data=ad_metadata )
# now run limma # first fit all genes with the model fit <- limma::lmFit( assays(se)$intensities, ad_vs_control_model ) # now better control fit errors with the empirical Bayes method fit <- limma::eBayes(fit)
limma::topTable()
function:# the coefficient name conditionAD is the column name in the design matrix topTable(fit, coef="conditionAD", adjust="BH", number=5)
## logFC AveExpr t P.Value adj.P.Val B ## 206354_at -2.7507429 5.669691 -5.349105 3.436185e-05 0.9992944 -2.387433 ## 234942_s_at 0.9906708 8.397425 5.206286 4.726372e-05 0.9992944 -2.447456 ## 243618_s_at -0.6879431 7.148455 -4.864695 1.020980e-04 0.9992944 -2.598600 ## 223703_at 0.8693657 6.154392 4.745033 1.340200e-04 0.9992944 -2.654092 ## 244391_at -0.5251423 5.532712 -4.691482 1.514215e-04 0.9992944 -2.679354
\[ f(k;\lambda) = \mathtt{Pr}(X=k) = \frac{\lambda^k e^{-\lambda}}{k!} \]
\[ \lambda = \mathtt{E}(X) = \mathtt{Var}(X) \]
\[ f(k; r,p) = \mathtt{Pr}(X=k) = \binom{k+r-1}{k} (1-p)^k p^r \]
\[ p = \frac{\mu}{\sigma^2}, r = \frac{\mu^2}{\sigma^2 - \mu} \]
\[ D = \frac{\sigma^2}{\mu} \]
As mentioned briefly above, DESeq2 requires three pieces of information to perform differential expression:
library(DESeq2) # the counts matrix *must be raw counts* count_mat <- as.matrix(filtered_counts[-1]) row.names(count_mat) <- filtered_counts$gene # create a sample matrix from sample names sample_info <- tibblesample_name=colnames(filtered_counts[-1])) %>% separate(sample_name, c("timepoint","replicate"),sep="_",remove=FALSE ) %>% mutate( timepoint=factor(timepoint,levels=c("Ad","P0","P4","P7")) ) design <- formula(~ timepoint)
sample_info
## # A tibble: 8 × 3 ## sample_name timepoint replicate ## <chr> <fct> <chr> ## 1 P0_1 P0 1 ## 2 P0_2 P0 2 ## 3 P4_1 P4 1 ## 4 P4_2 P4 2 ## 5 P7_1 P7 1 ## 6 P7_2 P7 2 ## 7 Ad_1 Ad 1 ## 8 Ad_2 Ad 2
dds <- DESeqDataSetFromMatrix( countData=count_mat, colData=sample_info, design=design ) dds <- DESeq(dds) resultsNames(dds)
## [1] "Intercept" "timepoint_P0_vs_Ad" "timepoint_P4_vs_Ad" ## [4] "timepoint_P7_vs_Ad"
res <- results(dds, name="timepoint_P0_vs_Ad") p0_vs_Ad_de <- as_tibble(res) %>% mutate(gene=rownames(res)) %>% relocate(gene) %>% arrange(pvalue) head(p0_vs_Ad_de, 5)
## # A tibble: 5 × 7 ## gene baseMean log2FoldChange lfcSE stat pvalue padj ## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> ## 1 ENSMUSG00000000031.17 19624. 6.75 0.187 36.2 2.79e-286 5.43e-282 ## 2 ENSMUSG00000026418.17 9671. 9.84 0.283 34.7 2.91e-264 2.83e-260 ## 3 ENSMUSG00000051855.16 3462. 5.35 0.155 34.5 4.27e-261 2.76e-257 ## 4 ENSMUSG00000027750.17 9482. 4.05 0.123 33.1 1.23e-239 5.98e-236 ## 5 ENSMUSG00000042828.13 3906. -5.32 0.165 -32.2 3.98e-227 1.55e-223
# number of genes significant at FDR < 0.05 filter(p0_vs_Ad_de,padj<0.05)
## # A tibble: 7,467 × 7 ## gene baseMean log2FoldChange lfcSE stat pvalue padj ## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> ## 1 ENSMUSG00000000031.17 19624. 6.75 0.187 36.2 2.79e-286 5.43e-282 ## 2 ENSMUSG00000026418.17 9671. 9.84 0.283 34.7 2.91e-264 2.83e-260 ## 3 ENSMUSG00000051855.16 3462. 5.35 0.155 34.5 4.27e-261 2.76e-257 ## 4 ENSMUSG00000027750.17 9482. 4.05 0.123 33.1 1.23e-239 5.98e-236 ## 5 ENSMUSG00000042828.13 3906. -5.32 0.165 -32.2 3.98e-227 1.55e-223 ## 6 ENSMUSG00000046598.16 1154. -6.08 0.197 -30.9 5.77e-210 1.87e-206 ## 7 ENSMUSG00000019817.19 1952. 5.15 0.170 30.3 1.14e-201 3.18e-198 ## 8 ENSMUSG00000021622.4 17146. -4.57 0.155 -29.5 5.56e-191 1.35e-187 ## 9 ENSMUSG00000002500.16 2121. -6.20 0.216 -28.7 2.36e-181 5.10e-178 ## 10 ENSMUSG00000024598.10 1885. 5.90 0.208 28.4 1.46e-177 2.83e-174 ## # ℹ 7,457 more rows
mutate( p0_vs_Ad_de, `-log10(adjusted p)`=-log10(padj), `FDR<0.05`=padj<0.05 ) %>% ggplot(aes(x=log2FoldChange,y=`-log10(adjusted p)`,color=`FDR<0.05`)) + geom_point()
## Warning: Removed 9551 rows containing missing values or values outside the scale range ## (`geom_point()`).
design <- data.frame(swirl = c("swirl.1", "swirl.2", "swirl.3", "swirl.4"), condition = c(1, -1, 1, -1)) dge <- DGEList(counts=counts) keep <- filterByExpr(dge, design) dge <- dge[keep,,keep.lib.sizes=FALSE]
topTable()
as in microarray limma results:# limma trend logCPM <- cpm(dge, log=TRUE, prior.count=3) fit <- lmFit(logCPM, design) fit <- eBayes(fit, trend=TRUE) topTable(fit, coef=ncol(design))
genecards.org
* https://www.genecards.org/cgi-bin/carddisp.pl?gene=APOE&keywords=APOE
GO:NNNNNNN
, e.g. GO:0019319
.gmt
- Gene Set File Format.gmt
- Gene Matrix Transposelibrary('GSEABase') hallmarks_gmt <- getGmt(con='h.all.v7.5.1.symbols.gmt') hallmarks_gmt ## GeneSetCollection ## names: HALLMARK_TNFA_SIGNALING_VIA_NFKB, HALLMARK_HYPOXIA, ..., HALLMARK_PANCREAS_BETA_CELLS (50 total) ## unique identifiers: JUNB, CXCL2, ..., SRP14 (4383 total) ## types in collection: ## geneIdType: NullIdentifier (1 total) ## collectionType: NullCollection (1 total)
GO:0019319
)contingency_table <- matrix(c(13, 987, 23, 8977), 2, 2) fisher_results <- fisher.test(contingency_table, alternative='greater') fisher_results ## ## Fisher's Exact Test for Count Data ## ## data: contingency_table ## p-value = 2.382e-05 ## alternative hypothesis: true odds ratio is greater than 1 ## 95 percent confidence interval: ## 2.685749 Inf ## sample estimates: ## odds ratio ## 5.139308