Data modeling goal:
Models:
The scientific model and statistical model are related but independent choices we make
Consider three imaginary genes measured in Parkinson’s Disease (PD) patients blood:
set.seed(1337) gene_exp <- tibble( sample_name = c(str_c("P",1:100), str_c("C",1:100)), `Disease Status` = factor(c(rep("PD",100),rep("Control",100))), `Gene 1` = c(rnorm(100,255,10), rnorm(100,250,10)), `Gene 2` = c(rnorm(100,520,100), rnorm(100,600,60)), `Gene 3` = c(rnorm(100,500,40), rnorm(100,330,40)) )
gene_exp
## # A tibble: 200 × 5 ## sample_name `Disease Status` `Gene 1` `Gene 2` `Gene 3` ## <chr> <fct> <dbl> <dbl> <dbl> ## 1 P1 PD 257. 636. 504. ## 2 P2 PD 241. 556. 484. ## 3 P3 PD 252. 426. 476. ## 4 P4 PD 271. 405. 458. ## 5 P5 PD 248. 482. 520. ## 6 P6 PD 275. 521. 460. ## 7 P7 PD 264. 415. 568. ## 8 P8 PD 276. 450. 495. ## 9 P9 PD 274. 490. 496. ## 10 P10 PD 251. 584. 573. ## # ℹ 190 more rows
pivot_longer( gene_exp, c(`Gene 1`, `Gene 2`, `Gene 3`), names_to="Gene", values_to="Expression" ) %>% ggplot( aes(x=`Disease Status`,y=Expression,fill=`Disease Status`) ) + facet_wrap(vars(Gene)) + geom_violin()
ggplot(gene_exp, aes(x=`Gene 1`)) + geom_histogram(bins=30,fill="#a93c13")
ggplot(gene_exp, aes(x=`Gene 1`)) + geom_histogram(bins=30,fill="#a93c13") + geom_vline(xintercept=mean(gene_exp$`Gene 1`))
library(patchwork) well_behaved_data <- tibble(data = rnorm(1000)) # introduce some outliers data_w_outliers <- tibble(data = c(rnorm(800), rep(5, 200))) g_no_outlier <- ggplot(well_behaved_data, aes(x = data)) + geom_histogram(fill = "#56CBF9", color = "grey", bins = 30) + geom_vline(xintercept = mean(well_behaved_data$data)) + ggtitle("Mean example, no outliers") g_outlier <- ggplot(data_w_outliers, aes(x = data)) + geom_histogram(fill = "#7FBEEB", color = "grey", bins = 30) + geom_vline(xintercept = mean(data_w_outliers$data)) + ggtitle("Mean example, big outliers") g_no_outlier | g_outlier
g_no_outlier <- ggplot(well_behaved_data, aes(x = data)) + geom_histogram(fill = "#AFBED1", color = "grey", bins = 30) + geom_vline(xintercept = median(well_behaved_data$data)) + ggtitle("Median example") g_outlier <- ggplot(data_w_outliers, aes(x = data)) + geom_histogram(fill = "#7FBEEB", color = "grey", bins = 30) + geom_vline(xintercept = median(data_w_outliers$data)) + ggtitle("Median example, big outliers") g_no_outlier | g_outlier
sd()
g1_mean <- mean(gene_exp$`Gene 1`) g1_sd <- sd(gene_exp$`Gene 1`) ggplot(gene_exp, aes(x=`Gene 1`)) + geom_histogram(bins=30,fill="#a93c13") + geom_vline(xintercept=g1_mean) + geom_segment(x=g1_mean-g1_sd, y=10, xend=g1_mean+g1_sd, yend=10)
data <- tibble(data = c(rnorm(1000, sd=1.75))) ggplot(data, aes(x = data)) + geom_histogram(fill = "#EAC5D8", color = "white", bins = 30) + geom_vline(xintercept = seq(-3,3,1) * sd(data$data)) + xlim(c(-6, 6)) + ggtitle("Standard deviations aplenty", paste("SD:", sd(data$data)))
g1_mean <- mean(gene_exp$`Gene 1`) g1_sd <- sd(gene_exp$`Gene 1`) ggplot(gene_exp, aes(x=`Gene 1`)) + geom_histogram( aes(y=after_stat(density)), bins=30, fill="#a93c13" ) + stat_function(fun=dnorm, args = list(mean=g1_mean, sd=g1_sd), linewidth=2 )
\[ Gene\;1 \sim \mathcal{N}(254, 11) \]
g_norm <- ggplot(tibble(data = rnorm(5000)), aes(x = data)) + geom_histogram(fill = "#D0FCB3", bins = 50, color = "gray") + ggtitle("Normal distribution", "rnorm(n = 1000)") g_unif <- ggplot(tibble(data = runif(5000)), aes(x = data)) + geom_histogram(fill = "#271F30", bins = 50, color = "white") + ggtitle("Uniform distribution", "runif(n = 1000)") g_logistic <- ggplot(tibble(data = rlogis(5000)), aes(x = data)) + geom_histogram(fill = "#9BC59D", bins = 50, color = "black") + ggtitle("Logistic distribution", "rlogis(n = 1000)") g_exp <- ggplot(tibble(data = rexp(5000, rate = 1)), aes(x = data)) + geom_histogram(fill = "#6C5A49", bins = 50, color = "white") + ggtitle("Exponential distribution", "rexp(n = 1000, rate = 1)") (g_norm | g_unif) / (g_logistic | g_exp)
We chose to model our gene 1 expression data using a normaly distribution parameterized by the arithmetic mean and standard deviation
\[ Y_i = \beta_0 + \beta_1 \phi_1 ( X_{i1} ) + \beta_2 \phi_2 ( X_{i2} ) + \ldots + \beta_p \phi_p ( X_{ip} ) + \epsilon_i \]
library(ggbeeswarm) ggplot(gene_exp, aes(x=`Disease Status`, y=`Gene 3`, color=`Disease Status`)) + geom_beeswarm()
exp_summ <- pivot_longer( gene_exp, c(`Gene 3`) ) %>% group_by(`Disease Status`) %>% summarize(mean=mean(value),sd=sd(value)) pd_mean <- filter(exp_summ, `Disease Status` == "PD")$mean c_mean <- filter(exp_summ, `Disease Status` == "Control")$mean ggplot(gene_exp, aes(x=`Gene 3`, fill=`Disease Status`)) + geom_histogram(bins=20, alpha=0.6,position="identity") + annotate("segment", x=c_mean, xend=pd_mean, y=20, yend=20, arrow=arrow(ends="both", angle=90)) + annotate("text", x=mean(c(c_mean,pd_mean)), y=21, hjust=0.5, label="How different?")
pd_mean - c_mean
## [1] 164.0942
In other words, this point estimate suggests that on average Parkinson’s patients have 164.1 greater expression than Controls.
ggplot(gene_exp, aes(x=`Disease Status`, y=`Gene 3`, color=`Disease Status`)) + geom_beeswarm() + annotate("segment", x=0, xend=3, y=2*c_mean-pd_mean, yend=2*pd_mean-c_mean)
Y ~ X
, or Gene 3 ~ Disease Status
fit <- lm(`Gene 3` ~ `Disease Status`, data=gene_exp) fit
## ## Call: ## lm(formula = `Gene 3` ~ `Disease Status`, data = gene_exp) ## ## Coefficients: ## (Intercept) `Disease Status`PD ## 334.6 164.1
lm()
Fitsummary(fit)
## ## Call: ## lm(formula = `Gene 3` ~ `Disease Status`, data = gene_exp) ## ## Residuals: ## Min 1Q Median 3Q Max ## -124.303 -25.758 -2.434 30.518 119.348 ## ## Coefficients: ## Estimate Std. Error t value Pr(>|t|) ## (Intercept) 334.578 4.414 75.80 <2e-16 *** ## `Disease Status`PD 164.094 6.243 26.29 <2e-16 *** ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## ## Residual standard error: 44.14 on 198 degrees of freedom ## Multiple R-squared: 0.7773, Adjusted R-squared: 0.7761 ## F-statistic: 691 on 1 and 198 DF, p-value: < 2.2e-16
# naive solution fit1 <- lm(`Gene 1` ~ `Disease Status`, data=gene_exp) fit2 <- lm(`Gene 2` ~ `Disease Status`, data=gene_exp) fit3 <- lm(`Gene 3` ~ `Disease Status`, data=gene_exp)
gene_stats <- lapply( c("Gene 1", "Gene 2", "Gene 3"), function(gene) { model <- paste0('`',gene,'`',' ~ `Disease Status`') fit <- lm(model, data=gene_exp) v <- c(gene, coefficients(fit), summary(fit)$coefficients[2,4] ) names(v) <- c("Gene", "Intercept", "PD", "pvalue") return(v) } ) %>% bind_rows() # turn the list into a tibble # compute FDR from nominal p-values gene_stats$padj <- p.adjust(gene_stats$pvalue,method="fdr")
gene_stats ## A tibble: 3 × 5 # Gene Intercept PD pvalue padj # <chr> <chr> <chr> <chr> <dbl> #1 Gene 1 250.410 6.959 3.921e-06 3.92e- 6 #2 Gene 2 597.763 -93.629 2.373e-13 3.56e-13 #3 Gene 3 334.577 164.094 1.727e-66 5.18e-66
\[Y = g^{-1}(\beta_0 + \beta_1 \phi_1 ( X_{i1} ) + \beta_2 \phi_2 ( X_{i2} ) + \ldots + \beta_p \phi_p ( X_{ip} ))\]
tibble( x = seq(-4,4,by=0.1), `Probability Density`=dnorm(x,0,1) ) %>% ggplot(aes(x=x,y=`Probability Density`)) + geom_line() + labs(title="Probability Density Function for a Normal Distribution")
\[ P(X = x|\mu,\sigma) = \frac{1}{\sigma\sqrt{2\pi}}e^{\frac{-(x-\mu)^2}{2\sigma}} \]
\[ P(X = x|\mu,\sigma) = \frac{1}{\sigma\sqrt{2\pi}}e^{\frac{-(x-\mu)^2}{2\sigma}} \]
tibble( x = seq(-4,4,by=0.1), PDF=dnorm(x,0,1), CDF=pnorm(x,0,1) ) %>% ggplot() + geom_line(aes(x=x,y=PDF,color="PDF")) + geom_line(aes(x=x,y=CDF,color="CDF"),linetype="dashed")
norm
for normal distributiond
, p
, q
, and r
, e.g.dnorm(x, mean=0, sd=1)
- PDF of the normal distributionpnorm(q, mean=0, sd=1)
- CDF of the normal distributionqnorm(p, mean=0, sd=1)
- inverse CDF; accepts quantiles between 0 and 1 and returns the value of the distribution for those quantilesrnorm(n, mean=0, sd=1)
- generate n
samples from a normal distributionDistribution | Probability Density Function |
---|---|
Normal | dnorm(x,mean,sd) |
t Distribution | dt(x,df) |
Poisson | dpois(n,lambda) |
Binomial | dbinom(x, size, prob) |
Negative Binomial | dnbinom(x, size, prob, mu) |
Exponential | dexp(x, rate) |
\(\chi^2\) | dchisq(x, df) |
library(statip) # NB: not a base R distribution # dbern(x, prob, log = FALSE) # qbern(p, prob, lower.tail = TRUE, log.p = FALSE) # pbern(q, prob, lower.tail = TRUE, log.p = FALSE) # rbern(n, prob) rbern(10, 0.5)
rbern(10, 0.5)
## [1] 1 1 1 0 0 1 1 1 0 1
\[ Pr(X = x|n,p) = {n \choose x} p^x (1-p) ^{(n-x)} \]
# dbinom(x, size, prob, log = FALSE) # pbinom(q, size, prob, lower.tail = TRUE, log.p = FALSE) # qbinom(p, size, prob, lower.tail = TRUE, log.p = FALSE) # rbinom(n, size, prob) rbinom(10, 10, 0.5)
rbinom(10, 10, 0.5)
## [1] 3 8 7 5 6 4 6 6 5 7
mean(rbinom(1000, 10, 0.5))
## [1] 4.999
\[ Pr(X = x|p) = (1-p)^x p \]
# dgeom(x, prob, log = FALSE) # pgeom(q, prob, lower.tail = TRUE, log.p = FALSE) # qgeom(p, prob, lower.tail = TRUE, log.p = FALSE) # rgeom(n, prob) rgeom(10, 0.5)
rgeom(10, 0.5)
## [1] 2 0 3 0 2 0 2 2 0 6
\[ Pr(X=k|\lambda) = \frac {\lambda^k e^{-\lambda}} {k!} \]
# dpois(x, lambda, log = FALSE) # ppois(q, lambda, lower.tail = TRUE, log.p = FALSE) # qpois(p, lambda, lower.tail = TRUE, log.p = FALSE) # rpois(n, lambda) rpois(10, 5)
rpois(10, 5)
## [1] 4 5 8 6 4 6 5 3 7 2
mean(rpois(1000, 5))
## [1] 4.917
\[ Pr(X = x|r,p) = \frac {x+r-1} {r-1} p^r {(1-p)}^x \]
# dnbinom(x, size, prob, mu, log = FALSE) # pnbinom(q, size, prob, mu, lower.tail = TRUE, log.p = FALSE) # qnbinom(p, size, prob, mu, lower.tail = TRUE, log.p = FALSE) # rnbinom(n, size, prob, mu) rnbinom(10, 10, 0.5)
# dnbinom(x, size, prob, mu, log = FALSE) # pnbinom(q, size, prob, mu, lower.tail = TRUE, log.p = FALSE) # qnbinom(p, size, prob, mu, lower.tail = TRUE, log.p = FALSE) # rnbinom(n, size, prob, mu) rnbinom(10, 10, 0.5)
## [1] 19 7 9 12 8 8 12 13 12 12
\[ P(X=x|a,b) = \begin{cases} \frac{1}{b-a} & \text{for}\; a \le x \le b, \\ 0 & \text{otherwise} \end{cases} \]
dunif(x, min = 0, max = 1, log = FALSE) punif(q, min = 0, max = 1, lower.tail = TRUE, log.p = FALSE) qunif(p, min = 0, max = 1, lower.tail = TRUE, log.p = FALSE) runif(n, min = 0, max = 1) runif(10, min=0, max=10)
runif(10)
## [1] 0.62197160 0.20632568 0.13773154 0.94534494 0.24755702 0.33996150 ## [7] 0.28370202 0.06990996 0.65094488 0.63905524
mean(runif(1000))
## [1] 0.5032486
\[ P(X = x|\mu,\sigma) = \frac{1}{\sigma\sqrt{2\pi}}e^{\frac{-(x-\mu)^2}{2\sigma}} \]
# dnorm(x, mean = 0, sd = 1, log = FALSE) # pnorm(q, mean = 0, sd = 1, lower.tail = TRUE, log.p = FALSE) # qnorm(p, mean = 0, sd = 1, lower.tail = TRUE, log.p = FALSE) # rnorm(n, mean = 0, sd = 1) rnorm(10, mean=10, sd=10)
rnorm(10, mean=10, sd=10)
## [1] 22.264245 22.609307 13.508107 5.892955 -5.542218 18.052817 19.157577 ## [8] 20.213437 3.250644 11.789966
mean(rnorm(1000, mean=10, sd=10))
## [1] 10.04227
\[ P(X=x|k) = \begin{cases} \frac{ x^{\frac{k}{2}-1}e^{-\frac{x}{2}} }{ 2^{\frac{k}{2}}\left ( \frac{k}{2} \right ) }, & x>0;\\ 0, & \text{otherwise} \end{cases} \]$
# dchisq(x, df, ncp = 0, log = FALSE) # pchisq(q, df, ncp = 0, lower.tail = TRUE, log.p = FALSE) # qchisq(p, df, ncp = 0, lower.tail = TRUE, log.p = FALSE) # rchisq(n, df, ncp = 0) rchisq(10, 10)
rchisq(10, 10)
## [1] 7.873837 12.733304 3.567957 11.308481 5.557218 10.716813 5.477301 ## [8] 7.458366 8.126337 16.742170
mean(rchisq(1000, 10))
## [1] 9.816199
p*
theoretical distribution functionsecdf(d$Value)
## Empirical CDF ## Call: ecdf(d$Value) ## x[1:3000] = -6.1284, -6.0381, -5.8883, ..., 25.555, 26.728
plot(ecdf(d$Value))
stats::prcomp()
function performs PCA in R# intensities contains microarray expression data for ~54k probesets x 20 # samples # transpose expression values so samples are rows expr_mat <- intensities %>% pivot_longer(-c(probeset_id),names_to="sample") %>% pivot_wider(names_from=probeset_id) # PCA expects all features (i.e. probesets) to be mean-centered, # convert to dataframe so we can use rownames expr_mat_centered <- as.data.frame( lapply(dplyr::select(expr_mat,-c(sample)),function(x) x-mean(x)) ) rownames(expr_mat_centered) <- expr_mat$sample
# prcomp performs PCA pca <- prcomp( expr_mat_centered, center=FALSE, # already centered scale=TRUE # prcomp scales each feature to have unit variance )
# the str() function prints the structure of its argument str(pca)
## List of 5 ## $ sdev : num [1:20] 101.5 81 71.2 61.9 57.9 ... ## $ rotation: num [1:54675, 1:20] 0.00219 0.00173 -0.00313 0.00465 0.0034 ... ## ..- attr(*, "dimnames")=List of 2 ## .. ..$ : chr [1:54675] "X1007_s_at" "X1053_at" "X117_at" "X121_at" ... ## .. ..$ : chr [1:20] "PC1" "PC2" "PC3" "PC4" ... ## $ center : logi FALSE ## $ scale : Named num [1:54675] 0.379 0.46 0.628 0.272 0.196 ... ## ..- attr(*, "names")= chr [1:54675] "X1007_s_at" "X1053_at" "X117_at" "X121_at" ... ## $ x : num [1:20, 1:20] -67.94 186.14 6.07 -70.72 9.58 ... ## ..- attr(*, "dimnames")=List of 2 ## .. ..$ : chr [1:20] "A1" "A2" "A3" "A4" ... ## .. ..$ : chr [1:20] "PC1" "PC2" "PC3" "PC4" ... ## - attr(*, "class")= chr "prcomp"
The result of prcomp()
is a list with five members:
sdev
- the standard deviation (i.e. the square root of the variance) for each componentrotation
- a matrix where the principal components are in columnsx
- the projections of the original datacenter
- if center=TRUE
was passed, a vector of feature meansscale
- if scale=TRUE
was passed, a vector of the feature variancesprcomp()
result doesn’t provide this fractionsdev
variable returned by prcomp()
may be used to first calculate the variance explained by each component by squaring it, then dividing by the sumlibrary(patchwork) pca_var <- tibble( PC=factor(str_c("PC",1:20),str_c("PC",1:20)), Variance=pca$sdev**2, `% Explained Variance`=Variance/sum(Variance)*100, `Cumulative % Explained Variance`=cumsum(`% Explained Variance`) )
exp_g <- pca_var %>% ggplot(aes(x=PC,y=`% Explained Variance`,group=1)) + geom_point() + geom_line() + theme(axis.text.x=element_text(angle=90,hjust=0,vjust=0.5)) cum_g <- pca_var %>% ggplot(aes(x=PC,y=`Cumulative % Explained Variance`,group=1)) + geom_point() + geom_line() + ylim(0,100) + # set y limits to [0,100] theme(axis.text.x=element_text(angle=90,hjust=0,vjust=0.5)) exp_g | cum_g
as_tibble(pca$x) %>% ggplot(aes(x=PC1,y=PC2)) + geom_point()
ggpairs() function
in the GGally packagelibrary(GGally) as_tibble(pca$x) %>% dplyr::select(PC1:PC6) %>% ggpairs()
as_tibble(pca$x) %>% pivot_longer(everything(),names_to="PC",values_to="projection") %>% mutate(PC=fct_relevel(PC,str_c("PC",1:20))) %>% ggplot(aes(x=PC,y=projection)) + geom_beeswarm() + labs(title="PCA Projection Plot")
as_tibble(pca$x) %>% mutate(type=stringr::str_sub(rownames(pca$x),1,1)) %>% ggplot(aes(x=PC1,y=PC2,color=type)) + geom_point()
library(GGally) as_tibble(pca$x) %>% mutate(type=stringr::str_sub(rownames(pca$x),1,1)) %>% dplyr::select(c(type,PC1:PC6)) %>% ggpairs(columns=1:6,mapping=aes(fill=type))
\[d(p, q) = \sqrt{(p - q)^2}\]
Below we will cluster the synthetic dataset introduced above in R:
ggplot(well_clustered_data, aes(x=f1,y=f2)) + geom_point() + labs(title="Unclustered")
# compute all pairwise distances using euclidean distance # as the distance metric euc_dist <- dist(dplyr::select(well_clustered_data,-ID)) # produce a clustering of the data using the hclust for # hierarchical clustering hc <- hclust(euc_dist, method="ave") # add ID as labels to the clustering object hc$labels <- well_clustered_data$ID
str(hc)
## List of 7 ## $ merge : int [1:59, 1:2] -50 -33 -48 -46 -44 -21 -11 -4 -36 -3 ... ## $ height : num [1:59] 0.00429 0.08695 0.23536 0.24789 0.25588 ... ## $ order : int [1:60] 8 9 7 6 19 18 11 16 4 14 ... ## $ labels : chr [1:60] "A1" "A2" "A3" "A4" ... ## $ method : chr "average" ## $ call : language hclust(d = euc_dist, method = "ave") ## $ dist.method: chr "euclidean" ## - attr(*, "class")= chr "hclust"
hclust()
outputhclust()
return object describes the clustering as a treelibrary(ggdendro) ggdendrogram(hc)
cutree
to divide the tree into three groups using k=3
:labels <- cutree(hc,k=3) labels
## A1 A2 A3 A4 A5 A6 A7 A8 A9 A10 A11 A12 A13 A14 A15 A16 A17 A18 A19 A20 ## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ## B1 B2 B3 B4 B5 B6 B7 B8 B9 B10 B11 B12 B13 B14 B15 B16 B17 B18 B19 B20 ## 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 ## C1 C2 C3 C4 C5 C6 C7 C8 C9 C10 C11 C12 C13 C14 C15 C16 C17 C18 C19 C20 ## 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
# we turn our labels into a tibble so we can join them with the original tibble well_clustered_data %>% left_join( tibble( ID=names(labels), label=as.factor(labels) ) ) %>% ggplot(aes(x=f1,y=f2,color=label)) + geom_point() + labs(title="Clustered")
# we turn our labels into a tibble so we can join them with the original tibble well_clustered_data %>% left_join( tibble( ID=names(labels), label=as.factor(labels) ) ) %>% ggplot(aes(x=f1,y=f2,color=label)) + geom_point() + labs(title="Clustered")
## Joining with `by = join_by(ID)`