Many ways to measure the abundance of RNA transcripts
SummarizedExperiment container standard way to load and work with gene expression data in Bioconductor
Stores the following information:
assays
- one or more measurement assays (e.g. gene expression) in the form of a feature by sample matrixcolData
- metadata associated with the samples (i.e. columns) of the assaysrowData
- metadata associated with the features (i.e. rows) of the assaysexptData
- additional metadata about the experiment itself, like protocol, project name, etcSummarizedExperiment
s# microarray expression dataset intensities intensities <- readr::read_delim( "example_intensity_data.csv",delim=" " ) # the first column of intensities tibble is the probesetId, # extract to pass as rowData rowData <- intensities["probeset_id"] # remove probeset IDs from tibble and turn into a R dataframe # so that we can assign rownames since tibbles don't support # row names intensities <- as.data.frame( select(intensities, -probeset_id) ) rownames(intensities) <- rowData$probeset_id
SummarizedExperiment
cont’d# these column data are made up, but you would have a # sample metadata file to use colData <- tibble( sample_name=colnames(intensities), condition=sample(c("case","control"),ncol(intensities),replace=TRUE) ) se <- SummarizedExperiment( assays=list(intensities=intensities), colData=colData, rowData=rowData )
SummarizedExperiment
cont’dse ## class: SummarizedExperiment ## dim: 54675 35 ## metadata(0): ## assays(1): intensities ## rownames(54675): 1007_s_at 1053_at ... AFFX-TrpnX-5_at AFFX-TrpnX-M_at ## rowData names(1): probeset_id ## colnames(35): GSM972389 GSM972390 ... GSM972512 GSM972521 ## colData names(2): sample_name condition
Four steps involved in analyzing gene expression microarray data:
affy
Bioconductor package
affy::ReadAffy
function# read all CEL files in a single directory affy_batch <- affy::ReadAffy(celfile.path="directory/of/CELfiles") # or individual files in different directories cel_filenames <- c( list.files( path="CELfile_dir1", full.names=TRUE, pattern=".CEL$" ), list.files( path="CELfile_dir2", full.names=TRUE, pattern=".CEL$" ) ) affy_batch <- affy::ReadAffy(filenames=cel_filenames)
affy
package provides functions to perform probe summarization and normalizationaffy::rma
function implements RMA, performs summarization and normalization of multiple arrays simultaneouslydata()
functionlibrary(affydata) data(Dilution)
library(affy) library(affydata) data(Dilution) # normalize the Dilution microarray expression values # note Dilution is an AffyBatch object eset_rma <- affy::rma(Dilution,verbose=FALSE)
ExpressionSet
vs SummarizedExperiment
ExpressionSet
is a container that contains high throughput assay dataSummarizedExperiment
affy
) return ExpressionSet
objects, not SummarizedExperiment
sOperation | ExpressionSet | SummarizedExperiment |
---|---|---|
Get expression values | exprs(eset) |
assays(se) |
Get column data | phenoData(eset) |
colData(se) |
Get row data | annotation(eset) |
rowData(se) |
ggplot2
ggplot2
Fundamentalsggplot2
ExampleA simple example dataset:
## # A tibble: 20 × 8 ## ID age_at_death condition tau abeta iba1 gfap braak_stage ## <chr> <dbl> <fct> <dbl> <dbl> <dbl> <dbl> <fct> ## 1 A1 73 AD 96548 176324 157501 78139 4 ## 2 A2 82 AD 95251 0 147637 79348 4 ## ... ## 10 A10 69 AD 48357 27260 47024 78507 2 ## 11 C1 80 Control 62684 93739 131595 124075 3 ## 12 C2 77 Control 63598 69838 7189 35597 3 ## ... ## 20 C10 73 Control 15781 16592 10271 100858 1
ggplot2
Exampleage_at_death
tau
ggplot( data=ad_metadata, mapping = aes( x = age_at_death, y = tau ) ) + geom_point()
ggplot(data=ad_metadata, mapping=aes(x=age_at_death, y=tau)) + geom_point()
ggplot2
Plot Componentsggplot(data=ad_metadata, mapping=aes(x=age_at_death, y=tau)) + geom_point()
ggplot()
- function creates a plotdata=
- pass a tibble with the datamapping=aes(...)
- Define an aesthetics mapping connecting data to plot propertiesgeom_point(...)
- Specify geometry as points where marks will be made at pairs of x,y coordinatesIs this the whole story?
There are both AD and Control subjects in this dataset!
How does condition
relate to this relationship we see?
Layer on an additional aesthetic of color:
ggplot( data=ad_metadata, mapping = aes( x = age_at_death, y = tau, color=condition # color each point ) ) + geom_point()
ggplot(data=ad_metadata, mapping=aes( x=age_at_death, y=tau, color=condition )) + geom_point()
Differences in distributions of variables can be important
Examine the distribution of age_at_death
for AD and Control samples with violin geometry with geom_violin()
:
ggplot(data=ad_metadata, mapping = aes(x=condition, y=age_at_death)) + geom_violin()
ggplot(data=ad_metadata, mapping = aes(x=condition, y=age_at_death)) + geom_violin()
patchwork
library:library(patchwork) age_boxplot <- ggplot( data=ad_metadata, mapping = aes(x=condition, y=age_at_death) ) + geom_boxplot() tau_boxplot <- ggplot( data=ad_metadata, mapping=aes(x=condition, y=tau) ) + geom_boxplot() age_boxplot | tau_boxplot # this puts the plots side by side
age_boxplot | tau_boxplot # this puts the plots side by side
ggplot
Mechanicsggplot
Mechanicsggplot
has two key concepts that give it great flexibility: layers and scalesggplot
LayersEach layer is a set of data connected to a geometry and an aesthetic
Each geom_X()
function adds a layer to a plot
The plot has three layers:
ggplot(data=ad_metadata, mapping=aes(x=age_at_death)) + geom_point(mapping=aes(y=tau, color='blue')) + geom_point(mapping=aes(y=abeta, color='red')) + geom_point(mapping=aes(y=iba1, color='cyan'))
ggplot
Layersggplot
Scalesggplot
ScalesHow many layers? How many scales?
ggplot
Incompatible Scalesggplot(data=ad_metadata, mapping=aes(x=braak_stage)) + geom_point(mapping=aes(y=tau, color='blue')) + geom_point(mapping=aes(y=age_at_death, color='red'))
ggplot
Incompatible Scalesggplot(ad_metadata, mapping = aes( x=ID, y=tau) ) + geom_bar(stat="identity")
ggplot(ad_metadata, mapping = aes(x=ID,y=tau)) + geom_bar(stat="identity")
ggplot(ad_metadata, mapping = aes( x=ID, y=tau, fill=condition) ) + geom_bar(stat="identity")
ggplot(ad_metadata, mapping = aes(x=ID,y=tau,fill=condition)) + geom_bar(stat="identity")
mutate(ad_metadata, tau_centered=(tau - mean(tau))) %>% ggplot(mapping = aes( x=ID, y=tau_centered, fill=condition) ) + geom_bar(stat="identity")
mutate(ad_metadata, tau_centered=(tau - mean(tau))) %>% ggplot(mapping = aes(x=ID, y=tau_centered, fill=condition)) + geom_bar(stat="identity")
geom_point
and geom_segment
layers:ggplot(ad_metadata) + geom_point(mapping=aes(x=ID, y=tau)) + geom_segment(mapping=aes(x=ID, xend=ID, y=0, yend=tau))
ggplot(ad_metadata) + geom_point(mapping=aes(x=ID, y=tau)) + geom_segment(mapping=aes(x=ID, xend=ID, y=0, yend=tau))
pivot_longer( ad_metadata, c(tau,abeta,iba1,gfap), names_to='Marker', values_to='Intensity' ) %>% ggplot(aes(x=ID,y=Intensity,group=Marker,fill=Marker)) + geom_area()
Stacked area plots require three pieces of data:
pivot_longer( ad_metadata, c(tau,abeta,iba1,gfap), names_to='Marker', values_to='Intensity' ) %>% group_by(ID) %>% # divide each intensity values by sum markers mutate( `Relative Intensity`=Intensity/sum(Intensity) ) %>% ungroup() %>% # ungroup restores the tibble to original number of rows ggplot(aes(x=ID,y=`Relative Intensity`,group=Marker,fill=Marker)) + geom_area()
ggplot(ad_metadata) + geom_histogram(mapping=aes(x=age_at_death))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(ad_metadata) + geom_histogram(mapping=aes(x=age_at_death),bins=10)
tibble(x=rnorm(1000)) %>% ggplot() + geom_histogram(aes(x=x))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
tibble( x=c(rnorm(1000),rnorm(1000,mean=4)), type=c(rep('A',1000),rep('B',1000)) ) %>% ggplot(aes(x=x,fill=type)) + geom_histogram(bins=30, alpha=0.6, position="identity")
Similar to histogram, except instead of binning the values draws a smoothly interpolated line that approximates the distribution
Density plot is always normalized so the integral under the curve is approximately 1
ggplot(ad_metadata) + geom_density(mapping=aes(x=age_at_death),fill="#c9a13daa")
library(patchwork) hist_g <- ggplot(ad_metadata) + geom_histogram(mapping=aes(x=age_at_death),bins=30) density_g <- ggplot(ad_metadata) + geom_density(mapping=aes(x=age_at_death),fill="#c9a13daa") hist_g | density_g
library(patchwork) normal_samples <- tibble( x=c(rnorm(1000),rnorm(1000,mean=4)), type=c(rep('A',1000),rep('B',1000)) ) hist_g <- ggplot(normal_samples) + geom_histogram( mapping=aes(x=x,fill=type), alpha=0.6, position="identity", bins=30 ) density_g <- ggplot(normal_samples) + geom_density( mapping=aes(x=x,fill=type), alpha=0.6, position="identity" ) hist_g | density_g
ggplot(ad_metadata) + geom_boxplot(mapping=aes(x=condition,y=age_at_death))
ggplot(ad_metadata) + geom_boxplot(mapping=aes(x=condition,y=age_at_death))
normal_samples <- tibble( x=c(rnorm(1000),rnorm(1000,4),rnorm(1000,2,3)), type=c(rep('A',2000),rep('B',1000)) ) ggplot(normal_samples, aes(x=x,fill=type,alpha=0.6)) + geom_density()
ggplot(normal_samples, aes(x=type,y=x,fill=type)) + geom_boxplot()
library(patchwork) g <- ggplot(normal_samples, aes(x=type,y=x,fill=type)) boxplot_g <- g + geom_boxplot() violin_g <- g + geom_violin() boxplot_g | violin_g
ggplot(ad_metadata) + geom_violin(aes(x=condition,y=tau,fill=condition))
library(ggbeeswarm) ggplot(ad_metadata) + geom_beeswarm( aes(x=condition,y=age_at_death,color=condition), cex=2, size=2 )
normal_samples <- tibble( x=c(rnorm(1000),rnorm(1000,4),rnorm(1000,2,3)), type=c(rep('A',2000),rep('B',1000)) ) ggplot(normal_samples, aes(x=type,y=x,color=type)) + geom_beeswarm()
ggplot(normal_samples, aes(x=type,y=x,color=type)) + geom_beeswarm()
normal_samples <- tibble( x=c(rnorm(100),rnorm(100,4),rnorm(100,2,3)), type=c(rep('A',200),rep('B',100)), category=sample(c('healthy','disease'),300,replace=TRUE) ) ggplot(normal_samples, aes(x=type,y=x,color=category)) + geom_beeswarm()
library(ggridges) tibble( x=c(rnorm(100),rnorm(100,4),rnorm(100,2,3)), type=c(rep('A',200),rep('B',100)), ) %>% ggplot(aes(y=type,x=x,fill=type)) + geom_density_ridges()
## Picking joint bandwidth of 0.892
tibble( x=rnorm(10000,mean=runif(10,1,10),sd=runif(2,1,4)), type=rep(c("A","B","C","D","E","F","G","H","I","J"),1000) ) %>% ggplot(aes(y=type,x=x,fill=type)) + geom_density_ridges(alpha=0.6,position="identity")
## Picking joint bandwidth of 0.499