Many ways to measure the abundance of RNA transcripts
SummarizedExperiment container standard way to load and work with gene expression data in Bioconductor
Stores the following information:
assays - one or more measurement assays (e.g. gene expression) in the form of a feature by sample matrixcolData - metadata associated with the samples (i.e. columns) of the assaysrowData - metadata associated with the features (i.e. rows) of the assaysexptData - additional metadata about the experiment itself, like protocol, project name, etc
SummarizedExperiments# microarray expression dataset intensities intensities <- readr::read_delim( "example_intensity_data.csv",delim=" " ) # the first column of intensities tibble is the probesetId, # extract to pass as rowData rowData <- intensities["probeset_id"] # remove probeset IDs from tibble and turn into a R dataframe # so that we can assign rownames since tibbles don't support # row names intensities <- as.data.frame( select(intensities, -probeset_id) ) rownames(intensities) <- rowData$probeset_id
SummarizedExperiment cont’d# these column data are made up, but you would have a
# sample metadata file to use
colData <- tibble(
sample_name=colnames(intensities),
condition=sample(c("case","control"),ncol(intensities),replace=TRUE)
)
se <- SummarizedExperiment(
assays=list(intensities=intensities),
colData=colData,
rowData=rowData
)
SummarizedExperiment cont’dse ## class: SummarizedExperiment ## dim: 54675 35 ## metadata(0): ## assays(1): intensities ## rownames(54675): 1007_s_at 1053_at ... AFFX-TrpnX-5_at AFFX-TrpnX-M_at ## rowData names(1): probeset_id ## colnames(35): GSM972389 GSM972390 ... GSM972512 GSM972521 ## colData names(2): sample_name condition
Four steps involved in analyzing gene expression microarray data:
affy Bioconductor package
affy::ReadAffy function# read all CEL files in a single directory affy_batch <- affy::ReadAffy(celfile.path="directory/of/CELfiles") # or individual files in different directories cel_filenames <- c( list.files( path="CELfile_dir1", full.names=TRUE, pattern=".CEL$" ), list.files( path="CELfile_dir2", full.names=TRUE, pattern=".CEL$" ) ) affy_batch <- affy::ReadAffy(filenames=cel_filenames)
affy package provides functions to perform probe summarization and normalizationaffy::rma function implements RMA, performs summarization and normalization of multiple arrays simultaneouslydata() functionlibrary(affydata) data(Dilution)
library(affy) library(affydata) data(Dilution) # normalize the Dilution microarray expression values # note Dilution is an AffyBatch object eset_rma <- affy::rma(Dilution,verbose=FALSE)
ExpressionSet vs SummarizedExperimentExpressionSet is a container that contains high throughput assay dataSummarizedExperimentaffy) return ExpressionSet objects, not SummarizedExperiments| Operation | ExpressionSet | SummarizedExperiment |
|---|---|---|
| Get expression values | exprs(eset) |
assays(se) |
| Get column data | phenoData(eset) |
colData(se) |
| Get row data | annotation(eset) |
rowData(se) |
ggplot2ggplot2 Fundamentalsggplot2 ExampleA simple example dataset:
## # A tibble: 20 × 8 ## ID age_at_death condition tau abeta iba1 gfap braak_stage ## <chr> <dbl> <fct> <dbl> <dbl> <dbl> <dbl> <fct> ## 1 A1 73 AD 96548 176324 157501 78139 4 ## 2 A2 82 AD 95251 0 147637 79348 4 ## ... ## 10 A10 69 AD 48357 27260 47024 78507 2 ## 11 C1 80 Control 62684 93739 131595 124075 3 ## 12 C2 77 Control 63598 69838 7189 35597 3 ## ... ## 20 C10 73 Control 15781 16592 10271 100858 1
ggplot2 Exampleage_at_deathtauggplot(
data=ad_metadata,
mapping = aes(
x = age_at_death,
y = tau
)
) +
geom_point()ggplot(data=ad_metadata, mapping=aes(x=age_at_death, y=tau)) + geom_point()
ggplot2 Plot Componentsggplot(data=ad_metadata, mapping=aes(x=age_at_death, y=tau)) + geom_point()
ggplot() - function creates a plotdata= - pass a tibble with the datamapping=aes(...) - Define an aesthetics mapping connecting data to plot propertiesgeom_point(...) - Specify geometry as points where marks will be made at pairs of x,y coordinatesIs this the whole story?
There are both AD and Control subjects in this dataset!
How does condition relate to this relationship we see?
Layer on an additional aesthetic of color:
ggplot(
data=ad_metadata,
mapping = aes(
x = age_at_death,
y = tau,
color=condition # color each point
)
) +
geom_point()ggplot(data=ad_metadata, mapping=aes(
x=age_at_death, y=tau, color=condition
)) + geom_point()
Differences in distributions of variables can be important
Examine the distribution of age_at_death for AD and Control samples with violin geometry with geom_violin():
ggplot(data=ad_metadata, mapping = aes(x=condition, y=age_at_death)) + geom_violin()
ggplot(data=ad_metadata, mapping = aes(x=condition, y=age_at_death)) + geom_violin()
patchwork library:library(patchwork)
age_boxplot <- ggplot(
data=ad_metadata,
mapping = aes(x=condition, y=age_at_death)
) +
geom_boxplot()
tau_boxplot <- ggplot(
data=ad_metadata,
mapping=aes(x=condition, y=tau)
) +
geom_boxplot()
age_boxplot | tau_boxplot # this puts the plots side by side
age_boxplot | tau_boxplot # this puts the plots side by side
ggplot Mechanicsggplot Mechanicsggplot has two key concepts that give it great flexibility: layers and scalesggplot LayersEach layer is a set of data connected to a geometry and an aesthetic
Each geom_X() function adds a layer to a plot
The plot has three layers:
ggplot(data=ad_metadata, mapping=aes(x=age_at_death)) + geom_point(mapping=aes(y=tau, color='blue')) + geom_point(mapping=aes(y=abeta, color='red')) + geom_point(mapping=aes(y=iba1, color='cyan'))
ggplot Layersggplot Scalesggplot ScalesHow many layers? How many scales?
ggplot Incompatible Scalesggplot(data=ad_metadata, mapping=aes(x=braak_stage)) + geom_point(mapping=aes(y=tau, color='blue')) + geom_point(mapping=aes(y=age_at_death, color='red'))
ggplot Incompatible Scalesggplot(ad_metadata,
mapping = aes(
x=ID,
y=tau)
) +
geom_bar(stat="identity")
ggplot(ad_metadata, mapping = aes(x=ID,y=tau)) + geom_bar(stat="identity")
ggplot(ad_metadata,
mapping = aes(
x=ID,
y=tau,
fill=condition)
) +
geom_bar(stat="identity")
ggplot(ad_metadata, mapping = aes(x=ID,y=tau,fill=condition)) + geom_bar(stat="identity")
mutate(ad_metadata, tau_centered=(tau - mean(tau))) %>%
ggplot(mapping = aes(
x=ID,
y=tau_centered,
fill=condition)
) +
geom_bar(stat="identity")
mutate(ad_metadata, tau_centered=(tau - mean(tau))) %>% ggplot(mapping = aes(x=ID, y=tau_centered, fill=condition)) + geom_bar(stat="identity")
geom_point and geom_segment layers:ggplot(ad_metadata) + geom_point(mapping=aes(x=ID, y=tau)) + geom_segment(mapping=aes(x=ID, xend=ID, y=0, yend=tau))
ggplot(ad_metadata) + geom_point(mapping=aes(x=ID, y=tau)) + geom_segment(mapping=aes(x=ID, xend=ID, y=0, yend=tau))
pivot_longer(
ad_metadata,
c(tau,abeta,iba1,gfap),
names_to='Marker',
values_to='Intensity'
) %>%
ggplot(aes(x=ID,y=Intensity,group=Marker,fill=Marker)) +
geom_area()
Stacked area plots require three pieces of data:
pivot_longer(
ad_metadata,
c(tau,abeta,iba1,gfap),
names_to='Marker',
values_to='Intensity'
) %>%
group_by(ID) %>% # divide each intensity values by sum markers
mutate(
`Relative Intensity`=Intensity/sum(Intensity)
) %>%
ungroup() %>% # ungroup restores the tibble to original number of rows
ggplot(aes(x=ID,y=`Relative Intensity`,group=Marker,fill=Marker)) +
geom_area()
ggplot(ad_metadata) + geom_histogram(mapping=aes(x=age_at_death))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(ad_metadata) + geom_histogram(mapping=aes(x=age_at_death),bins=10)
tibble(x=rnorm(1000)) %>% ggplot() + geom_histogram(aes(x=x))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
tibble(
x=c(rnorm(1000),rnorm(1000,mean=4)),
type=c(rep('A',1000),rep('B',1000))
) %>%
ggplot(aes(x=x,fill=type)) +
geom_histogram(bins=30, alpha=0.6, position="identity")
Similar to histogram, except instead of binning the values draws a smoothly interpolated line that approximates the distribution
Density plot is always normalized so the integral under the curve is approximately 1
ggplot(ad_metadata) + geom_density(mapping=aes(x=age_at_death),fill="#c9a13daa")
library(patchwork) hist_g <- ggplot(ad_metadata) + geom_histogram(mapping=aes(x=age_at_death),bins=30) density_g <- ggplot(ad_metadata) + geom_density(mapping=aes(x=age_at_death),fill="#c9a13daa") hist_g | density_g
library(patchwork)
normal_samples <- tibble(
x=c(rnorm(1000),rnorm(1000,mean=4)),
type=c(rep('A',1000),rep('B',1000))
)
hist_g <- ggplot(normal_samples) +
geom_histogram(
mapping=aes(x=x,fill=type),
alpha=0.6, position="identity", bins=30
)
density_g <- ggplot(normal_samples) +
geom_density(
mapping=aes(x=x,fill=type),
alpha=0.6, position="identity"
)
hist_g | density_g
ggplot(ad_metadata) + geom_boxplot(mapping=aes(x=condition,y=age_at_death))
ggplot(ad_metadata) + geom_boxplot(mapping=aes(x=condition,y=age_at_death))
normal_samples <- tibble(
x=c(rnorm(1000),rnorm(1000,4),rnorm(1000,2,3)),
type=c(rep('A',2000),rep('B',1000))
)
ggplot(normal_samples, aes(x=x,fill=type,alpha=0.6)) +
geom_density()
ggplot(normal_samples, aes(x=type,y=x,fill=type)) + geom_boxplot()
library(patchwork) g <- ggplot(normal_samples, aes(x=type,y=x,fill=type)) boxplot_g <- g + geom_boxplot() violin_g <- g + geom_violin() boxplot_g | violin_g
ggplot(ad_metadata) + geom_violin(aes(x=condition,y=tau,fill=condition))
library(ggbeeswarm)
ggplot(ad_metadata) +
geom_beeswarm(
aes(x=condition,y=age_at_death,color=condition),
cex=2,
size=2
)
normal_samples <- tibble(
x=c(rnorm(1000),rnorm(1000,4),rnorm(1000,2,3)),
type=c(rep('A',2000),rep('B',1000))
)
ggplot(normal_samples, aes(x=type,y=x,color=type)) +
geom_beeswarm()
ggplot(normal_samples, aes(x=type,y=x,color=type)) + geom_beeswarm()
normal_samples <- tibble(
x=c(rnorm(100),rnorm(100,4),rnorm(100,2,3)),
type=c(rep('A',200),rep('B',100)),
category=sample(c('healthy','disease'),300,replace=TRUE)
)
ggplot(normal_samples, aes(x=type,y=x,color=category)) +
geom_beeswarm()
library(ggridges)
tibble(
x=c(rnorm(100),rnorm(100,4),rnorm(100,2,3)),
type=c(rep('A',200),rep('B',100)),
) %>%
ggplot(aes(y=type,x=x,fill=type)) +
geom_density_ridges()
## Picking joint bandwidth of 0.892
tibble(
x=rnorm(10000,mean=runif(10,1,10),sd=runif(2,1,4)),
type=rep(c("A","B","C","D","E","F","G","H","I","J"),1000)
) %>%
ggplot(aes(y=type,x=x,fill=type)) +
geom_density_ridges(alpha=0.6,position="identity")
## Picking joint bandwidth of 0.499