An effective data visualization:
A great visualization has some additional properties:
data <- tibble( x=rnorm(100), y=rnorm(100,0,5) ) data %>% ggplot(aes(x=x,y=y)) + geom_point()
data <- mutate(data, `x times y`=abs(x*y) ) data %>% ggplot(aes(x=x,y=y,size=`x times y`)) + geom_point()
data <- mutate(data, z=runif(100), category=sample(c('A','B','C'), 100, replace=TRUE) ) data %>% ggplot(aes(x=x, y=y, size=`x times y`, color=z, shape=category)) + geom_point()
data <- mutate(data, `x + y`=x+y) %>% arrange(`x + y`) %>% mutate( xend=lag(x,1), yend=lag(y,1) ) data %>% ggplot() + geom_segment(aes(x=x,xend=xend,y=y,yend=yend,alpha=0.5)) + geom_point(aes(x=x, y=y, size=`x times y`, color=z, shape=category))
tibble( value=rnorm(100), category='A' ) %>% ggplot(aes(x=category,y=value,fill=category)) + geom_violin()
Plot | \(x\) encoding | \(y\) encoding | note |
---|---|---|---|
scatter | position | position | |
vertical bar | position | height | |
horizontal bar | width | position | |
lollipop | position | height for line + position for “head” | |
violin | position | width | \(x\) transformed to range, \(y\) transformed to densities |
# two groups of samples with similar random data profiles data <- as.matrix( tibble( A=c(rnorm(10),rnorm(10,2)), B=c(rnorm(10),rnorm(10,2)), C=c(rnorm(10),rnorm(10,2)), D=c(rnorm(10,4),rnorm(10,-1)), E=c(rnorm(10,4),rnorm(10,-1)), F=c(rnorm(10,4),rnorm(10,-1)) ) ) rownames(data) <- paste0('G',seq(nrow(data))) heatmap(data)
library(patchwork) data <- tibble( percent=c(86,88,87,90,93,89), ID=c('A','B','C','D','E','F') ) g <- ggplot(data, aes(x=ID,y=percent)) + geom_bar(stat="identity") g + coord_cartesian(ylim=c(85,95)) | g
ggplot2
ggplot2
Fundamentalsggplot2
ExampleA simple example dataset:
## # A tibble: 20 × 8 ## ID age_at_death condition tau abeta iba1 gfap braak_stage ## <chr> <dbl> <fct> <dbl> <dbl> <dbl> <dbl> <fct> ## 1 A1 73 AD 96548 176324 157501 78139 4 ## 2 A2 82 AD 95251 0 147637 79348 4 ## ... ## 10 A10 69 AD 48357 27260 47024 78507 2 ## 11 C1 80 Control 62684 93739 131595 124075 3 ## 12 C2 77 Control 63598 69838 7189 35597 3 ## ... ## 20 C10 73 Control 15781 16592 10271 100858 1
ggplot2
Exampleage_at_death
tau
ggplot( data=ad_metadata, mapping = aes( x = age_at_death, y = tau ) ) + geom_point()
ggplot(data=ad_metadata, mapping=aes(x=age_at_death, y=tau)) + geom_point()
ggplot2
Plot Componentsggplot(data=ad_metadata, mapping=aes(x=age_at_death, y=tau)) + geom_point()
ggplot()
- function creates a plotdata=
- pass a tibble with the datamapping=aes(...)
- Define an aesthetics mapping connecting data to plot propertiesgeom_point(...)
- Specify geometry as points where marks will be made at pairs of x,y coordinatesIs this the whole story?
There are both AD and Control subjects in this dataset!
How does condition
relate to this relationship we see?
Layer on an additional aesthetic of color:
ggplot( data=ad_metadata, mapping = aes( x = age_at_death, y = tau, color=condition # color each point ) ) + geom_point()
ggplot(data=ad_metadata, mapping=aes( x=age_at_death, y=tau, color=condition )) + geom_point()
Differences in distributions of variables can be important
Examine the distribution of age_at_death
for AD and Control samples with violin geometry with geom_violin()
:
ggplot(data=ad_metadata, mapping = aes(x=condition, y=age_at_death)) + geom_violin()
ggplot(data=ad_metadata, mapping = aes(x=condition, y=age_at_death)) + geom_violin()
patchwork
library:library(patchwork) age_boxplot <- ggplot( data=ad_metadata, mapping = aes(x=condition, y=age_at_death) ) + geom_boxplot() tau_boxplot <- ggplot( data=ad_metadata, mapping=aes(x=condition, y=tau) ) + geom_boxplot() age_boxplot | tau_boxplot # this puts the plots side by side
age_boxplot | tau_boxplot # this puts the plots side by side
ggplot
Mechanicsggplot
Mechanicsggplot
has two key concepts that give it great flexibility: layers and scalesggplot
LayersEach layer is a set of data connected to a geometry and an aesthetic
Each geom_X()
function adds a layer to a plot
The plot has three layers:
ggplot(data=ad_metadata, mapping=aes(x=age_at_death)) + geom_point(mapping=aes(y=tau, color='blue')) + geom_point(mapping=aes(y=abeta, color='red')) + geom_point(mapping=aes(y=iba1, color='cyan'))
ggplot
Layersggplot
Scalesggplot
ScalesHow many layers? How many scales?
ggplot
Incompatible Scalesggplot(data=ad_metadata, mapping=aes(x=braak_stage)) + geom_point(mapping=aes(y=tau, color='blue')) + geom_point(mapping=aes(y=age_at_death, color='red'))
ggplot
Incompatible Scalesggplot(ad_metadata, mapping = aes( x=ID, y=tau) ) + geom_bar(stat="identity")
ggplot(ad_metadata, mapping = aes(x=ID,y=tau)) + geom_bar(stat="identity")
ggplot(ad_metadata, mapping = aes(x=ID,y=tau,fill=condition)) + geom_bar(stat="identity")
ggplot(ad_metadata, mapping = aes(x=ID,y=tau,fill=condition)) + geom_bar(stat="identity")
mutate(ad_metadata, tau_centered=(tau - mean(tau))) %>% ggplot(mapping = aes(x=ID, y=tau_centered, fill=condition)) + geom_bar(stat="identity")
geom_point
and geom_segment
layers:ggplot(ad_metadata) + geom_point(mapping=aes(x=ID, y=tau)) + geom_segment(mapping=aes(x=ID, xend=ID, y=0, yend=tau))
pivot_longer( ad_metadata, c(tau,abeta,iba1,gfap), names_to='Marker', values_to='Intensity' ) %>% ggplot(aes(x=ID,y=Intensity,group=Marker,fill=Marker)) + geom_area()
Stacked area plots require three pieces of data:
pivot_longer( ad_metadata, c(tau,abeta,iba1,gfap), names_to='Marker', values_to='Intensity' ) %>% group_by(ID) %>% # divide each intensity values by sum markers mutate( `Relative Intensity`=Intensity/sum(Intensity) ) %>% ungroup() %>% # ungroup restores the tibble to original number of rows ggplot(aes(x=ID,y=`Relative Intensity`,group=Marker,fill=Marker)) + geom_area()
ggplot(ad_metadata) + geom_histogram(mapping=aes(x=age_at_death))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(ad_metadata) + geom_histogram(mapping=aes(x=age_at_death),bins=10)
tibble(x=rnorm(1000)) %>% ggplot() + geom_histogram(aes(x=x))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
tibble( x=c(rnorm(1000),rnorm(1000,mean=4)), type=c(rep('A',1000),rep('B',1000)) ) %>% ggplot(aes(x=x,fill=type)) + geom_histogram(bins=30, alpha=0.6, position="identity")
Similar to histogram, except instead of binning the values draws a smoothly interpolated line that approximates the distribution
Density plot is always normalized so the integral under the curve is approximately 1
ggplot(ad_metadata) + geom_density(mapping=aes(x=age_at_death),fill="#c9a13daa")
library(patchwork) hist_g <- ggplot(ad_metadata) + geom_histogram(mapping=aes(x=age_at_death),bins=30) density_g <- ggplot(ad_metadata) + geom_density(mapping=aes(x=age_at_death),fill="#c9a13daa") hist_g | density_g
library(patchwork) normal_samples <- tibble( x=c(rnorm(1000),rnorm(1000,mean=4)), type=c(rep('A',1000),rep('B',1000)) ) hist_g <- ggplot(normal_samples) + geom_histogram( mapping=aes(x=x,fill=type), alpha=0.6, position="identity", bins=30 ) density_g <- ggplot(normal_samples) + geom_density( mapping=aes(x=x,fill=type), alpha=0.6, position="identity" ) hist_g | density_g
ggplot(ad_metadata) + geom_boxplot(mapping=aes(x=condition,y=age_at_death))
ggplot(ad_metadata) + geom_boxplot(mapping=aes(x=condition,y=age_at_death))
normal_samples <- tibble( x=c(rnorm(1000),rnorm(1000,4),rnorm(1000,2,3)), type=c(rep('A',2000),rep('B',1000)) ) ggplot(normal_samples, aes(x=x,fill=type,alpha=0.6)) + geom_density()
ggplot(normal_samples, aes(x=type,y=x,fill=type)) + geom_boxplot()
library(patchwork) g <- ggplot(normal_samples, aes(x=type,y=x,fill=type)) boxplot_g <- g + geom_boxplot() violin_g <- g + geom_violin() boxplot_g | violin_g
ggplot(ad_metadata) + geom_violin(aes(x=condition,y=tau,fill=condition))
library(ggbeeswarm) ggplot(ad_metadata) + geom_beeswarm( aes(x=condition,y=age_at_death,color=condition), cex=2, size=2 )
normal_samples <- tibble( x=c(rnorm(1000),rnorm(1000,4),rnorm(1000,2,3)), type=c(rep('A',2000),rep('B',1000)) ) ggplot(normal_samples, aes(x=type,y=x,color=type)) + geom_beeswarm()
ggplot(normal_samples, aes(x=type,y=x,color=type)) + geom_beeswarm()
normal_samples <- tibble( x=c(rnorm(100),rnorm(100,4),rnorm(100,2,3)), type=c(rep('A',200),rep('B',100)), category=sample(c('healthy','disease'),300,replace=TRUE) ) ggplot(normal_samples, aes(x=type,y=x,color=category)) + geom_beeswarm()
library(ggridges) tibble( x=c(rnorm(100),rnorm(100,4),rnorm(100,2,3)), type=c(rep('A',200),rep('B',100)), ) %>% ggplot(aes(y=type,x=x,fill=type)) + geom_density_ridges()
## Picking joint bandwidth of 0.822
tibble( x=rnorm(10000,mean=runif(10,1,10),sd=runif(2,1,4)), type=rep(c("A","B","C","D","E","F","G","H","I","J"),1000) ) %>% ggplot(aes(y=type,x=x,fill=type)) + geom_density_ridges(alpha=0.6,position="identity")
## Picking joint bandwidth of 0.494
ggplot(ad_metadata,mapping=aes(x=abeta, y=tau)) + geom_point(size=3)
ggplot(ad_metadata,mapping=aes(x=abeta, y=tau)) + geom_point(size=3)
ggplot(ad_metadata,mapping=aes(x=abeta, y=tau, shape=condition)) + geom_point(size=3)
g <- ggplot() for(x in 0:5) { for(y in 0:4) { if(x+y*6 < 26) { g <- g + geom_point( tibble(x=x,y=y), aes(x=x,y=y), shape=x+y*6,size=8) + geom_label( tibble(x=x,y=y,label=x+y*6), aes(x=x,y=y+0.5,label=label) ) } } } g
scale_shape_manual()
:ggplot(ad_metadata,mapping=aes(x=abeta, y=tau, shape=condition)) + geom_point(size=3) + scale_shape_manual(values=c(3,9))
library(patchwork) g <- ggplot(ad_metadata) g_condition <- g + geom_point(mapping=aes(x=abeta, y=tau, color=condition),size=3) g_age <- g + geom_point(mapping=aes(x=abeta, y=tau, color=age_at_death),size=3) g_condition / g_age
ggplot(ad_metadata,mapping=aes(x=abeta, y=tau, size=age_at_death)) + geom_point(alpha=0.5)
arrange(ad_metadata,age_at_death) %>% mutate( x=abeta, xend=lag(x,1), y=tau, yend=lag(y,1) ) %>% ggplot() + geom_segment(aes(x=abeta, xend=xend, y=tau, yend=yend)) + geom_point(aes(x=x,y=y,shape=condition,color=condition),size=3)
geom_line()
function draws lines between pairs of points sorted by \(x\) axis by defaultggplot(ad_metadata,mapping=aes(x=abeta, y=tau)) + geom_line()
group
aesthetic mappingpivot_longer(ad_metadata, c(tau,abeta,iba1,gfap), names_to='Marker', values_to='Intensity' ) %>% ggplot(ad_metadata,mapping=aes(x=ID, y=Intensity, group=Marker, color=Marker)) + geom_line()
group
aesthetic mappinglibrary(GGally) ggparcoord(ad_metadata, columns=c(2,4:8), groupColumn=3, showPoints=TRUE ) + scale_color_manual(values=c("#bbbbbb", "#666666"))
0 - f
, 0=0
and f=16
#rrggbb
rr
is the value of the color redgg
for greenbb
for blue#ffffff
is the color
white#000000
is the color
black#ff0000
is the color
red#0000ff
is the color
blue#7fff00
is
this color (called chartreuse)heatmap()
function:heatmap()
function creates a clustered heatmap where the rows and columns have been hierarchically clustered# heatmap() requires a R matrix, and cannot accept a tibble or a dataframe marker_matrix <- as.matrix( dplyr::select(ad_metadata,c(tau,abeta,iba1,gfap)) ) # rownames of the matrix become y labels rownames(marker_matrix) <- ad_metadata$ID heatmap(marker_matrix)
# heatmap() requires a R matrix, and cannot accept a tibble or a dataframe marker_matrix <- as.matrix( dplyr::select(ad_metadata,c(tau,abeta,iba1,gfap)) ) # rownames of the matrix become y labels rownames(marker_matrix) <- ad_metadata$ID heatmap(marker_matrix)
heatmap()
Functionalityheatmap( marker_matrix, Rowv=NA, # don't cluster rows Colv=NA, # don't cluster columns scale="none", # don't scale rows )
heatmap()
Drawbackheatmap()
function has the major drawback that no color key is provided!heatmap.2()
in gplots
package has a similar interfacelibrary(gplots) heatmap.2(marker_matrix)
heatmap.2()
Exampleheatmap()
and heatmap.2()
can annotate rows and columns with a categorical variable along the margins# with heatmap() condition_colors <- transmute( ad_metadata, color=if_else(condition == "AD","red","blue") ) heatmap( marker_matrix, RowSideColors=condition_colors$color )
# with heatmap.2() heatmap.2( marker_matrix, RowSideColors=condition_colors$color )
geom_tile
geometrygeom_tile
requires data in long format with x
, y
, and z
valuespivot_longer( ad_metadata, c(tau,abeta,iba1,gfap), names_to="Marker", values_to="Intensity" ) %>% ggplot(aes(x=Marker,y=ID,fill=Intensity)) + geom_tile()
col
argument# native R colors are: # - rainbow(n, start=.7, end=.1) # - heat.colors(n) # - terrain.colors(n) # - topo.colors(n) # - cm.colors(n) # the n argument specifies the number of colors (i.e. resolution) of the colormap to return heatmap(marker_matrix,col=cm.colors(256))
geom_tile()
, use the scale_fill_gradientn
function to specify a different color palettepivot_longer( ad_metadata, c(tau,abeta,iba1,gfap), names_to="Marker", values_to="Intensity" ) %>% ggplot(aes(x=Marker,y=ID,fill=Intensity)) + geom_tile() + scale_fill_gradientn(colors=cm.colors(256))
Heatmaps are quite complicated and can easily mislead us
Four factors influence how a dataset can be visualized as a heatmap:
data <- tibble( ID=paste0('F',seq(10)), a=rnorm(10,0,1), b=rnorm(10,100,20), c=rnorm(10,20,5) ) %>% pivot_longer(c(a,b,c)) ggplot(data,aes(x=name,y=ID,fill=value)) + geom_tile()
library(ggbeeswarm) ggplot(data) + geom_beeswarm(aes(x=name,y=value,color=name))
data %>% pivot_wider(id_cols='ID',names_from=name) %>% mutate( across(c(a,b,c),scale) ) %>% pivot_longer(c(a,b,c)) %>% ggplot(aes(x=name,y=ID,fill=value)) + geom_tile()
data %>% pivot_wider(id_cols=name,names_from=ID) %>% mutate( across(starts_with('F'),scale) ) %>% pivot_longer(starts_with('F'),names_to="ID") %>% ggplot(aes(x=name,y=ID,fill=value)) + geom_tile()
tibble( ID=paste0('F',seq(10)), a=rnorm(10,0,1), b=rnorm(10,0,1), c=c(rnorm(9,0,1),1e9) ) %>% pivot_longer(c(a,b,c)) %>% ggplot(aes(x=name,y=ID,fill=value)) + geom_tile()
data <- tibble( ID=paste0('F',seq(10)), a=10**rnorm(10,4,1), b=10**rnorm(10,4,1), c=10**rnorm(10,4,1) ) %>% pivot_longer(c(a,b,c)) data %>% ggplot(aes(x=name,y=ID,fill=value)) + geom_tile()
midpoint=0
when central value is 0tibble( ID=paste0('F',seq(10)), a=rnorm(10,0,1), b=rnorm(10,0,1), c=rnorm(10,0,1) ) %>% pivot_longer(c(a,b,c)) %>% ggplot(aes(x=name,y=ID,fill=value)) + geom_tile() + scale_fill_gradient2( low="#000099", mid="#ffffff", high="#990000", midpoint=0 )
tibble( ID=paste0('F',seq(10)), a=rnorm(10,1,1), b=rnorm(10,1,1), c=rnorm(10,1,1) ) %>% pivot_longer(c(a,b,c)) %>% ggplot(aes(x=name,y=ID,fill=value)) + geom_tile() + scale_fill_gradient2(low="#000099", mid="#ffffff", high="#990000")
circlize
R packagecirclize
package provides R implementationcirclize
R packagepatchwork
library composes ggplot objects together using an intuitive set of operators:
a | b
- put plots side-by-sidea / b
- put plot a
above b
(a | b) / c
- put a
and b
side-by-side, and c
below themdata <- tibble( a=rnorm(100,0,1), b=rnorm(100,3,2) ) g_scatter <- ggplot(data, aes(x=a, y=b)) + geom_point() g_violin <- pivot_longer(data, c(a,b)) %>% ggplot(aes(x=name,y=value,fill=name)) + geom_violin()
g_scatter | g_violin
g_scatter / g_violin
g_scatter / ( g_scatter | g_violin)
(g_scatter / g_scatter ) | g_violin
library(mvtnorm) # package implementing multivariate normal distributions nsamp <- 100 data <- rbind( rmvnorm(nsamp,c(1,1),sigma=matrix(c(1,0.8,0.8,1),nrow=2)), rmvnorm(nsamp,c(1,1),sigma=matrix(c(1,-0.8,-0.8,1),nrow=2)), rmvnorm(nsamp,c(1,1),sigma=matrix(c(1,0,0,1),nrow=2)) ) colnames(data) <- c('x','y') g_oneplot <- as_tibble(data) %>% mutate( sample_name=c(rep('A',nsamp),rep('B',nsamp),rep('C',nsamp)) ) %>% ggplot(aes(x=x,y=y,color=sample_name)) + geom_point() g_oneplot
facet_wrap()
functionfacet_wrap()
functiong_oneplot + facet_wrap(vars(sample_name))
facet_wrap()
functiong_oneplot + facet_wrap(vars(sample_name)) + geom_smooth(method="loess", formula=y ~ x)
ggplot
Themesbase_g <- tibble( x=rnorm(100), y=rnorm(100) ) %>% ggplot(aes(x=x, y=y)) + geom_point()
base_g
ggplot
comes with other themes that may be added to plots with theme_X()
functionsbase_g + theme_bw()
base_g + theme_classic()
<circle cx="50" cy="50" r="50"/>
(g1 / g2) | g3
ggsave()
functionggsave('multipanel.svg')
## Saving 7.5 x 4.5 in image