An effective data visualization:
A great visualization has some additional properties:
data <- tibble( x=rnorm(100), y=rnorm(100,0,5) ) data %>% ggplot(aes(x=x,y=y)) + geom_point()
data <- mutate(data, `x times y`=abs(x*y) ) data %>% ggplot(aes(x=x,y=y,size=`x times y`)) + geom_point()
data <- mutate(data, z=runif(100), category=sample(c('A','B','C'), 100, replace=TRUE) ) data %>% ggplot(aes(x=x, y=y, size=`x times y`, color=z, shape=category)) + geom_point()
tibble( value=rnorm(100), category='A' ) %>% ggplot(aes(x=category,y=value,fill=category)) + geom_violin()
Plot | \(x\) encoding | \(y\) encoding | note |
---|---|---|---|
scatter | position | position | |
vertical bar | position | height | |
horizontal bar | width | position | |
lollipop | position | height for line + position for “head” | |
violin | position | width | \(x\) transformed to range, \(y\) transformed to densities |
# two groups of samples with similar random data profiles data <- as.matrix( tibble( A=c(rnorm(10),rnorm(10,2)), B=c(rnorm(10),rnorm(10,2)), C=c(rnorm(10),rnorm(10,2)), D=c(rnorm(10,4),rnorm(10,-1)), E=c(rnorm(10,4),rnorm(10,-1)), F=c(rnorm(10,4),rnorm(10,-1)) ) ) rownames(data) <- paste0('G',seq(nrow(data))) heatmap(data)
library(patchwork) data <- tibble( percent=c(86,88,87,90,93,89), ID=c('A','B','C','D','E','F') ) g <- ggplot(data, aes(x=ID,y=percent)) + geom_bar(stat="identity") g + coord_cartesian(ylim=c(85,95)) | g
## # A tibble: 20 × 8 ## ID age_at_death condition tau abeta iba1 gfap braak_stage ## <chr> <dbl> <fct> <dbl> <dbl> <dbl> <dbl> <fct> ## 1 A1 73 AD 96548 176324 157501 78139 4 ## 2 A2 82 AD 95251 0 147637 79348 4 ## ... ## 10 A10 69 AD 48357 27260 47024 78507 2 ## 11 C1 80 Control 62684 93739 131595 124075 3 ## 12 C2 77 Control 63598 69838 7189 35597 3 ## ... ## 20 C10 73 Control 15781 16592 10271 100858 1
ggplot2
ggplot2
Fundamentalsggplot
LayersEach layer is a set of data connected to a geometry and an aesthetic
Each geom_X()
function adds a layer to a plot
The plot has three layers:
ggplot(data=ad_metadata, mapping=aes(x=age_at_death)) + geom_point(mapping=aes(y=tau, color='blue')) + geom_point(mapping=aes(y=abeta, color='red')) + geom_point(mapping=aes(y=iba1, color='cyan'))
ggplot
Layersggplot
Scalesggplot(ad_metadata,mapping=aes(x=abeta, y=tau)) + geom_point(size=3)
ggplot(ad_metadata,mapping=aes(x=abeta, y=tau)) + geom_point(size=3)
ggplot(ad_metadata,mapping=aes(x=abeta, y=tau, shape=condition)) + geom_point(size=3)
g <- ggplot() for(x in 0:5) { for(y in 0:4) { if(x+y*6 < 26) { g <- g + geom_point( tibble(x=x,y=y), aes(x=x,y=y), shape=x+y*6,size=8) + geom_label( tibble(x=x,y=y,label=x+y*6), aes(x=x,y=y+0.5,label=label) ) } } } g
scale_shape_manual()
:ggplot(ad_metadata,mapping=aes(x=abeta, y=tau, shape=condition)) + geom_point(size=3) + scale_shape_manual(values=c(3,9))
library(patchwork) g <- ggplot(ad_metadata) g_condition <- g + geom_point( mapping=aes(x=abeta, y=tau, color=condition), size=3 ) g_age <- g + geom_point( mapping=aes(x=abeta, y=tau, color=age_at_death), size=3 ) g_condition / g_age
ggplot(ad_metadata,mapping=aes(x=abeta, y=tau, size=age_at_death)) + geom_point(alpha=0.5)
arrange(ad_metadata,age_at_death) %>% mutate( x=abeta, xend=lag(x,1), y=tau, yend=lag(y,1) ) %>% ggplot() + geom_segment(aes(x=abeta, xend=xend, y=tau, yend=yend)) + geom_point(aes(x=x,y=y,shape=condition,color=condition),size=3)
geom_line()
function draws lines between pairs of points sorted by \(x\) axis by defaultggplot(ad_metadata,mapping=aes(x=abeta, y=tau)) + geom_line()
group
aesthetic mappingpivot_longer(ad_metadata, c(tau,abeta,iba1,gfap), names_to='Marker', values_to='Intensity' ) %>% ggplot(ad_metadata,mapping=aes(x=ID, y=Intensity, group=Marker, color=Marker)) + geom_line()
group
aesthetic mappinglibrary(GGally) ggparcoord(ad_metadata, columns=c(2,4:8), groupColumn=3, showPoints=TRUE ) + scale_color_manual(values=c("#bbbbbb", "#666666"))
0 - f
, 0=0
and f=16
#rrggbb
rr
is the value of the color redgg
for greenbb
for blue#ffffff
is the color
white#000000
is the color
black#ff0000
is the color
red#0000ff
is the color
blue#7fff00
is
this color (called chartreuse)heatmap()
function:heatmap()
function creates a clustered heatmap where the rows and columns have been hierarchically clustered# heatmap() requires a R matrix, and cannot accept a tibble or a dataframe marker_matrix <- as.matrix( dplyr::select(ad_metadata,c(tau,abeta,iba1,gfap)) ) # rownames of the matrix become y labels rownames(marker_matrix) <- ad_metadata$ID heatmap(marker_matrix)
heatmap()
Functionalityheatmap( marker_matrix, Rowv=NA, # don't cluster rows Colv=NA, # don't cluster columns scale="none", # don't scale rows )
heatmap()
Drawbackheatmap()
function has the major drawback that no color key is provided!heatmap.2()
in gplots
package has a similar interfacelibrary(gplots) heatmap.2(marker_matrix)
heatmap.2()
Exampleheatmap()
and heatmap.2()
can annotate rows and columns with a categorical variable along the margins# with heatmap() condition_colors <- transmute( ad_metadata, color=if_else(condition == "AD","red","blue") ) heatmap( marker_matrix, RowSideColors=condition_colors$color )
# with heatmap.2() heatmap.2( marker_matrix, RowSideColors=condition_colors$color )
geom_tile
geometrygeom_tile
requires data in long format with x
, y
, and z
valuespivot_longer( ad_metadata, c(tau,abeta,iba1,gfap), names_to="Marker", values_to="Intensity" ) %>% ggplot(aes(x=Marker,y=ID,fill=Intensity)) + geom_tile()
col
argument# native R colors are: # - rainbow(n, start=.7, end=.1) # - heat.colors(n) # - terrain.colors(n) # - topo.colors(n) # - cm.colors(n) # the n argument specifies the number of colors # (i.e. resolution) of the colormap to return heatmap(marker_matrix,col=cm.colors(256))
heatmap(marker_matrix,col=cm.colors(256))
heatmap(marker_matrix,col=cm.colors(2))
geom_tile()
, use the scale_fill_gradientn
function to specify a different color palettepivot_longer( ad_metadata, c(tau,abeta,iba1,gfap), names_to="Marker", values_to="Intensity" ) %>% ggplot(aes(x=Marker,y=ID,fill=Intensity)) + geom_tile() + scale_fill_gradientn(colors=cm.colors(256))
Heatmaps are quite complicated and can easily mislead us
Four factors influence how a dataset can be visualized as a heatmap:
data <- tibble( ID=paste0('F',seq(10)), a=rnorm(10,0,1), b=rnorm(10,100,20), c=rnorm(10,20,5) ) %>% pivot_longer(c(a,b,c)) ggplot(data,aes(x=name,y=ID,fill=value)) + geom_tile()
library(ggbeeswarm) ggplot(data) + geom_beeswarm(aes(x=name,y=value,color=name))
data %>% pivot_wider(id_cols='ID',names_from=name) %>% mutate( across(c(a,b,c),scale) ) %>% pivot_longer(c(a,b,c)) %>% ggplot(aes(x=name,y=ID,fill=value)) + geom_tile()
data %>% pivot_wider(id_cols=name,names_from=ID) %>% mutate( across(starts_with('F'),scale) ) %>% pivot_longer(starts_with('F'),names_to="ID") %>% ggplot(aes(x=name,y=ID,fill=value)) + geom_tile()
tibble( ID=paste0('F',seq(10)), a=rnorm(10,0,1), b=rnorm(10,0,1), c=c(rnorm(9,0,1),1e9) ) %>% pivot_longer(c(a,b,c)) %>% ggplot(aes(x=name,y=ID,fill=value)) + geom_tile()
data <- tibble( ID=paste0('F',seq(10)), a=10**rnorm(10,4,1), b=10**rnorm(10,4,1), c=10**rnorm(10,4,1) ) %>% pivot_longer(c(a,b,c)) data %>% ggplot(aes(x=name,y=ID,fill=value)) + geom_tile()
midpoint=0
when central value is 0tibble( ID=paste0('F',seq(10)), a=rnorm(10,0,1), b=rnorm(10,0,1), c=rnorm(10,0,1) ) %>% pivot_longer(c(a,b,c)) %>% ggplot(aes(x=name,y=ID,fill=value)) + geom_tile() + scale_fill_gradient2( low="#000099", mid="#ffffff", high="#990000", midpoint=0 )
tibble( ID=paste0('F',seq(10)), a=rnorm(10,1,1), b=rnorm(10,1,1), c=rnorm(10,1,1) ) %>% pivot_longer(c(a,b,c)) %>% ggplot(aes(x=name,y=ID,fill=value)) + geom_tile() + scale_fill_gradient2(low="#000099", mid="#ffffff", high="#990000")
circlize
R packagecirclize
package provides R implementationcirclize
R packagepatchwork
library composes ggplot objects together using an intuitive set of operators:
a | b
- put plots side-by-sidea / b
- put plot a
above b
(a | b) / c
- put a
and b
side-by-side, and c
below themdata <- tibble( a=rnorm(100,0,1), b=rnorm(100,3,2) ) g_scatter <- ggplot(data, aes(x=a, y=b)) + geom_point() g_violin <- pivot_longer(data, c(a,b)) %>% ggplot(aes(x=name,y=value,fill=name)) + geom_violin()
g_scatter | g_violin
g_scatter / g_violin
g_scatter / ( g_scatter | g_violin)
(g_scatter / g_scatter ) | g_violin
library(mvtnorm) # package implementing multivariate normal distributions nsamp <- 100 data <- rbind( rmvnorm(nsamp,c(1,1),sigma=matrix(c(1,0.8,0.8,1),nrow=2)), rmvnorm(nsamp,c(1,1),sigma=matrix(c(1,-0.8,-0.8,1),nrow=2)), rmvnorm(nsamp,c(1,1),sigma=matrix(c(1,0,0,1),nrow=2)) ) colnames(data) <- c('x','y') g_oneplot <- as_tibble(data) %>% mutate( sample_name=c(rep('A',nsamp),rep('B',nsamp),rep('C',nsamp)) ) %>% ggplot(aes(x=x,y=y,color=sample_name)) + geom_point() g_oneplot
facet_wrap()
functionfacet_wrap()
functiong_oneplot + facet_wrap(vars(sample_name))
facet_wrap()
functiong_oneplot + facet_wrap(vars(sample_name)) + geom_smooth(method="loess", formula=y ~ x)
ggplot
Themesbase_g <- tibble( x=rnorm(100), y=rnorm(100) ) %>% ggplot(aes(x=x, y=y)) + geom_point()
base_g
ggplot
comes with other themes that may be added to plots with theme_X()
functionsbase_g + theme_bw()
base_g + theme_classic()
<circle cx="50" cy="50" r="50"/>
ggsave()
functionggsave('multipanel.svg')
## Saving 7.5 x 4.5 in image