An effective data visualization:
A great visualization has some additional properties:
data <- tibble( x=rnorm(100), y=rnorm(100,0,5) ) data %>% ggplot(aes(x=x,y=y)) + geom_point()
data <- mutate(data, `x times y`=abs(x*y) ) data %>% ggplot(aes(x=x,y=y,size=`x times y`)) + geom_point()
data <- mutate(data,
z=runif(100),
category=sample(c('A','B','C'), 100, replace=TRUE)
)
data %>%
ggplot(aes(x=x, y=y, size=`x times y`, color=z, shape=category)) +
geom_point()
tibble( value=rnorm(100), category='A' ) %>% ggplot(aes(x=category,y=value,fill=category)) + geom_violin()
| Plot | \(x\) encoding | \(y\) encoding | note |
|---|---|---|---|
| scatter | position | position | |
| vertical bar | position | height | |
| horizontal bar | width | position | |
| lollipop | position | height for line + position for “head” | |
| violin | position | width | \(x\) transformed to range, \(y\) transformed to densities |
# two groups of samples with similar random data profiles
data <- as.matrix(
tibble(
A=c(rnorm(10),rnorm(10,2)),
B=c(rnorm(10),rnorm(10,2)),
C=c(rnorm(10),rnorm(10,2)),
D=c(rnorm(10,4),rnorm(10,-1)),
E=c(rnorm(10,4),rnorm(10,-1)),
F=c(rnorm(10,4),rnorm(10,-1))
)
)
rownames(data) <- paste0('G',seq(nrow(data)))
heatmap(data)
library(patchwork)
data <- tibble(
percent=c(86,88,87,90,93,89),
ID=c('A','B','C','D','E','F')
)
g <- ggplot(data, aes(x=ID,y=percent)) +
geom_bar(stat="identity")
g + coord_cartesian(ylim=c(85,95)) | g
## # A tibble: 20 × 8 ## ID age_at_death condition tau abeta iba1 gfap braak_stage ## <chr> <dbl> <fct> <dbl> <dbl> <dbl> <dbl> <fct> ## 1 A1 73 AD 96548 176324 157501 78139 4 ## 2 A2 82 AD 95251 0 147637 79348 4 ## ... ## 10 A10 69 AD 48357 27260 47024 78507 2 ## 11 C1 80 Control 62684 93739 131595 124075 3 ## 12 C2 77 Control 63598 69838 7189 35597 3 ## ... ## 20 C10 73 Control 15781 16592 10271 100858 1
ggplot2ggplot2 Fundamentalsggplot LayersEach layer is a set of data connected to a geometry and an aesthetic
Each geom_X() function adds a layer to a plot
The plot has three layers:
ggplot(data=ad_metadata, mapping=aes(x=age_at_death)) + geom_point(mapping=aes(y=tau, color='blue')) + geom_point(mapping=aes(y=abeta, color='red')) + geom_point(mapping=aes(y=iba1, color='cyan'))
ggplot Layersggplot Scalesggplot(ad_metadata,mapping=aes(x=abeta, y=tau)) + geom_point(size=3)
ggplot(ad_metadata,mapping=aes(x=abeta, y=tau)) + geom_point(size=3)
ggplot(ad_metadata,mapping=aes(x=abeta, y=tau, shape=condition)) + geom_point(size=3)
g <- ggplot()
for(x in 0:5) {
for(y in 0:4) {
if(x+y*6 < 26) {
g <- g + geom_point(
tibble(x=x,y=y),
aes(x=x,y=y),
shape=x+y*6,size=8) +
geom_label(
tibble(x=x,y=y,label=x+y*6),
aes(x=x,y=y+0.5,label=label)
)
}
}
}
g
scale_shape_manual():ggplot(ad_metadata,mapping=aes(x=abeta, y=tau, shape=condition)) + geom_point(size=3) + scale_shape_manual(values=c(3,9))
library(patchwork) g <- ggplot(ad_metadata) g_condition <- g + geom_point( mapping=aes(x=abeta, y=tau, color=condition), size=3 ) g_age <- g + geom_point( mapping=aes(x=abeta, y=tau, color=age_at_death), size=3 ) g_condition / g_age
ggplot(ad_metadata,mapping=aes(x=abeta, y=tau, size=age_at_death)) + geom_point(alpha=0.5)
arrange(ad_metadata,age_at_death) %>%
mutate(
x=abeta,
xend=lag(x,1),
y=tau,
yend=lag(y,1)
) %>%
ggplot() +
geom_segment(aes(x=abeta, xend=xend, y=tau, yend=yend)) +
geom_point(aes(x=x,y=y,shape=condition,color=condition),size=3)
geom_line() function draws lines between pairs of points sorted by \(x\) axis by defaultggplot(ad_metadata,mapping=aes(x=abeta, y=tau)) +
geom_line()
group aesthetic mappingpivot_longer(ad_metadata,
c(tau,abeta,iba1,gfap),
names_to='Marker',
values_to='Intensity'
) %>%
ggplot(ad_metadata,mapping=aes(x=ID, y=Intensity, group=Marker, color=Marker)) +
geom_line()
group aesthetic mappinglibrary(GGally)
ggparcoord(ad_metadata,
columns=c(2,4:8),
groupColumn=3,
showPoints=TRUE
) +
scale_color_manual(values=c("#bbbbbb", "#666666"))
0 - f, 0=0 and f=16#rrggbb
rr is the value of the color redgg for greenbb for blue#ffffff is the color
white#000000 is the color
black#ff0000 is the color
red#0000ff is the color
blue#7fff00 is
this color (called chartreuse)heatmap() function:heatmap() function creates a clustered heatmap where the rows and columns have been hierarchically clustered# heatmap() requires a R matrix, and cannot accept a tibble or a dataframe marker_matrix <- as.matrix( dplyr::select(ad_metadata,c(tau,abeta,iba1,gfap)) ) # rownames of the matrix become y labels rownames(marker_matrix) <- ad_metadata$ID heatmap(marker_matrix)
heatmap() Functionalityheatmap( marker_matrix, Rowv=NA, # don't cluster rows Colv=NA, # don't cluster columns scale="none", # don't scale rows )
heatmap() Drawbackheatmap() function has the major drawback that no color key is provided!heatmap.2() in gplots package has a similar interfacelibrary(gplots) heatmap.2(marker_matrix)
heatmap.2() Exampleheatmap() and heatmap.2() can annotate rows and columns with a categorical variable along the margins# with heatmap()
condition_colors <-
transmute(
ad_metadata,
color=if_else(condition == "AD","red","blue")
)
heatmap(
marker_matrix,
RowSideColors=condition_colors$color
)
# with heatmap.2() heatmap.2( marker_matrix, RowSideColors=condition_colors$color )
geom_tile geometrygeom_tile requires data in long format with x, y, and z valuespivot_longer( ad_metadata, c(tau,abeta,iba1,gfap), names_to="Marker", values_to="Intensity" ) %>% ggplot(aes(x=Marker,y=ID,fill=Intensity)) + geom_tile()
col argument# native R colors are: # - rainbow(n, start=.7, end=.1) # - heat.colors(n) # - terrain.colors(n) # - topo.colors(n) # - cm.colors(n) # the n argument specifies the number of colors # (i.e. resolution) of the colormap to return heatmap(marker_matrix,col=cm.colors(256))
heatmap(marker_matrix,col=cm.colors(256))
heatmap(marker_matrix,col=cm.colors(2))
geom_tile(), use the scale_fill_gradientn function to specify a different color palettepivot_longer( ad_metadata, c(tau,abeta,iba1,gfap), names_to="Marker", values_to="Intensity" ) %>% ggplot(aes(x=Marker,y=ID,fill=Intensity)) + geom_tile() + scale_fill_gradientn(colors=cm.colors(256))
Heatmaps are quite complicated and can easily mislead us
Four factors influence how a dataset can be visualized as a heatmap:
data <- tibble(
ID=paste0('F',seq(10)),
a=rnorm(10,0,1),
b=rnorm(10,100,20),
c=rnorm(10,20,5)
) %>%
pivot_longer(c(a,b,c))
ggplot(data,aes(x=name,y=ID,fill=value)) +
geom_tile()
library(ggbeeswarm) ggplot(data) + geom_beeswarm(aes(x=name,y=value,color=name))
data %>%
pivot_wider(id_cols='ID',names_from=name) %>%
mutate(
across(c(a,b,c),scale)
) %>%
pivot_longer(c(a,b,c)) %>%
ggplot(aes(x=name,y=ID,fill=value)) +
geom_tile()
data %>%
pivot_wider(id_cols=name,names_from=ID) %>%
mutate(
across(starts_with('F'),scale)
) %>%
pivot_longer(starts_with('F'),names_to="ID") %>%
ggplot(aes(x=name,y=ID,fill=value)) +
geom_tile()
tibble(
ID=paste0('F',seq(10)),
a=rnorm(10,0,1),
b=rnorm(10,0,1),
c=c(rnorm(9,0,1),1e9)
) %>%
pivot_longer(c(a,b,c)) %>%
ggplot(aes(x=name,y=ID,fill=value)) +
geom_tile()
data <- tibble(
ID=paste0('F',seq(10)),
a=10**rnorm(10,4,1),
b=10**rnorm(10,4,1),
c=10**rnorm(10,4,1)
) %>%
pivot_longer(c(a,b,c))
data %>%
ggplot(aes(x=name,y=ID,fill=value)) +
geom_tile()
midpoint=0 when central value is 0tibble(
ID=paste0('F',seq(10)),
a=rnorm(10,0,1),
b=rnorm(10,0,1),
c=rnorm(10,0,1)
) %>%
pivot_longer(c(a,b,c)) %>%
ggplot(aes(x=name,y=ID,fill=value)) +
geom_tile() +
scale_fill_gradient2(
low="#000099", mid="#ffffff", high="#990000",
midpoint=0
)
tibble(
ID=paste0('F',seq(10)),
a=rnorm(10,1,1),
b=rnorm(10,1,1),
c=rnorm(10,1,1)
) %>%
pivot_longer(c(a,b,c)) %>%
ggplot(aes(x=name,y=ID,fill=value)) +
geom_tile() +
scale_fill_gradient2(low="#000099", mid="#ffffff", high="#990000")

circlize R packagecirclize package provides R implementationcirclize R packagepatchwork library composes ggplot objects together using an intuitive set of operators:
a | b - put plots side-by-sidea / b - put plot a above b(a | b) / c - put a and b side-by-side, and c below themdata <- tibble( a=rnorm(100,0,1), b=rnorm(100,3,2) ) g_scatter <- ggplot(data, aes(x=a, y=b)) + geom_point() g_violin <- pivot_longer(data, c(a,b)) %>% ggplot(aes(x=name,y=value,fill=name)) + geom_violin()
g_scatter | g_violin
g_scatter / g_violin
g_scatter / ( g_scatter | g_violin)
(g_scatter / g_scatter ) | g_violin
library(mvtnorm) # package implementing multivariate normal distributions
nsamp <- 100
data <- rbind(
rmvnorm(nsamp,c(1,1),sigma=matrix(c(1,0.8,0.8,1),nrow=2)),
rmvnorm(nsamp,c(1,1),sigma=matrix(c(1,-0.8,-0.8,1),nrow=2)),
rmvnorm(nsamp,c(1,1),sigma=matrix(c(1,0,0,1),nrow=2))
)
colnames(data) <- c('x','y')
g_oneplot <- as_tibble(data) %>%
mutate(
sample_name=c(rep('A',nsamp),rep('B',nsamp),rep('C',nsamp))
) %>%
ggplot(aes(x=x,y=y,color=sample_name)) +
geom_point()
g_oneplot
facet_wrap() functionfacet_wrap() functiong_oneplot + facet_wrap(vars(sample_name))
facet_wrap() functiong_oneplot + facet_wrap(vars(sample_name)) + geom_smooth(method="loess", formula=y ~ x)
ggplot Themesbase_g <- tibble( x=rnorm(100), y=rnorm(100) ) %>% ggplot(aes(x=x, y=y)) + geom_point()
base_g
ggplot comes with other themes that may be added to plots with theme_X() functionsbase_g + theme_bw()
base_g + theme_classic()
<circle cx="50" cy="50" r="50"/>
ggsave() functionggsave('multipanel.svg')
## Saving 7.5 x 4.5 in image