An effective data visualization:
A great visualization has some additional properties:
data <- tibble( x=rnorm(100), y=rnorm(100,0,5) ) data %>% ggplot(aes(x=x,y=y)) + geom_point()
data <- mutate(data, `x times y`=abs(x*y) ) data %>% ggplot(aes(x=x,y=y,size=`x times y`)) + geom_point()
data <- mutate(data,
z=runif(100),
category=sample(c('A','B','C'), 100, replace=TRUE)
)
data %>%
ggplot(aes(x=x, y=y, size=`x times y`, color=z, shape=category)) +
geom_point()
data <- mutate(data, `x + y`=x+y) %>%
arrange(`x + y`) %>%
mutate(
xend=lag(x,1),
yend=lag(y,1)
)
data %>%
ggplot() +
geom_segment(aes(x=x,xend=xend,y=y,yend=yend,alpha=0.5)) +
geom_point(aes(x=x, y=y, size=`x times y`, color=z, shape=category))
tibble( value=rnorm(100), category='A' ) %>% ggplot(aes(x=category,y=value,fill=category)) + geom_violin()
| Plot | \(x\) encoding | \(y\) encoding | note |
|---|---|---|---|
| scatter | position | position | |
| vertical bar | position | height | |
| horizontal bar | width | position | |
| lollipop | position | height for line + position for “head” | |
| violin | position | width | \(x\) transformed to range, \(y\) transformed to densities |
# two groups of samples with similar random data profiles
data <- as.matrix(
tibble(
A=c(rnorm(10),rnorm(10,2)),
B=c(rnorm(10),rnorm(10,2)),
C=c(rnorm(10),rnorm(10,2)),
D=c(rnorm(10,4),rnorm(10,-1)),
E=c(rnorm(10,4),rnorm(10,-1)),
F=c(rnorm(10,4),rnorm(10,-1))
)
)
rownames(data) <- paste0('G',seq(nrow(data)))
heatmap(data)
library(patchwork)
data <- tibble(
percent=c(86,88,87,90,93,89),
ID=c('A','B','C','D','E','F')
)
g <- ggplot(data, aes(x=ID,y=percent)) +
geom_bar(stat="identity")
g + coord_cartesian(ylim=c(85,95)) | g
ggplot2ggplot2 Fundamentalsggplot2 ExampleA simple example dataset:
## # A tibble: 20 × 8 ## ID age_at_death condition tau abeta iba1 gfap braak_stage ## <chr> <dbl> <fct> <dbl> <dbl> <dbl> <dbl> <fct> ## 1 A1 73 AD 96548 176324 157501 78139 4 ## 2 A2 82 AD 95251 0 147637 79348 4 ## ... ## 10 A10 69 AD 48357 27260 47024 78507 2 ## 11 C1 80 Control 62684 93739 131595 124075 3 ## 12 C2 77 Control 63598 69838 7189 35597 3 ## ... ## 20 C10 73 Control 15781 16592 10271 100858 1
ggplot2 Exampleage_at_deathtauggplot(
data=ad_metadata,
mapping = aes(
x = age_at_death,
y = tau
)
) +
geom_point()ggplot(data=ad_metadata, mapping=aes(x=age_at_death, y=tau)) + geom_point()
ggplot2 Plot Componentsggplot(data=ad_metadata, mapping=aes(x=age_at_death, y=tau)) + geom_point()
ggplot() - function creates a plotdata= - pass a tibble with the datamapping=aes(...) - Define an aesthetics mapping connecting data to plot propertiesgeom_point(...) - Specify geometry as points where marks will be made at pairs of x,y coordinatesIs this the whole story?
There are both AD and Control subjects in this dataset!
How does condition relate to this relationship we see?
Layer on an additional aesthetic of color:
ggplot(
data=ad_metadata,
mapping = aes(
x = age_at_death,
y = tau,
color=condition # color each point
)
) +
geom_point()
ggplot(data=ad_metadata, mapping=aes(
x=age_at_death, y=tau, color=condition
)) + geom_point()
Differences in distributions of variables can be important
Examine the distribution of age_at_death for AD and Control samples with violin geometry with geom_violin():
ggplot(data=ad_metadata, mapping = aes(x=condition, y=age_at_death)) + geom_violin()
ggplot(data=ad_metadata, mapping = aes(x=condition, y=age_at_death)) + geom_violin()
patchwork library:library(patchwork)
age_boxplot <- ggplot(
data=ad_metadata,
mapping = aes(x=condition, y=age_at_death)
) +
geom_boxplot()
tau_boxplot <- ggplot(
data=ad_metadata,
mapping=aes(x=condition, y=tau)
) +
geom_boxplot()
age_boxplot | tau_boxplot # this puts the plots side by side
age_boxplot | tau_boxplot # this puts the plots side by side
ggplot Mechanicsggplot Mechanicsggplot has two key concepts that give it great flexibility: layers and scalesggplot LayersEach layer is a set of data connected to a geometry and an aesthetic
Each geom_X() function adds a layer to a plot
The plot has three layers:
ggplot(data=ad_metadata, mapping=aes(x=age_at_death)) + geom_point(mapping=aes(y=tau, color='blue')) + geom_point(mapping=aes(y=abeta, color='red')) + geom_point(mapping=aes(y=iba1, color='cyan'))
ggplot Layersggplot Scalesggplot ScalesHow many layers? How many scales?
ggplot Incompatible Scalesggplot(data=ad_metadata, mapping=aes(x=braak_stage)) + geom_point(mapping=aes(y=tau, color='blue')) + geom_point(mapping=aes(y=age_at_death, color='red'))
ggplot Incompatible Scalesggplot(ad_metadata,
mapping = aes(
x=ID,
y=tau)
) +
geom_bar(stat="identity")
ggplot(ad_metadata, mapping = aes(x=ID,y=tau)) + geom_bar(stat="identity")
ggplot(ad_metadata, mapping = aes(x=ID,y=tau,fill=condition)) + geom_bar(stat="identity")
ggplot(ad_metadata, mapping = aes(x=ID,y=tau,fill=condition)) + geom_bar(stat="identity")
mutate(ad_metadata, tau_centered=(tau - mean(tau))) %>% ggplot(mapping = aes(x=ID, y=tau_centered, fill=condition)) + geom_bar(stat="identity")
geom_point and geom_segment layers:ggplot(ad_metadata) + geom_point(mapping=aes(x=ID, y=tau)) + geom_segment(mapping=aes(x=ID, xend=ID, y=0, yend=tau))
pivot_longer(
ad_metadata,
c(tau,abeta,iba1,gfap),
names_to='Marker',
values_to='Intensity'
) %>%
ggplot(aes(x=ID,y=Intensity,group=Marker,fill=Marker)) +
geom_area()
Stacked area plots require three pieces of data:
pivot_longer(
ad_metadata,
c(tau,abeta,iba1,gfap),
names_to='Marker',
values_to='Intensity'
) %>%
group_by(ID) %>% # divide each intensity values by sum markers
mutate(
`Relative Intensity`=Intensity/sum(Intensity)
) %>%
ungroup() %>% # ungroup restores the tibble to original number of rows
ggplot(aes(x=ID,y=`Relative Intensity`,group=Marker,fill=Marker)) +
geom_area()
ggplot(ad_metadata) + geom_histogram(mapping=aes(x=age_at_death))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(ad_metadata) + geom_histogram(mapping=aes(x=age_at_death),bins=10)
tibble(x=rnorm(1000)) %>% ggplot() + geom_histogram(aes(x=x))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
tibble(
x=c(rnorm(1000),rnorm(1000,mean=4)),
type=c(rep('A',1000),rep('B',1000))
) %>%
ggplot(aes(x=x,fill=type)) +
geom_histogram(bins=30, alpha=0.6, position="identity")
Similar to histogram, except instead of binning the values draws a smoothly interpolated line that approximates the distribution
Density plot is always normalized so the integral under the curve is approximately 1
ggplot(ad_metadata) + geom_density(mapping=aes(x=age_at_death),fill="#c9a13daa")
library(patchwork) hist_g <- ggplot(ad_metadata) + geom_histogram(mapping=aes(x=age_at_death),bins=30) density_g <- ggplot(ad_metadata) + geom_density(mapping=aes(x=age_at_death),fill="#c9a13daa") hist_g | density_g
library(patchwork)
normal_samples <- tibble(
x=c(rnorm(1000),rnorm(1000,mean=4)),
type=c(rep('A',1000),rep('B',1000))
)
hist_g <- ggplot(normal_samples) +
geom_histogram(
mapping=aes(x=x,fill=type),
alpha=0.6, position="identity", bins=30
)
density_g <- ggplot(normal_samples) +
geom_density(
mapping=aes(x=x,fill=type),
alpha=0.6, position="identity"
)
hist_g | density_g
ggplot(ad_metadata) + geom_boxplot(mapping=aes(x=condition,y=age_at_death))
ggplot(ad_metadata) + geom_boxplot(mapping=aes(x=condition,y=age_at_death))
normal_samples <- tibble(
x=c(rnorm(1000),rnorm(1000,4),rnorm(1000,2,3)),
type=c(rep('A',2000),rep('B',1000))
)
ggplot(normal_samples, aes(x=x,fill=type,alpha=0.6)) +
geom_density()
ggplot(normal_samples, aes(x=type,y=x,fill=type)) + geom_boxplot()
library(patchwork) g <- ggplot(normal_samples, aes(x=type,y=x,fill=type)) boxplot_g <- g + geom_boxplot() violin_g <- g + geom_violin() boxplot_g | violin_g
ggplot(ad_metadata) + geom_violin(aes(x=condition,y=tau,fill=condition))
library(ggbeeswarm)
ggplot(ad_metadata) +
geom_beeswarm(
aes(x=condition,y=age_at_death,color=condition),
cex=2,
size=2
)
normal_samples <- tibble(
x=c(rnorm(1000),rnorm(1000,4),rnorm(1000,2,3)),
type=c(rep('A',2000),rep('B',1000))
)
ggplot(normal_samples, aes(x=type,y=x,color=type)) +
geom_beeswarm()
ggplot(normal_samples, aes(x=type,y=x,color=type)) + geom_beeswarm()
normal_samples <- tibble(
x=c(rnorm(100),rnorm(100,4),rnorm(100,2,3)),
type=c(rep('A',200),rep('B',100)),
category=sample(c('healthy','disease'),300,replace=TRUE)
)
ggplot(normal_samples, aes(x=type,y=x,color=category)) +
geom_beeswarm()
library(ggridges)
tibble(
x=c(rnorm(100),rnorm(100,4),rnorm(100,2,3)),
type=c(rep('A',200),rep('B',100)),
) %>%
ggplot(aes(y=type,x=x,fill=type)) +
geom_density_ridges()
## Picking joint bandwidth of 0.822
tibble(
x=rnorm(10000,mean=runif(10,1,10),sd=runif(2,1,4)),
type=rep(c("A","B","C","D","E","F","G","H","I","J"),1000)
) %>%
ggplot(aes(y=type,x=x,fill=type)) +
geom_density_ridges(alpha=0.6,position="identity")
## Picking joint bandwidth of 0.494
ggplot(ad_metadata,mapping=aes(x=abeta, y=tau)) + geom_point(size=3)
ggplot(ad_metadata,mapping=aes(x=abeta, y=tau)) + geom_point(size=3)
ggplot(ad_metadata,mapping=aes(x=abeta, y=tau, shape=condition)) + geom_point(size=3)
g <- ggplot()
for(x in 0:5) {
for(y in 0:4) {
if(x+y*6 < 26) {
g <- g + geom_point(
tibble(x=x,y=y),
aes(x=x,y=y),
shape=x+y*6,size=8) +
geom_label(
tibble(x=x,y=y,label=x+y*6),
aes(x=x,y=y+0.5,label=label)
)
}
}
}
g
scale_shape_manual():ggplot(ad_metadata,mapping=aes(x=abeta, y=tau, shape=condition)) + geom_point(size=3) + scale_shape_manual(values=c(3,9))
library(patchwork) g <- ggplot(ad_metadata) g_condition <- g + geom_point(mapping=aes(x=abeta, y=tau, color=condition),size=3) g_age <- g + geom_point(mapping=aes(x=abeta, y=tau, color=age_at_death),size=3) g_condition / g_age
ggplot(ad_metadata,mapping=aes(x=abeta, y=tau, size=age_at_death)) + geom_point(alpha=0.5)
arrange(ad_metadata,age_at_death) %>%
mutate(
x=abeta,
xend=lag(x,1),
y=tau,
yend=lag(y,1)
) %>%
ggplot() +
geom_segment(aes(x=abeta, xend=xend, y=tau, yend=yend)) +
geom_point(aes(x=x,y=y,shape=condition,color=condition),size=3)
geom_line() function draws lines between pairs of points sorted by \(x\) axis by defaultggplot(ad_metadata,mapping=aes(x=abeta, y=tau)) +
geom_line()
group aesthetic mappingpivot_longer(ad_metadata,
c(tau,abeta,iba1,gfap),
names_to='Marker',
values_to='Intensity'
) %>%
ggplot(ad_metadata,mapping=aes(x=ID, y=Intensity, group=Marker, color=Marker)) +
geom_line()
group aesthetic mappinglibrary(GGally)
ggparcoord(ad_metadata,
columns=c(2,4:8),
groupColumn=3,
showPoints=TRUE
) +
scale_color_manual(values=c("#bbbbbb", "#666666"))
0 - f, 0=0 and f=16#rrggbb
rr is the value of the color redgg for greenbb for blue#ffffff is the color
white#000000 is the color
black#ff0000 is the color
red#0000ff is the color
blue#7fff00 is
this color (called chartreuse)heatmap() function:heatmap() function creates a clustered heatmap where the rows and columns have been hierarchically clustered# heatmap() requires a R matrix, and cannot accept a tibble or a dataframe marker_matrix <- as.matrix( dplyr::select(ad_metadata,c(tau,abeta,iba1,gfap)) ) # rownames of the matrix become y labels rownames(marker_matrix) <- ad_metadata$ID heatmap(marker_matrix)
# heatmap() requires a R matrix, and cannot accept a tibble or a dataframe marker_matrix <- as.matrix( dplyr::select(ad_metadata,c(tau,abeta,iba1,gfap)) ) # rownames of the matrix become y labels rownames(marker_matrix) <- ad_metadata$ID heatmap(marker_matrix)
heatmap() Functionalityheatmap( marker_matrix, Rowv=NA, # don't cluster rows Colv=NA, # don't cluster columns scale="none", # don't scale rows )
heatmap() Drawbackheatmap() function has the major drawback that no color key is provided!heatmap.2() in gplots package has a similar interfacelibrary(gplots) heatmap.2(marker_matrix)
heatmap.2() Exampleheatmap() and heatmap.2() can annotate rows and columns with a categorical variable along the margins# with heatmap()
condition_colors <-
transmute(
ad_metadata,
color=if_else(condition == "AD","red","blue")
)
heatmap(
marker_matrix,
RowSideColors=condition_colors$color
)
# with heatmap.2() heatmap.2( marker_matrix, RowSideColors=condition_colors$color )
geom_tile geometrygeom_tile requires data in long format with x, y, and z valuespivot_longer( ad_metadata, c(tau,abeta,iba1,gfap), names_to="Marker", values_to="Intensity" ) %>% ggplot(aes(x=Marker,y=ID,fill=Intensity)) + geom_tile()
col argument# native R colors are: # - rainbow(n, start=.7, end=.1) # - heat.colors(n) # - terrain.colors(n) # - topo.colors(n) # - cm.colors(n) # the n argument specifies the number of colors (i.e. resolution) of the colormap to return heatmap(marker_matrix,col=cm.colors(256))
geom_tile(), use the scale_fill_gradientn function to specify a different color palettepivot_longer( ad_metadata, c(tau,abeta,iba1,gfap), names_to="Marker", values_to="Intensity" ) %>% ggplot(aes(x=Marker,y=ID,fill=Intensity)) + geom_tile() + scale_fill_gradientn(colors=cm.colors(256))
Heatmaps are quite complicated and can easily mislead us
Four factors influence how a dataset can be visualized as a heatmap:
data <- tibble(
ID=paste0('F',seq(10)),
a=rnorm(10,0,1),
b=rnorm(10,100,20),
c=rnorm(10,20,5)
) %>%
pivot_longer(c(a,b,c))
ggplot(data,aes(x=name,y=ID,fill=value)) +
geom_tile()
library(ggbeeswarm) ggplot(data) + geom_beeswarm(aes(x=name,y=value,color=name))
data %>%
pivot_wider(id_cols='ID',names_from=name) %>%
mutate(
across(c(a,b,c),scale)
) %>%
pivot_longer(c(a,b,c)) %>%
ggplot(aes(x=name,y=ID,fill=value)) +
geom_tile()
data %>%
pivot_wider(id_cols=name,names_from=ID) %>%
mutate(
across(starts_with('F'),scale)
) %>%
pivot_longer(starts_with('F'),names_to="ID") %>%
ggplot(aes(x=name,y=ID,fill=value)) +
geom_tile()
tibble(
ID=paste0('F',seq(10)),
a=rnorm(10,0,1),
b=rnorm(10,0,1),
c=c(rnorm(9,0,1),1e9)
) %>%
pivot_longer(c(a,b,c)) %>%
ggplot(aes(x=name,y=ID,fill=value)) +
geom_tile()
data <- tibble(
ID=paste0('F',seq(10)),
a=10**rnorm(10,4,1),
b=10**rnorm(10,4,1),
c=10**rnorm(10,4,1)
) %>%
pivot_longer(c(a,b,c))
data %>%
ggplot(aes(x=name,y=ID,fill=value)) +
geom_tile()
midpoint=0 when central value is 0tibble(
ID=paste0('F',seq(10)),
a=rnorm(10,0,1),
b=rnorm(10,0,1),
c=rnorm(10,0,1)
) %>%
pivot_longer(c(a,b,c)) %>%
ggplot(aes(x=name,y=ID,fill=value)) +
geom_tile() +
scale_fill_gradient2(
low="#000099", mid="#ffffff", high="#990000",
midpoint=0
)
tibble(
ID=paste0('F',seq(10)),
a=rnorm(10,1,1),
b=rnorm(10,1,1),
c=rnorm(10,1,1)
) %>%
pivot_longer(c(a,b,c)) %>%
ggplot(aes(x=name,y=ID,fill=value)) +
geom_tile() +
scale_fill_gradient2(low="#000099", mid="#ffffff", high="#990000")

circlize R packagecirclize package provides R implementationcirclize R packagepatchwork library composes ggplot objects together using an intuitive set of operators:
a | b - put plots side-by-sidea / b - put plot a above b(a | b) / c - put a and b side-by-side, and c below themdata <- tibble( a=rnorm(100,0,1), b=rnorm(100,3,2) ) g_scatter <- ggplot(data, aes(x=a, y=b)) + geom_point() g_violin <- pivot_longer(data, c(a,b)) %>% ggplot(aes(x=name,y=value,fill=name)) + geom_violin()
g_scatter | g_violin
g_scatter / g_violin
g_scatter / ( g_scatter | g_violin)
(g_scatter / g_scatter ) | g_violin
library(mvtnorm) # package implementing multivariate normal distributions
nsamp <- 100
data <- rbind(
rmvnorm(nsamp,c(1,1),sigma=matrix(c(1,0.8,0.8,1),nrow=2)),
rmvnorm(nsamp,c(1,1),sigma=matrix(c(1,-0.8,-0.8,1),nrow=2)),
rmvnorm(nsamp,c(1,1),sigma=matrix(c(1,0,0,1),nrow=2))
)
colnames(data) <- c('x','y')
g_oneplot <- as_tibble(data) %>%
mutate(
sample_name=c(rep('A',nsamp),rep('B',nsamp),rep('C',nsamp))
) %>%
ggplot(aes(x=x,y=y,color=sample_name)) +
geom_point()
g_oneplot
facet_wrap() functionfacet_wrap() functiong_oneplot + facet_wrap(vars(sample_name))
facet_wrap() functiong_oneplot + facet_wrap(vars(sample_name)) + geom_smooth(method="loess", formula=y ~ x)
ggplot Themesbase_g <- tibble( x=rnorm(100), y=rnorm(100) ) %>% ggplot(aes(x=x, y=y)) + geom_point()
base_g
ggplot comes with other themes that may be added to plots with theme_X() functionsbase_g + theme_bw()
base_g + theme_classic()
<circle cx="50" cy="50" r="50"/>
(g1 / g2) | g3
ggsave() functionggsave('multipanel.svg')
## Saving 7.5 x 4.5 in image