New ones to install:
install.packages("ggbeeswarm")
install.packages("skimr")
install.packages("janitor")
install.packages("ggridges")
To load:
library(tidyverse)
library(ggbeeswarm)
library(skimr)
library(janitor)
library(ggridges)
Below are simulated four distributions (n = 100 each), all with similar measures of center (mean = 0) and spread (s.d. = 1), but with distinctly different shapes.1
n
);s
, Johnson distribution with skewness 2.2 and kurtosis 13);k
, Johnson distribution with skewness 0 and kurtosis 30);mm
, two normals with mean -0.95 and 0.95 and standard deviation 0.31).#install.packages("SuppDists")
library(SuppDists)
# this is used later to generate the s and k distributions
findParams <- function(mu, sigma, skew, kurt) {
value <- .C("JohnsonMomentFitR", as.double(mu), as.double(sigma),
as.double(skew), as.double(kurt - 3), gamma = double(1),
delta = double(1), xi = double(1), lambda = double(1),
type = integer(1), PACKAGE = "SuppDists")
list(gamma = value$gamma, delta = value$delta,
xi = value$xi, lambda = value$lambda,
type = c("SN", "SL", "SU", "SB")[value$type])
}
# Generate sample data -------------------------------------------------------
set.seed(141079)
# normal
n <- rnorm(100)
# right-skew
s <- rJohnson(100, findParams(0, 1, 2.2, 13))
# leptikurtic
k <- rJohnson(100, findParams(0, 1, 0, 30))
# mixture
mm <- rnorm(100, rep(c(-1, 1), each = 50) * sqrt(0.9), sqrt(0.1))
four <- data.frame(
dist = factor(rep(c("n", "s", "k", "mm"),
each = 100),
c("n", "s", "k", "mm")),
vals = c(n, s, k, mm)
)
glimpse(four)
Observations: 400
Variables: 2
$ dist <fct> n, n, n, n, n, n, n, n, n, n, n, n, n, n, n, n, n, n, n, n,…
$ vals <dbl> 0.91835807, -0.44302553, 2.83453800, -0.74480448, 1.4431036…
Let’s see what our descriptive statistics look like:
skim(four)
Skim summary statistics
n obs: 400
n variables: 2
── Variable type:factor ───────────────────────────────────────────────────────────────────────
variable missing complete n n_unique top_counts
dist 0 400 400 4 n: 100, s: 100, k: 100, mm: 100
ordered
FALSE
── Variable type:numeric ──────────────────────────────────────────────────────────────────────
variable missing complete n mean sd p0 p25 p50 p75 p100
vals 0 400 400 0.19 1.02 -3.07 -0.49 0.16 0.85 4.49
hist
▁▂▆▇▆▁▁▁
four %>%
group_by(dist) %>%
skim()
Skim summary statistics
n obs: 400
n variables: 2
group variables: dist
── Variable type:numeric ──────────────────────────────────────────────────────────────────────
dist variable missing complete n mean sd p0 p25 p50
n vals 0 100 100 0.071 1.09 -3.07 -0.61 0.13
s vals 0 100 100 0.74 1.01 -0.48 0.053 0.43
k vals 0 100 100 1e-04 0.79 -2.17 -0.49 -0.069
mm vals 0 100 100 -0.06 0.99 -1.63 -1 -0.031
p75 p100 hist
0.9 2.83 ▁▂▃▆▇▆▃▁
1.12 4.49 ▇▇▅▂▁▁▁▁
0.37 2.77 ▁▁▅▇▃▂▁▁
0.86 1.49 ▃▇▅▁▁▃▇▅
What you want to look for:
The nice thing about ggplot
is that we can use facet_wrap
, and the x- and y-axes are the same, and the size of the binwidths are also the same.
#2 x 2 histograms in ggplot
ggplot(four, aes(x = vals)) + #no y needed for visualization of univariate distributions
geom_histogram(fill = "white", colour = "black") + #easier to see for me
coord_cartesian(xlim = c(-5, 5)) + #use this to change x/y limits!
facet_wrap(~ dist) #this is one factor variable with 4 levels
Always change the binwidths on a histogram. Sometimes the default in ggplot
works great, sometimes it does not.
Super narrow:
#2 x 2 histograms in ggplot
ggplot(four, aes(x = vals)) +
geom_histogram(binwidth = .1, fill = "white", colour = "black") + #super narrow bins
coord_cartesian(xlim = c(-5, 5)) +
facet_wrap(~ dist)
Super wide:
#2 x 2 histograms in ggplot
ggplot(four, aes(x = vals)) +
geom_histogram(binwidth = 2, fill = "white", colour = "black") + #super wide bins
coord_cartesian(xlim = c(-5, 5)) +
facet_wrap(~ dist)
Just right? Pretty close to the default for this data.
#2 x 2 histograms in ggplot
ggplot(four, aes(x = vals)) +
geom_histogram(binwidth = .2, fill = "white", colour = "black") +
coord_cartesian(xlim = c(-5, 5)) +
facet_wrap(~ dist)
#2 x 2 histograms in ggplot
ggplot(four, aes(x = vals)) +
geom_histogram(binwidth = .2, fill = "white", colour = "black") +
geom_rug() +
coord_cartesian(xlim = c(-5, 5)) +
facet_wrap(~ dist)
What you want to look for:
ggplot(four, aes(y = vals, x = dist)) +
geom_boxplot() +
scale_x_discrete(name="") +
scale_y_continuous(name="") +
coord_cartesian(ylim = c(-4,4))
ggplot notches: “Notches are used to compare groups; if the notches of two boxes do not overlap, this is strong evidence that the medians differ.” (Chambers et al., 1983, p. 62)
ggplot(four, aes(y = vals, x = dist)) +
geom_boxplot(notch = T) +
scale_x_discrete(name = "") +
scale_y_continuous(name = "") +
coord_cartesian(ylim = c(-4,4))
Here we add a diamond for the mean (see other possible shape codes here).
ggplot(four, aes(x = dist, y = vals)) +
geom_boxplot() +
stat_summary(fun.y = mean,
geom = "point",
shape = 18,
size = 4,
colour = "lightseagreen") +
coord_cartesian(ylim = c(-4, 4))
Options:
Combining geom_jitter() + stat_summary()
is the ggplot corollary to a stripchart.
ggplot(four, aes(x = dist, y = vals)) +
geom_jitter(position = position_jitter(height = 0, width = .1),
fill = "lightseagreen",
colour = "lightseagreen",
alpha = .5) +
stat_summary(fun.y = median,
fun.ymin = median,
fun.ymax = median,
geom = "crossbar",
width = 0.5) +
scale_x_discrete(name = "") +
scale_y_continuous(name = "") +
coord_cartesian(ylim = c(-4, 4))
This is a beeswarm-like ggplot- not exactly the same, but gives you the same idea.
ggplot(four, aes(x = dist, y = vals)) +
geom_dotplot(stackdir = "center",
binaxis = "y",
binwidth = .1,
binpositions = "all",
stackratio = 1.5,
fill = "lightseagreen",
colour = "lightseagreen") +
scale_x_discrete(name = "") +
scale_y_continuous(name = "") +
coord_cartesian(ylim = c(-4, 4))
https://github.com/eclarke/ggbeeswarm
install.packages("ggbeeswarm")
library(ggbeeswarm)
ggplot(four, aes(x = dist, y = vals)) +
geom_quasirandom(fill = "lightseagreen",
colour = "lightseagreen") +
scale_x_discrete(name = "") +
scale_y_continuous(name = "") +
coord_cartesian(ylim = c(-4, 4))
ggplot(four, aes(x = dist, y = vals)) +
geom_quasirandom(fill = "lightseagreen",
colour = "lightseagreen",
method = "smiley") +
scale_x_discrete(name = "") +
scale_y_continuous(name = "") +
coord_cartesian(ylim = c(-4, 4))
Note that these recommendations do not apply if your data is “big”. You will know your data is too big if you try the below methods and you can’t see many of the individual points (typically, N > 100).
Combining geom_boxplot() + geom_dotplot()
is my personal pick for EDA when I have small - medium data (N < 100).
ggplot(four, aes(y = vals, x = dist)) +
geom_boxplot(outlier.shape = NA) +
geom_dotplot(binaxis = 'y',
stackdir = 'center',
stackratio = 1.5,
binwidth = .1,
binpositions = "all",
dotsize = 1,
alpha = .75,
fill = "lightseagreen",
colour = "lightseagreen",
na.rm = TRUE) +
scale_x_discrete(name = "") +
scale_y_continuous(name = "") +
coord_cartesian(ylim = c(-4, 4))
You can also combin geom_boxplot() + geom_jitter()
. I left the outliers in to demonstrate the jittered points only go left to right because I set the jitter height = 0
.
ggplot(four, aes(y = vals, x = dist)) +
geom_boxplot(width = .5) + #left the outliers in here, so they are double-plotted
geom_jitter(fill = "lightseagreen",
colour = "lightseagreen",
na.rm = TRUE,
position = position_jitter(height = 0, width = .1),
alpha = .5) +
scale_x_discrete(name = "") +
scale_y_continuous(name = "") +
coord_cartesian(ylim = c(-4, 4))
A few ways to do this:
ggplot(four, aes(x = vals)) +
geom_density(fill = "lightseagreen") +
coord_cartesian(xlim = c(-5, 5)) +
facet_wrap(~ dist)
Instead of doing a facet_wrap
, I could make just one plot showing all four distributions. To make each distribution a different color, set the fill
within the aes
, and assign it to the factor variable dist
. Since now all four will be plotted on top of each other, add an alpha
level to make the color fill transparent (0 = completely transparent; 1 = completely opaque).
# Density plots with semi-transparent fill
ggplot(four, aes(x = vals, fill = dist)) +
geom_density(alpha = .5)
These are pretty easy to make in ggplot
. However, note that the y-axis is different from if you just plotted the histogram. In fact, when interpreting this plot, the y-axis is only meaningful for reading density. It is meaningless for interpreting the histogram.
ggplot(four, aes(x = vals)) +
geom_histogram(aes(y = ..density..),
binwidth = .5,
colour = "black",
fill = "white") +
geom_density(alpha = .5, fill = "lightseagreen") +
coord_cartesian(xlim = c(-5,5)) +
facet_wrap(~ dist)
My advice: always set color = NA
for geom_violin
. For fill, always set alpha
.
ggplot(four, aes(x = dist, y = vals)) +
geom_violin(color = NA,
fill = "lightseagreen",
alpha = .5,
na.rm = TRUE,
scale = "count") + # max width proportional to sample size
coord_cartesian(ylim = c(-4, 4))
Combination geom_violin() + geom_boxplot()
is my personal pick for EDA when I have large data (N > 100).
ggplot(four, aes(x = dist, y = vals)) +
geom_boxplot(outlier.size = 2,
colour = "lightseagreen",
fill = "black",
na.rm = TRUE,
width = .1) +
geom_violin(alpha = .2,
fill = "lightseagreen",
colour = NA,
na.rm = TRUE) +
coord_cartesian(ylim = c(-4, 4))
Note that it is just as easy to layer a univariate scatterplot over a violin plot, just by replacing the geom_boxplot
with a different geom as shown abobe. Lots of combination plots are possible!
Using David Robinson’s code: https://gist.github.com/dgrtwo/eb7750e74997891d7c20
ggplot(four, aes(x = dist, y = vals)) +
geom_flat_violin(alpha = .5,
fill = "lightseagreen",
colour = NA,
na.rm = TRUE) +
coord_flip()
Typically makes the most sense to have the factor variable like dist
on the y-axis for these.
https://cran.r-project.org/web/packages/ggridges/vignettes/introduction.html
# play with scale
ggplot(four, aes(x = vals, y = dist)) +
geom_density_ridges(scale = 0.9,
fill = "lightseagreen",
alpha = .5)
https://cran.r-project.org/web/packages/ggridges/vignettes/introduction.html
ggplot(four, aes(x = vals, y = dist)) +
geom_density_ridges(jittered_points = TRUE,
position = "raincloud",
fill = "lightseagreen",
alpha = 0.7,
scale = 0.7)
The more general stat_summary
function applies a summary function to the variable mapped to y at each x value.
The simplest summary function is mean_se
, which returns the mean and the mean plus its standard error on each side. Thus, stat_summary
will calculate and plot the mean and standard errors for the y variable at each x value.
The default geom is “pointrange” which places a dot at the central y value and extends lines to the minimum and maximum y values. Other geoms you might consider to display summarized data:
geom_errorbar
geom_pointrange
geom_linerange
geom_crossbar
There are a few summary functions from the Hmisc
package which are reformatted for use in stat_summary()
. They all return aesthetics for y
, ymax
, and ymin
.
mean_cl_normal()
mean_sdl()
mean_cl_boot()
median_hilow()
ggplot(four, aes(x = dist, y = vals)) +
stat_summary(fun.data = "mean_se")
ggplot(four, aes(x = dist, y = vals)) +
stat_summary(fun.y = "mean", geom = "point") +
stat_summary(fun.y = "max", geom = "point", shape = 21)
ggplot(four, aes(x = dist, y = vals)) +
stat_summary(fun.data = median_hilow)
You may have noticed two different arguments that are potentially confusing: fun.data
and fun.y
. If the function returns three values, specify the function with the argument fun.data
. If the function returns one value, specify fun.y
. See below.
x <- c(1, 2, 3)
mean(x) # use fun.y
[1] 2
mean_cl_normal(x) # use fun.data
y ymin ymax
1 2 -0.4841377 4.484138
Confidence limits may give us a better idea than standard error limits of whether two means would be deemed statistically different when modeling, so we frequently use mean_cl_normal
or mean_cl_boot
in addition to mean_se
.
Using the ToothGrowth
dataset
data(ToothGrowth)
tg <- ToothGrowth
# Standard error of the mean
ggplot(tg, aes(x = dose, y = len, colour = supp)) +
stat_summary(fun.data = "mean_se") +
ggtitle("Mean +/- SE")
# Connect the points with lines
ggplot(tg, aes(x = dose, y = len, colour = supp)) +
stat_summary(fun.data = "mean_se") +
stat_summary(fun.y = mean, geom = "line") +
ggtitle("Mean +/- SE")
# Use 95% confidence interval instead of SEM
ggplot(tg, aes(x = dose, y = len, colour = supp)) +
stat_summary(fun.data = "mean_cl_normal") +
stat_summary(fun.y = mean, geom = "line") +
ggtitle("Mean with 95% confidence intervals")
# The errorbars overlapped, so use position_dodge to move them horizontally
pd <- position_dodge(0.1) # move them .05 to the left and right
ggplot(tg, aes(x = dose, y = len, colour = supp)) +
stat_summary(fun.data = "mean_cl_normal", position = pd) +
stat_summary(fun.y = mean, geom = "line", position = pd) +
ggtitle("Mean with 95% confidence intervals")
Not the best example for this geom, but another good one for showing variability…
# ribbon geom
ggplot(tg, aes(x = dose, y = len, colour = supp, fill = supp)) +
stat_summary(fun.y = mean, geom = "line") +
stat_summary(fun.data = mean_se, geom = "ribbon", alpha = .5)
If you must…
# Standard error of the mean; note positioning
ggplot(tg, aes(x = factor(dose), y = len, fill = supp)) +
stat_summary(fun.y = mean, geom = "bar", position = position_dodge(width = .9)) +
stat_summary(fun.data = mean_se, geom = "linerange", position = position_dodge(width = .9)) +
ggtitle("Mean +/- SE")
# Use 95% confidence interval instead of SEM
ggplot(tg, aes(x = factor(dose), y = len, fill = supp)) +
stat_summary(fun.y = mean, geom = "bar", position = position_dodge(width = .9)) +
stat_summary(fun.data = mean_cl_boot, geom = "linerange", position = position_dodge(width = .9)) +
ggtitle("Mean with 95% confidence intervals")
More help here: http://www.cookbook-r.com/Graphs/Plotting_means_and_error_bars_(ggplot2)/
The code for these distributions comes from Hadley Wickham’s paper on “40 years of boxplots”↩