::p_load(plotly, crosstalk, DT,
pacman
ggdist, ggridges, colorspace, gganimate, tidyverse)
Hands-on_Ex04_c
Learning Outcome
Visualising uncertainty is relatively new in statistical graphics. In this chapter, you will gain hands-on experience on creating statistical graphics for visualising uncertainty. By the end of this chapter you will be able:
to plot statistics error bars by using ggplot2.
to plot interactive error bars by combining ggplot2, plotly and DT.
to create advanced by using ggdist.
to create hypothetical outcome plots (HOPs) by using ungeviz package.
Getting Started
Installing and loading the packages
For the purpose of this exercise, the following R packages will be used, they are:
tidyverse, a family of R packages for data science process, plotly for creating interactive plot.
gganimate for creating animation plot.
DT for displaying interactive html table.
crosstalk for for implementing cross-widget interactions. (currently, linked brushing and filtering)
ggdist for visualising distribution and uncertainty.
Data import
<- read_csv("Exam_data.csv") exam
Visualizing the uncertainty of point estimates: ggplot2 methods
A point estimate is a single number, such as a mean. Uncertainty, on the other hand, is expressed as standard error, confidence interval, or credible interval.
Firstly, code chunk below will be used to derive the necessary summary statistics.
<- exam %>%
my_sum group_by(RACE) %>%
summarise(
n=n(),
mean=mean(MATHS),
sd=sd(MATHS)
%>%
) mutate(se=sd/sqrt(n-1))
group_by() of dplyr package is used to group the observation by RACE
summarise() is used to compute the count of observations, mean, standard deviation
mutate() is used to derive standard error of Maths by RACE
the output is save as a tibble data table called my_sum
Next, the code chunk below will be used to display my_sum tibble data frame in an html table format.
#|eval: false
::kable(head(my_sum), format = 'html') knitr
RACE | n | mean | sd | se |
---|---|---|---|---|
Chinese | 193 | 76.50777 | 15.69040 | 1.132357 |
Indian | 12 | 60.66667 | 23.35237 | 7.041005 |
Malay | 108 | 57.44444 | 21.13478 | 2.043177 |
Others | 9 | 69.66667 | 10.72381 | 3.791438 |
#|echo: false
::kable(head(my_sum), format = 'html') knitr
RACE | n | mean | sd | se |
---|---|---|---|---|
Chinese | 193 | 76.50777 | 15.69040 | 1.132357 |
Indian | 12 | 60.66667 | 23.35237 | 7.041005 |
Malay | 108 | 57.44444 | 21.13478 | 2.043177 |
Others | 9 | 69.66667 | 10.72381 | 3.791438 |
Plotting standard error bars of point estimates
Now we are ready to plot the standard error bars of mean maths score by race as shown below.
#|echo: false
ggplot(my_sum) +
geom_errorbar(
aes(x=RACE,
ymin=mean-se,
ymax=mean+se),
width=0.2,
colour="black",
alpha=0.9,
linewidth=0.5) +
geom_point(aes
x=RACE,
(y=mean),
stat="identity",
color="red",
size = 1.5,
alpha=1) +
ggtitle("Standard error of mean maths score by race")
#|eval: false
ggplot(my_sum) +
geom_errorbar(
aes(x=RACE,
ymin=mean-se,
ymax=mean+se),
width=0.2,
colour="black",
alpha=0.9,
linewidth=0.5) +
geom_point(aes
x=RACE,
(y=mean),
stat="identity",
color="red",
size = 1.5,
alpha=1) +
ggtitle("Standard error of mean maths score by race")
Plotting confidence interval of point estimates
Instead of plotting the standard error bar of point estimates, we can also plot the confidence intervals of mean maths score by race.
#|echo: false
ggplot(my_sum) +
geom_errorbar(
aes(x=reorder(RACE, -mean),
ymin=mean-1.96*se,
ymax=mean+1.96*se),
width=0.2,
colour="black",
alpha=0.9,
linewidth=0.5) +
geom_point(aes
x=RACE,
(y=mean),
stat="identity",
color="red",
size = 1.5,
alpha=1) +
labs(x = "Maths score",
title = "95% confidence interval of mean maths score by race")
#|eval: false
ggplot(my_sum) +
geom_errorbar(
aes(x=reorder(RACE, -mean),
ymin=mean-1.96*se,
ymax=mean+1.96*se),
width=0.2,
colour="black",
alpha=0.9,
linewidth=0.5) +
geom_point(aes
x=RACE,
(y=mean),
stat="identity",
color="red",
size = 1.5,
alpha=1) +
labs(x = "Maths score",
title = "95% confidence interval of mean maths score by race")
- The confidence intervals are computed by using the formula mean+/-1.96*se.
- The error bars is sorted by using the average maths scores.
- labs() argument of ggplot2 is used to change the x-axis label.
Visualizing the uncertainty of point estimates with interactive error bars
In this section, you will learn how to plot interactive error bars for the 99% confidence interval of mean maths score by race as shown in the figure below.
#|echo: false
= SharedData$new(my_sum)
shared_df
bscols(widths = c(4,8),
ggplotly((ggplot(shared_df) +
geom_errorbar(aes(
x=reorder(RACE, -mean),
ymin=mean-2.58*se,
ymax=mean+2.58*se),
width=0.2,
colour="black",
alpha=0.9,
size=0.5) +
geom_point(aes(
x=RACE,
y=mean,
text = paste("Race:", `RACE`,
"<br>N:", `n`,
"<br>Avg. Scores:", round(mean, digits = 2),
"<br>95% CI:[",
round((mean-2.58*se), digits = 2), ",",
round((mean+2.58*se), digits = 2),"]")),
stat="identity",
color="red",
size = 1.5,
alpha=1) +
xlab("Race") +
ylab("Average Scores") +
theme_minimal() +
theme(axis.text.x = element_text(
angle = 45, vjust = 0.5, hjust=1)) +
ggtitle("99% Confidence interval of average /<br>maths scores by race")),
tooltip = "text"),
::datatable(shared_df,
DTrownames = FALSE,
class="compact",
width="100%",
options = list(pageLength = 10,
scrollX=T),
colnames = c("No. of pupils",
"Avg Scores",
"Std Dev",
"Std Error")) %>%
formatRound(columns=c('mean', 'sd', 'se'),
digits=2))
#|eval: false
= SharedData$new(my_sum)
shared_df
bscols(widths = c(4,8),
ggplotly((ggplot(shared_df) +
geom_errorbar(aes(
x=reorder(RACE, -mean),
ymin=mean-2.58*se,
ymax=mean+2.58*se),
width=0.2,
colour="black",
alpha=0.9,
size=0.5) +
geom_point(aes(
x=RACE,
y=mean,
text = paste("Race:", `RACE`,
"<br>N:", `n`,
"<br>Avg. Scores:", round(mean, digits = 2),
"<br>95% CI:[",
round((mean-2.58*se), digits = 2), ",",
round((mean+2.58*se), digits = 2),"]")),
stat="identity",
color="red",
size = 1.5,
alpha=1) +
xlab("Race") +
ylab("Average Scores") +
theme_minimal() +
theme(axis.text.x = element_text(
angle = 45, vjust = 0.5, hjust=1)) +
ggtitle("99% Confidence interval of average /<br>maths scores by race")),
tooltip = "text"),
::datatable(shared_df,
DTrownames = FALSE,
class="compact",
width="100%",
options = list(pageLength = 10,
scrollX=T),
colnames = c("No. of pupils",
"Avg Scores",
"Std Dev",
"Std Error")) %>%
formatRound(columns=c('mean', 'sd', 'se'),
digits=2))
Visualising Uncertainty: ggdist package
ggdist is an R package that provides a flexible set of ggplot2 geoms and stats designed especially for visualising distributions and uncertainty.
It is designed for both frequentist and Bayesian uncertainty visualization, taking the view that uncertainty visualization can be unified through the perspective of distribution visualization: 1.for frequentist models, one visualises confidence distributions or bootstrap distributions (see vignette(“freq-uncertainty-vis”)); 2.for Bayesian models, one visualises probability distributions (see the tidybayes package, which builds on top of ggdist).
Visualizing the uncertainty of point estimates: ggdist methods
In the code chunk below, stat_pointinterval() of ggdist is used to build a visual for displaying distribution of maths scores by race.
%>%
exam ggplot(aes(x = RACE,
y = MATHS)) +
stat_pointinterval() +
labs(
title = "Visualising confidence intervals of mean math score",
subtitle = "Mean Point + Multiple-interval plot")
For example, in the code chunk below the following arguments are used:
.width = 0.95 .point = median .interval = qi
%>%
exam ggplot(aes(x = RACE, y = MATHS)) +
stat_pointinterval(.width = 0.95,
.point = median,
.interval = qi) +
labs(
title = "Visualising confidence intervals of median math score",
subtitle = "Median Point + Multiple-interval plot")
Visualizing the uncertainty of point estimates: ggdist methods
%>%
exam ggplot(aes(x = RACE,
y = MATHS)) +
stat_pointinterval(
show.legend = FALSE) +
labs(
title = "Visualising confidence intervals of mean math score",
subtitle = "Mean Point + Multiple-interval plot")
Visualizing the uncertainty of point estimates: ggdist methods
In the code chunk below, stat_gradientinterval() of ggdist is used to build a visual for displaying distribution of maths scores by race.
%>%
exam ggplot(aes(x = RACE,
y = MATHS)) +
stat_gradientinterval(
fill = "skyblue",
show.legend = TRUE
+
) labs(
title = "Visualising confidence intervals of mean math score",
subtitle = "Gradient + interval plot")
Visualising Uncertainty with Hypothetical Outcome Plots (HOPs)
Installing ungeviz package
::install_github("wilkelab/ungeviz") devtools
strapgod (NA -> ea2b1ecfc...) [GitHub]
rlang (1.1.5 -> 1.1.6 ) [CRAN]
cli (3.6.3 -> 3.6.5 ) [CRAN]
utf8 (1.2.4 -> 1.2.5 ) [CRAN]
pillar (1.10.1 -> 1.10.2 ) [CRAN]
stringi (1.8.4 -> 1.8.7 ) [CRAN]
R6 (2.5.1 -> 2.6.1 ) [CRAN]
cpp11 (0.5.1 -> 0.5.2 ) [CRAN]
scales (1.3.0 -> 1.4.0 ) [CRAN]
ggplot2 (3.5.1 -> 3.5.2 ) [CRAN]
rlang (1.1.5 -> 1.1.6 ) [CRAN]
cli (3.6.3 -> 3.6.5 ) [CRAN]
utf8 (1.2.4 -> 1.2.5 ) [CRAN]
pillar (1.10.1 -> 1.10.2) [CRAN]
stringi (1.8.4 -> 1.8.7 ) [CRAN]
R6 (2.5.1 -> 2.6.1 ) [CRAN]
cpp11 (0.5.1 -> 0.5.2 ) [CRAN]
package 'rlang' successfully unpacked and MD5 sums checked
package 'cli' successfully unpacked and MD5 sums checked
package 'utf8' successfully unpacked and MD5 sums checked
package 'pillar' successfully unpacked and MD5 sums checked
package 'stringi' successfully unpacked and MD5 sums checked
package 'R6' successfully unpacked and MD5 sums checked
package 'cpp11' successfully unpacked and MD5 sums checked
The downloaded binary packages are in
C:\Users\user.DESKTOP-QJ387K9\AppData\Local\Temp\RtmpQ35YFK\downloaded_packages
── R CMD build ─────────────────────────────────────────────────────────────────
* checking for file 'C:\Users\user.DESKTOP-QJ387K9\AppData\Local\Temp\RtmpQ35YFK\remotes79001fd876d5\DavisVaughan-strapgod-ea2b1ec/DESCRIPTION' ... OK
* preparing 'strapgod':
* checking DESCRIPTION meta-information ... OK
* checking for LF line-endings in source and make files and shell scripts
* checking for empty or unneeded directories
Omitted 'LazyData' from DESCRIPTION
* building 'strapgod_0.0.4.9000.tar.gz'
package 'rlang' successfully unpacked and MD5 sums checked
package 'cli' successfully unpacked and MD5 sums checked
package 'utf8' successfully unpacked and MD5 sums checked
package 'pillar' successfully unpacked and MD5 sums checked
package 'stringi' successfully unpacked and MD5 sums checked
package 'R6' successfully unpacked and MD5 sums checked
package 'cpp11' successfully unpacked and MD5 sums checked
package 'scales' successfully unpacked and MD5 sums checked
The downloaded binary packages are in
C:\Users\user.DESKTOP-QJ387K9\AppData\Local\Temp\RtmpQ35YFK\downloaded_packages
── R CMD build ─────────────────────────────────────────────────────────────────
* checking for file 'C:\Users\user.DESKTOP-QJ387K9\AppData\Local\Temp\RtmpQ35YFK\remotes79007f2a4396\wilkelab-ungeviz-74e1651/DESCRIPTION' ... OK
* preparing 'ungeviz':
* checking DESCRIPTION meta-information ... OK
* checking for LF line-endings in source and make files and shell scripts
* checking for empty or unneeded directories
* building 'ungeviz_0.1.0.tar.gz'
Launch the application in R
library(ungeviz)
Visualising Uncertainty with Hypothetical Outcome Plots (HOPs)
ggplot(data = exam,
aes(x = factor(RACE),
(y = MATHS))) +
geom_point(position = position_jitter(
height = 0.3,
width = 0.05),
size = 0.4,
color = "#0072B2",
alpha = 1/2) +
geom_hpline(data = sampler(25,
group = RACE),
height = 0.6,
color = "#D55E00") +
theme_bw() +
transition_states(.draw, 1, 3)