12 Plot populations and samples

12.1 ⭐️Overview

This chapter is about plotting out populations (usually of people) and samples (usually of people). This are mostly useful for teaching and training purposes

12.3 📦Load packages

library(dplyr, warn.conflicts = FALSE)
library(tidyr)
library(ggplot2)
library(templates)
data("my_colors")

12.4 🔢Simulate population data

I think you have to make it on a grid.

pop <- expand_grid(x = 1:10, y = 1:10)

Add exposed vs. unexposed

Half exposed and half unexposed. If exposed, half have outcome. If unexposed, 10% have outcome.

# Helper function for sampling No and Yes
# sample_ny <- function(n = 100, prob_y = 0.5) {
#   set.seed(123)
#   s <- sample(c("No", "Yes"), n, TRUE, c(1 - prob_y, prob_y))
#   s <- factor(s)
#   s
# }

# For testing
# sample_ny(prob_y = 0.1)
# Helper function for sampling No and Yes - Simplified
sample_ny <- function(n = 100, prob_y = 0.5) {
  sample(c("No", "Yes"), n, TRUE, c(1 - prob_y, prob_y))
}

# For testing
# sample_ny(prob_y = 0.1)
set.seed(123)
pop$exposed <- sample_ny()
pop$outcome <- NA_character_
pop$outcome[pop$exposed == "Yes"] <- sample_ny(n = sum(pop$exposed == "Yes"), prob_y = 0.5)
pop$outcome[pop$exposed == "No"] <- sample_ny(n = sum(pop$exposed == "No"), prob_y = 0.1)
count(pop, exposed, outcome)
## # A tibble: 4 × 3
##   exposed outcome     n
##   <chr>   <chr>   <int>
## 1 No      No         43
## 2 No      Yes         4
## 3 Yes     No         26
## 4 Yes     Yes        27

12.5 Plot

# Pull orange and blue colors from templates package
u_orange <- filter(my_colors, description == "University Orange") %>% pull(hex)
u_blue <- filter(my_colors, description == "University Blue") %>% pull(hex)
ggplot(pop, aes(x, y)) + 
  geom_point(size = 5, aes(color = exposed, shape = outcome)) +
  scale_color_manual("Exposed", values = c(u_blue, u_orange)) +
  theme(
    panel.background = element_blank(),
    axis.title = element_blank(),
    axis.text = element_blank(),
    axis.ticks = element_blank()
  )

12.5.1 Combine legends

To combine the legend, we need to have a single variable with exposure and outcome information.

pop <- pop %>% 
  mutate(
    e_o = case_when(
      exposed == "Yes" & outcome == "Yes" ~ "a",
      exposed == "Yes" & outcome == "No" ~ "b",
      exposed == "No" & outcome == "Yes" ~ "c",
      exposed == "No" & outcome == "No" ~ "d"
    ),
    e_o_f = factor(
      e_o, c("a", "b", "c", "d"),
      c(
        "Exposed - Outcome", "Exposed - No Outcome", 
        "Not exposed - Outcome", "Not exposed - No outcome"
      )
    )
  )
# Helper function for shapes
cir <- 16
tri <- 17
ggplot(pop, aes(x, y, color = e_o_f, shape = e_o_f)) + 
  geom_point(size = 5) +
  scale_color_manual("Exposure-Outcome", values = c(u_orange, u_orange, u_blue, u_blue)) +
  scale_shape_manual("Exposure-Outcome", values = c(tri, cir, tri, cir)) +
  theme(
    panel.background = element_blank(),
    axis.title = element_blank(),
    axis.text = element_blank(),
    axis.ticks = element_blank()
  )

12.5.2 Sample box

Sometimes I want to draw a box around a sample. For example, let’s draw a box around a 5x5 sample in the top right corner.

ggplot(pop, aes(x, y)) + 
  geom_point(size = 5, aes(color = e_o_f, shape = e_o_f)) +
  # Draw sample box
  geom_rect(
    xmin = 0.7, xmax = 5.3, ymin = 5.7, ymax = 10.3,
    alpha = 0, color = "black"
  ) +
  scale_color_manual("Exposure-Outcome", values = c(u_orange, u_orange, u_blue, u_blue)) +
  scale_shape_manual("Exposure-Outcome", values = c(tri, cir, tri, cir)) +
  theme(
    panel.background = element_blank(),
    axis.title = element_blank(),
    axis.text = element_blank(),
    axis.ticks = element_blank()
  )

12.5.3 Align colors and shapes

Sometimes I want the exposed-unexposed to be haphazardly spread around the plot like they are above. Sometimes, I want the exposed next to the exposed and the unexposed next to the unexposed.

pop_arrange <- pop %>% 
  arrange(desc(exposed), desc(outcome)) %>% 
  # Renumber the grid
  mutate(
    x = rep(1:10, each = 10),
    y = rep(1:10, 10)
  )
ggplot(pop_arrange, aes(x, y)) + 
  geom_point(size = 5, aes(color = e_o_f, shape = e_o_f)) +
  scale_color_manual("Exposure-Outcome", values = c(u_orange, u_orange, u_blue, u_blue)) +
  scale_shape_manual("Exposure-Outcome", values = c(tri, cir, tri, cir)) +
  theme(
    panel.background = element_blank(),
    axis.title = element_blank(),
    axis.text = element_blank(),
    axis.ticks = element_blank()
  )

Use facet to make them stand out even more clearly.

ggplot(pop_arrange, aes(x, y)) + 
  geom_point(size = 5, aes(color = e_o_f, shape = e_o_f)) +
  scale_color_manual("Exposure-Outcome", values = c(u_orange, u_orange, u_blue, u_blue)) +
  scale_shape_manual("Exposure-Outcome", values = c(tri, cir, tri, cir)) +
  facet_wrap(vars(e_o_f))+
  theme(
    panel.background = element_blank(),
    axis.title = element_blank(),
    axis.text = element_blank(),
    axis.ticks = element_blank()
  )

12.6 More helper functions

12.6.1 Create a pop of size x with e prop exposed and o prop with outcome

make_pop <- function(n_total = 100, 
                     prob_exposed, 
                     prob_outcome_exposed,
                     prob_outcome_unexposed,
                     arrange = FALSE) {
  # Figure out the smallest integer that will be at least size
  # n_total when multiplied by 2. The idea is to figure out the dimensions
  # for the closest thing I can get to a square given n_total
  n_sqrt <- sqrt(n_total)
  n_sqrt_ceiling <- ceiling(n_sqrt)
  drop <- n_sqrt_ceiling^2 - n_total
  
  # Make coordinates for grid of points
  pop <- expand_grid(
    x = seq(n_sqrt_ceiling),
    y = seq(n_sqrt_ceiling)
  )
  
  # Drop of n_sqrt is uneven. Drop from bottom right corner.
  # High x, low y.
  pop <- pop %>% 
    arrange(desc(y)) %>% 
    slice(1:(n() - drop)) %>% 
    arrange(x, y)
  
  # I still want y to be base 1
  pop$y <- pop$y + (1 - min(pop$y))
  
  # Add exposed and unexposed
  # Helper function for sampling No and Yes - Simplified
  sample_ny <- function(n = 100, prob_y = 0.5) {
    sample(c("No", "Yes"), n, TRUE, c(1 - prob_y, prob_y))
  }
  
  # Add exposed
  pop$exposed <- sample_ny(n = n_total, prob_y = prob_exposed)
  
  # Add outcome
  pop$outcome <- NA_character_
  n_exp_y <- sum(pop$exposed == "Yes")
  n_exp_n <- sum(pop$exposed == "No")
  pop$outcome[pop$exposed == "Yes"] <- sample_ny(n_exp_y, prob_y = prob_outcome_exposed)
  pop$outcome[pop$exposed == "No"] <- sample_ny(n_exp_n, prob_y = prob_outcome_unexposed)
  
  # Add exposure-outcome group columns
  # To combine the legend, we need to have a single variable with exposure 
  # and outcome information.
  pop <- pop %>% 
    mutate(
      e_o = case_when(
        exposed == "Yes" & outcome == "Yes" ~ "a",
        exposed == "Yes" & outcome == "No" ~ "b",
        exposed == "No" & outcome == "Yes" ~ "c",
        exposed == "No" & outcome == "No" ~ "d"
      ),
      e_o_f = factor(
        e_o, c("a", "b", "c", "d"),
        c(
          "Exposed - Outcome", "Exposed - No Outcome", 
          "Not exposed - Outcome", "Not exposed - No outcome"
        )
      )
    )
  
  # Arrange 
  # Sometimes I want the exposed-unexposed to be haphazardly spread around the 
  # plot. Sometimes, I want the exposed next to the exposed and the unexposed 
  # next to the unexposed.
  if (arrange) {
    # Separate x and y from the rest of the data before arranging
    x_y <- select(pop, x, y)
    pop <- pop %>% 
      select(-x, -y) %>% 
      arrange(desc(exposed), desc(outcome))
    # Add x and y back
    pop <- bind_cols(x_y, pop)
  }
  
  # Return tibble
  pop
}

# For testing
# set.seed(123)
# make_pop(
#   n_total = 100,
#   prob_exposed = 0.5,
#   prob_outcome_exposed = 0.5,
#   prob_outcome_unexposed = 0.1,
#   arrange = FALSE
# )

12.6.2 Create a helper plot function

plot_pop <- function(.data, p_size = 5) {
  
  # Store shape codes
  cir <- 16
  tri <- 17
  
  # Create plot
  p <- ggplot(.data, aes(x, y, color = e_o_f, shape = e_o_f)) + 
    geom_point(size = p_size) +
    scale_color_manual("Exposure-Outcome", values = c(u_orange, u_orange, u_blue, u_blue)) +
    scale_shape_manual("Exposure-Outcome", values = c(tri, cir, tri, cir)) +
    theme(
      panel.background = element_blank(),
      axis.title = element_blank(),
      axis.text = element_blank(),
      axis.ticks = element_blank()
    )
  
  # Return plot object
  p
}

# For testing
# pop_plot(pop, 5)

Test

set.seed(123)
pop <- make_pop(
  n_total = 1000,
  prob_exposed = 0.5,
  prob_outcome_exposed = 0.5,
  prob_outcome_unexposed = 0.1,
  arrange = TRUE
)

pop %>% 
  plot_pop(p_size = 3)

I might actually want to add this to freqtables.

rm(list = ls())