Skip to content

Latest commit

 

History

History
221 lines (198 loc) · 8.55 KB

bayesian_updating.md

File metadata and controls

221 lines (198 loc) · 8.55 KB

Bayesian updating demo

Tristan Mahr September 20, 2016

library("dplyr")
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union
library("tidyr")
library("ggplot2")

# We have some IQ scores
iqs <- car::Burt$IQbio
iqs
#>  [1]  82  80  88 108 116 117 132  71  75  93  95  88 111  63  77  86  83
#> [18]  93  97  87  94  96 112 113 106 107  98

# Want to use Bayesian updating to estimate the mean of the data and the sd of
# the data.

# Use a uniform prior across a range of means and SDs
each_mean <- seq(from = 70, to = 130, by = 1)
each_sd <- 10:20
# equal probability
p_mean <- 1 / length(each_mean)
p_sd <- 1 / length(each_sd)

# We are using a grid approximation. Create every combination of mean and sd.
# Joint probability as product of each one's prior probability.
df <- data_frame(each_mean, p_mean) %>%
  expand(nesting(each_mean, p_mean), nesting(each_sd, p_sd)) %>%
  mutate(prior = p_sd * p_mean,
         posterior = prior,
         data_seen = 0)
df
#> # A tibble: 671 × 7
#>    each_mean     p_mean each_sd       p_sd       prior   posterior
#>        <dbl>      <dbl>   <int>      <dbl>       <dbl>       <dbl>
#> 1         70 0.01639344      10 0.09090909 0.001490313 0.001490313
#> 2         70 0.01639344      11 0.09090909 0.001490313 0.001490313
#> 3         70 0.01639344      12 0.09090909 0.001490313 0.001490313
#> 4         70 0.01639344      13 0.09090909 0.001490313 0.001490313
#> 5         70 0.01639344      14 0.09090909 0.001490313 0.001490313
#> 6         70 0.01639344      15 0.09090909 0.001490313 0.001490313
#> 7         70 0.01639344      16 0.09090909 0.001490313 0.001490313
#> 8         70 0.01639344      17 0.09090909 0.001490313 0.001490313
#> 9         70 0.01639344      18 0.09090909 0.001490313 0.001490313
#> 10        70 0.01639344      19 0.09090909 0.001490313 0.001490313
#> # ... with 661 more rows, and 1 more variables: data_seen <dbl>

# Update the posterior given some data
update_distribution <- function(df, data) {
  data_seen <- unique(df$data_seen) + length(data)
  
  df %>%
    # Last posterior is current prior
    select(each_mean, each_sd, prior = posterior) %>%
    # Attach data to each mean-sd pair
    expand(nesting(each_mean, each_sd, prior), points = data) %>%
    # Measure likelihood of data in each mean-sd pair
    group_by(each_mean, each_sd, prior) %>%
    summarise(likelihood = prod(dnorm(points, each_mean, each_sd))) %>%
    ungroup %>%
    # bayes theorem
    mutate(posterior = likelihood * prior / (sum(likelihood * prior)),
           data_seen = data_seen)
}

# Create each sequence of data samples
data_sequence <- Map(seq, 1, seq_along(iqs)) %>%
  lapply(function(xs) iqs[xs])

iq_sequence <- data_sequence %>%
  purrr::map(~ update_distribution(df, .x))

# include 0 data seen
iq_sequence <- c(list(df), iq_sequence)

iq_sequence %>% 
  purrr::map_dbl(~ sum(.x$posterior))
#>  [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1

df_fit <- iq_sequence %>%
  purrr::accumulate(bind_rows) %>%
  # Create animation steps
  purrr::map(. %>% mutate(step = max(data_seen)) %>%
               # Keep last five data_seen values.
               filter(step - data_seen <= 5) %>%
               # Create a column to control transparency using recency
               mutate(alpha = 1 - ((step - data_seen) / 5))) %>%
  bind_rows
df_fit
#> # A tibble: 102,663 × 10
#>    each_mean     p_mean each_sd       p_sd       prior   posterior
#>        <dbl>      <dbl>   <int>      <dbl>       <dbl>       <dbl>
#> 1         70 0.01639344      10 0.09090909 0.001490313 0.001490313
#> 2         70 0.01639344      11 0.09090909 0.001490313 0.001490313
#> 3         70 0.01639344      12 0.09090909 0.001490313 0.001490313
#> 4         70 0.01639344      13 0.09090909 0.001490313 0.001490313
#> 5         70 0.01639344      14 0.09090909 0.001490313 0.001490313
#> 6         70 0.01639344      15 0.09090909 0.001490313 0.001490313
#> 7         70 0.01639344      16 0.09090909 0.001490313 0.001490313
#> 8         70 0.01639344      17 0.09090909 0.001490313 0.001490313
#> 9         70 0.01639344      18 0.09090909 0.001490313 0.001490313
#> 10        70 0.01639344      19 0.09090909 0.001490313 0.001490313
#> # ... with 102,653 more rows, and 4 more variables: data_seen <dbl>,
#> #   step <dbl>, alpha <dbl>, likelihood <dbl>

# Also want a data-frame of the cumulative data-set as each new point is added.
# This lets us plot a rug of raw data.
data_sequence <- data_sequence %>% 
  purrr::map_df(~ data_frame(iq = .x, data_seen = length(iq))) %>% 
  group_by(data_seen) %>% 
  mutate(step = max(data_seen), sequence = seq_along(data_seen), current = sequence == max(sequence)) %>% 
  select(-sequence) %>% 
  ungroup
data_sequence
#> # A tibble: 378 × 4
#>       iq data_seen  step current
#>    <int>     <int> <int>   <lgl>
#> 1     82         1     1    TRUE
#> 2     82         2     2   FALSE
#> 3     80         2     2    TRUE
#> 4     82         3     3   FALSE
#> 5     80         3     3   FALSE
#> 6     88         3     3    TRUE
#> 7     82         4     4   FALSE
#> 8     80         4     4   FALSE
#> 9     88         4     4   FALSE
#> 10   108         4     4    TRUE
#> # ... with 368 more rows

library("gganimate")

# plot posterior of means, one line per sd level
p <- ggplot(df_fit) +
  aes(x = each_mean, y = posterior, frame = step) +
  geom_line(aes(group = interaction(each_sd, data_seen), alpha = alpha)) +
  ggtitle("Data observed:") +
  xlab("possible mean") +
  ylab("posterior probability") +
  guides(alpha = FALSE) +
  theme_grey(base_size = 14) + 
  geom_rug(aes(x = iq, y = 0), data_sequence, sides = "b") + 
  geom_rug(aes(x = iq, y = 0), filter(data_sequence, current), size = 1.5, 
           sides = "b")

animation::ani.options(interval = 1/10)
g <- gg_animate(p,  ani.height = 400, ani.width = 640)
# g
gg_animate_save(g, filename = "./assets/updating.gif", saver = "gif",
                ani.height = 400, ani.width = 640)
#> Executing: 
#> ""convert" -loop 0 -delay 10 Rplot1.png Rplot2.png Rplot3.png
#>     Rplot4.png Rplot5.png Rplot6.png Rplot7.png Rplot8.png
#>     Rplot9.png Rplot10.png Rplot11.png Rplot12.png Rplot13.png
#>     Rplot14.png Rplot15.png Rplot16.png Rplot17.png Rplot18.png
#>     Rplot19.png Rplot20.png Rplot21.png Rplot22.png Rplot23.png
#>     Rplot24.png Rplot25.png Rplot26.png Rplot27.png Rplot28.png
#>     "updating.gif""
#> Output at: updating.gif


# plot posterior of sds, one line per mean level
p <- ggplot(df_fit) +
  aes(x = each_sd, y = posterior, frame = step) +
  geom_line(aes(group = interaction(each_mean, data_seen), alpha = alpha)) +
  ggtitle("Data observed:") +
  xlab("possible sd") +
  ylab("posterior probability") +
  guides(alpha = FALSE) +
  theme_grey(base_size = 14)

g2 <- gg_animate(p,  ani.height = 400, ani.width = 640)

gg_animate_save(g2, filename = "./assets/updating_sds.gif", saver = "gif",
                ani.height = 400, ani.width = 640)
#> Executing: 
#> ""convert" -loop 0 -delay 10 Rplot1.png Rplot2.png Rplot3.png
#>     Rplot4.png Rplot5.png Rplot6.png Rplot7.png Rplot8.png
#>     Rplot9.png Rplot10.png Rplot11.png Rplot12.png Rplot13.png
#>     Rplot14.png Rplot15.png Rplot16.png Rplot17.png Rplot18.png
#>     Rplot19.png Rplot20.png Rplot21.png Rplot22.png Rplot23.png
#>     Rplot24.png Rplot25.png Rplot26.png Rplot27.png Rplot28.png
#>     "updating_sds.gif""
#> Output at: updating_sds.gif

p <- iq_sequence %>%
  bind_rows %>%
  ggplot(.) +
  aes(x = each_mean, y = each_sd, z = posterior, group = data_seen, frame = data_seen) +
  geom_contour() +
  ggtitle("Data observed:") +
  xlab("possible mean") +
  ylab("possible sd")
g3 <- gg_animate(p, ani.height = 400, ani.width = 640)
#> Warning in grDevices::contourLines(x = sort(unique(data$x)), y =
#> sort(unique(data$y)), : all z values are equal
#> Warning: Not possible to generate contour data

gg_animate_save(g3, filename = "./assets/updating_contour.gif", saver = "gif",
                ani.height = 400, ani.width = 640)
#> Executing: 
#> ""convert" -loop 0 -delay 10 Rplot1.png Rplot2.png Rplot3.png
#>     Rplot4.png Rplot5.png Rplot6.png Rplot7.png Rplot8.png
#>     Rplot9.png Rplot10.png Rplot11.png Rplot12.png Rplot13.png
#>     Rplot14.png Rplot15.png Rplot16.png Rplot17.png Rplot18.png
#>     Rplot19.png Rplot20.png Rplot21.png Rplot22.png Rplot23.png
#>     Rplot24.png Rplot25.png Rplot26.png Rplot27.png
#>     "updating_contour.gif""
#> Output at: updating_contour.gif