diff --git a/config.yaml b/config.yaml
index b554de4f..c2325c69 100644
--- a/config.yaml
+++ b/config.yaml
@@ -14,7 +14,7 @@ carpentry: 'incubator'
 title: 'Real-time analysis and forecasting for outbreak analytics with R'
 
 # Date the lesson was created (YYYY-MM-DD, this is empty by default)
-created:
+created: 
 
 # Comma-separated list of keywords for the lesson
 keywords: 'forecasts, epidemic models, interventions'
@@ -62,21 +62,23 @@ episodes:
 #- quantify-transmissibility.Rmd
 - create-forecast.Rmd
 - severity-static.Rmd
+- superspreading-estimate.Rmd
+- superspreading-simulate.Rmd
 
 # Information for Learners
-learners:
+learners: 
 
 # Information for Instructors
-instructors:
+instructors: 
 
 # Learner Profiles
-profiles:
+profiles: 
 
 # Customisation ---------------------------------------------
 #
 # This space below is where custom yaml items (e.g. pinning
 # sandpaper and varnish versions) should live
 
+
 varnish: epiverse-trace/varnish@epiversetheme
-# this is carpentries/sandpaper#533 in our fork so we can keep it up to date with main
 sandpaper: epiverse-trace/sandpaper@patch-renv-github-bug
diff --git a/episodes/fig/SEE-dist.png b/episodes/fig/SEE-dist.png
new file mode 100644
index 00000000..c42cc6e0
Binary files /dev/null and b/episodes/fig/SEE-dist.png differ
diff --git a/episodes/fig/SEE-individual-reproductive-number-fig-b.png b/episodes/fig/SEE-individual-reproductive-number-fig-b.png
new file mode 100644
index 00000000..fe4ded4e
Binary files /dev/null and b/episodes/fig/SEE-individual-reproductive-number-fig-b.png differ
diff --git a/episodes/fig/SEE-individual-reproductive-number-fig-c.png b/episodes/fig/SEE-individual-reproductive-number-fig-c.png
new file mode 100644
index 00000000..2e9700ec
Binary files /dev/null and b/episodes/fig/SEE-individual-reproductive-number-fig-c.png differ
diff --git a/episodes/fig/SEE-individual-reproductive-number-fig-d.png b/episodes/fig/SEE-individual-reproductive-number-fig-d.png
new file mode 100644
index 00000000..bbd856ad
Binary files /dev/null and b/episodes/fig/SEE-individual-reproductive-number-fig-d.png differ
diff --git a/episodes/fig/SEE-individual-reproductive-number.png b/episodes/fig/SEE-individual-reproductive-number.png
new file mode 100644
index 00000000..21ec9fcf
Binary files /dev/null and b/episodes/fig/SEE-individual-reproductive-number.png differ
diff --git a/episodes/fig/SEE-params.png b/episodes/fig/SEE-params.png
new file mode 100644
index 00000000..5dcdb7e2
Binary files /dev/null and b/episodes/fig/SEE-params.png differ
diff --git a/episodes/fig/contact-tracing-backward-time.png b/episodes/fig/contact-tracing-backward-time.png
new file mode 100644
index 00000000..f72c31d5
Binary files /dev/null and b/episodes/fig/contact-tracing-backward-time.png differ
diff --git a/episodes/fig/contact-tracing-serial-interval.png b/episodes/fig/contact-tracing-serial-interval.png
new file mode 100644
index 00000000..ed19ecb9
Binary files /dev/null and b/episodes/fig/contact-tracing-serial-interval.png differ
diff --git a/episodes/fig/contact-tracing-strategies.png b/episodes/fig/contact-tracing-strategies.png
new file mode 100644
index 00000000..f63d76c7
Binary files /dev/null and b/episodes/fig/contact-tracing-strategies.png differ
diff --git a/episodes/fig/see-intro-secondary-cases-fig-b.png b/episodes/fig/see-intro-secondary-cases-fig-b.png
new file mode 100644
index 00000000..f86ca9a7
Binary files /dev/null and b/episodes/fig/see-intro-secondary-cases-fig-b.png differ
diff --git a/episodes/fig/see-intro-secondary-cases.png b/episodes/fig/see-intro-secondary-cases.png
new file mode 100644
index 00000000..bf112647
Binary files /dev/null and b/episodes/fig/see-intro-secondary-cases.png differ
diff --git a/episodes/fig/see-intro-superspreading.png b/episodes/fig/see-intro-superspreading.png
new file mode 100644
index 00000000..c74925c0
Binary files /dev/null and b/episodes/fig/see-intro-superspreading.png differ
diff --git a/episodes/fig/see-nature04153_Fig2-c.jpg b/episodes/fig/see-nature04153_Fig2-c.jpg
new file mode 100644
index 00000000..1edbfe75
Binary files /dev/null and b/episodes/fig/see-nature04153_Fig2-c.jpg differ
diff --git a/episodes/fig/see-nature04153_Fig2.jpg b/episodes/fig/see-nature04153_Fig2.jpg
new file mode 100644
index 00000000..68f62469
Binary files /dev/null and b/episodes/fig/see-nature04153_Fig2.jpg differ
diff --git a/episodes/superspreading-estimate.Rmd b/episodes/superspreading-estimate.Rmd
new file mode 100644
index 00000000..5ec417ad
--- /dev/null
+++ b/episodes/superspreading-estimate.Rmd
@@ -0,0 +1,882 @@
+---
+title: 'Account for superspreading'
+teaching: 30
+exercises: 2
+---
+
+```{r setup, echo= FALSE, message = FALSE, warning = FALSE}
+library(webshot)
+webshot::install_phantomjs(force = TRUE)
+```
+
+
+:::::::::::::::::::::::::::::::::::::: questions 
+
+- How can we estimate individual-level variation in transmission (i.e. superspreading potential) from contact tracing data?
+- What are the implications for variation in transmission for decision-making?
+
+::::::::::::::::::::::::::::::::::::::::::::::::
+
+::::::::::::::::::::::::::::::::::::: objectives
+
+- Estimate the distribution of onward transmission from infected individuals (i.e. offspring distribution) from outbreak data using `{epicontacts}`.
+- Estimate the extent of individual-level variation (i.e. the dispersion parameter) of the offspring distribution using `{fitdistrplus}`.
+- Estimate the proportion of transmission that is linked to 'superspreading events' using `{superspreading}`.
+
+::::::::::::::::::::::::::::::::::::::::::::::::
+
+::::::::::::::::::::::::::::::::::::: prereq
+
+## Prerequisites
+
+Learners should familiarise themselves with following concept dependencies before working through this tutorial: 
+
+**Statistics**: common probability distributions, particularly Poisson and negative binomial.
+
+**Epidemic theory**: The reproduction number, R.
+
+:::::::::::::::::::::::::::::::::
+
+## Introduction
+
+<!-- we know -->
+
+From smallpox to severe acute respiratory syndrome coronavirus 2 (SARS-CoV-2), some infected individuals spread infection to more people than others. Disease transmission is the result of a combination of biological and social factors, and these factors average out to some extent at the population level during a large epidemic. Hence researchers often use population averages to assess the potential for disease to spread. However, in the earlier or later phases of an outbreak, individual differences in infectiousness can be more important. In particular, they increase the chance of superspreading events (SSEs), which can ignite explosive epidemics and also influence the chances of controlling transmission ([Lloyd-Smith et al., 2005](https://wellcomeopenresearch.org/articles/5-83)).
+
+![**Chains of SARS-CoV-2 transmission in Hong Kong initiated by local or imported cases.** (**a**), Transmission network of a cluster of cases traced back to a collection of four bars across Hong Kong (n = 106). (**b**), Transmission network associated with a wedding without clear infector–infectee pairs but linked back to a preceding social gathering and local source (n = 22). (**c**), Transmission network associated with a temple cluster of undetermined source (n = 19). (**d**), All other clusters of SARS-CoV-2 infections where the source and transmission chain could be determined ([Adam et al., 2020](https://www.nature.com/articles/s41591-020-1092-0)).](fig/see-intro-superspreading.png)
+
+<!-- we dont know -->
+
+The [basic reproduction number](../learners/reference.md#basic), $R_{0}$, measures the average number of cases caused by one infectious individual in a entirely susceptible population. Estimates of $R_{0}$ are useful for understanding the average dynamics of an epidemic at the population-level, but can obscure considerable individual variation in infectiousness. This was highlighted during the global emergence of SARS-CoV-2 by numerous ‘superspreading events’ in which certain infectious individuals generated unusually large numbers of secondary cases ([LeClerc et al, 2020](https://wellcomeopenresearch.org/articles/5-83)).
+
+![**Observed offspring distribution of SARS-CoV-2 transmission in Hong Kong.** N = 91 SARS-CoV-2 infectors, N = 153 terminal infectees and N = 46 sporadic local cases. Histogram bars indicate the proportion of onward transmission per amount of secondary cases. Line corresponds to a fitted negative binomial distribution ([Adam et al., 2020](https://www.nature.com/articles/s41591-020-1092-0)).](fig/see-intro-secondary-cases-fig-b.png){alt='R = 0.58 and k = 0.43.'}
+
+<!-- we want -->
+
+In this tutorial, we are going to quantify individual variation in transmission, and hence estimate the potential for superspreading events. Then we are going to use these estimates to explore the implications of superspreading for contact tracing interventions.
+
+We are going to use data from the `{outbreaks}` package, manage the linelist and contacts data using `{epicontacts}`, and estimate distribution parameters with `{fitdistrplus}`. Lastly, we are going to use `{superspreading}` to explore the implications of variation in transmission for decision-making.
+
+We’ll use the pipe `%>%` to connect some of the functions from these packages, so let’s also call the `{tidyverse}` package.
+
+```{r,message=FALSE,warning=FALSE}
+library(outbreaks)
+library(epicontacts)
+library(fitdistrplus)
+library(superspreading)
+library(tidyverse)
+```
+
+::::::::::::::::::: checklist
+
+### The double-colon
+
+The double-colon `::` in R let you call a specific function from a package without loading the entire package into the current environment. 
+
+For example, `dplyr::filter(data, condition)` uses `filter()` from the `{dplyr}` package.
+
+This help us remember package functions and avoid namespace conflicts.
+
+:::::::::::::::::::
+
+## The individual reprodution number
+
+The individual reproduction number is defined as the number of secondary cases caused by a particular infected individual. 
+
+Early in an outbreak we can use contact data to reconstruct transmission chains (i.e. who infected whom) and calculate the number of secondary cases generated by each individual. This reconstruction of linked transmission events from contact data can provide an understanding about how different individuals have contributed to transmission during an epidemic ([Cori et al., 2017](https://royalsocietypublishing.org/doi/10.1098/rstb.2016.0371)).
+
+Let's practice this using the `mers_korea_2015` linelist and contact data from the `{outbreaks}` package and integrate them with the `{epicontacts}` package to calculate the distribution of secondary cases during the 2015 MERS-CoV outbreak in South Korea ([Campbell, 2022](https://community.appliedepi.org/t/estimating-the-degree-of-super-spreading-from-transmission-chain-data/103/2)):
+
+```{r}
+## first, make an epicontacts object
+epi_contacts <-
+  epicontacts::make_epicontacts(
+    linelist = outbreaks::mers_korea_2015$linelist,
+    contacts = outbreaks::mers_korea_2015$contacts
+  )
+```
+
+```{r,eval=FALSE}
+# visualise contact network
+epicontacts::vis_epicontacts(epi_contacts)
+```
+
+```{r,echo=FALSE}
+# visualise contact network
+network <-
+  vis_epicontacts(epi_contacts) %>%
+  visNetwork::visPhysics(solver = "barnesHut")
+
+fname <- "network.html"
+
+visNetwork::visSave(network, fname)
+
+webshot::webshot(
+  fname,
+  delay = 5,
+  zoom = 10
+)
+```
+
+::::::::::::::::::::::::::: spoiler
+
+### Is contact data tidy?
+
+Contact data from a transmission chain can provide information on which infected individuals came into contact with others. We expect to have the infector (`from`) and the infectee (`to`) plus additional columns of variables related to their contact, such as location (`exposure`) and date of contact.
+
+Following [tidy data](https://tidyr.tidyverse.org/articles/tidy-data.html#tidy-data) principles, the observation unit in our contact dataset is the **infector-infectee** pair. Although one infector can infect multiple infectees, from contact tracing investigations we may record contacts linked to more than one infector (e.g. within a household). But we should expect to have unique infector-infectee pairs, because typically each infected person will have acquired the infection from one other.
+
+To ensure these unique pairs, we can check on replicates for infectees:
+
+```{r}
+# no infector-infectee pairs are replicated
+epi_contacts %>%
+  pluck("contacts") %>%
+  group_by(to) %>%
+  filter(n() > 1)
+```
+
+:::::::::::::::::::::::::::
+
+When each infector-infectee row is unique, the number of entries per infector corresponds to the number of secondary cases generated by that individual.
+
+```{r}
+# count secondary cases per infector
+infector_secondary <- epi_contacts %>%
+  pluck("contacts") %>%
+  count(from, name = "secondary_cases")
+```
+
+But this output only contains number of secondary cases for reported infectors, not for each of the individuals in the whole `epicontacts` object.
+
+To get this, first, we can use `epicontacts::get_id()` to get the full list of unique identifiers ("id") from the `epicontacts` class object. Second, join it with the count secondary cases per infector stored in the `infector_secondary` object. Third, replace the missing values with `0` to express no report of secondary cases from them.
+
+```{r,message=FALSE,warning=FALSE}
+all_secondary <- epi_contacts %>%
+  # extract ids in contact *and* linelist using "which" argument
+  epicontacts::get_id(which = "all") %>%
+  # transform vector to dataframe to use left_join()
+  enframe(name = NULL, value = "from") %>%
+  # join count secondary cases per infectee
+  left_join(infector_secondary) %>%
+  # infectee with missing secondary cases are replaced with zero
+  replace_na(
+    replace = list(secondary_cases = 0)
+  )
+```
+
+From a histogram of the `all_secondary` object, we can identify the **individual-level variation** in the number of secondary cases. Three cases were related to more than 20 secondary cases, while the complementary cases with less than five or zero secondary cases.
+
+```{r,echo=FALSE,eval=FALSE}
+# arrange in descendant order of secondary cases
+all_secondary %>%
+  arrange(desc(secondary_cases))
+```
+
+<!-- Visualizing the number of secondary cases on a histogram will help us to relate this with the statistical distribution to fit: -->
+
+```{r}
+## plot the distribution
+all_secondary %>%
+  ggplot(aes(secondary_cases)) +
+  geom_histogram(binwidth = 1) +
+  labs(
+    x = "Number of secondary cases",
+    y = "Frequency"
+  )
+```
+
+The number of secondary cases can be used to _empirically_ estimate the **offspring distribution**, which is the number of secondary _infections_ caused by each case. One candidate statistical distribution used to model the offspring distribution is the **negative binomial** distribution with two parameters:
+
+- **Mean**, which represents the $R_{0}$, the average number of (secondary) cases produced by a single individual in an entirely susceptible population, and
+
+- **Dispersion**, expressed as $k$, which represents the individual-level variation in transmission by single individuals.
+
+```{r,echo=FALSE,message=FALSE,warning=FALSE}
+# Load parameters
+mpox <- epiparameter::epidist_db(
+  disease = "SARS",
+  epi_dist = "offspring_distribution",
+  single_epidist = TRUE
+)
+mpox_params <- epiparameter::get_parameters(mpox)
+# mpox_params
+
+# calculate density fit
+fit_density <-
+  tibble(quantile = 0:40) %>%
+  mutate(
+    density = dnbinom(
+      x = quantile,
+      mu = mpox_params["mean"],
+      size = mpox_params["dispersion"]
+    )
+  )
+
+# plot offspring distribution with density fit
+ggplot() +
+  geom_line(
+    data = fit_density,
+    mapping =
+      aes(
+        x = quantile,
+        y = density
+      )
+  ) +
+  geom_vline(
+    aes(xintercept = mpox_params["mean"]
+    ), lty = 2
+  ) +
+  annotate(
+    "text", label = "mean Ro",
+    x = mpox_params["mean"] + 5,
+    y = 0.5,
+    size = 8,
+    colour = "red"
+  ) +
+  labs(
+    x = "Individual reproduction number",
+    y = "Density"
+  )
+```
+
+From the histogram and density plot, we can identify that the offspring distribution is highly skewed or **overdispersed**. In this framework, the superspreading events (SSEs) are not arbitrary or exceptional, but simply realizations from the right-hand tail of the offspring distribution, which we can quantify and analyse ([Lloyd-Smith et al., 2005](https://www.nature.com/articles/nature04153)).
+
+::::::::::::::::::::::::::: callout
+
+### Terminology recap
+
+- From linelist and contact data, we calculate the **number of secondary cases** caused by the observed infected individuals.
+- Whereas $R_{0}$ captures the average transmission in the population, we can define the **individual reproduction number** as a random variable representing the _expected_ number of secondary cases caused by a infected individual.
+- From the stochastic effects in transmission, the number of secondary _infections_ caused by each case is described by an **offspring distribution**.
+- An _empirical_ offspring distribution can be modeled by the **negative-binomial** distribution with mean $R_{0}$ and dispersion parameter $k$.
+
+:::::::::::::::::::::::::::
+
+::::::::::::::::::::::::::::: spoiler
+
+### Poisson, overdispersion, and Negative Binomial
+
+<!-- distribution stories -->
+
+For occurrences of associated discrete events we can use **Poisson** or negative binomial discrete distributions.
+
+In a Poisson distribution, mean is equal to variance. But when variance is higher than the mean, this is called **overdispersion**. In biological applications, overdispersion occurs and so a negative binomial may be worth considering as an alternative to Poisson distribution.
+
+**Negative binomial** distribution is specially useful for discrete data over an unbounded positive range whose sample variance exceeds the sample mean. In such terms, the observations are overdispersed with respect to a Poisson distribution, for which the mean is equal to the variance.
+
+In epidemiology, [negative binomial](https://en.wikipedia.org/wiki/Negative_binomial_distribution) have being used to model disease transmission for infectious diseases where the likely number of onward infections may vary considerably from individual to individual and from setting to setting, capturing all variation in infectious histories of individuals, including properties of the biological (i.e. degree of viral shedding) and environmental circumstances (e.g. type and location of contact).
+
+:::::::::::::::::::::::::::::
+
+:::::::::::::::::::::::::::::::::: challenge
+
+Calculate the distribution of secondary cases for Ebola using the `ebola_sim_clean` object from `{outbreaks}` package.
+
+Is the offspring distribution of Ebola skewed or overdispersed?
+
+:::::::::::::::::: hint
+
+**Note:** This dataset has `r nrow(ebola_sim_clean$linelist)` cases. Running `epicontacts::vis_epicontacts()` may overload your session!
+
+::::::::::::::::::
+
+:::::::::::::::::: solution
+
+```{r,message=FALSE,warning=FALSE}
+## first, make an epicontacts object
+ebola_contacts <-
+  epicontacts::make_epicontacts(
+    linelist = ebola_sim_clean$linelist,
+    contacts = ebola_sim_clean$contacts
+  )
+
+# count secondary cases
+
+ebola_infector_secondary <- ebola_contacts %>%
+  pluck("contacts") %>%
+  count(from, name = "secondary_cases")
+
+ebola_secondary <- ebola_contacts %>%
+  # extract ids in contact *and* linelist using "which" argument
+  epicontacts::get_id(which = "all") %>%
+  # transform vector to dataframe to use left_join()
+  enframe(name = NULL, value = "from") %>%
+  # join count secondary cases per infectee
+  left_join(ebola_infector_secondary) %>%
+  # infectee with missing secondary cases are replaced with zero
+  replace_na(
+    replace = list(secondary_cases = 0)
+  )
+
+## plot the distribution
+ebola_secondary %>%
+  ggplot(aes(secondary_cases)) +
+  geom_histogram(binwidth = 1) +
+  labs(
+    x = "Number of secondary cases",
+    y = "Frequency"
+  )
+```
+
+From a visual inspection, the distribution of secondary cases for the Ebola data set in `ebola_sim_clean` shows an skewed distribution with secondary cases equal or lower than 6. We need to complement this observation with a statistical analysis to evaluate for overdispersion.
+
+::::::::::::::::::
+
+::::::::::::::::::::::::::::::::::
+
+## Estimate the dispersion parameter
+
+To empirically estimate the dispersion parameter $k$, we could fit a negative binomial distribution to the number of secondary cases.
+
+We can fit distributions to data using the `{fitdistrplus}` package, which provides maximum likelihood estimates.
+
+```{r}
+library(fitdistrplus)
+
+## fit distribution
+offspring_fit <- all_secondary %>%
+  pull(secondary_cases) %>%
+  fitdist(distr = "nbinom")
+
+offspring_fit
+```
+
+:::::::::::::::::::::::::::::::: callout
+
+### Name of parameters
+
+From the `{fitdistrplus}` output:
+
+- The `size` object refers to the estimated dispersion parameter $k$, and
+- The `mu` object refers to the estimated mean, which represents the $R_{0}$, 
+
+::::::::::::::::::::::::::::::::
+
+```{r,echo=FALSE}
+## extract the "size" parameter
+mid <- offspring_fit$estimate[["size"]]
+
+## calculate the 95% confidence intervals using the standard error estimate and
+## the 0.025 and 0.975 quantiles of the normal distribution.
+lower <- mid + offspring_fit$sd[["size"]] * qnorm(0.025)
+upper <- mid + offspring_fit$sd[["size"]] * qnorm(0.975)
+```
+
+From the number secondary cases distribution we estimated a dispersion parameter $k$ of
+`r round(mid, 3)`, with a 95% Confidence Interval from `r round(lower, 3)` to `r round(upper, 3)`. As the value of $k$ is significantly lower than one, we can conclude that there is considerable potential for superspreading events.
+
+We can overlap the estimated density values of the fitted negative binomial distribution and the histogram of the number of secondary cases:
+
+```{r,echo=FALSE}
+# calculate density fit
+fit_density <-
+  tibble(quantile = 0:40) %>%
+  mutate(
+    density = dnbinom(
+      x = quantile,
+      mu = offspring_fit$estimate[["mu"]],
+      size = mid
+    )
+  ) %>%
+  mutate(label = "Fitted\nnegative\nbinomial\ndistribution")
+
+# plot offspring distribution with density fit
+ggplot() +
+  geom_histogram(
+    data = all_secondary,
+    mapping =
+      aes(
+        x = secondary_cases,
+        y = after_stat(density)
+      ), fill = "white", color = "black",
+    binwidth = 1
+  ) +
+  geom_point(
+    data = fit_density,
+    mapping = aes(
+      x = quantile,
+      y = density,
+      color = label
+    ),
+    alpha = 0.5
+  ) +
+  geom_line(
+    data = fit_density,
+    mapping = aes(
+      x = quantile,
+      y = density
+    ),
+    alpha = 0.5, color = "red"
+  ) +
+  labs(
+    x = "Number of secondary cases",
+    y = "Density",
+    color = "Legend"
+  ) +
+  theme_bw()
+```
+
+:::::::::::::::::::: callout
+
+### Individual-level variation in transmission
+
+The individual-level variation in transmission is defined by the relationship between the mean ($R_{0}$), dispersion ($k$), and the variance of a negative binomial distribution.
+
+The negative binomial model has $variance = R_{0}(1+\frac{R_{0}}{k})$, so smaller values of $k$ indicate greater variance and, consequently, greater **individual-level variation** in transmission.
+
+$$\uparrow variance = R_{0}(1+\frac{R_{0}}{\downarrow k})$$
+
+When $k$ approaches infinity ($k \rightarrow \infty$) the variance equals the mean (because $\frac{R_{0}}{\infty}=0$). This makes the conventional Poisson model an special case of the negative binomial model.
+
+::::::::::::::::::::
+
+::::::::::::::::::::::: challenge
+
+Use the distribution of secondary cases from the `ebola_sim_clean` object from `{outbreaks}` package.
+
+Fit a negative binomial distribution to estimate the mean and dispersion parameter of the offspring distribution.
+
+Does the estimated dispersion parameter of Ebola provide evidence of an individual-level variation in transmission?
+
+:::::::::::::: hint
+
+Review how we fitted a negative binomial distribution using the `fitdistrplus::fitdist()` function.
+
+::::::::::::::
+
+:::::::::::::: solution
+
+```{r}
+ebola_offspring <- ebola_secondary %>%
+  pull(secondary_cases) %>%
+  fitdist(distr = "nbinom")
+
+ebola_offspring
+```
+
+```{r}
+## extract the "size" parameter
+ebola_mid <- ebola_offspring$estimate[["size"]]
+
+## calculate the 95% confidence intervals using the standard error estimate and
+## the 0.025 and 0.975 quantiles of the normal distribution.
+
+ebola_lower <- ebola_mid + ebola_offspring$sd[["size"]] * qnorm(0.025)
+
+ebola_upper <- ebola_mid + ebola_offspring$sd[["size"]] * qnorm(0.975)
+
+# ebola_mid
+# ebola_lower
+# ebola_upper
+```
+
+From the number secondary cases distribution we estimated a dispersion parameter $k$ of
+`r round(ebola_mid, 3)`, with a 95% Confidence Interval from `r round(ebola_lower, 3)` to `r round(ebola_upper, 3)`.
+
+For dispersion parameter estimates higher than one we get low distribution variance, hence, low individual-level variation in transmission.
+
+But does this mean that the secondary case distribution does not have superspreading events (SSEs)? You will later find one additional challenge: How do you define an SSE threshold for Ebola?
+
+::::::::::::::
+
+:::::::::::::: solution
+
+### Select the best model
+
+We can use the maximum likelihood estimates from `{fitdistrplus}` to compare different models and assess fit performance using estimators like the AIC and BIC. Read further in the vignette on [Estimate individual-level transmission](https://epiverse-trace.github.io/superspreading/articles/estimate_individual_level_transmission.html) and use the `{superspreading}` helper function `ic_tbl()` for this!
+
+::::::::::::::
+
+:::::::::::::::::::::::
+
+:::::::::::::::::::::::::::::::::: checklist
+
+### The dispersion parameter across diseases
+
+Research into sexually transmitted and vector-borne diseases has previously suggested a '20/80' rule, with 20% of individuals contributing at least 80% of the transmission potential ([Woolhouse et al](https://www.pnas.org/doi/10.1073/pnas.94.1.338)). On its own, the dispersion parameter $k$ is hard to interpret intuitively, and hence converting into proportional summary can enable easier comparison. When we consider a wider range of pathogens, we can see there is no hard and fast rule for the percentage that generates 80% of transmission, but variation does emerge as a common feature of infectious diseases
+
+- When the 20% most infectious cases contribute to the 80% of transmission (or more), there is a high individual-level variation in transmission, with a highly overdispersed offspring distribution ($k<0.3$), e.g., SARS-1.
+
+- When the 20% most infectious cases contribute to the ~50% of transmission, there is a low individual-level variation in transmission, with a moderately dispersed offspring distribution ($k > 3$), e.g. Ebola.
+
+```{r,message=FALSE,warning=FALSE,echo=FALSE}
+library(epiparameter)
+library(superspreading)
+library(tidyverse)
+
+# list of diseases with offspring distribution
+epidist_string <- epidist_db(
+  epi_dist = "offspring distribution"
+) %>%
+  list_distributions() %>%
+  dplyr::select(disease) %>%
+  distinct() %>%
+  as_tibble()
+
+# get percent of cases that cause percent of transmission
+across_offspring <- epidist_string %>%
+  # add column list of epidist objects
+  mutate(
+    epidist_out =
+      map(
+        .x = disease,
+        .f = epiparameter::epidist_db,
+        epi_dist = "offspring distribution",
+        single_epidist = TRUE
+      )
+  ) %>%
+  # get parameters
+  mutate(
+    epidist_params =
+      map(
+        .x = epidist_out,
+        .f = epiparameter::get_parameters
+      )
+  ) %>%
+  # unnest parameters
+  unnest_wider(col = epidist_params) %>%
+  # to each disease, add sequence from 0.01 to 1 (proportion of transmission)
+  expand_grid(percent_transmission = seq(from = 0.01, to = 1, by = 0.01)) %>%
+  # estimate proportion of cases responsible of proportion of transmission (row)
+  mutate(
+    transmission_output =
+      pmap(
+        .l = dplyr::select(., R = mean, k = dispersion, percent_transmission),
+        .f = superspreading::proportion_transmission,
+        format_prop = FALSE,
+        simulate = TRUE # use a numerical simulation
+      )
+  ) %>%
+  # unnest proportion of cases results
+  unnest_wider(col = transmission_output) %>%
+  # move each result to one column
+  rowwise() %>%
+  mutate(
+    percent_cases =
+      sum(
+        c_across(cols = starts_with("prop_")),
+        na.rm = TRUE
+      )
+  ) %>%
+  dplyr::select(-starts_with("prop_")) %>%
+  ungroup()
+
+# get a position to the ggplot text annotation
+across_offspring_tip <- across_offspring %>%
+  group_by(disease) %>%
+  filter(percent_transmission < 0.98, percent_transmission > 0.85) %>%
+  slice_max(percent_transmission) %>%
+  ungroup() %>%
+  mutate(disease = case_when(
+    str_detect(disease, stringr::fixed("Hantavirus")) ~ "Hantavirus",
+    str_detect(disease, stringr::fixed("Ebola")) ~ "Ebola",
+    TRUE ~ disease
+  ))
+
+# plot x: proportion of cases, y: proportion of transmission
+across_offspring %>%
+  ggplot() +
+  geom_line(
+    aes(
+      x = percent_cases,
+      y = percent_transmission,
+      color = dispersion,
+      group = disease
+    )
+  ) +
+  geom_text(
+    data = across_offspring_tip,
+    aes(
+      x = percent_cases,
+      y = percent_transmission,
+      label = disease
+    ),
+    hjust = 0.0,
+    vjust = 1.0,
+    angle = 25,
+    size = 3
+  ) +
+  scale_y_continuous(breaks = scales::breaks_pretty(n = 5)) +
+  colorspace::scale_color_continuous_diverging(trans = "log10", rev = TRUE) +
+  labs(
+    x = "Proportion of infectious cases (ranked)",
+    y = "Expected proportion of transmission",
+    color = "Dispersion\nparameter (k)"
+  ) +
+  # geom_hline(aes(yintercept = 0.8),lty = 3) +
+  geom_vline(aes(xintercept = 0.2), lty = 2) +
+  coord_fixed(ratio = 1)
+```
+
+
+::::::::::::::::::::::::::::::::::
+
+## Controlling superspreading with contact tracing
+
+During an outbreak, it is common to try and reduce transmission by identifying people who have come into contact with an infected person, then quarantine them in case they subsequently turn out to be infected. Such contact tracing can be deployed in multiple ways. 'Forward' contact tracing targets downstream contacts who may have been infected by a newly identifed infection (i.e. the 'index case'). 'Backward' tracing instead tracks the upstream primary case who infected the index case (or a setting or event at which the index case was infected), for example by retracing history of contact to the likely point of exposure. This makes it possible to identify others who were also potentially infected by this earlier primary case.
+
+In the presence of individual-level variation in transmission, i.e., with an overdispersed offspring distribution, if this primary case is identified, a larger fraction of the transmission chain can be detected by forward tracing each of the contacts of this primary case  ([Endo et al., 2020](https://wellcomeopenresearch.org/articles/5-239/v3)).
+
+![Schematic representation of contact tracing strategies. Black arrows indicate the directions of transmission, blue and Orange arrows, a successful or failed contact tracing, respectivelly. When there is evidence of individual-level variation in transmission, often resulting in superspreading, backward contact tracing from the index case (blue circle) increase the probability to find the primary case (green circle) or clusters with a larger fraction of cases, potentially increasing the number of quarentined cases (yellow circles). [Claire Blackmore, 2021](https://www.paho.org/sites/default/files/backward_contact_tracing_v3_0.pdf)](fig/contact-tracing-strategies.png)
+
+When there is evidence of individual-level variation (i.e. overdispersion), often resulting in so-called superspreading events, a large proportion of infections may be linked to a small proportion of original clusters. As a result, finding and targeting originating clusters in combination with reducing onwards infection may substantially enhance the effectiveness of tracing methods ([Endo et al., 2020](https://wellcomeopenresearch.org/articles/5-239/v3)). 
+
+Empirical evidence focused on evaluating the efficiency of backward tracing lead to 42% more cases identified than forward tracing supporting its implementation when rigorous suppression of transmission is justified ([Raymenants et al., 2022](https://www.nature.com/articles/s41467-022-32531-6))
+
+
+## Probability of cases in a given cluster
+
+Using `{superspreading}`, we can estimate the probability of having a cluster of secondary infections caused by a primary case identified by backward tracing of size $X$ or larger ([Endo et al., 2020](https://wellcomeopenresearch.org/articles/5-239/v3)).
+
+```{r}
+# Set seed for random number generator
+set.seed(33)
+
+# estimate the probability of
+# having a cluster size of 5, 10, or 25
+# secondary cases from a primary case,
+# given known reproduction number and
+# dispersion parameter.
+proportion_cluster_size(
+  R = offspring_fit$estimate["mu"],
+  k = offspring_fit$estimate["size"],
+  cluster_size = c(5, 10, 25)
+)
+```
+
+```{r,echo=FALSE,message=FALSE,warning=FALSE}
+# Set seed for random number generator
+set.seed(33)
+
+cluster_probability <- proportion_cluster_size(
+  R = offspring_fit$estimate["mu"],
+  k = offspring_fit$estimate["size"],
+  cluster_size = c(5, 10, 25)
+)
+
+cluster_probability_percent <- cluster_probability %>%
+  select(prop_25) %>%
+  pull(prop_25)
+
+# cluster_probability_percent
+```
+
+
+Even though we have an $R<1$, a highly overdispersed offspring distribution ($k=0.02$) means that if we detect a new case, there is a `r cluster_probability_percent` probability they originated from a cluster of 25 infections or more. Hence, by following a backwards strategy, contact tracing efforts will increase the probability of successfully contain and quarantining this large number of earlier infected individuals, rather than simply focusing on the new case, who is likely to have infected nobody (because $k$ is very small).
+
+We can also use this number to prevent gathering of certain sized to reduce the epidemic by preventing potential superspreading events. Interventions can target to reduce the reproduction number in order to reduce the probability of having clusters of secondary cases.
+
+
+::::::::::::::::::::::::::::::::: challenge
+
+### Backward contact tracing for Ebola
+
+Use the Ebola estimated parameters for `ebola_sim_clean` object from `{outbreaks}` package.
+
+Calculate the probability of having a cluster of secondary infections caused by a primary case identified by backward tracing of size 5, 10, 15 or larger.
+
+Would implementing a backward strategy at this stage of the Ebola outbreak increase the probability of containing and quarantining more onward cases?
+
+:::::::::::::::: hint
+
+Review how we estimated the probability of having clusters of a fixed size, given an offspring distribution mean and dispersion parameters, using the `superspreading::proportion_cluster_size()` function.
+
+::::::::::::::::
+
+:::::::::::::::: solution
+
+```{r}
+# estimate the probability of
+# having a cluster size of 5, 10, or 25
+# secondary cases from a primary case,
+# given known reproduction number and
+# dispersion parameter.
+proportion_cluster_size(
+  R = ebola_offspring$estimate["mu"],
+  k = ebola_offspring$estimate["size"],
+  cluster_size = c(5, 10, 25)
+)
+```
+
+The probability of having clusters of five people is 1.8%. At this stage, given this offspring distribution parameters, a backward strategy may not increase the probability of contain and quarantine more onward cases.
+
+::::::::::::::::
+
+:::::::::::::::::::::::::::::::::
+
+## Challenges
+
+::::::::::::::::::::::::: challenge
+
+### Does Ebola have any superspreading event?
+
+'Superspreading events' can mean different things to different people, so [Lloyd-Smith et al., 2005](https://www.nature.com/articles/nature04153) proposed a general protocol for defining a superspreading event (SSE). If the number of secondary infections caused by each case, $Z$, follows a negative binomial distribution ($R, k$):
+
+- We define an SSE as any infected individual who infects more than $Z(n)$ others, where $Z(n)$ is the nth percentile of the $Poisson(R)$ distribution. 
+- A 99th-percentile SSE is then any case causing more infections than would occur in 99% of infectious histories in a homogeneous population
+
+Using the corresponding distribution function, estimate the SSE threshold to define a SSE for the Ebola offspring distribution estimates for the `ebola_sim_clean` object from `{outbreaks}` package.
+
+::::::::::::::::: hint
+
+In a Poisson distribution, the **lambda** or **rate** parameter are equal to the estimated **mean** from a negative binomial distribution. You can explore this in [The distribution zoo](https://ben18785.shinyapps.io/distribution-zoo/) shiny app.
+
+:::::::::::::::::
+
+::::::::::::::::: solution
+
+To get the quantile value for the 99th-percentile we need to use the [density function](https://sakai.unc.edu/access/content/group/3d1eb92e-7848-4f55-90c3-7c72a54e7e43/public/docs/lectures/lecture13.htm#probfunc) for the Poisson distribution `dpois()`.
+
+```{r}
+# get mean
+ebola_mu_mid <- ebola_offspring$estimate["mu"]
+
+# get 99th-percentile from poisson distribution
+# with mean equal to mu
+qpois(
+  p = 0.99,
+  lambda = ebola_mu_mid
+)
+```
+
+Compare this values with the ones reported by [Lloyd-Smith et al., 2005](https://www.nature.com/articles/nature04153). See figure below:
+
+![Reported superspreading events (SSEs; diamonds) relative to estimated reproduction number R (squares) for twelve directly transmitted infections. Lines show 5–95 percentile range of the number of secondary cases following a Poisson distribution with lambda equal to the reproduction number ($Z∼Poisson(R)$), and crosses show the 99th-percentile proposed as threshold for SSEs. Stars represent SSEs caused by more than one source case. ‘Other’ diseases are: 1, Streptococcus group A; 2, Lassa fever; 3, Mycoplasma pneumonia; 4, pneumonic plague; 5, tuberculosis. R is not shown for ‘other’ diseases, and is off-scale for monkeypox.](fig/SEE-individual-reproductive-number-fig-d.png)
+
+:::::::::::::::::
+
+::::::::::::::::::::::::
+
+::::::::::::::::::::::::::::: challenge
+
+### Expected proportion of transmission
+
+What is the proportion of cases responsible for 80% of transmission?
+
+Use `{superspreading}` and compare the estimates for **MERS** using the offspring distributions parameters from this tutorial episode, with **SARS-1** and **Ebola** offspring distributions parameter accessible via the `{epiparameter}` R package.
+
+::::::::::::::::::::: hint
+
+To use `superspreading::proportion_transmission()` we recommend to read the [Estimate what proportion of cases cause a certain proportion of transmission](https://epiverse-trace.github.io/superspreading/reference/proportion_transmission.html) reference manual.
+
+```{r,message=FALSE,warning=FALSE,echo=FALSE}
+library(epiparameter)
+
+epidist_string <- epidist_db(
+  epi_dist = "offspring distribution"
+) %>%
+  list_distributions() %>%
+  dplyr::select(disease) %>%
+  distinct() %>%
+  pull() %>%
+  paste(collapse = ", ")
+```
+
+Currently, `{epiparameter}` has offspring distributions for `r epidist_string`. Let's access the offspring distribution `mean` and `dispersion` parameters for SARS-1:
+
+```{r,message=FALSE,warning=FALSE}
+# Load parameters
+sars <- epiparameter::epidist_db(
+  disease = "SARS",
+  epi_dist = "offspring distribution",
+  single_epidist = TRUE
+)
+sars_params <- epiparameter::get_parameters(sars)
+sars_params
+```
+
+:::::::::::::::::::::
+
+::::::::::::::::::::: solution
+
+```{r,message=FALSE,warning=FALSE}
+#' estimate for ebola --------------
+
+ebola_epiparameter <- epiparameter::epidist_db(
+  disease = "Ebola",
+  epi_dist = "offspring distribution",
+  single_epidist = TRUE
+)
+ebola_params <- epiparameter::get_parameters(ebola_epiparameter)
+ebola_params
+
+# estimate
+# proportion of cases that
+# generate 80% of transmission
+proportion_transmission(
+  R = ebola_params[["mean"]],
+  k = ebola_params[["dispersion"]],
+  percent_transmission = 0.8
+)
+
+#' estimate for sars --------------
+
+# estimate
+# proportion of cases that
+# generate 80% of transmission
+proportion_transmission(
+  R = sars_params[["mean"]],
+  k = sars_params[["dispersion"]],
+  percent_transmission = 0.8
+)
+
+
+#' estimate for mers --------------
+
+# estimate
+# proportion of cases that
+# generate 80% of transmission
+proportion_transmission(
+  R = offspring_fit$estimate["mu"],
+  k = offspring_fit$estimate["size"],
+  percent_transmission = 0.8
+)
+```
+
+MERS has the lowest percent of cases (2.1%) responsible of the 80% of the transmission, representative of highly overdispersed offspring distributions.
+
+Ebola has the highest percent of cases (43%) responsible of the 80% of the transmission. This is representative of offspring distributions with high dispersion parameters.
+
+:::::::::::::::::::::
+
+:::::::::::::::::::::::::::::
+
+::::::::::::::::: callout
+
+### inverse-dispersion?
+
+The dispersion parameter $k$ can be expressed differently across the literature. 
+
+- In the Wikipedia page for the negative binomial, this parameter is defined in its reciprocal form (refer to the [variance equation](https://en.wikipedia.org/wiki/Negative_binomial_distribution)). 
+- In [the distribution zoo](https://ben18785.shinyapps.io/distribution-zoo/) shiny app, the dispersion parameter $k$ is named "Inverse-dispersion" but it is equal to parameter estimated in this episode. We invite you to explore this!
+
+:::::::::::::::::
+
+:::::::::::::::::::::::::::: callout
+
+### heterogeneity?
+
+The individual-level variation in transmission is also referred as the heterogeneity in the transmission or degree of heterogeneity in [Lloyd-Smith et al., 2005](https://wellcomeopenresearch.org/articles/5-83), heterogeneous infectiousness in [Campbell et al., 2018](https://bmcbioinformatics.biomedcentral.com/articles/10.1186/s12859-018-2330-z) when introducing the `{outbreaker2}` package. Similarly, a contact network can store heterogeneous epidemiological contacts as in the documentation of the `{epicontacts}` package ([Nagraj et al., 2018](https://www.repidemicsconsortium.org/epicontacts/articles/epicontacts.html)).
+
+::::::::::::::::::::::::::::
+
+::::::::::::::::::::::::::::: testimonial
+
+### Read these blog posts
+
+The [Tracing monkeypox](https://plus.maths.org/content/monkeypox) article from the [JUNIPER](https://maths.org/juniper/) consortium showcases the usefulness of network models on contact tracing.
+
+The [Going viral](https://kucharski.substack.com/p/going-viral) post from Adam Kucharski shares insides from YouTube virality, disease outbreaks, and marketing campaigns on the conditions that spark contagion online.
+
+:::::::::::::::::::::::::::::
+
+
+::::::::::::::::::::::::::::::::::::: keypoints 
+
+- Use `{epicontacts}` to calculate the number of secondary cases cause by a particular individual from linelist and contact data.
+- Use `{fitdistrplus}` to empirically estimate the offspring distribution from the number of secondary cases distribution.
+- Use `{superspreading}` to estimate the probability of having clusters of a given size from primary cases and inform contact tracing efforts.
+
+::::::::::::::::::::::::::::::::::::::::::::::::
+
diff --git a/episodes/superspreading-simulate.Rmd b/episodes/superspreading-simulate.Rmd
new file mode 100644
index 00000000..7a94c077
--- /dev/null
+++ b/episodes/superspreading-simulate.Rmd
@@ -0,0 +1,1029 @@
+---
+title: 'Simulate transmission chains'
+teaching: 30
+exercises: 2
+---
+
+:::::::::::::::::::::::::::::::::::::: questions 
+
+- How can we simulate transmission chains based on infection characteristics?
+
+::::::::::::::::::::::::::::::::::::::::::::::::
+
+::::::::::::::::::::::::::::::::::::: objectives
+
+- Create a short term projection using a branching process with `{epichains}`.
+
+::::::::::::::::::::::::::::::::::::::::::::::::
+
+::::::::::::::::::::::::::::::::::::: prereq
+
+## Prerequisites
+
+Learners should familiarise themselves with the following concept dependencies before working through this tutorial: 
+
+**Statistics**: Common probability distributions, including Poisson and negative binomial.
+
+**Epidemic theory**: The reproduction number, $R$.
+
+:::::::::::::::::::::::::::::::::
+
+## Introduction
+
+<!-- what we know -->
+
+Individual variation in transmission can affect both the potential for an epidemic to establish in a population and the ease of control ([Cori et al., 2017](https://royalsocietypublishing.org/doi/10.1098/rstb.2016.0371)). 
+
++ Greater variation reduces the overall probably of a single new case causing a large local outbreak, because most cases infect few others and individuals that generate a large number of secondary cases are relatively rare.
+
++ However, if a 'superspreading event' does occur and the outbreak gets established, this variation can make an outbreak harder to control using *mass interventions* (i.e. blanket interventions that implicitly assume everyone contributes equally to transmission), because some cases contribute disproportionality: a single uncontrolled case may generate a large number of secondary cases.
+
++ Conversely, variation in transmission may provide opportunities for *targeted interventions* if the individuals who contribute more to transmission (due to biological or behavioural factors), or the settings in which 'superspreading events' occur, share socio-demographic, environmental or geographical characteristics that can be defined.
+
+<!-- what we dont know -->
+
+How can we quantify the potential of a new infection to cause a large outbreak based on its reproduction number $R$ and the dispersion $k$ of its offspring distribution?
+
+```{r,echo=FALSE,warning=FALSE,message=FALSE,fig.cap="Observed number of cumulative cases from the Middle East respiratory syndrome (MERS) outbreak in South Korea, 2015, alongside with simulated transmission chains assuming an offspring distribution with $R=0.6$ and $k=0.02$."}
+library(epichains)
+library(epiparameter)
+library(tidyverse)
+library(outbreaks)
+
+mers_cumcases <- mers_korea_2015$linelist %>%
+  # incidence2 workflow
+  incidence2::incidence(date_index = "dt_onset") %>%
+  incidence2::complete_dates() %>%
+  # wrangling using {dplyr}
+  mutate(count_cumsum = cumsum(count)) %>%
+  rownames_to_column(var = "day") %>%
+  mutate(day = as.numeric(day))
+
+# offspring distribution
+mers_offspring <- c(mean = 0.60, dispersion = 0.02)
+
+# generation time
+serial_interval <- epidist_db(
+  disease = "mers",
+  epi_dist = "serial",
+  single_epidist = TRUE
+)
+
+# Set seed for random number generator
+set.seed(33)
+# Number of simulation runs
+number_chains <- 1000
+# Number of initial cases
+initial_cases <- 1
+
+simulated_chains_map <-
+  # iterate one function across multiple numbers (chain IDs)
+  map(
+    # vector of numbers (chain IDs)
+    .x = seq_len(number_chains),
+    # function to iterate to each chain ID number
+    .f = function(sim) {
+      simulate_chains(
+        # simulation controls
+        index_cases = initial_cases,
+        statistic = "size",
+        # offspring
+        offspring_dist = rnbinom,
+        mu = mers_offspring["mean"],
+        size = mers_offspring["dispersion"],
+        # generation
+        generation_time = function(x) generate(x = serial_interval, times = x)
+      ) %>%
+        # creates a column with the chain ID number
+        mutate(chain_id = sim)
+    }
+  ) %>%
+  # combine list outputs (for each chain ID) into a single data frame
+  list_rbind()
+
+# daily aggregate of cases
+simulated_chains_day <- simulated_chains_map %>%
+  # use data.frame output from <epichains> object
+  as_tibble() %>%
+  # transform chain ID column to factor (categorical variable)
+  mutate(chain_id = as_factor(chain_id)) %>%
+  # get the round number (day) of infection times
+  mutate(day = ceiling(time)) %>%
+  # count the daily number of cases each simulation (chain ID)
+  count(chain_id, day, name = "cases") %>%
+  # calculate the cumulative number of cases for each simulation (chain ID)
+  group_by(chain_id) %>%
+  mutate(cases_cumsum = cumsum(cases)) %>%
+  ungroup()
+
+# Summarise the chain duration and size
+sim_chains_max <-
+  simulated_chains_day %>%
+  group_by(chain_id) %>%
+  summarise(
+    # duration
+    day_max = max(day),
+    # size
+    cases_total = max(cases_cumsum)
+  ) %>%
+  ungroup()
+
+# Observed cases vs Simulated transmission chains
+mers_cumcases_type <-
+  mers_cumcases %>%
+  mutate(type = "Observed")
+simulated_chains_day_type <-
+  simulated_chains_day %>%
+  mutate(type = "Simulated")
+
+ggplot() +
+  geom_line(
+    data = simulated_chains_day_type,
+    mapping = aes(
+      x = day,
+      y = cases_cumsum,
+      group = chain_id,
+      color = type
+    )
+  ) +
+  geom_hline(aes(yintercept = 100), lty = 2) +
+  geom_line(
+    data = mers_cumcases_type,
+    mapping = aes(
+      x = day,
+      y = count_cumsum,
+      color = type
+    ),
+    linewidth = 1.5
+  ) +
+  labs(
+    x = "Day since first report",
+    y = "Cumulative cases",
+    color = "Type"
+  )
+```
+
+
+<!-- what we are going to do -->
+
+In this episode, we will use the `{epichains}` package to simulate transmission chains and estimate the potential for large outbreaks following the introduction of a new case. We are going to use it with functions from `{epiparameter}`, `{dplyr}` and `{purrr}`, so also loading the `{tidyverse}` package:
+
+```{r,message=FALSE,warning=FALSE}
+library(epichains)
+library(epiparameter)
+library(tidyverse)
+```
+
+::::::::::::::::::: checklist
+
+### The double-colon
+
+The double-colon `::` in R let you call a specific function from a package without loading the entire package into the current environment. 
+
+For example, `dplyr::filter(data, condition)` uses `filter()` from the `{dplyr}` package.
+
+This help us remember package functions and avoid namespace conflicts.
+
+:::::::::::::::::::
+
+## Simulation of uncontrolled outbreaks
+
+Infectious disease epidemics spread through populations when a chain of infected individuals transmit the infection to others. [Branching processes](https://epiverse-trace.github.io/epichains/articles/theoretical_background.html) can be used to model this transmission. A branching process is a stochastic process (i.e. a random process that can be described by a known probability distribution), where each infectious individual gives rise to a random number of individuals in the next generation of infection, starting with the index case in generation 1. The distribution of the number of secondary cases each individual generates is called the offspring distribution ([Azam & Funk, 2024](https://epiverse-trace.github.io/epichains/articles/epichains.html)).
+
+`{epichains}` provides methods to analyse and simulate the *size* and *length* of branching processes with an given offspring distribution. `{epichains}` implements a rapid and simple model to simulate transmission chains to assess epidemic risk, project cases into the future, and evaluate interventions that change $R$.
+
+::::::::::::::::::: discussion
+
+### chain size and length
+
+- The **size** of the transmission chain is defined as the total number of individuals infected across all generations of infection, and 
+
+- the **length** of the transmission chain is the number of generations from the first case to the last case in the outbreak before the chain ended. 
+
+The *size* calculation includes the first case, and the *length* calculation contains the first generation when the first case starts the chain (See figure below).
+
+![**An example of a transmission chain starting with a single case C1.** Cases are represented by blue circles and arrows indicating who infected whom. The chain grows through generations Gen 1, Gen 2, and Gen 3, producing cases C2, C3, C4, C5, and C6. The chain ends at generation Gen 3 with cases C4, C5, and C6. The size of C1’s chain is 6, including C1 (that is, the sum of all blue circles), and the length is 3, which includes Gen 1 (maximum number of generations reached by C1’s chain) ([Azam & Funk, 2024](https://epiverse-trace.github.io/epichains/articles/epichains.html)).](https://raw.githubusercontent.com/epiverse-trace/epichains/main/vignettes/img/transmission_chain_example.png)
+
+::::::::::::::::::::
+
+To use `{epichains}`, we need to know (or assume) two key epidemiological values: the offspring distribution and the generation time.
+
+## Get the offspring distribution
+
+Here we assume the MERS offspring distribution follows a negative binomial distribution, with mean (reproduction number $R$) and dispersion $k$ values estimated from the linelist and contact data of `mers_korea_2015` in the `{outbreaks}` R package in the previous episode.
+
+```{r}
+mers_offspring <- c(mean = 0.60, dispersion = 0.02)
+```
+
+:::::::::::::::::::::::::::: callout
+
+### offspring distribution for epichains
+
+We input an offspring distribution to `{epichains}` by referring to the R function that generates random values from the distribution we want. For a negative binomial distribution, we use `rnbinom` with its corresponding `mu` and `size` arguments:
+
+```r
+  offspring_dist = rnbinom,
+  mu = mers_offspring["mean"],
+  size = mers_offspring["dispersion"],
+```
+
+The reference manual in `?rnbinom` tells us our required specific arguments.
+
+::::::::::::::::::::::::::::
+
+:::::::::::::::::::::::::: spoiler
+
+### Poisson and other distributions
+
+`{epichains}` can accept any R function that generates random numbers, so the specified arguments will change depending on the R function used. For more details on the range of possible options, see the function reference manual.
+
+For example, let's say we want to use a Poisson distribution for the offspring distribution. First, read the argument required in the `?rpois` reference manual. Second, specify the `lambda` argument parameter, also known as rate or mean in the literature. In `{epichains}`, this can look like this:
+
+```r
+  offspring_dist = rpois,
+  lambda = mers_offspring["mean"],
+```
+
+In this example, we can specify `lambda = mers_offspring["mean"]` because the mean number of secondary cases generated (i.e. $R$) should be the same regardless of the distribution we assume. What changes is the variance of the distribution, and hence the level of individual-level variation in transmission. When the dispersion parameter $k$ approaches infinity ($k \rightarrow \infty$) in a negative binomial distribution, the variance equals the mean. This makes the conventional Poisson distribution a special case of the negative binomial distribution.
+
+::::::::::::::::::::::::::
+
+## Get generation time
+
+The [serial interval](../learners/reference.md#serialinterval) distribution is often used to approximate the generation time distribution. This approximation is commonly used because it is easier to observe and measure the onset of symptoms in each case than the precise time of infection.
+
+:::::::::::::::::::::::::::::::: spoiler
+
+### generation time vs serial interval
+
+![A schematic of the relationship of different time periods of transmission between an infector and an infectee in a transmission pair. Exposure window is defined as the time interval having viral exposure, and transmission window is defined as the time interval for onward transmission with respect to the infection time ([Chung Lau et al., 2021](https://academic.oup.com/jid/article/224/10/1664/6356465)).](fig/serial-interval-observed.jpeg)
+
+However, using the *serial interval* as an approximation of the *generation time* is primarily valid for diseases in which infectiousness starts after symptom onset ([Chung Lau et al., 2021](https://academic.oup.com/jid/article/224/10/1664/6356465)). In cases where infectiousness starts before symptom onset, the serial intervals can have negative values, which is the case for diseases with pre-symptomatic transmission ([Nishiura et al., 2020](https://www.ijidonline.com/article/S1201-9712(20)30119-3/fulltext#gr2)).
+
+::::::::::::::::::::::::::::::::
+
+Let's use the `{epiparameter}` package to access and use the available serial interval for MERS disease:
+
+```{r,message=FALSE,warning=FALSE}
+serial_interval <- epidist_db(
+  disease = "mers",
+  epi_dist = "serial",
+  single_epidist = TRUE
+)
+
+plot(serial_interval, day_range = 0:25)
+```
+
+```{r,echo=FALSE}
+serial_summary <- serial_interval$summary_stats %>%
+  as_tibble() %>%
+  select(mean, sd) %>%
+  distinct(.keep_all = TRUE)
+```
+
+The serial interval for MERS has a mean of `r serial_summary$mean` days and a standard deviation of `r serial_summary$sd` days.
+
+:::::::::::::::::::::::::::: callout
+
+### generation time for epichains
+
+In `{epichains}`, we need to specify the generation time as a function that generates random numbers. Using `{epiparameter}` has the advantage of using the distribution function `epiparameter::generate()` for this input. This will look like this:
+
+```r
+function(x) generate(x = serial_interval, times = x)
+```
+
+This interface is similar to the one `{cfr}` uses to link with `{epiparameter}`. Read the [work with delay distributions](https://epiverse-trace.github.io/cfr/articles/delay_distributions.html) vignette for further context.
+
+::::::::::::::::::::::::::::
+
+## Simulate a single chain
+
+Now we are prepared to use the `simulate_chains()` function from `{epichains}` to create **one** transmission chain:
+
+```{r,message=FALSE,warning=FALSE,eval=FALSE}
+simulate_chains(
+  # simulation controls
+  index_cases = 5,
+  statistic = "size",
+  # offspring
+  offspring_dist = rnbinom,
+  mu = mers_offspring["mean"],
+  size = mers_offspring["dispersion"],
+  # generation
+  generation_time = function(x) generate(x = serial_interval, times = x)
+)
+```
+
+`simulate_chains()` requires three sets of arguments as a minimum:
+
+- simulation controls,
+- offspring distribution, and
+- generation time.
+
+In the lines above, we described how to specify the offspring distribution and generation time. The **simulation controls** include at least two arguments:
+
+- `index_case`, which defines the number of index cases to simulate transmission chains for and
+- `statistic`, which defines a chain statistic to track (either `"size"` or `"length"`) as the stopping criteria for each chain being simulated.
+
+::::::::::::::::::::::::::: callout
+
+### Stopping criteria
+
+This is an customisable feature of `{epichains}`. Branching process simulations end when they have gone extinct. No more offspring are being produced because of some stopping criterion.
+
+:::::::::::::::::::::::::::
+
+The `simulate_chains()` output creates a `<epichains>` class object, which we can then analyse further in R.
+
+## Simulate multiple chains
+
+We can use `simulate_chains()` to create multiple chains and increase the probability of simulating uncontrolled outbreak projections given an overdispersed offspring distribution.
+
+We need to specify three additional elements:
+
+- `set.seed(<integer>)`, which is a random number generator function with a specified seed value, the `<integer>` number, to ensure consistent results across different runs of the code.
+- `number_chains`, which defines the number of simulations to run.
+- `initial_cases` defines the number of initial cases to input to the `index_cases` argument explained in the lines above.
+
+```{r}
+# Set seed for random number generator
+set.seed(33)
+# Number of simulation runs
+number_chains <- 1000
+# Number of initial cases
+initial_cases <- 1
+```
+
+`number_chains` and `initial_cases` are conveniently stored in objects to facilitate downstream reuse in the workflow.
+
+:::::::::::::::::::::::::::::: checklist
+
+### Iteration using purrr
+
+[Iteration](https://r4ds.hadley.nz/iteration.html) aims to perform the same action on different objects repeatedly.
+
+Learn how to use the core `{purrr}` functions like `map()` from the YouTube tutorial on [How to purrr](https://www.youtube.com/watch?v=nd-Y8b22YaQ) by Equitable Equations.
+
+Or, if you previously used the `*apply` family of functions, visit the package vignette on [purrr base R](https://purrr.tidyverse.org/articles/base.html), which shares key differences, direct translations, and examples.
+
+::::::::::::::::::::::::::::::
+
+To get multiple chains, we must apply the `simulate_chains()` function to each chain defined by a sequence of numbers from 1 to `r number_chains`.
+
+::::::::::::::::::::::::::::::: callout
+
+### purrr and epichains
+
+First, let's sketch how we use `purrr::map()` with `epichains::simulate_chains()`. The `map()` function requires two arguments:
+
+- `.x`, with a vector of numbers, and
+- `.f`, a function to iterate to each vector value.
+
+```r
+map(
+  # vector of numbers (chain IDs)
+  .x = seq_len(number_chains),
+  # function to iterate to each chain ID number
+  .f = function(sim) {
+    simulate_chains(...) %>%
+      # creates a column with the chain ID number
+      mutate(chain_id = sim)
+  }
+) %>%
+  # combine list outputs (for each chain ID) into a single data frame
+  list_rbind()
+```
+
+The `sim` element is placed to register the iteration number (**chain ID**) as a new column in the `<epichains>` output. The `purrr::list_rbind()` function aims to combine all the list outputs from `map()`.
+
+**Why a dot (`.`) as a prefix?** In the [tidy design principles](https://design.tidyverse.org/dots-prefix.html) book we have a chapter on the dot prefix!
+
+:::::::::::::::::::::::::::::::
+
+Now, we are prepared to use `map()` to repeatedly simulate from `simulate_chains()` and store in a vector from 1 to `r number_chains`:
+
+```{r}
+simulated_chains_map <-
+  # iterate one function across multiple numbers (chain IDs)
+  map(
+    # vector of numbers (chain IDs)
+    .x = seq_len(number_chains),
+    # function to iterate to each chain ID number
+    .f = function(sim) {
+      simulate_chains(
+        # simulation controls
+        index_cases = initial_cases,
+        statistic = "size",
+        # offspring
+        offspring_dist = rnbinom,
+        mu = mers_offspring["mean"],
+        size = mers_offspring["dispersion"],
+        # generation
+        generation_time = function(x) generate(x = serial_interval, times = x)
+      ) %>%
+        # creates a column with the chain ID number
+        mutate(chain_id = sim)
+    }
+  ) %>%
+  # combine list outputs (for each chain ID) into a single data frame
+  list_rbind()
+```
+
+```{r,echo=FALSE,eval=FALSE}
+# view infectee number per simulation
+simulated_chains_map %>%
+  as_tibble() %>%
+  count(chain_id, sort = TRUE)
+```
+
+::::::::::::::::::::::::::::::::: discussion
+
+### Read the epichains output
+
+```{r,echo=FALSE}
+#### get simulated chain with more infections --------------------------------
+
+chains_subgroup <- simulated_chains_map %>%
+  as_tibble() %>%
+  group_by(chain_id) %>%
+  distinct(infector_id) %>%
+  count(chain_id) %>%
+  filter(n < 5, n >= 3) %>%
+  pull(chain_id)
+
+chain_to_observe <- simulated_chains_map %>%
+  as_tibble() %>%
+  count(chain_id, sort = TRUE) %>%
+  filter(
+    magrittr::is_in(
+      chain_id, chains_subgroup
+    )
+  ) %>%
+  filter(n < 10) %>%
+  filter(n == max(n)) %>%
+  slice_min(chain_id) %>%
+  pull(chain_id)
+```
+
+To explore the output format of the `<epichains>` class object of name `simulated_chains_map`, let's look at the simulated `chain_id` number `r chain_to_observe`. 
+
+:::::::::::::::::::::::::::::::::
+
+::::::::::::::::::::::::: solution
+
+### The epichains object
+
+Let's use `dplyr::filter()` for this:
+
+```r
+chain_to_observe <- 806
+```
+
+```{r}
+#### get epichain summary ----------------------------------------------------
+
+simulated_chains_map %>%
+  filter(chain_id == chain_to_observe)
+```
+
+Key elements from this output are in the footer, the piece of text that appears at the bottom:
+
+```output
+Number of infectors (known): 3
+Number of generations: 3
+```
+
+The simulated `chain_id` number `r chain_to_observe` has three known infectors and three generations. These numbers are more visible when reading the `<epichains>` objects as a data frame.
+
+:::::::::::::::::::::::::
+
+::::::::::::::::::::::::: solution
+
+### The epichains data frame
+
+```{r}
+#### infector-infectee data frame --------------------------------------------
+
+simulated_chains_map %>%
+  filter(chain_id == chain_to_observe) %>%
+  as_tibble()
+```
+
+Chain `r chain_to_observe` tells us a **story**: "In the first transmission generation at `time = 0`, one index case infected the first case with `sim_id = 1`. Then, in the second transmission generation (between `time` 10 to 16), `sim_id = 1` infected five cases. Later, in the third transmission generation (between `time` 26 to 30), `sim_id = 2` infected three new cases."
+
+:::::::::::::::::::::::::
+
+::::::::::::::::::::::::: solution
+
+### An infectee data frame
+
+The output data frame collects **infectees** as the observation unit: 
+
+- Each infectee has a `sim_id`. 
+- Each _infectee_ that behaved as an _infector_ is registered in the `infector_id` column using `sim_id` of that infectee. 
+- Each infectee got infected in a specific `generation` and (continuous) `time`. 
+- The simulation number is registered under the `chain_id` column.
+
+**Note:** The `Number of infectors (known)` includes the `NA` observation under the `infector_id` column. This refers to the infector specified as index case (in the `index_cases` argument), which started the transmission chain to the infectee of `sim_id = 1`, at `generation = 1`, and `time = 0`.
+
+:::::::::::::::::::::::::
+
+## Visualize multiple chains
+
+To visualize the simulated chains, we need some pre-processing:
+
+1. Let's use `{dplyr}` to get round time numbers to resemble surveillance days.
+2. Count the daily cases in each simulation (by `chain_id`).
+3. Calculate the cumulative number of cases within a simulation.
+
+```{r}
+# daily aggregate of cases
+simulated_chains_day <- simulated_chains_map %>%
+  # use data.frame output from <epichains> object
+  as_tibble() %>%
+  # transform chain ID column to factor (categorical variable)
+  mutate(chain_id = as_factor(chain_id)) %>%
+  # get the round number (day) of infection times
+  mutate(day = ceiling(time)) %>%
+  # count the daily number of cases in each simulation (chain ID)
+  count(chain_id, day, name = "cases") %>%
+  # calculate the cumulative number of cases for each simulation (chain ID)
+  group_by(chain_id) %>%
+  mutate(cases_cumsum = cumsum(cases)) %>%
+  ungroup()
+```
+
+Before the plot, let's create a summary table with the total time duration and size of each chain. We can use the `{dplyr}` "combo" of `group_by()`, `summarise()` and `ungroup()`:
+
+```{r}
+# Summarise the chain duration and size
+sim_chains_max <-
+  simulated_chains_day %>%
+  group_by(chain_id) %>%
+  summarise(
+    # duration
+    day_max = max(day),
+    # size
+    cases_total = max(cases_cumsum)
+  ) %>%
+  ungroup()
+```
+
+Now, we are prepared for using the `{ggplot2}` package:
+
+```{r}
+# Visualize transmission chains by cumulative cases
+ggplot() +
+  # create grouped chain trajectories
+  geom_line(
+    data = simulated_chains_day,
+    mapping = aes(
+      x = day,
+      y = cases_cumsum,
+      group = chain_id
+    ),
+    color = "black",
+    alpha = 0.25,
+    show.legend = FALSE
+  ) +
+  # create points to visualize the chain endpoint
+  geom_point(
+    data = sim_chains_max,
+    mapping = aes(
+      x = day_max,
+      y = cases_total,
+      group = chain_id,
+      color = chain_id
+    ),
+    show.legend = FALSE
+  ) +
+  # define a 100-case threshold
+  geom_hline(aes(yintercept = 100), lty = 2) +
+  labs(
+    x = "Day",
+    y = "Cumulative cases"
+  )
+```
+
+
+```{r,echo=FALSE,message=FALSE,warning=FALSE}
+# proportion that reached the 100-case threshold
+threshhold_summary <- sim_chains_max %>%
+  arrange(desc(day_max)) %>%
+  filter(cases_total > 100) %>%
+  count(name = "chains_theshold") %>%
+  mutate(chains_number = number_chains) %>%
+  mutate(chains_percentage = chains_theshold / chains_number * 100)
+
+chains_extinct <- sim_chains_max %>%
+  arrange(desc(day_max)) %>%
+  filter(day_max > 0) %>%
+  summarise(
+    extinct_duration_median = median(day_max),
+    extinct_size_median = median(cases_total)
+  )
+
+chains_null <- sim_chains_max %>%
+  arrange(desc(cases_total)) %>%
+  filter(day_max < 1) %>%
+  nrow()
+```
+
+Although most introductions of `r initial_cases` index case do not generate secondary cases (N = `r chains_null`) or most outbreaks rapidly become extinct (median duration of `r chains_extinct$extinct_duration_median` and median size of `r chains_extinct$extinct_size_median`), only `r threshhold_summary$chains_theshold` epidemic trajectories among `r threshhold_summary$chains_number` simulations (`r threshhold_summary$chains_percentage`%) can reach to more than 100 infected cases. This finding is particularly remarkable because the reproduction number $R$ is less than 1 (offspring distribution mean of `r mers_offspring[["mean"]]`), but, given an offspring distribution dispersion parameter of `r mers_offspring[["dispersion"]]`, it shows the potential for explosive outbreaks of MERS disease.
+
+::::::::::::::::::::::::::::::::::: spoiler
+
+### Observed cases vs simulated chains
+
+Let's overlap the cumulative number of observed cases using the linelist object from the `mers_korea_2015` dataset of the `{outbreaks}` R package. To prepare the dataset so we can plot daily total cases over time, we use `{incidence2}` to convert the linelist to an `<incidence2>` object, complete the missing dates of the time series with `complete_dates()`
+
+```{r,warning=FALSE,message=FALSE}
+library(outbreaks)
+
+mers_cumcases <- mers_korea_2015$linelist %>%
+  # incidence2 workflow
+  incidence2::incidence(date_index = "dt_onset") %>%
+  incidence2::complete_dates() %>%
+  # wrangling using {dplyr}
+  mutate(count_cumsum = cumsum(count)) %>%
+  rownames_to_column(var = "day") %>%
+  mutate(day = as.numeric(day))
+```
+
+Use `plot()` to make an incidence plot:
+
+```{r}
+# plot the incidence2 object
+plot(mers_cumcases)
+```
+
+:::::::::::::::::::::::::::::::::::
+
+When plotting the observed number of cumulative cases from the Middle East respiratory syndrome (MERS) outbreak in South Korea in 2015 alongside the previously simulated chains, we see that the observed cases followed a trajectory that is consistent with the simulated explosive outbreak dynamics (which makes sense, given the simulation uses parameters based on this specific outbreak).
+
+```{r,echo=FALSE}
+# Observed cases vs Simulated transmission chains
+mers_cumcases_type <-
+  mers_cumcases %>%
+  mutate(type = "Observed")
+simulated_chains_day_type <-
+  simulated_chains_day %>%
+  mutate(type = "Simulated")
+
+ggplot() +
+  geom_line(
+    data = simulated_chains_day_type,
+    mapping = aes(
+      x = day,
+      y = cases_cumsum,
+      group = chain_id,
+      color = type
+    )
+  ) +
+  geom_hline(aes(yintercept = 100), lty = 2) +
+  geom_line(
+    data = mers_cumcases_type,
+    mapping = aes(
+      x = day,
+      y = count_cumsum,
+      color = type
+    ),
+    linewidth = 1.5
+  ) +
+  labs(
+    x = "Day since first report",
+    y = "Cumulative cases",
+    color = "Type"
+  )
+```
+
+When we increase the dispersion parameter from $k = 0.01$ to $k = \infty$ - and hence reduce individual-level variation in transmission - and assume a fixed reproduction number $R = 1.5$, the proportion of simulated outbreaks that reached the 100-case threshold increases. This is because the simulated outbreaks now have more of a consistent, clockwise dynamic, rather than the high level of variability seen previously.
+
+![**Growth of simulated outbreaks with R = 1.5 and one initial case, conditional on non-extinction.** Boxes show the median and interquartile range (IQR) of the first disease generation with 100 cases; whiskers show the most extreme values within 1.5 × IQR of the boxes, and crosses show outliers. Percentages show the proportion of 10,000 simulated outbreaks that reached the 100-case threshold ([Lloyd-Smith et al., 2005](https://www.nature.com/articles/nature04153)).](fig/see-nature04153_Fig2-c.jpg)
+
+:::::::::::::::::::::: testimonial
+
+### Early spread projections
+
+In the epidemic's initial phase, you can use `{epichains}` to apply a branching process model to project the number of future cases. Even though the model accounts for randomness in transmission and variation in the number of secondary cases, there may be additional local features we have not considered. Analysis of early forecasts made for COVID in different countries using this model structure found that predictions were often overconfident ([Pearson et al., 2020](https://www.eurosurveillance.org/content/10.2807/1560-7917.ES.2020.25.18.2000543#validationofthemodel-1)). This is likely because the real-time model did not include all the changes in the offspring distribution that were happening at the local level as a result of behaviour change and control measures. You can read more about the importance of local context in COVID-19 models in [Eggo et al. (2020)](https://www.nature.com/articles/s43588-020-00014-7).
+
+We invite you to read the vignette on [Projecting infectious disease incidence: a COVID-19 example](https://epiverse-trace.github.io/epichains/articles/projecting_incidence.html)! for more on making predictions using `{epichains}`.
+
+::::::::::::::::::::::
+
+## Challenges
+
+:::::::::::::::::::::::::: challenge
+
+### Monkeypox large outbreak potential
+
+Evaluate the potential for a new Monkey pox (Mpox) case to generate an explosive large outbreak. 
+
+- Simulate 1000 transmission chains with 1 initial case each.
+- Use the appropriate package to access delay data from previous outbreaks.
+- How many simulated trajectories reach more than 100 infected cases?
+
+:::::::::::::: hint
+
+With `{epiparameter}`, you can access and use offspring and delay distributions from previous Ebola outbreaks.
+
+```{r,warning=FALSE,message=FALSE}
+library(epiparameter)
+library(tidyverse)
+
+epidist_db(epi_dist = "offspring") %>%
+  list_distributions() %>%
+  count(disease, epi_distribution)
+
+epidist_db(epi_dist = "serial interval") %>%
+  list_distributions() %>%
+  count(disease, epi_distribution)
+```
+
+::::::::::::::
+
+:::::::::::::: solution
+
+```{r,message=FALSE,warning=FALSE}
+# load packages -----------------------------------------------------------
+
+library(epiparameter)
+library(epichains)
+library(tidyverse)
+
+# delays ------------------------------------------------------------------
+
+mpox_offspring_epidist <- epidist_db(
+  disease = "mpox",
+  epi_dist = "offspring",
+  single_epidist = TRUE
+)
+
+mpox_offspring <- get_parameters(mpox_offspring_epidist)
+
+mpox_serialint <- epidist_db(
+  disease = "mpox",
+  epi_dist = "serial interval",
+  single_epidist = TRUE
+)
+
+# iterate -----------------------------------------------------------------
+
+# Set seed for random number generator
+set.seed(33)
+# Number of simulation runs
+number_chains <- 1000
+# Number of initial cases
+initial_cases <- 1
+
+simulated_chains_mpox <-
+  # iterate one function across multiple numbers (chain IDs)
+  map(
+    # vector of numbers (chain IDs)
+    .x = seq_len(number_chains),
+    # function to iterate to each chain ID number
+    .f = function(sim) {
+      simulate_chains(
+        # simulation controls
+        index_cases = initial_cases,
+        statistic = "size",
+        # offspring
+        offspring_dist = rnbinom,
+        mu = mpox_offspring["mean"],
+        size = mpox_offspring["dispersion"],
+        # generation
+        generation_time = function(x) generate(x = mpox_serialint, times = x)
+      ) %>%
+        # creates a column with the chain ID number
+        mutate(chain_id = sim)
+    }
+  ) %>%
+  # combine list outputs (for each chain ID) into a single data frame
+  list_rbind()
+
+# visualize ---------------------------------------------------------------
+
+# daily aggregate of cases
+simulated_chains_mpox_day <- simulated_chains_mpox %>%
+  # use data.frame output from <epichains> object
+  as_tibble() %>%
+  # transform chain ID column to factor (categorical variable)
+  mutate(chain_id = as_factor(chain_id)) %>%
+  # get the round number (day) of infection times
+  mutate(day = ceiling(time)) %>%
+  # count the daily number of cases in each simulation (chain ID)
+  count(chain_id, day, name = "cases") %>%
+  # calculate the cumulative number of cases for each simulation (chain ID)
+  group_by(chain_id) %>%
+  mutate(cases_cumsum = cumsum(cases)) %>%
+  ungroup()
+
+# Visualize transmission chains by cumulative cases
+ggplot() +
+  # create grouped chain trajectories
+  geom_line(
+    data = simulated_chains_mpox_day,
+    mapping = aes(
+      x = day,
+      y = cases_cumsum,
+      group = chain_id
+    ),
+    color = "black",
+    alpha = 0.25,
+    show.legend = FALSE
+  ) +
+  # define a 100-case threshold
+  geom_hline(aes(yintercept = 100), lty = 2) +
+  labs(
+    x = "Day",
+    y = "Cumulative cases"
+  )
+```
+
+Assuming a Monkey pox outbreak with $R$ = 0.32 and $k$ = 0.58, there is no trajectory among 1000 simulations that reach more than 100 infected cases. Compared to MERS ($R$ = 0.6 and $k$ = 0.02).
+
+::::::::::::::
+
+::::::::::::::::::: hint
+
+### Epidemic Risk assessment accounting for superspreading
+
+With `{superspreading}`, you can get numerical solutions to processes that `{epichains}` solve using branching processes. We invite you to read the `{superspreading}` vignette on [Epidemic risk](https://epiverse-trace.github.io/superspreading/articles/epidemic_risk.html) and respond to:
+
+- What is the probability that a newly introduced pathogen will cause a large outbreak?
+- What is the probability that an infection will, by chance, fail to establish following initial introduction(s)?
+- What is the probability the outbreak will be contained?
+
+Check how these estimates vary non-linearly with respect to the mean reproduction number $R$ and dispersion $k$ of a given disease.
+
+<!-- draft challenge -->
+
+<!-- Calculate the probability a new Mpox case will lead to a large outbreak in the absence of control measures. Use the appropriate package to access delay data from previous outbreaks. -->
+
+```{r,message=FALSE,warning=FALSE,echo=FALSE,eval=FALSE}
+library(superspreading)
+
+# estimate probability to contain
+probability_contain(
+  R = mers_offspring["mean"],
+  k = mers_offspring["dispersion"],
+  num_init_infect = 1,
+  case_threshold = 100
+)
+
+# Estimate the probability of a large outbreak
+# with 5 independent imported cases
+probability_epidemic(
+  R = mers_offspring["mean"],
+  k = mers_offspring["dispersion"],
+  num_init_infect = 5
+)
+```
+
+```{r,message=FALSE,warning=FALSE,echo=FALSE,eval=FALSE}
+library(superspreading)
+
+# estimate probability to contain
+probability_contain(
+  R = mpox_offspring_epidist["mean"],
+  k = mpox_offspring_epidist["dispersion"],
+  num_init_infect = 1,
+  case_threshold = 100
+)
+
+# Estimate the probability of a large outbreak
+# with 5 independent imported cases
+probability_epidemic(
+  R = mpox_offspring_epidist["mean"],
+  k = mpox_offspring_epidist["dispersion"],
+  num_init_infect = 5
+)
+```
+
+::::::::::::::::::
+
+:::::::::::::::::::::::::
+
+:::::::::::::::::::::::::: challenge
+
+### From a distribution of secondary cases
+
+[Christian Althaus, 2015](https://www.thelancet.com/journals/laninf/article/PIIS1473-3099(15)70135-0/fulltext) reused data published by [Faye et al., 2015 (Figure 2)](https://www.thelancet.com/journals/laninf/article/PIIS1473-3099(14)71075-8/fulltext#gr2) on the transmission tree on Ebola virus disease in Conakry, Guinea, 2014.
+
+Using the data under the **hint** tab, estimate the offspring distribution from the distribution of secondary cases. Then estimate the large outbreak potential from this data.
+
+::::::::::: hint
+
+Code with the transmission tree data written by [Christian Althaus, 2015](https://www.thelancet.com/journals/laninf/article/PIIS1473-3099(15)70135-0/fulltext):
+
+```{r,message=FALSE,warning=FALSE}
+# Number of individuals in the trees
+n <- 152
+# Number of secondary cases for all individuals
+c1 <- c(1, 2, 2, 5, 14, 1, 4, 4, 1, 3, 3, 8, 2, 1, 1,
+        4, 9, 9, 1, 1, 17, 2, 1, 1, 1, 4, 3, 3, 4, 2,
+        5, 1, 2, 2, 1, 9, 1, 3, 1, 2, 1, 1, 2)
+c0 <- c(c1, rep(0, n - length(c1)))
+
+c0 %>%
+  enframe() %>%
+  ggplot(aes(value)) +
+  geom_histogram()
+```
+
+:::::::::::
+
+::::::::::: solution
+
+```{r,message=FALSE,warning=FALSE}
+# load packages ---------------------------
+library(fitdistrplus)
+library(epiparameter)
+library(epichains)
+library(tidyverse)
+
+# fit a negative binomial distribution ------------------------------------
+
+# Fitting a negative binomial distribution to the number of secondary cases
+fit.cases <- fitdist(c0, "nbinom")
+fit.cases
+
+# serial interval parameters ----------------------------------------------
+
+ebola_serialinter <- epidist_db(
+  disease = "ebola",
+  epi_dist = "serial interval",
+  single_epidist = TRUE
+)
+
+# simulate outbreak trajectories ------------------------------------------
+
+# Set seed for random number generator
+set.seed(645)
+# Number of simulation runs
+number_chains <- 1e2
+# Number of initial cases
+initial_cases <- 1
+
+sim_multiple_chains <-
+  map(
+    .x = seq_len(number_chains),
+    .f = function(sim) {
+      simulate_chains(
+        index_cases = initial_cases,
+        # stopping
+        statistic = "size",
+        # offspring
+        offspring_dist = rnbinom,
+        mu = fit.cases$estimate["mu"],
+        size = fit.cases$estimate["size"],
+        # generation
+        generation_time = function(x) generate(x = ebola_serialinter, times = x)
+      ) %>%
+        mutate(simulation_n = sim)
+    }
+  ) %>%
+  # combine list outputs (for each chain ID) into a single data frame
+  list_rbind()
+
+# visualize ----------------------------------------
+
+sim_chains_aggregate <-
+  sim_multiple_chains %>%
+  as_tibble() %>%
+  mutate(simulation_n = as_factor(simulation_n)) %>%
+  mutate(day = ceiling(time)) %>%
+  count(simulation_n, day, name = "cases") %>%
+  group_by(simulation_n) %>%
+  mutate(cases_cumsum = cumsum(cases)) %>%
+  ungroup()
+
+ggplot() +
+  geom_line(
+    data = sim_chains_aggregate,
+    mapping = aes(
+      x = day,
+      y = cases_cumsum,
+      group = simulation_n
+    ),
+    show.legend = FALSE
+  ) +
+  # define a 100-case threshold
+  geom_hline(aes(yintercept = 100), lty = 2)
+```
+
+
+Remarkably, even with R0 less than 1 (R = 0.95) we can have potentially explosive outbreaks. The observed variation in individual infectiousness in Ebola means that although the probability of extinction is high, new index cases also have the potential for explosive regrowth of the epidemic.
+
+:::::::::::
+
+:::::::::::::::::::::::::::::
+
+::::::::::::::::::::::::::::::::::::: keypoints 
+
+- Use `{epichains}` to simulate the large outbreak potential of diseases with overdispersed offspring distributions.
+
+::::::::::::::::::::::::::::::::::::::::::::::::
+
diff --git a/learners/setup.md b/learners/setup.md
index b2e982a8..a88922f6 100644
--- a/learners/setup.md
+++ b/learners/setup.md
@@ -213,7 +213,6 @@ library(EpiNow2)
 library(cfr)
 library(epiparameter)
 library(incidence2)
-library(covidregionaldata)
 library(outbreaks)
 library(tidyverse)
 ```
diff --git a/renv/profiles/lesson-requirements/renv.lock b/renv/profiles/lesson-requirements/renv.lock
index 77d08829..017591d6 100644
--- a/renv/profiles/lesson-requirements/renv.lock
+++ b/renv/profiles/lesson-requirements/renv.lock
@@ -289,6 +289,23 @@
       ],
       "Hash": "40415719b5a479b87949f3aa0aee737c"
     },
+    "bpmodels": {
+      "Package": "bpmodels",
+      "Version": "0.3.1",
+      "Source": "GitHub",
+      "RemoteType": "github",
+      "RemoteHost": "api.github.com",
+      "RemoteRepo": "bpmodels",
+      "RemoteUsername": "epiverse-trace",
+      "RemotePkgRef": "epiverse-trace/bpmodels",
+      "RemoteRef": "HEAD",
+      "RemoteSha": "07c054090f2b07367855b73a0cc2c43e24bc32d3",
+      "Requirements": [
+        "R",
+        "checkmate"
+      ],
+      "Hash": "4374941c8a9cf7a8991072466918c3b2"
+    },
     "broom": {
       "Package": "broom",
       "Version": "1.0.5",
@@ -513,6 +530,19 @@
       ],
       "Hash": "e8a1e41acf02548751f45c718d55aa6a"
     },
+    "crosstalk": {
+      "Package": "crosstalk",
+      "Version": "1.2.1",
+      "Source": "Repository",
+      "Repository": "CRAN",
+      "Requirements": [
+        "R6",
+        "htmltools",
+        "jsonlite",
+        "lazyeval"
+      ],
+      "Hash": "ab12c7b080a57475248a30f4db6298c0"
+    },
     "curl": {
       "Package": "curl",
       "Version": "5.2.0",
@@ -676,6 +706,42 @@
       ],
       "Hash": "bb0eec2fe32e88d9e2836c2f73ea2077"
     },
+    "epichains": {
+      "Package": "epichains",
+      "Version": "0.0.0.9999",
+      "Source": "GitHub",
+      "RemoteType": "github",
+      "RemoteHost": "api.github.com",
+      "RemoteRepo": "epichains",
+      "RemoteUsername": "epiverse-trace",
+      "RemotePkgRef": "epiverse-trace/epichains",
+      "RemoteRef": "HEAD",
+      "RemoteSha": "7b5b80b7b8531fba18e827636a45b0d064e73c76",
+      "Requirements": [
+        "R",
+        "checkmate",
+        "stats",
+        "utils"
+      ],
+      "Hash": "e4d58df3ff7111aa1c89356cad126c3b"
+    },
+    "epicontacts": {
+      "Package": "epicontacts",
+      "Version": "1.1.3",
+      "Source": "Repository",
+      "Repository": "CRAN",
+      "Requirements": [
+        "R",
+        "colorspace",
+        "dplyr",
+        "grDevices",
+        "igraph",
+        "methods",
+        "threejs",
+        "visNetwork"
+      ],
+      "Hash": "ce65f8ac65b26b2a6c17497a4cd5d3a0"
+    },
     "epiparameter": {
       "Package": "epiparameter",
       "Version": "0.0.0.9000",
@@ -747,6 +813,21 @@
       ],
       "Hash": "66fa5a16464666772f4929f8f5b2fc71"
     },
+    "fitdistrplus": {
+      "Package": "fitdistrplus",
+      "Version": "1.1-11",
+      "Source": "Repository",
+      "Repository": "CRAN",
+      "Requirements": [
+        "MASS",
+        "R",
+        "grDevices",
+        "methods",
+        "stats",
+        "survival"
+      ],
+      "Hash": "f40ef9686e85681a1ccbf33d9236aeb9"
+    },
     "fontawesome": {
       "Package": "fontawesome",
       "Version": "0.5.2",
@@ -1080,6 +1161,21 @@
       ],
       "Hash": "2d7b3857980e0e0d0a1fd6f11928ab0f"
     },
+    "htmlwidgets": {
+      "Package": "htmlwidgets",
+      "Version": "1.6.4",
+      "Source": "Repository",
+      "Repository": "CRAN",
+      "Requirements": [
+        "grDevices",
+        "htmltools",
+        "jsonlite",
+        "knitr",
+        "rmarkdown",
+        "yaml"
+      ],
+      "Hash": "04291cc45198225444a397606810ac37"
+    },
     "httr": {
       "Package": "httr",
       "Version": "1.4.7",
@@ -1106,6 +1202,29 @@
       ],
       "Hash": "99df65cfef20e525ed38c3d2577f7190"
     },
+    "igraph": {
+      "Package": "igraph",
+      "Version": "2.0.3",
+      "Source": "Repository",
+      "Repository": "CRAN",
+      "Requirements": [
+        "Matrix",
+        "R",
+        "cli",
+        "cpp11",
+        "grDevices",
+        "graphics",
+        "lifecycle",
+        "magrittr",
+        "methods",
+        "pkgconfig",
+        "rlang",
+        "stats",
+        "utils",
+        "vctrs"
+      ],
+      "Hash": "c3b7d801d722e26e4cd888e042bf9af5"
+    },
     "incidence2": {
       "Package": "incidence2",
       "Version": "2.2.3",
@@ -1215,6 +1334,16 @@
       ],
       "Hash": "7c5e89f04e72d6611c77451f6331a091"
     },
+    "lazyeval": {
+      "Package": "lazyeval",
+      "Version": "0.2.2",
+      "Source": "Repository",
+      "Repository": "CRAN",
+      "Requirements": [
+        "R"
+      ],
+      "Hash": "d908914ae53b04d4c0c0fd72ecc35370"
+    },
     "lifecycle": {
       "Package": "lifecycle",
       "Version": "1.0.4",
@@ -1902,6 +2031,42 @@
       ],
       "Hash": "960e2ae9e09656611e0b8214ad543207"
     },
+    "superspreading": {
+      "Package": "superspreading",
+      "Version": "0.2.0.9000",
+      "Source": "GitHub",
+      "Remotes": "epiverse-trace/epiparameter, epiverse-trace/bpmodels",
+      "RemoteType": "github",
+      "RemoteHost": "api.github.com",
+      "RemoteRepo": "superspreading",
+      "RemoteUsername": "epiverse-trace",
+      "RemotePkgRef": "epiverse-trace/superspreading",
+      "RemoteRef": "HEAD",
+      "RemoteSha": "4549ff929acb90aee04dc3b66a7fb30503affbc5",
+      "Requirements": [
+        "bpmodels",
+        "checkmate",
+        "rlang",
+        "stats"
+      ],
+      "Hash": "b6902d57c67ae90e59f588c01319aea7"
+    },
+    "survival": {
+      "Package": "survival",
+      "Version": "3.5-7",
+      "Source": "Repository",
+      "Repository": "CRAN",
+      "Requirements": [
+        "Matrix",
+        "R",
+        "graphics",
+        "methods",
+        "splines",
+        "stats",
+        "utils"
+      ],
+      "Hash": "b8e943d262c3da0b0febd3e04517c197"
+    },
     "sys": {
       "Package": "sys",
       "Version": "3.4.2",
@@ -1932,6 +2097,22 @@
       ],
       "Hash": "997aac9ad649e0ef3b97f96cddd5622b"
     },
+    "threejs": {
+      "Package": "threejs",
+      "Version": "0.3.3",
+      "Source": "Repository",
+      "Repository": "CRAN",
+      "Requirements": [
+        "R",
+        "base64enc",
+        "crosstalk",
+        "htmlwidgets",
+        "igraph",
+        "methods",
+        "stats"
+      ],
+      "Hash": "2ad32c3a8745e827977f394bc387e3b0"
+    },
     "tibble": {
       "Package": "tibble",
       "Version": "3.2.1",
@@ -2116,6 +2297,24 @@
       ],
       "Hash": "c826c7c4241b6fc89ff55aaea3fa7491"
     },
+    "visNetwork": {
+      "Package": "visNetwork",
+      "Version": "2.1.2",
+      "Source": "Repository",
+      "Repository": "CRAN",
+      "Requirements": [
+        "R",
+        "grDevices",
+        "htmltools",
+        "htmlwidgets",
+        "jsonlite",
+        "magrittr",
+        "methods",
+        "stats",
+        "utils"
+      ],
+      "Hash": "3e48b097e8d9a91ecced2ed4817a678d"
+    },
     "vroom": {
       "Package": "vroom",
       "Version": "1.6.5",
@@ -2142,6 +2341,19 @@
       ],
       "Hash": "390f9315bc0025be03012054103d227c"
     },
+    "webshot": {
+      "Package": "webshot",
+      "Version": "0.5.5",
+      "Source": "Repository",
+      "Repository": "CRAN",
+      "Requirements": [
+        "R",
+        "callr",
+        "jsonlite",
+        "magrittr"
+      ],
+      "Hash": "16858ee1aba97f902d24049d4a44ef16"
+    },
     "withr": {
       "Package": "withr",
       "Version": "3.0.0",