Skip to content

Commit

Permalink
differences for PR #147
Browse files Browse the repository at this point in the history
  • Loading branch information
actions-user committed Oct 8, 2024
1 parent 300cab7 commit 9733352
Show file tree
Hide file tree
Showing 3 changed files with 426 additions and 0 deletions.
213 changes: 213 additions & 0 deletions epikinetics-descriptive.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,213 @@
---
title: Solutions using epikinetics data and package
---

# Descriptive analysis

```r

#' goal:
#' vaccination incidence stratified by vaccine type
#' observation incidence stratified by censoring status

# load packages -----------------------------------------------------------

library(tidyverse)
library(cleanepi)
library(datatagr) # a generalization of {linelist}
library(incidence2)

# read data ---------------------------------------------------------------

# rawdata <- "data-raw/delta.csv"
rawdata <- "https://raw.githubusercontent.com/seroanalytics/epikinetics/refs/heads/main/inst/delta_full.rds"

dat <- read_csv(rawdata)

dat %>% glimpse()

# what these columns mean? ------------------------------------------------

#' data dictionary: https://seroanalytics.org/epikinetics/articles/data.html
#' reference paper: https://www.thelancet.com/journals/laninf/article/PIIS1473-3099(24)00484-5/fulltext
#' location: https://github.com/seroanalytics/epikinetics/tree/main/inst

# 335 subjects where followed up
dat %>% count(pid)

## what "titre type" means? -------------------------------------------------

#' In the time series,
#' each subject had monthly serum measurements
#' for three types of antigens ("titre_type").
#'
#' Serum samples where challenged against Ancestral, Alpha and Delta antigens.
#'
#' The column "value" measures the titre of
#' the neutralizing effect of each sample against each antigen

dat %>%
dplyr::filter(pid == 2) %>%
dplyr::arrange(date) %>%
# select time invariant columns
dplyr::select(
pid, infection_history, exp_num, last_exp_date, last_vax_type,
dplyr::everything()
)

## what "censored" means? ----------------------------------------------------

# context: censored regression model
# the "value" as the outcome is censored above or below
# because the it was measured outside the limits of detection
# threshold limit below: 5
# threshold limit above: 2560

dat %>%
ggplot(aes(value, fill = as.factor(censored))) +
geom_histogram()

# datatagr ----------------------------------------------------------------

datatagr::lost_labels_action()
datatagr::get_lost_labels_action()
# datatagr::lost_labels_action(action = "error")

# cleanepi ----------------------------------------------------------------

# check sequence of events

dat_clean <- dat %>%
# arrange columns
dplyr::select(
pid, infection_history, exp_num, last_exp_date, last_vax_type,
dplyr::everything()
) %>%
# arrange rows
dplyr::arrange(
pid, infection_history, exp_num, last_exp_date, last_vax_type, date
) %>%
# cleanepi
cleanepi::check_date_sequence(
target_columns = c("last_exp_date", "date")
) %>%
# cleanepi::print_report()
cleanepi::timespan(
target_column = "last_exp_date",
end_date = "date",
span_unit = "days",
span_column_name = "t_since_last_exp",
span_remainder_unit = "days"
) %>%
# extra wrangling
mutate(
last_vax_type = forcats::fct_infreq(last_vax_type),
exp_num = forcats::as_factor(exp_num),
titre_type = forcats::fct_relevel(titre_type,"Ancestral", "Alpha"),
censored = forcats::as_factor(censored)
) %>%
# tag with {datatagr}
datatagr::make_datatagr(
pid = "subject id",
infection_history = "subject infection history",
exp_num = "number of vaccine exposures",
last_exp_date = "date of last exposure",
last_vax_type = "type of vaccine in the last exposure",
date = "date of observation of titre in serum sample",
titre_type = "type of antigen challenged against serum sample",
value = "titre value",
censored = "censored titre value out of limit of detection [5 - 2560] bellow (-1) or above (+1)",
t_since_last_exp = "time interval between last vaccine exposure and observed serum sample titre"
) %>%
# validate with {datatagr}
datatagr::validate_datatagr(
pid = "numeric",
infection_history = "character",
exp_num = "factor",
last_exp_date="Date",
last_vax_type = "factor",
date = "Date",
titre_type = "factor",
value = "numeric",
censored = "factor",
t_since_last_exp = "numeric"
) %>%
# datatagr::labels_df() %>% # this extract labels as column names [affects downstream]
identity()

dat_clean

# distribution of the time from the last vaccine to first observation
dat_clean %>%
group_by(pid) %>%
filter(date == min(date)) %>%
slice(1) %>%
ungroup() %>%
ggplot(aes(t_since_last_exp)) +
geom_histogram()

## subject table -----------------------------------------------------------

# subject time-invariant data
dat_subject <- dat_clean %>%
# {datatagr} reacts with dplyr::select() but not with dplyr::count() when losing tags
dplyr::select(pid, infection_history, exp_num, last_exp_date, last_vax_type) %>%
dplyr::count(pid, infection_history, exp_num, last_exp_date, last_vax_type)

# table 1: time-invariant columns
dat_subject %>%
compareGroups::compareGroups(
data = .,
formula = ~infection_history + exp_num + last_exp_date + last_vax_type
) %>%
compareGroups::createTable()

# table 2: were vaccine type differently applied between naive and non-naive?
dat_subject %>%
compareGroups::compareGroups(
data = .,
formula = last_vax_type~infection_history,
byrow = TRUE
) %>%
compareGroups::createTable(show.all = TRUE)

# vaccinations ------------------------------------------------------------

## by vaccine type ---------------------------------------------

dat_subject %>%
# aggregate
incidence2::incidence(
date_index = "last_exp_date", # change: "date" or "last_exp_date"
groups = ("last_vax_type"), # change: "titre_type" or "infection_history" or "last_vax_type" or c("infection_history", "titre_type")
interval = "month", # change: "day" or "week" or "epiweek" or "month"
# complete_dates = TRUE, # relevant to downstream analysis [time-series data]
) %>%
# transform to cumulative per group (optional display)
# incidence2::cumulate() %>%
# plot
incidence2:::plot.incidence2(
fill = "last_vax_type"
)

# observations ------------------------------------------------------------

# by history-variants
# not required, this reflect the proportion of "infection_history" in the cohort

## by censored -----------------------------------------------

dat_clean %>% count(censored)

dat_clean %>%
incidence2::incidence(
date_index = "date", # change: "date" or "last_exp_date"
groups = "censored", # change: "censored" or "titre_type" or "infection_history" or "last_vax_type" or c("infection_history", "titre_type")
interval = "month", # change: "day", "week", "month"
# complete_dates = TRUE # relevant to downstream analysis [time-series data]
) %>%
incidence2:::plot.incidence2(
fill = "censored"
)

```
Loading

0 comments on commit 9733352

Please sign in to comment.