differences for PR #121

epiverse-trace · Sep 16, 2024 · 33ea88d · 33ea88d
1 parent 06ae110
commit 33ea88d
Show file tree

Hide file tree

Showing 4 changed files with 2,920 additions and 71 deletions.
diff --git a/clean-data.md b/clean-data.md
@@ -63,24 +63,33 @@ The first step is to import the dataset following the guidelines outlined in the
 # e.g.: if path to file is data/simulated_ebola_2.csv then:
 raw_ebola_data <- rio::import(
   here::here("data", "simulated_ebola_2.csv")
-)
+) %>% 
+  dplyr::as_tibble() # for a simple data frame output
 ```
 
 
 
 
 ``` r
 # Return first five rows
-utils::head(raw_ebola_data, 5)
+raw_ebola_data
 ```
 
 ``` output
-  V1 case id         age gender    status date onset date sample
-1  1   14905          90      1 confirmed 03/15/2015  06/04/2015
-2  2   13043 twenty-five      2            Sep /11/Y  03/01/2014
-3  3   14364          54      f      <NA> 09/02/2014  03/03/2015
-4  4   14675      ninety   <NA>           10/19/2014  31/ 12 /14
-5  5   12648          74      F           08/06/2014  10/10/2016
+# A tibble: 15,000 × 7
+      V1 `case id` age         gender status      `date onset` `date sample`
+   <int>     <int> <chr>       <chr>  <chr>       <chr>        <chr>        
+ 1     1     14905 90          1      "confirmed" 03/15/2015   06/04/2015   
+ 2     2     13043 twenty-five 2      ""          Sep /11/Y    03/01/2014   
+ 3     3     14364 54          f       <NA>       09/02/2014   03/03/2015   
+ 4     4     14675 ninety      <NA>   ""          10/19/2014   31/ 12 /14   
+ 5     5     12648 74          F      ""          08/06/2014   10/10/2016   
+ 6     6     14274 seventy-six female ""          Apr /05/Y    01/23/2016   
+ 7     7     14132 sixteen     male   "confirmed" Dec /29/Y    05/10/2015   
+ 8     8     14715 44          f      "confirmed" Apr /06/Y    04/24/2016   
+ 9     9     13435 26          1      ""          09/07/2014   20/ 09 /14   
+10    10     14816 thirty      f      ""          06/29/2015   06/02/2015   
+# ℹ 14,990 more rows
 ```
 
 ##  A quick inspection
@@ -205,17 +214,24 @@ sim_ebola_data <- cleanepi::standardize_dates(
   )
 )
 
-utils::head(sim_ebola_data)
+sim_ebola_data
 ```
 
 ``` output
-  v_1 case_id         age gender    status date_onset date_sample
-1   1   14905          90      1 confirmed 2015-03-15  2015-04-06
-2   2   13043 twenty-five      2      <NA>       <NA>  2014-01-03
-3   3   14364          54      f      <NA> 2014-02-09  2015-03-03
-4   4   14675      ninety   <NA>      <NA> 2014-10-19  2014-12-31
-5   5   12648          74      F      <NA> 2014-06-08  2016-10-10
-6   6   14274 seventy-six female      <NA>       <NA>  2016-01-23
+# A tibble: 15,000 × 7
+     v_1 case_id age         gender status    date_onset date_sample
+   <int> <chr>   <chr>       <chr>  <chr>     <date>     <date>     
+ 1     1 14905   90          1      confirmed 2015-03-15 2015-04-06 
+ 2     2 13043   twenty-five 2      <NA>      NA         2014-01-03 
+ 3     3 14364   54          f      <NA>      2014-02-09 2015-03-03 
+ 4     4 14675   ninety      <NA>   <NA>      2014-10-19 2014-12-31 
+ 5     5 12648   74          F      <NA>      2014-06-08 2016-10-10 
+ 6     6 14274   seventy-six female <NA>      NA         2016-01-23 
+ 7     7 14132   sixteen     male   confirmed NA         2015-10-05 
+ 8     8 14715   44          f      confirmed NA         2016-04-24 
+ 9     9 13435   26          1      <NA>      2014-07-09 2014-09-20 
+10    10 14816   thirty      f      <NA>      2015-06-29 2015-02-06 
+# ℹ 14,990 more rows
 ```
 
 This function coverts the values in the target columns, or will automatically figure out the date columns within the dataset (if `target_columns = NULL`) and convert them into the **Ymd**  format.
@@ -229,17 +245,25 @@ The `convert_to_numeric()` function in `{cleanepi}` does such conversion as illu
 sim_ebola_data <- cleanepi::convert_to_numeric(sim_ebola_data,
   target_columns = "age"
 )
-utils::head(sim_ebola_data)
+
+sim_ebola_data
 ```
 
 ``` output
-  v_1 case_id age gender    status date_onset date_sample
-1   1   14905  90      1 confirmed 2015-03-15  2015-04-06
-2   2   13043  25      2      <NA>       <NA>  2014-01-03
-3   3   14364  54      f      <NA> 2014-02-09  2015-03-03
-4   4   14675  90   <NA>      <NA> 2014-10-19  2014-12-31
-5   5   12648  74      F      <NA> 2014-06-08  2016-10-10
-6   6   14274  76 female      <NA>       <NA>  2016-01-23
+# A tibble: 15,000 × 7
+     v_1 case_id   age gender status    date_onset date_sample
+   <int> <chr>   <dbl> <chr>  <chr>     <date>     <date>     
+ 1     1 14905      90 1      confirmed 2015-03-15 2015-04-06 
+ 2     2 13043      25 2      <NA>      NA         2014-01-03 
+ 3     3 14364      54 f      <NA>      2014-02-09 2015-03-03 
+ 4     4 14675      90 <NA>   <NA>      2014-10-19 2014-12-31 
+ 5     5 12648      74 F      <NA>      2014-06-08 2016-10-10 
+ 6     6 14274      76 female <NA>      NA         2016-01-23 
+ 7     7 14132      16 male   confirmed NA         2015-10-05 
+ 8     8 14715      44 f      confirmed NA         2016-04-24 
+ 9     9 13435      26 1      <NA>      2014-07-09 2014-09-20 
+10    10 14816      30 f      <NA>      2015-06-29 2015-02-06 
+# ℹ 14,990 more rows
 ```
 
 ## Epidemiology related operations
@@ -301,17 +325,25 @@ sim_ebola_data <- cleanepi::clean_using_dictionary(
   sim_ebola_data,
   dictionary = test_dict
 )
-utils::head(sim_ebola_data)
+
+sim_ebola_data
 ```
 
 ``` output
-  v_1 case_id age gender    status date_onset date_sample
-1   1   14905  90   male confirmed 2015-03-15  2015-04-06
-2   2   13043  25 female      <NA>       <NA>  2014-01-03
-3   3   14364  54 female      <NA> 2014-02-09  2015-03-03
-4   4   14675  90   <NA>      <NA> 2014-10-19  2014-12-31
-5   5   12648  74 female      <NA> 2014-06-08  2016-10-10
-6   6   14274  76 female      <NA>       <NA>  2016-01-23
+# A tibble: 15,000 × 7
+     v_1 case_id   age gender status    date_onset date_sample
+   <int> <chr>   <dbl> <chr>  <chr>     <date>     <date>     
+ 1     1 14905      90 male   confirmed 2015-03-15 2015-04-06 
+ 2     2 13043      25 female <NA>      NA         2014-01-03 
+ 3     3 14364      54 female <NA>      2014-02-09 2015-03-03 
+ 4     4 14675      90 <NA>   <NA>      2014-10-19 2014-12-31 
+ 5     5 12648      74 female <NA>      2014-06-08 2016-10-10 
+ 6     6 14274      76 female <NA>      NA         2016-01-23 
+ 7     7 14132      16 male   confirmed NA         2015-10-05 
+ 8     8 14715      44 female confirmed NA         2016-04-24 
+ 9     9 13435      26 male   <NA>      2014-07-09 2014-09-20 
+10    10 14816      30 female <NA>      2015-06-29 2015-02-06 
+# ℹ 14,990 more rows
 ```
 
 This approach simplifies the data cleaning process, ensuring that categorical data in epidemiological datasets is accurately categorized and ready for further analysis.
@@ -322,7 +354,7 @@ This approach simplifies the data cleaning process, ensuring that categorical da
 
 In epidemiological data analysis it is also useful to track and analyze time-dependent events, such as the progression of a disease outbreak or the duration between sample collection and analysis.
 The `{cleanepi}` package  offers a convenient function for calculating the time elapsed between two dated events at different time scales. For example, the below code snippet utilizes the `span()` function to compute the time elapsed since the date of sample for the case identified
- until the date this document was generated (2024-09-13).
+ until the date this document was generated (2024-09-16).
 
 
 ``` r
@@ -334,24 +366,26 @@ sim_ebola_data <- cleanepi::timespan(
   span_column_name = "time_since_sampling_date",
   span_remainder_unit = "months"
 )
-utils::head(sim_ebola_data)
+
+sim_ebola_data
 ```
 
 ``` output
-  v_1 case_id age gender    status date_onset date_sample
-1   1   14905  90   male confirmed 2015-03-15  2015-04-06
-2   2   13043  25 female      <NA>       <NA>  2014-01-03
-3   3   14364  54 female      <NA> 2014-02-09  2015-03-03
-4   4   14675  90   <NA>      <NA> 2014-10-19  2014-12-31
-5   5   12648  74 female      <NA> 2014-06-08  2016-10-10
-6   6   14274  76 female      <NA>       <NA>  2016-01-23
-  time_since_sampling_date remainder_months
-1                        9                5
-2                       10                8
-3                        9                6
-4                        9                8
-5                        7               11
-6                        8                7
+# A tibble: 15,000 × 9
+     v_1 case_id   age gender status    date_onset date_sample
+   <int> <chr>   <dbl> <chr>  <chr>     <date>     <date>     
+ 1     1 14905      90 male   confirmed 2015-03-15 2015-04-06 
+ 2     2 13043      25 female <NA>      NA         2014-01-03 
+ 3     3 14364      54 female <NA>      2014-02-09 2015-03-03 
+ 4     4 14675      90 <NA>   <NA>      2014-10-19 2014-12-31 
+ 5     5 12648      74 female <NA>      2014-06-08 2016-10-10 
+ 6     6 14274      76 female <NA>      NA         2016-01-23 
+ 7     7 14132      16 male   confirmed NA         2015-10-05 
+ 8     8 14715      44 female confirmed NA         2016-04-24 
+ 9     9 13435      26 male   <NA>      2014-07-09 2014-09-20 
+10    10 14816      30 female <NA>      2015-06-29 2015-02-06 
+# ℹ 14,990 more rows
+# ℹ 2 more variables: time_since_sampling_date <dbl>, remainder_months <dbl>
 ```
 
 After executing the `span()` function, two new columns named `time_since_sampling_date` and `remainder_months` are added to the **sim_ebola_data** dataset, containing the calculated time elapsed since the date of sampling for each case, measured in years, and the remaining time measured in months.
@@ -428,26 +462,34 @@ it's essential to establish an additional foundational layer to ensure the integ
 
 ``` r
 library(linelist)
-data <- linelist::make_linelist(
+
+linelist_data <- linelist::make_linelist(
   x = cleaned_data,
   id = "case_id",
   date_onset = "date_onset",
   gender = "gender"
 )
-utils::head(data, 7)
+
+linelist_data
 ```
 
 ``` output
 
 // linelist object
-  v_1 case_id age gender    status date_onset date_sample
-1   1   14905  90   male confirmed 2015-03-15  2015-04-06
-2   2   13043  25 female      <NA>       <NA>  2014-01-03
-3   3   14364  54 female      <NA> 2014-02-09  2015-03-03
-4   4   14675  90   <NA>      <NA> 2014-10-19  2014-12-31
-5   5   12648  74 female      <NA> 2014-06-08  2016-10-10
-6   6   14274  76 female      <NA>       <NA>  2016-01-23
-7   7   14132  16   male confirmed       <NA>  2015-10-05
+# A tibble: 15,000 × 7
+     v_1 case_id   age gender status    date_onset date_sample
+   <int> <chr>   <dbl> <chr>  <chr>     <date>     <date>     
+ 1     1 14905      90 male   confirmed 2015-03-15 2015-04-06 
+ 2     2 13043      25 female <NA>      NA         2014-01-03 
+ 3     3 14364      54 female <NA>      2014-02-09 2015-03-03 
+ 4     4 14675      90 <NA>   <NA>      2014-10-19 2014-12-31 
+ 5     5 12648      74 female <NA>      2014-06-08 2016-10-10 
+ 6     6 14274      76 female <NA>      NA         2016-01-23 
+ 7     7 14132      16 male   confirmed NA         2015-10-05 
+ 8     8 14715      44 female confirmed NA         2016-04-24 
+ 9     9 13435      26 male   <NA>      2014-07-09 2014-09-20 
+10    10 14816      30 female <NA>      2015-06-29 2015-02-06 
+# ℹ 14,990 more rows
 
 // tags: id:case_id, date_onset:date_onset, gender:gender 
 ```
@@ -506,7 +548,7 @@ columns, you will receive an error or warning message, as shown in the example b
 
 
 ``` r
-new_df <- data %>%
+new_df <- linelist_data %>%
   dplyr::select(case_id, gender)
 ```
 
@@ -525,7 +567,7 @@ Let's test the implications of changing the **safeguarding** configuration from
 
 
 ``` r
-data %>%
+linelist_data %>%
   dplyr::select(case_id, gender) %>%
   dplyr::count(gender)
 ```
@@ -534,7 +576,7 @@ data %>%
 
 
 ``` r
-# set behavior to default "warning"
+# set behavior to the default option: "warning"
 linelist::lost_tags_action()
 
 # set behavior to "error"
@@ -555,7 +597,7 @@ types, use the `linelist::validate_linelist()`, as
 shown in the example below:
 
 ```r
-linelist::validate_linelist(data)
+linelist::validate_linelist(linelist_data)
 ```
 <!-- If your dataset requires a new tag, set the argument -->
 <!-- `allow_extra = TRUE` when creating the linelist object with its corresponding-->
@@ -599,7 +641,7 @@ Error: Some tags have the wrong class:
   - age: Must inherit from class 'numeric'/'integer', but has class 'character'
 ```
 
-Why are we getting this error message?
+Why are we getting this `Error` message? Should we have a `Warning` message instead? 
 
 ::::::::::::::::::::::::::
 
@@ -656,16 +698,24 @@ features. For example, you can extract a dataframe of only the tagged columns
 using the `linelist::tags_df()` function, as shown below:
 
 ``` r
-head(linelist::tags_df(data), 5)
+linelist::tags_df(linelist_data)
 ```
 
 ``` output
-     id date_onset gender
-1 14905 2015-03-15   male
-2 13043       <NA> female
-3 14364 2014-02-09 female
-4 14675 2014-10-19   <NA>
-5 12648 2014-06-08 female
+# A tibble: 15,000 × 3
+   id    date_onset gender
+   <chr> <date>     <chr> 
+ 1 14905 2015-03-15 male  
+ 2 13043 NA         female
+ 3 14364 2014-02-09 female
+ 4 14675 2014-10-19 <NA>  
+ 5 12648 2014-06-08 female
+ 6 14274 NA         female
+ 7 14132 NA         male  
+ 8 14715 NA         female
+ 9 13435 2014-07-09 male  
+10 14816 2015-06-29 female
+# ℹ 14,990 more rows
 ```
 
 This allows, the extraction of use tagged-only columns in downstream analysis, which will be useful for the next episode!

diff --git a/config.yaml b/config.yaml
@@ -0,0 +1,83 @@
+#------------------------------------------------------------
+# Values for this lesson.
+#------------------------------------------------------------
+
+# Which carpentry is this (swc, dc, lc, or cp)?
+# swc: Software Carpentry
+# dc: Data Carpentry
+# lc: Library Carpentry
+# cp: Carpentries (to use for instructor training for instance)
+# incubator: The Carpentries Incubator
+carpentry: 'incubator'
+
+# Overall title for pages.
+title: 'Read and clean case data, and make linelist for outbreak analytics with R'
+
+# Date the lesson was created (YYYY-MM-DD, this is empty by default)
+created:
+
+# Comma-separated list of keywords for the lesson
+keywords:
+
+# Life cycle stage of the lesson
+# possible values: pre-alpha, alpha, beta, stable
+life_cycle: 'pre-alpha'
+
+# License of the lesson materials (recommended CC-BY 4.0)
+license: 'CC-BY 4.0'
+
+# Link to the source repository for this lesson
+source: 'https://github.com/epiverse-trace/tutorials-early'
+
+# Default branch of your lesson
+branch: 'main'
+
+# Who to contact if there are any issues
+contact: '[email protected]'
+
+# Navigation ------------------------------------------------
+#
+# Use the following menu items to specify the order of
+# individual pages in each dropdown section. Leave blank to
+# include all pages in the folder.
+#
+# Example -------------
+#
+# episodes:
+# - introduction.md
+# - first-steps.md
+#
+# learners:
+# - setup.md
+#
+# instructors:
+# - instructor-notes.md
+#
+# profiles:
+# - one-learner.md
+# - another-learner.md
+
+# Order of episodes in your lesson
+episodes:
+- read-cases.Rmd
+- clean-data.Rmd
+- describe-cases.Rmd
+#- simple-analysis.Rmd
+
+# Information for Learners
+learners:
+
+# Information for Instructors
+instructors:
+
+# Learner Profiles
+profiles:
+
+# Customisation ---------------------------------------------
+#
+# This space below is where custom yaml items (e.g. pinning
+# sandpaper and varnish versions) should live
+
+
+varnish: epiverse-trace/varnish@epiversetheme
+sandpaper: epiverse-trace/sandpaper@patch-renv-github-bug