Merge pull request #71 from nhs-r-community/tidy-package

Tidy package
nhs-r-community · Sep 28, 2024 · defefde · defefde
2 parents 8dbff91 + acfcbf9
commit defefde
Show file tree

Hide file tree

Showing 177 changed files with 631 additions and 31,528 deletions.
diff --git a/.all-contributorsrc b/.all-contributorsrc
@@ -0,0 +1,4 @@
+{
+  "projectName": "NHSRdatasets",
+  "projectOwner": "nhs-r-community"
+}
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -2,11 +2,12 @@ Package: NHSRdatasets
 Type: Package
 Title: NHS and Healthcare-Related Data for Education and Training
 Date: 2022-11-08
-Version: 0.3.2
+Version: 0.3.3
 Authors@R: c(
-          person("Gary", "Hutson", , "[email protected]", "aut", comment = c(ORCID="0000-0003-3534-6143")),
+          person("Gary", "Hutson", , , "aut", comment = c(ORCID="0000-0003-3534-6143")),
           person("Tom", "Jemmett", ,"[email protected]", "aut", comment = c(ORCID="0000-0002-6943-2990")),
           person("Chris", "Mainey", ,"[email protected]", c("aut", "cre"), comment = c(ORCID ="0000-0002-3018-6171")),
+          person("Fran", "Barton", ,"[email protected]", "aut", comment = c(ORCID = "0000-0002-5650-1176")),
           person("Zoë", "Turner", role = "aut", email = "[email protected]", comment = c(ORCID = "0000-0003-1033-9158")),
           person("NHS-R community", role = "cph")
           )
@@ -16,11 +17,9 @@ License: CC0
 Language: en-GB
 Encoding: UTF-8
 LazyData: true
-RoxygenNote: 7.2.3
+RoxygenNote: 7.3.2
 Depends: R (>= 3.5.0)
 BugReports: https://github.com/nhs-r-community/NHSRdatasets/issues
-Imports: 
-    tibble
 Suggests: 
     caret,
     dplyr,
@@ -29,24 +28,21 @@ Suggests:
     ggplot2,
     ggrepel,
     httr2,
-    janitor,
     knitr,
-    labelled,
     lattice,
     lme4,
     lmtest,
     lubridate,
     magrittr,
     MASS,
     ModelMetrics,
-    openxlsx2,
     rcmdcheck,
     readr,
     rmarkdown,
     rsample,
     scales,
-    stringi,
     synthpop,
+    tibble,
     tidyr,
     varhandle
 VignetteBuilder: knitr

diff --git a/NEWS.md b/NEWS.md
@@ -1,23 +1,40 @@
+# NHSRdatasets 0.3.3
+
+- Moved the vignette for how ONS provisionally recorded deaths to an R script
+available in the data-raw folder. 
+This is also available on the Quarto NHS-R Community website and 
+[GitHub](https://github.com/nhs-r-community/nhs-r-community/blob/main/blog/building-the-ons-mortality-dataset.qmd)
+- Noted in this version the previous addition of the dataset from AphA (with kind 
+permission) from their CPD Survey, thanks to Fran Barton. Code detailing how the
+data was extracted using the httr2 package and tidied can be found in the 
+[data-raw folder](https://github.com/nhs-r-community/NHSRdatasets/blob/main/data-raw/apha_cpd_survey.R).
+- Noted in this version the previous addition of the dataset from European 
+Centre for Disease Prevention and Control for reported COVID19 infections and 
+deaths by day and country collected on the 14th December 2020, thanks to Chris
+Mainey.
+
 # NHSRdatasets 0.3.0
 
-- Added two new datasets for 'stranded patients' and synthetic early warning scores (NEWS), including vignettes, thanks to Gary Hutson.
+- Added two new datasets for 'stranded patients' and synthetic early warning 
+scores (NEWS), including vignettes, thanks to Gary Hutson.
 - Removed Travis and added GitHub actions
 
 # NHSRdatasets 0.2.0
 
-- Added a new ONS Mortality data set, and vignette showing it's construction, thanks to Zoë Turner.
+- Added a new ONS Mortality data set, and vignette showing its construction, 
+thanks to Zoë Turner.
 - Resaved .Rdata files as rda, using "gzip" compression.
 - Other minor documentation tweaks.
 
-
 # NHSRdatasets 0.1.2
 
-- Added a new NHS Accident and Emergency (A&E) dataset with vignette, thanks to Tom Jemmett.
+- Added a new NHS Accident and Emergency (A&E) dataset with vignette, thanks to 
+Tom Jemmett.
 - Typos resolved and cleaned some files.
 - Added pkgdown site.
 
-
 # NHSRdatasets 0.1.1
 
-- This is the first release of this collaborative package for NHS and healthcare analysts to learn or teach R.
+- This is the first release of this collaborative package for NHS and healthcare 
+analysts to learn or teach R.
 It will evolve over time as new contributions are released.
diff --git a/R/LOS_model.R b/R/LOS_model.R
@@ -10,7 +10,7 @@
 #' @format Data frame with five columns
 #' \describe{
 #' \item{ID}{A fictional patient ID number}
-#' \item{Organisation}{A factor representing one of ten fictional hospital trusts, e.g. Trust1}
+#' \item{Organisation}{A factor representing one of ten fictional hospital trusts, for example Trust1}
 #' \item{Age}{Age in years of each fictional patient}
 #' \item{LOS}{In-hospital length of stay in days.  The difference between admission and discharge date in dates}
 #' \item{Death}{Binary for death status: 0 = survived, 1= died in hospital}
@@ -23,12 +23,11 @@
 #' @examples
 #' data(LOS_model)
 #'
-#' model1 <- glm(Death ~ Age + LOS, data=LOS_model, family="binomial")
+#' model1 <- glm(Death ~ Age + LOS, data = LOS_model, family = "binomial")
 #' summary(model1)
 #'
 #' # Now with an Age, LOS, and Age*LOS interaction.
-#' model2<- glm(Death ~ Age * LOS, data=LOS_model, family="binomial")
+#' model2 <- glm(Death ~ Age * LOS, data = LOS_model, family = "binomial")
 #' summary(model2)
 #'
 "LOS_model"
-
diff --git a/R/ae_attendances.R b/R/ae_attendances.R
@@ -51,21 +51,26 @@
 #'   filter(any(type == "1")) %>%
 #'   summarise_at(vars(attendances, breaches), sum) %>%
 #'   arrange(desc(attendances)) %>%
-#'   mutate(performance = 1 - breaches / attendances,
-#'          overall_performance = 1 - sum(breaches) / sum(attendances),
-#'          rank = rank(-performance, ties.method = "first") / n()) %>%
+#'   mutate(
+#'     performance = 1 - breaches / attendances,
+#'     overall_performance = 1 - sum(breaches) / sum(attendances),
+#'     rank = rank(-performance, ties.method = "first") / n()
+#'   ) %>%
 #'   ggplot(aes(rank, performance)) +
 #'   geom_vline(xintercept = c(0.25, 0.5, 0.75), linetype = "dotted") +
 #'   geom_hline(yintercept = 0.95, colour = "red") +
 #'   geom_hline(aes(yintercept = overall_performance), linetype = "dotted") +
 #'   geom_point() +
 #'   scale_y_continuous(labels = percent) +
 #'   theme_minimal() +
-#'   theme(panel.grid = element_blank(),
-#'         axis.text.x = element_blank()) +
-#'   labs(title = "4 Hour performance by trust",
-#'        subtitle = "Apr-16 through Mar-19",
-#'        x = "", y = "")
+#'   theme(
+#'     panel.grid = element_blank(),
+#'     axis.text.x = element_blank()
+#'   ) +
+#'   labs(
+#'     title = "4 Hour performance by trust",
+#'     subtitle = "Apr-16 through Mar-19",
+#'     x = "", y = ""
+#'   )
 #'
 "ae_attendances"
-
diff --git a/R/apha_cpd_survey.R b/R/apha_cpd_survey.R
@@ -1,21 +1,21 @@
 #' AphA (Association of Professional Healthcare Analysts) CPD Survey Responses
-#' 
+#'
 #' Full raw data from the AphA CPD Survey
-#' 
+#'
 #' @source \url{https://www.aphanalysts.org/documents/cpd-survey-results-raw-data/}
-#' 
+#'
 #' The survey of NHS and other healthcare data analysts was conducted in July
 #'  2022. The results data is made available in this package with the permission
 #'  of AphA.
-#' 
+#'
 #' @format This tidied raw data is available here as a tibble with 38 columns
 #'  (blank or superfluous columns from the raw data were removed) and 237 rows
 #'  (1 per respondent ID).
-#' 
+#'
 #' Variables have been named using a "controlled language" approach informed
 #'  by Emily Riederer's "Column Names as Contracts"
 #'  \url{https://emilyriederer.netlify.app/post/column-name-contracts/}.
-#' 
+#'
 #' \describe{
 #' \item{*_id}{Columns ending in \code{"_id"} are numeric and represent a
 #'   unique ID for that response.}
@@ -33,20 +33,20 @@
 #' \item{*_txt}{Columns ending in \code{"_txt"} contain free text responses and
 #'   are in character format.}
 #' }
-#' 
+#'
 #' Multi-part questions have column name stubs with sequential letters. For
 #'  example, \code{"q20a_"}, \code{"q20b_"} and so on.
 #'  For formatting consistency, questions with a single part still have a
 #'  column name stub with the letter a, for example \code{"q01a_"}.
-#' 
+#'
 #' Original survey questions (lightly edited) are provided as variable labels
 #'  using the \code{{labelled}} package
 #'  \url{https://larmarange.github.io/labelled/}.
 #'  These labels provide more descriptive context for the "clean" column names.
 #'  Variable labels can be viewed using \code{labelled::get_variable_labels
 #'  (apha_cpd_survey)}.
-#' 
-#' 
+#'
+#'
 #' Survey press release web page: \url{https://www.aphanalysts.org/ltnws/nhs-at-risk-of-losing-a-generation-of-data-analysts/}
-#' 
+#'
 "apha_cpd_survey"
diff --git a/R/covid19.R b/R/covid19.R
@@ -1,11 +1,15 @@
 #' International COVID-19 reported infection and death data
 #'
-#' Reported COVID-19 infections, and deaths, collected and collated by the European Centre for Disease Prevention
-#' and Control (ECDC, provided by day and country.
-#' Data were collated and published up to 14th December 2020, and have been tidied so they are easily usable within the `tidyverse` of packages.
+#' Reported COVID-19 infections, and deaths, collected and collated by the
+#' European Centre for Disease Prevention and Control (ECDC, provided by day
+#' and country).  Data were collated and published up to 14th December 2020,
+#' and have been tidied so they are easily usable within the `tidyverse` of
+#' packages.
 #'
 #' Data sourced from \href{https://www.ecdc.europa.eu/en/publications-data/download-todays-data-geographic-distribution-covid-19-cases-worldwide}{European Centre for Disease Prevention and Control}
-#' which is available under the open licence, compatible with the  CC BY 4.0 license, further details available at \href{https://www.ecdc.europa.eu/en/copyright}{ECDC}.
+#' which is available under the open licence, compatible with the  CC BY 4.0
+#' license, further details available at
+#' \href{https://www.ecdc.europa.eu/en/copyright}{ECDC}.
 #'
 #' @docType data
 #'
@@ -14,10 +18,14 @@
 #' @format Tibble with seven columns
 #' \describe{
 #' \item{date_reported}{The date cases were reported}
-#' \item{contient}{A `factor` for the geographical continent in which the reporting country is located.}
-#' \item{countries_and_territories}{A `factor` for the country or territory reporting the data.}
-#' \item{countries_territory_code}{A `factor` for the a three-letter country or territory code.}
-#' \item{population_2019}{The reported population of the country for 2019, taken from Eurostat for Europe and the World Bank for the rest of the world.}
+#' \item{contient}{A `factor` for the geographical continent in which the
+#' reporting country is located.}
+#' \item{countries_and_territories}{A `factor` for the country or territory
+#' reporting the data.}
+#' \item{countries_territory_code}{A `factor` for the a three-letter country
+#' or territory code.}
+#' \item{population_2019}{The reported population of the country for 2019,
+#' taken from Eurostat for Europe and the World Bank for the rest of the world.}
 #' \item{cases}{The reported number of positive cases.}
 #' \item{deaths}{The reported number of deaths.}
 #' }
@@ -36,13 +44,14 @@
 #' # Create a plot of the performance for England over time
 #' covid19 %>%
 #'   filter(countries_and_territories ==
-#'   c( "United_Kingdom", "Italy", "France", "Germany", "Spain")) %>%
-#'   ggplot(aes(x=date_reported, y= cases, col=countries_and_territories)) +
-#'   geom_line()+
+#'     c("United_Kingdom", "Italy", "France", "Germany", "Spain")) %>%
+#'   ggplot(aes(x = date_reported, y = cases, col = countries_and_territories)) +
+#'   geom_line() +
 #'   scale_color_discrete("Country") +
-#'   scale_y_continuous(labels=comma)+
-#'   labs(y="Cases", x="Date", title="Covid-19 cases for selected countries"
-#'        , alt="A plot of covid-19 cases in France, Germany, Italy, Spain & the UK")+
+#'   scale_y_continuous(labels = comma) +
+#'   labs(
+#'     y = "Cases", x = "Date", title = "Covid-19 cases for selected countries",
+#'     alt = "A plot of covid-19 cases in France, Germany, Italy, Spain & the UK"
+#'   ) +
 #'   theme_minimal()
 "covid19"
-
diff --git a/R/ons_mortality.R b/R/ons_mortality.R
@@ -13,27 +13,28 @@
 #'
 #' @format Data frame with five columns
 #' \describe{
-#' \item{category_1}{character, containing the names of the groups for counts, e.g. "Total deaths", "all ages".}
-#' \item{category_2}{character, subcategory of names of groups where necessary, e.g. details of region: "East", details of age bands "15-44".}
+#' \item{category_1}{character, containing the names of the groups for counts, for example "Total deaths", "all ages".}
+#' \item{category_2}{character, subcategory of names of groups where necessary, for example details of region: "East", details of age bands "15-44".}
 #' \item{counts}{numeric, numbers of deaths in whole numbers and average numbers with decimal points. To retain the integrity of the format this column data is left as character.}
 #' \item{date}{date, format is yyyy-mm-dd; all dates are a Friday.}
 #' \item{week_no}{integer, each week in a year is numbered sequentially.}
 #' }
 #'
-#' @source Collected by Zoë Turner \email{zoe.turner2@nottshc.nhs.uk}, Apr-2020 from \url{https://www.ons.gov.uk/peoplepopulationandcommunity/birthsdeathsandmarriages/deaths/datasets/weeklyprovisionalfiguresondeathsregisteredinenglandandwales}
+#' @source Collected by Zoë Turner \email{zoe.turner3@nhs.net}, Apr-2020 from \url{https://www.ons.gov.uk/peoplepopulationandcommunity/birthsdeathsandmarriages/deaths/datasets/weeklyprovisionalfiguresondeathsregisteredinenglandandwales}
 #'
 #' @usage data(ons_mortality)
 #'
 #' @examples
 #' data(ons_mortality)
 #'
-#'library(dplyr)
-#'library(tidyr)
+#' library(dplyr)
+#' library(tidyr)
 #'
-#'wideForm <- ons_mortality %>%
-#'  select(-week_no) %>%
-#'  pivot_wider(names_from = date,
-#'              values_from = counts
-#'  )
+#' wideForm <- ons_mortality %>%
+#'   select(-week_no) %>%
+#'   pivot_wider(
+#'     names_from = date,
+#'     values_from = counts
+#'   )
 #'
 "ons_mortality"
diff --git a/R/stranded_patient_model.R b/R/stranded_patient_model.R
@@ -20,7 +20,7 @@
 #' \item{frailty_index}{An initial index assessment to say if the patient is frail or not. This is needed for alignment of service provision.}
 #' }
 #'
-#' @source Synthetically generated by Gary Hutson \email{[email protected]}, Mar-2021.
+#' @source Synthetically generated by Gary Hutson, Mar-2021.
 #'
 #' @usage data(stranded_data)
 #'
@@ -29,6 +29,6 @@
 #' library(dplyr)
 #' data("stranded_data")
 #' stranded_data %>%
-#'  glimpse()
+#'   glimpse()
 #'
 "stranded_data"
diff --git a/R/synthetic_news_data.R b/R/synthetic_news_data.R
@@ -23,7 +23,7 @@
 #' \item{died}{Indicator to monitor patient death}
 #' }
 #'
-#' @source Generated by Dr. Muhammed Faisal and created by Gary Hutson \email{[email protected]}, Mar-2021
+#' @source Generated by Dr. Muhammed Faisal and created by Gary Hutson, Mar-2021
 #'
 #' @usage data(synthetic_news_data)
 #'
@@ -32,5 +32,5 @@
 #' library(dplyr)
 #' data("synthetic_news_data")
 #' synthetic_news_data %>%
-#'  glimpse()
+#'   glimpse()
 "synthetic_news_data"
diff --git a/README.Rmd b/README.Rmd
@@ -74,3 +74,14 @@ It's great to see the {NHSRdatasets} package and data used as it promotes the wo
 
 The data used to build the mortality dataset in this package is released under © Crown copyright and is free to use under the terms of the Open Government Licence. 
 Any subsequent use should include a source accreditation to ONS to help people find the original releases and any statistical corrections that may have occurred since this was included in this pacakge - Source: Office for National Statistics licensed under the Open Government Licence.
+
+## Contributors
+
+<!-- ALL-CONTRIBUTORS-LIST:START - Do not remove or modify this section -->
+<!-- prettier-ignore-start -->
+<!-- markdownlint-disable -->
+
+<!-- markdownlint-restore -->
+<!-- prettier-ignore-end -->
+
+<!-- ALL-CONTRIBUTORS-LIST:END -->