rnabioco · jayhesselberth · Feb 9, 2025 · Feb 9, 2025 · Feb 9, 2025 · Feb 9, 2025
diff --git a/NEWS.md b/NEWS.md
@@ -1,5 +1,10 @@
 # nihexporter (development version)
 
+* New `projects_min` table, which contains a minimal subset of projects data from 2006-2024,
+  with both direct and indirect costs (2006 was the first year IC amounts were published).
+
+* Fixed date parsing in `projects`.
+
 # nihexporter 0.10.0
 
 * Update tables through FY 2024.

diff --git a/R/data.R b/R/data.R
@@ -5,6 +5,13 @@
 #' @source \url{https://reporter.nih.gov/exporter/projects}
 "projects"
 
+#' A minimum set of project information from 2006-2024.
+#'
+#' @format A data frame with 14 variables
+#'
+#' @source \url{https://reporter.nih.gov/exporter/projects}
+"projects_min"
+
 #' Principal investigators.
 #'
 #' @format A data frame with 2 variables: `project.num` and `pi.id`

diff --git a/README.Rmd b/README.Rmd
@@ -33,7 +33,9 @@ pak::pak("rnabioco/nihexporter")
 
 ## Tables
 
-* `projects`: provides data on funded projects by NIH.
+* `projects`: provides data on funded projects by NIH across all years.
+*
+* `projects_min`: a minimal set of project data from 2006-2024. Contains both direct and indirect costs.
 
 * `project_pis`: links project numbers (`project.num`) to principal investigator IDs (`pi.id`).
 

diff --git a/README.md b/README.md
@@ -27,7 +27,12 @@ time to download and install. ⚠️
 
 ## Tables
 
-- `projects`: provides data on funded projects by NIH.
+- `projects`: provides data on funded projects by NIH across all years.
+
+- 
+
+- `projects_min`: a minimal set of project data from 2006-2024. Contains
+  both direct and indirect costs.
 
 - `project_pis`: links project numbers (`project.num`) to principal
   investigator IDs (`pi.id`).

diff --git a/_pkgdown.yml b/_pkgdown.yml
@@ -24,6 +24,7 @@ reference:
 - title: Tables
   contents:
   - projects
+  - projects_min
   - project_pis
   - project_io
   - publications

diff --git a/data-raw/common.R b/data-raw/common.R
@@ -5,6 +5,7 @@ library(usethis)
 library(devtools)
 library(here)
 library(janitor)
+library(fs)
 
 # provides `nih.institutes`
 source("R/vars.R")
@@ -13,8 +14,21 @@ source("R/vars.R")
 #'
 load_tbl <- function(path, col_types = NULL) {
   csvfiles <- dir(path, pattern = "\\.csv", full.names = TRUE)
-  tables <- lapply(csvfiles, function(x) read_csv(x, col_types = col_types))
+  tables <- lapply(
+    csvfiles,
+    function(x) readr::read_csv(x, col_types = col_types)
+  )
   raw_tbl <- tibble(bind_rows(tables))
 
   janitor::clean_names(raw_tbl)
 }
+
+load_tbl_nested <- function(path) {
+  tibble(
+    path = fs::dir_ls(path, glob = "*.csv"),
+    csv = fs::path_file(path),
+    fy = as.integer(stringr::str_extract(csv, "[0-9]+")),
+    tbl = purrr::map(path, ~read_csv(.x))
+  ) |>
+    select(fy, csv, tbl)
+}
diff --git a/data-raw/projects.R b/data-raw/projects.R
@@ -20,8 +20,8 @@ col_types <- cols_only(
   ORG_STATE = col_character(),
   ORG_DISTRICT = col_integer(),
   PI_IDS = col_character(),
-  PROJECT_START = col_date(format = "%m/%d/%Y"),
-  PROJECT_END = col_date(format = "%m/%d/%Y"),
+  PROJECT_START = col_date(format = "%Y-%m-%d"),
+  PROJECT_END = col_date(format = "%Y-%m-%d"),
   STUDY_SECTION = col_character(),
   SUFFIX = col_character(),
   TOTAL_COST = col_double()

diff --git a/data-raw/projects_min.R b/data-raw/projects_min.R
@@ -0,0 +1,96 @@
+## projects_min table
+
+# where do the "PRJFUNDING" files come from? TBD
+
+# main data are in the "PRJ_C" files.
+#
+# the format of the projects tables is different for each year:
+#
+# - 1985 - 2005: 42 columns
+# - 2006 - 2024: 46 columns
+#
+# The later years had direct and indirect costs added.
+library(tidyverse)
+library(here)
+library(janitor)
+
+data_dir <- here("data-raw/downloads/projects")
+
+col_types <- cols_only(
+  APPLICATION_ID = col_double(),
+  ACTIVITY = col_character(),
+  ADMINISTERING_IC = col_character(),
+  APPLICATION_TYPE = col_double(),
+  ARRA_FUNDED = col_character(),
+  CORE_PROJECT_NUM = col_character(),
+  # FOA_NUMBER = col_character(),
+  FUNDING_MECHANISM = col_factor(),
+  FY = col_factor(),
+  ORG_CITY = col_character(),
+  ORG_DUNS = col_character(), # leading zeros so no int
+  ORG_NAME = col_character(),
+  ORG_STATE = col_character(),
+  ORG_DISTRICT = col_integer(),
+  PI_IDS = col_character(),
+  PROJECT_START = col_date(format = "%Y-%m-%d"),
+  PROJECT_END = col_date(format = "%Y-%m-%d"),
+  STUDY_SECTION = col_character(),
+  SUFFIX = col_character(),
+  TOTAL_COST = col_double(),
+  DIRECT_COST_AMT = col_double(),
+  INDIRECT_COST_AMT = col_double()
+)
+
+# main projects table is 2006 onward
+projects_min <-
+  tibble(
+    path = fs::dir_ls(data_dir, glob = "*PRJ_C_*.csv"),
+    csv = fs::path_file(path),
+    fy = stringr::str_extract(csv, "[0-9]+")
+  ) |>
+    filter(fy >= 2006) |>
+    mutate(
+      tbl = purrr::map(path, ~read_csv(.x, col_types = col_types))
+    ) |>
+    select(tbl) |>
+    unnest(tbl) |>
+    janitor::clean_names() |>
+    select(-pi_ids) |>
+    rename(
+      project_num = core_project_num,
+      fiscal_year = fy,
+      institute = administering_ic
+    ) |>
+    filter(!is.na(project_num) & !is.na(total_cost)) |>
+    filter(!grepl("-", project_num)) |>
+    filter(institute %in% nih_institutes) |>
+    mutate(
+      across(
+        c(
+          project_num,
+          institute,
+          activity,
+          application_type,
+          arra_funded,
+          study_section,
+          suffix,
+          org_state,
+          org_district
+        ),
+        as.factor
+      ),
+      fy_cost = as.double(total_cost)
+    ) |>
+    select(
+      activity,
+      institute,
+      project_num,
+      fiscal_year,
+      project_start,
+      project_end,
+      starts_with("org"),
+      ends_with("amt"),
+      fy_cost
+    )
+
+use_data(projects_min, compress = "xz", overwrite = TRUE)
diff --git a/data/projects.rda b/data/projects.rda
diff --git a/data/projects_min.rda b/data/projects_min.rda
diff --git a/man/projects_min.Rd b/man/projects_min.Rd
diff --git a/vignettes/nihexporter.Rmd b/vignettes/nihexporter.Rmd
@@ -476,7 +476,7 @@ ggplot(
 
 ## Duration
 
-`nihexporter` exposes `project.start` and `project.end`, which we can use to examine the duration of projects. For example, we can identify the longest running R01 grants.
+The `projects` table contains `project_start` and `project_end`, which we can use to examine the duration of projects. For example, we can identify the longest running R01 grants.
 
 ```{r}
 #| label: grant_stamina