Merge pull request #17 from kuriwaki/add-kos

Add kos and cd_info_long
kuriwaki · Oct 7, 2024 · ac78c94 · ac78c94
2 parents 6052f73 + c3590db
commit ac78c94
Show file tree

Hide file tree

Showing 15 changed files with 471 additions and 56 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,15 +1,18 @@
 Package: ccesMRPprep
 Type: Package
 Title: Functions and Data to Prepare CCES data for MRP
-Version: 0.1.12
+Version: 0.1.13
 Authors@R: 
    c(person(given = "Shiro",
              family = "Kuriwaki",
              role = c("aut", "cre", "cph"),
              email = "[email protected]",
              comment = c(ORCID = "0000-0002-5687-2647")),
      person(given = "Daily Kos Elections",
-             role = "dtc"))
+             role = "dtc"),
+     person(given = "Miranda",
+             family = "Selin",
+             role = "ctb"))
 Description: This provides data loading, processing, and formatting functions for
    a particular task: using CCES data for Multilevel Regression Post-stratification.
    Model fitting and visualization of MRP itself is handled elsewhere. This package

diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,6 @@
+# cccesMRPprep 0.1.13
+
+* Add 2022, 2024, 2006, and 2012 daily kos cd data
 
 # cccesMRPprep 0.1.12
 

diff --git a/R/datadoc_cd-info-long.R b/R/datadoc_cd-info-long.R
@@ -0,0 +1,38 @@
+#' Congressional District level information - long version
+#'
+#' `cd_info_long` provides a "long" version of the yearly `cd_info_20**` datasets.
+#'
+#'
+#' @format `cd_info_long` is a dataframe with `r nrow(cd_info_long)` rows,
+#'  covering the maps of `r length(unique(cd_info_long$lines))` election years
+#'  (`r paste0(unique(cd_info_long$lines), collapse = ', ')`) for each of the 435 congressional districts.
+#'   \describe{
+#'    \item{`lines`}{Is the year corresponding to the geography (district line). For example, `lines = 2008` and `cd = "AL-01`
+#'     indicates that the row is representing AL-01's geography as used in the 2008 election.}
+#'    \item{`cd`}{Is the CD corresponding to the year of the geography (district line). Note that districts can change drastically
+#'     by redistricting; a state's "first congressional district" from one `lines` can cover a different area
+#'     than the same first congressional district for another.}
+#'    \item{`elec`}{Is the year of the election for the presidential election data that follows}
+#'    \item{`party`, `candidate`}{Define the presidential candidate that corresponds to the `elec`
+#'     (which may not be the same as `lines`). For example, `lines = 2012, cd = AL-01` combined with
+#'      `elec = 2008` represents the 2008 election results in the newly redistricted (2012) AL-01 geography}
+#'    \item{`pct`}{are the two party voteshares of the candidate}
+#'    \item{`presvotes_total`}{Is the total number of votes for President in that CD}
+#'    }
+#'
+#' @examples
+#'  library(dplyr)
+#'
+#'  # get only data for proximate years
+#'  cd_info_long |> filter((elec == lines) | (elec + 2 == lines))
+#'
+#'  # this subset returns exactly 2 * 435 districts per cycle:
+#'  cd_info_long |> filter((elec == lines) | (elec + 2 == lines)) |> count(lines, party)
+#'
+#' # this will show where the districts lines changed between 2022 and 2024
+#' # (same election, same candidate, different map)
+#' cd_info_long |>
+#'  filter(lines %in% c(2022, 2024), elec == 2020, candidate == "biden") |>
+#'  arrange(cd, lines)
+#'
+"cd_info_long"
diff --git a/R/datadoc_cd-info.R b/R/datadoc_cd-info.R
@@ -1,70 +1,122 @@
-#' Congressional District level information by Daily Kos
+#' Congressional District level information by The Downballot
 #'
 #'
-#' Some of the most consequential variables to include in MRP are at the
+#' Some of the most consequential variables to include in MRP are measured at the
 #' district-level. We include one such data for congressional districts. All data
-#' is collected by Daily Kos. `cd_info_2018` is data on 2018 boundaries, `cd_info_2016`
-#' uses 2016 boundaries and `cd_info_2020` uses 2020 (but with place descriptions
-#' currently at 2016).
+#' is collected by The Downballot.
+#'
+#' @details `cd_info_2008` is data on boundaries used in 2006, 2008, and 2010;
+#' `cd_info_2012` is data on boundaries used in 2012 and 2014; `cd_info_2016`
+#' uses 2016 boundaries; `cd_info_2018` is data on 2018 boundaries;
+#' `cd_info_2020` uses 2020 boundaries; `cd_info_2022` is
+#' data on 2022 boundaries; `cd_info_2024` uses 2024 boundaries.
+#'
+#' District lines change before and after each decennial Census, e.g. 2010 vs. 2012 and
+#'  2020 vs. 2022. There is also change in district lines due to court interventions.
+#'
+#'  * Between the 2022 and 2024 data, 4 districts changed their
+#'    districts: AL, GA, LA, and NC. In these changes, AL-02, GA-06, and LA-06 became
+#'    a majority minority district; the NC state supreme court plan in 2022 expired; and
+#'    the NY court struck down the initial 2022 NY plan.
+#'   * Between the 2018 and 2020 data, NC changed their districts
+#'   * Between the 2016 and 2018 data, PA changed their districts
+#'   * Between the 2012-14 and 2016 data, FL, NC, VA changed their districts
+#'
+#' These can be seen by, for example, the following code:
+#'   `cd_info_2022 |> left_join(cd_info_2024, by = "cd") |> select(cd, matches("trump")) |> mutate(diff = abs(pct_trump - pct_trump20))`
 #'
 #' @format Each `cd_info_20**` is a dataframe with the `r nrow(cd_info_2018)` Congressional
 #'  Districts, one row per cd.
 #'  \describe{
-#'    \item{year}{The year for the district line. A congressional district's
+#'    \item{`year`}{The year for the district line. A congressional district's
 #'    actual geography can change year to year, and significantly so in different
 #'    redistricting cycles. Lines try to get the contemporaneous district map,
 #'    so that cd_info_2016 uses 2016 maps and cd_info_2020 uses 2020 maps.
 #'    However, this work relies on the hard work of assembling precinct results by Daily Kos.}
-#'    \item{cd}{District code. The formatting corresponds to the CCES cumulative
+#'    \item{`cd`}{District code. The formatting corresponds to the CCES cumulative
 #'    coding of \code{cd}: a two-letter abbreviation for the state followed by
 #'    a dash, and the district number padded with zeros to the left to be of length
-#'    2. At-large districts like Delaware are given a "-01" for the district number.}
-#'    \item{presvotes_total}{In presidential years, the total number of votes cast for
+#'    2. At-large districts like Delaware are given a "-01" for the district number. See `to_cd()`}
+#'    \item{`presvotes_total`}{In presidential years, the total number of votes cast for
 #'     the office of President that year. Taken from Daily Kos estimates from precinct results.}
-#'    \item{pct_trump, pct_romney, pct_mccain}{The two-party voteshare of Republican
+#'    \item{`pct_trump`, `pct_romney`, `pct_mccain`}{The two-party voteshare of Republican
 #'    presidential candidates in that district for the given year. E.g. the
-#'    \code{pct_mccain} data when \code{cd_year == 2018} represents the percent
-#'    of the vote by McCain in 2008 for that district _under 2018 lines._
-#'    The Trump value is for 2016 for `cd_info_2018` and ``cd_info_2020` but not
-#'    for 2020 where we use Trump's 2020 vote against Biden and denote as
-#'    `pct_trump16` the 2016 result.}
-#'    \item{dailykos_name}{The unique descriptive name for the district code in
-#'    2018 given by Daily Kos. Some edits are made for changing district. See
+#'    \code{pct_mccain} data for `cd_info_2018` represents the percent
+#'    of the vote by McCain in 2008 for that district _under 2018 lines._\cr
+#'    `pct_trump` denotes the 2016 election for `cd_info_2018` and `cd_info_2016`.\cr
+#'    `pct_trump` denotes the 2020 election for `cd_info_2020`, `cd_info_2022`, and `cd_info_2024`.\cr
+#'    `pct_trump16` denotes the 2016 result for `cd_info_2020`.}
+#'    \item{`dailykos_name`}{The unique descriptive name for the district code in
+#'    2018 given by Daily Kos (later renamed to The Downballot). Some edits are made for changing district. See
 #'    Source for full citation.}
-#'    \item{largest_place}{The largest place in the district code in 2018 given by Daily Kos. Multiple districts may
-#'    have the largest place.}
+#'    \item{`largest_place`}{The largest place in the district code in 2018 given by Daily Kos. Multiple districts may
+#'    have the same largest place.}
 #'  }
 #'
+#' @seealso cd_info_long
 #'
 #' @source
+#'   The Downballot (formerly Daily Kos Elections), \url{https://www.the-downballot.com/p/data}
+#'
 #'   The Daily Kos Elections naming guide to the nation's congressional districts.
 #'   \url{https://bit.ly/2XsFI5W}
 #'
-#'   Daily Kos, "2008, 2012, & 2016 results for districts used in 2018."
-#'   \url{https://bit.ly/3DRhPcj}
-
-#'   DailyDaily Kos Elections 2012, 2016 & 2020 presidential election results for congressional districts in 2020"
-#'   \url{https://bit.ly/3bXtAPB}
+#'   Daily Kos, "2008 results for districts used in **2006, 2008, 2010**"
+#'   \url{https://bit.ly/4entUrV}
+#'
+#'   Daily Kos, "2008, 2012 results for districts used in **2012, 2014**"
+#'   \url{https://bit.ly/3N4PDZK}
+#'
+#'   Daily Kos, "2008, 2012, & 2016 results for districts used in **2018**." \url{https://bit.ly/3bXtAPB}
+#'
+#'   Daily Kos, "2012, 2016 & 2020 presidential election results for congressional districts in **2020**", \url{https://bit.ly/3DRhPcj}
+#'
+#'   Daily Kos, 2020 presidential election results by later congressional districts:
+#'
+#'   * __2022__ congressional districts: \url{https://bit.ly/4gLYnBK}
+#'   * __2024__ congressional districts: \url{https://bit.ly/47KTvZw}
+#'
+#'   Daily Kos, congressional district geography and most populous places:
+#'
+#'   * 119th Congress: \url{https://bit.ly/geography_119}\cr
+#'   * 118th Congress: \url{https://bit.ly/geography_118}
 #'
 #'   Pennsylvania 2016 CD names are named by Shiro Kuriwaki and Lara Putnam.
 #'
-#'   Also see Cha, Jeremiah; Kuriwaki, Shiro; Snyder, James M. Jr., 2021,
+#'   Also see Cha, Kuriwaki, and Snyder, 2021,
 #'    "Candidates in American General Elections", https://doi.org/10.7910/DVN/DGDRDT,
 #'    Harvard Dataverse.
 #'
-#' @importFrom tibble tibble
 #'
 #' @examples
 #' head(cd_info_2018)
 #' head(elec_NY)
 "cd_info_2018"
 
+#' @rdname cd_info_2018
+#' @format NULL
+"cd_info_2008"
+
+#' @rdname cd_info_2018
+#' @format NULL
+"cd_info_2012"
 
 #' @rdname cd_info_2018
+#' @format NULL
 "cd_info_2016"
 
 #' @rdname cd_info_2018
+#' @format NULL
 "cd_info_2020"
 
 #' @rdname cd_info_2018
+#' @format NULL
+"cd_info_2022"
+
+#' @rdname cd_info_2018
+#' @format NULL
+"cd_info_2024"
+
+#' @rdname cd_info_2018
+#' @format NULL
 "elec_NY"
diff --git a/_pkgdown.yml b/_pkgdown.yml
@@ -31,6 +31,7 @@ reference:
   - cc18_samp
   - cc18_NY
   - cd_info_2018
+  - cd_info_long
   - elec_NY
   - questions_samp
 - title: Data recoding and cleaning

diff --git a/data-raw/create_cd-info-long.R b/data-raw/create_cd-info-long.R
@@ -0,0 +1,104 @@
+library(tidyverse)
+
+pres_party <- tibble::tribble(
+  ~name, ~party,
+  "harris",    "D",
+  "trump",    "R",
+  "biden",    "D",
+  "clinton",    "D",
+  "obama",    "D",
+  "romney",    "R",
+  "mccain",    "R"
+)
+
+pres_names_long <- tribble(
+  ~name, ~elec,
+  "harris", 2024,
+  "trump", 2024,
+  "biden", 2020,
+  "trump", 2020,
+  "trump", 2016,
+  "clinton", 2016,
+  "obama", 2012,
+  "romney", 2012,
+  "obama", 2008,
+  "mccain", 2008)
+
+pres_names_wide <- pres_names_long |>
+  left_join(pres_party) |>
+  pivot_wider(
+    id_cols = elec,
+    names_from = party,
+    names_prefix = "party_",
+    values_from = name
+  )
+
+cd_info_all <- bind_rows(
+  cd_info_2008,
+  mutate(cd_info_2008, year = 2010),
+  cd_info_2012,
+  mutate(cd_info_2012, year = 2014),
+  cd_info_2016,
+  cd_info_2018,
+  cd_info_2020,
+  cd_info_2022,
+  cd_info_2024,
+)
+
+# Republican vote percentages
+R_pct <-
+  cd_info_all |>
+  select(lines = year, cd, starts_with("pct_")) |>
+  pivot_longer(
+    matches("pct_"),
+    names_prefix = "pct_",
+    values_to = "pct", values_drop_na = TRUE) |>
+  mutate(elec = case_when(
+    name == "mccain" ~ 2008,
+    name == "romney" ~ 2012,
+    name == "trump16" ~ 2016,
+    name == "trump20" ~ 2020,
+    name == "trump" & lines <= 2018 ~ 2016,
+    name == "trump" & lines >= 2020 & lines <= 2022 ~ 2020
+  ),
+  .after = cd
+  ) |>
+  mutate(name = str_remove(name, "(16|20)"),
+         party = "R")
+
+
+D_pct <- R_pct |>
+  left_join(pres_names_wide, by = "elec") |>
+  transmute(lines,
+            cd,
+            elec,
+            party = "D",
+            name = party_D,
+            pct = 1 - pct)
+
+# same for Ns
+Ns <-
+  cd_info_all |>
+  select(lines = year, cd, matches("total")) |>
+  pivot_longer(
+    matches("presvotes_total"),
+    names_prefix = "presvotes_",
+    values_to = "presvotes_total", values_drop_na = TRUE) |>
+  mutate(elec = case_when(
+    name == "total20" ~ 2020,
+    lines %in% c(2008, 2012, 2016, 2020) ~ lines,
+    lines == 2010 ~ 2008,
+    lines == 2014 ~ 2012,
+    lines == 2018 ~ 2016,
+    lines == 2022 ~ 2020,
+  ),
+  .after = cd
+  ) |>
+  select(-name)
+
+cd_info_long <- bind_rows(D_pct, R_pct) |>
+  tidylog::left_join(Ns, by = c("lines", "cd", "elec")) |>
+  arrange(lines, elec, cd, party) |>
+  rename(candidate = name)
+
+usethis::use_data(cd_info_long, overwrite = TRUE)
diff --git a/data-raw/create_dist-level-data_2006_2012.R b/data-raw/create_dist-level-data_2006_2012.R
@@ -0,0 +1,50 @@
+library(tidyverse)
+library(googlesheets4)
+
+# Authenticate with Google Sheets
+# gs4_auth()
+
+# URLs for your Google Sheets
+url_2008 <- "https://docs.google.com/spreadsheets/d/1l7W130tPRF6dQ4JoqlQJSJexnEg4rZct-7kwDXqgoLE/edit?gid=429358610#gid=429358610"
+url_2012 <- "https://docs.google.com/spreadsheets/d/1xn6nCNM97oFDZ4M-HQgoUT3X4paOiSDsRMSuxbaOBdg/edit?gid=0#gid=0"
+
+# Read data from Google Sheets
+
+# 2008
+cd_names_2008 <- read_sheet(url_2008, range = "A2:E", col_names = TRUE, sheet = 1) |>
+  mutate(year = 2008) |>
+  select(year,
+         cd = CD,
+         mccain = McCain)
+
+voting_info_2008 <- read_sheet(url_2008,range = "A2:I", col_names = TRUE, sheet = 2) |>
+  select(cd = CD,
+      #   pct_mccain = 'McCain%', # replaced by McCain sheet one
+         presvotes_total = "Total")
+
+# 2012
+cd_names_2012 <- read_sheet(url_2012, range = "A2:G", col_names = TRUE, sheet = 1) |>
+  mutate(year = 2012) |>
+  select(year, cd = CD)
+
+voting_info_2012 <- read_sheet(url_2012, range = "A2:O", col_names = TRUE, sheet = 2) |>
+  select(cd = CD,
+         presvotes_total = 7,
+         pct_romney = 'Romney%',
+         pct_mccain = 'McCain%')
+
+# Join data from page 1 and page 2 for each dataset
+cd_info_2008 <- cd_names_2008 |>
+  left_join(voting_info_2008, by = "cd") |>
+  mutate(cd = str_replace(cd, "-AL$", "-01")) |>
+  mutate(pct_mccain = mccain * 0.01) |>
+  select(!mccain) |>
+  relocate(year, cd, pct_mccain, presvotes_total)
+
+cd_info_2012 <- cd_names_2012 |>
+  left_join(voting_info_2012, by = "cd") |>
+  mutate(cd = str_replace(cd, "-AL$", "-01"))
+
+# Save the data
+usethis::use_data(cd_info_2012, overwrite = TRUE)
+usethis::use_data(cd_info_2008, overwrite = TRUE)