Skip to content

Commit 08a4bf2

Browse files
author
Alex Jakubow
committed
initial commit
0 parents  commit 08a4bf2

13 files changed

+2464
-0
lines changed

.Rprofile

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
source("renv/activate.R")

.gitignore

+8
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
.Rproj.user
2+
.Rhistory
3+
.RData
4+
.Ruserdata
5+
6+
# Directories
7+
data/**
8+
_targets/**

R/functions.R

+36
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
download_source <- function(type = "individual",
2+
outdir) {
3+
# Setup for scrape
4+
home <- "https://www.ussc.gov"
5+
url <- paste0(home, "/research/datafiles/commission-datafiles")
6+
all_links_css <- ".subContainer a"
7+
8+
# Determine links
9+
links <- read_html(url) %>%
10+
html_elements(all_links_css) %>%
11+
html_attr("href")
12+
if (type == "individual") {
13+
regex_str <- "opafy"
14+
}
15+
links <- paste0(home,
16+
links[grepl("\\.zip$", links) & grepl(regex_str, links)]
17+
)
18+
19+
# Create output directory if needed
20+
if (!dir.exists(outdir)) {
21+
dir.create(outdir, recursive = TRUE, showWarnings = FALSE)
22+
}
23+
24+
# Download
25+
safe_dl <- safely(download.file)
26+
map(
27+
.x = links,
28+
.f = ~ {
29+
Sys.sleep(10);
30+
safe_dl(.x,
31+
destfile = paste0(outdir, "/", gsub("^.+/", "", .x))
32+
)
33+
},
34+
.progress = TRUE
35+
)
36+
}

_setup.R

+11
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
# Directory setup
2+
dir.create("R")
3+
file.create("R/functions.R")
4+
5+
# Install
6+
install.packages(c("targets", "usethis", "visNetwork"))
7+
install.packages(c("tidyverse", "rvest"))
8+
renv::snapshot()
9+
10+
# Initialize pipeline
11+
targets::use_targets()

_targets.R

+23
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
# Load packages required to define the pipeline
2+
library(targets)
3+
4+
# Set target options
5+
tar_option_set(
6+
packages = c("tidyverse", "rvest"), # packages that your targets need to run
7+
format = "rds" # default storage format
8+
# Set other options as needed.
9+
)
10+
11+
# tar_make_clustermq() configuration (okay to leave alone):
12+
options(clustermq.scheduler = "multicore")
13+
14+
# Run the R scripts in the R/ folder with your custom functions
15+
tar_source()
16+
17+
# Pipeline
18+
list(
19+
tar_target(
20+
name = source_files,
21+
command = download_source(outdir = "data/01_source")
22+
)
23+
)

_targets/.gitignore

+5
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
*
2+
!.gitignore
3+
!meta
4+
meta/*
5+
!meta/meta

_targets/meta/meta

+3
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
name|type|data|command|depend|seed|path|time|size|bytes|format|repository|iteration|parent|children|seconds|warnings|error
2+
download_source|function|34e74197f3ce84c5|||||||||||||||
3+
source_files|stem|a1809566a143e50c|ac3c124c7b3257b9|318b40e636bcae4b|1238641790||t19411.732262596s|21cb808e5a67bfdb|302|rds|local|vector|||942.152|downloaded length 28277929 reported length 34311337. URL httpswww.ussc.govsitesdefaultfileszipopafy19nid.zip Timeout of 60 seconds was reached. downloaded length 30506153 reported length 37178129. URL httpswww.ussc.govsitesdefaultfileszipopafy18nid.zip Timeout of 60 seconds was reached. downloaded length 29965481 reported length 33885462. URL httpswww.ussc.govsitesdefaultfileszipopafy17nid.zip Timeout of 60 seconds was reached. downloaded length 27311273 reported length 34369031. URL httpswww.ussc.govsitesdefaultfileszipopafy16nid.zip Timeout of 60 seconds was reached. downloaded length 32914601 reported length 35341457. URL httpswww.ussc.govsitesdefaultfileszipopafy14nid.zip Timeout of 60 seconds was reached|

0 commit comments

Comments
 (0)