From c49764ca53a182b8a2622470946163c02c30f6e3 Mon Sep 17 00:00:00 2001 From: benjaminguinaudeau Date: Thu, 12 Mar 2020 09:16:25 -0400 Subject: [PATCH 1/2] =?UTF-8?q?verbesserungs=20vorschl=C3=A4ge?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- 01_Scrape_Austrian_ben.Rmd | 91 +++++++++++++++++++++++++++ utils.R | 122 +++++++++++++++++++++++++++++++++++++ 2 files changed, 213 insertions(+) create mode 100644 01_Scrape_Austrian_ben.Rmd create mode 100644 utils.R diff --git a/01_Scrape_Austrian_ben.Rmd b/01_Scrape_Austrian_ben.Rmd new file mode 100644 index 0000000..401dc10 --- /dev/null +++ b/01_Scrape_Austrian_ben.Rmd @@ -0,0 +1,91 @@ +--- +title: "Austrian Proposals and Bills" +output: html_document +--- + +```{r setup, include=FALSE} +knitr::opts_chunk$set(echo = TRUE) +``` + +# Packages + +```{r} +pacman::p_load(tidyverse, furrr, rvest, ggthemes, lubridate, rio, haven) + +source("utils.R") + +``` + +# Getting Index pages for XX - XXV legislative period + +```{r} +# Index page for lp XX - XXV +index_urls("XX") + +# Scraping Metadata of indexpages and page URLs +# scrape_indexpage(index_urls("XXV") +# scrape_indexpage(index_urls("XXIV") +# scrape_indexpage(index_urls("XXIII") +# scrape_indexpage(index_urls("XXII") +# scrape_indexpage(index_urls("XXI") +# scrape_indexpage(index_urls("XX") + +all_indexpagedata <- dir("data", pattern = "_X", full.names = T) %>% + map_dfr(readr::read_csv) %>% + select(-X1) + +readr::write_csv(all_indexpagedata, "data/indexpagedata.csv") + +``` +# Main + +## Scraping from proposal pages + +```{r} +# reading in indexpagedata and preparing variables (as.character for strings) + +indexpagedata <- readr::read_csv(file = "data/indexpagedata.csv") %>% + mutate_all(as.character) + +proposalpagedata <- indexpagedata %>% + mutate(.id = 1:n()) %>% + # sample_n(10) %>% + split(1:nrow(.)) %>% + # I put the scraping code in a function, which is sourced from the script utils.R + # Map is more efficient than for loop + # It take the providedfunction and apply it to each element of the input + map_dfr(get_meta_info) + +data <- indexpagedata %>% + left_join(proposalpagedata) %>% + mutate(proposal_download = paste0("www.parlament.gv.at", proposal_download), + bill_download = bill_link %>% + str_replace("^\\/(?<=PAKT)", "https://www.parlament.gv.at/") %>% + map(~{ + if(str_detect(.x, "PAKT") != 1) {return(NA_character_)} + wait() + + .x %>% + read_html %>% + html_nodes("ul.fliesstext li a") %>% + html_attr("href") %>% + str_subset("\\d.html") %>% + .[1] %>% + paste0("www.parlament.gv.at", .) + })) + + + +data %>% + split(1:nrow(.)) %>% + walk(~{ + download_html(.x$proposal_download , file = paste0("data/proposal/", .x$proposal_filename, ".html")) + wait() + download_html(.x$bill_download , file = paste0("data/bill/", .x$bill_id, ".html")) + wait() + }) + + +``` + + diff --git a/utils.R b/utils.R new file mode 100644 index 0000000..4fbab3a --- /dev/null +++ b/utils.R @@ -0,0 +1,122 @@ +wait <- function(wait = T){ + if(wait){ + Sys.sleep(runif(1, 1, 1.2)) + cat(paste0(i, " - waited for ", round(t, 2), "s. ")) + } +} + +scrape_indexpage <- function(url) { # URL = XXV, XXIV, XXIII, ... + + #read in page + indexpage <- read_html(url) + + proposalpages <- indexpage %>% + html_nodes("a.link-indicator") %>% + html_attr("href") %>% + .[seq(1, length(.), by = 2)] %>% + paste0("https://www.parlament.gv.at", .) + + # get dates of proposals (website upload?), type (RV, A, VOLKBG, BRA), descriptive title, + indexpagedata <- indexpage %>% + html_nodes("span.table-responsive__inner") %>% + html_text(trim = T) %>% # get table inner text and trim + matrix(ncol = 5, byrow = T) %>% # put into matrix, then dataframe (tibble) + .[, -5] %>% + as_tibble(.name_repair = "minimal") %>% + setNames(c("web_date", "type", "desc_title", "proposal_id")) + + indexpagedata <- indexpagedata %>% + mutate(proposal_link = proposalpages, + period = url %>% + str_extract("GP=\\w+") %>% + str_sub(., start = 4), + status = indexpage %>% + html_nodes("img.status") %>% + html_attr("src") %>% + .[seq(1, length(.), by = 2)] %>% + str_extract("[1-9]") %>% + as.character(), + proposal_filename = proposal_id %>% + gsub("[^A-Za-z0-9]", "" , .) %>% + paste(period, ., sep = "_")) + + #exporting to appropriatly named .csv + write_csv(indexpagedata, file = paste0("data/df_", indexpagedata$period[1], ".csv")) + +} + +fix_obj <- function(obj, pattern = NA_character_){if(length(obj) == 0) return(pattern) else return(obj)} + +get_meta_info <- function(.x){ + page <- read_html(.x$proposal_link) + + bill_link <- page %>% + html_nodes("div.floatLeft p a") %>% + html_attr("href") %>% + str_subset("BNR_\\d+") %>% + str_replace("\\/pls.+[=]", "https://www.parlament.gv.at") %>% + fix_obj + + bill_id <- bill_link %>% + str_extract("XX.+BNR_[0-9]*") %>% + str_replace("\\/B.+_00", "_BNR") %>% + fix_obj + + parties <- page %>% + html_node("div.floatLeft p") %>% + html_text() %>% + str_to_lower() %>% + str_extract("(?<=daf.r\\:).*") %>% + str_split(", dagegen\\: ") %>% + .[[1]] %>% + str_split(", ") + + resolution_NR <- tibble(party = unlist(parties)) %>% + mutate(pro = party %in% parties[[1]]) %>% + list %>% + fix_obj(pattern = list()) + + + proposal_download <- case_when( + .x$type %in% c("A", "RV", "GABR") ~ {page %>% + html_nodes("ul.fliesstext li a") %>% + html_attr("href") %>% + str_subset("\\d.html") %>% + .[1]}, + .x$type %in% c("BUA") ~ {page %>% + html_nodes("ul.fliesstext li a") %>% + html_attr("href") %>% + str_subset("\\d.html") %>% .[2]} + ) %>% + fix_obj + + iniator <- page %>% + html_nodes("div.c_2 p") %>% + .[2] %>% + html_text(trim = T) %>% + str_squish %>% + str_extract("(?<=\\:)\\s*[A-Z].+") %>% + str_trim() %>% + str_subset("[0-9]{3}", negate = T) %>% + str_subset("\\s{3,}", negate = T) %>% + fix_obj + + dates <- page %>% + html_nodes("table.table-nonresponsive") %>% + html_nodes("tr.historyShowAlways") %>% + html_text(trim = T) %>% + str_extract("\\d{2}.\\d{2}.\\d{4}") %>% + discard(is.na) + + processes <- page %>% + html_nodes("a.historieOverviewToggle") %>% + html_text() + + parl_verfahren <- tibble(date = dates, + process = processes) %>% + list %>% + fix_obj(pattern = list()) + + return(tibble( .id = .x$.id, bill_link, bill_id, proposal_download, iniator, resolution_NR, parl_verfahren)) + +} \ No newline at end of file From c7bcb89831801f274a08348681288baf8cde24d6 Mon Sep 17 00:00:00 2001 From: flixi67 Date: Tue, 17 Mar 2020 14:54:53 +0100 Subject: [PATCH 2/2] Update utils.R Initiator scrape fixed, list-columns created differently to accomodate for unequal vectors, wait() --- utils.R | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/utils.R b/utils.R index 4fbab3a..efdcb1d 100644 --- a/utils.R +++ b/utils.R @@ -10,12 +10,6 @@ scrape_indexpage <- function(url) { # URL = XXV, XXIV, XXIII, ... #read in page indexpage <- read_html(url) - proposalpages <- indexpage %>% - html_nodes("a.link-indicator") %>% - html_attr("href") %>% - .[seq(1, length(.), by = 2)] %>% - paste0("https://www.parlament.gv.at", .) - # get dates of proposals (website upload?), type (RV, A, VOLKBG, BRA), descriptive title, indexpagedata <- indexpage %>% html_nodes("span.table-responsive__inner") %>% @@ -26,7 +20,11 @@ scrape_indexpage <- function(url) { # URL = XXV, XXIV, XXIII, ... setNames(c("web_date", "type", "desc_title", "proposal_id")) indexpagedata <- indexpagedata %>% - mutate(proposal_link = proposalpages, + mutate(proposal_link = indexpage %>% + html_nodes("a.link-indicator") %>% + html_attr("href") %>% + .[seq(1, length(.), by = 2)] %>% + paste0("https://www.parlament.gv.at", .), period = url %>% str_extract("GP=\\w+") %>% str_sub(., start = 4), @@ -86,35 +84,36 @@ get_meta_info <- function(.x){ .x$type %in% c("BUA") ~ {page %>% html_nodes("ul.fliesstext li a") %>% html_attr("href") %>% - str_subset("\\d.html") %>% .[2]} - ) %>% + str_subset("\\d.html") %>% + .[2]} + ) %>% fix_obj iniator <- page %>% html_nodes("div.c_2 p") %>% - .[2] %>% + .[-1] %>% html_text(trim = T) %>% str_squish %>% str_extract("(?<=\\:)\\s*[A-Z].+") %>% str_trim() %>% str_subset("[0-9]{3}", negate = T) %>% str_subset("\\s{3,}", negate = T) %>% - fix_obj + list() %>% + fix_obj(pattern = list()) dates <- page %>% html_nodes("table.table-nonresponsive") %>% html_nodes("tr.historyShowAlways") %>% html_text(trim = T) %>% str_extract("\\d{2}.\\d{2}.\\d{4}") %>% - discard(is.na) + discard(is.na) processes <- page %>% html_nodes("a.historieOverviewToggle") %>% html_text() - parl_verfahren <- tibble(date = dates, + parl_verfahren <- list(date = dates, process = processes) %>% - list %>% fix_obj(pattern = list()) return(tibble( .id = .x$.id, bill_link, bill_id, proposal_download, iniator, resolution_NR, parl_verfahren))