Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

verbesserungs vorschläge #1

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
91 changes: 91 additions & 0 deletions 01_Scrape_Austrian_ben.Rmd
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
---
title: "Austrian Proposals and Bills"
output: html_document
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```

# Packages

```{r}
pacman::p_load(tidyverse, furrr, rvest, ggthemes, lubridate, rio, haven)

source("utils.R")

```

# Getting Index pages for XX - XXV legislative period

```{r}
# Index page for lp XX - XXV
index_urls("XX")

# Scraping Metadata of indexpages and page URLs
# scrape_indexpage(index_urls("XXV")
# scrape_indexpage(index_urls("XXIV")
# scrape_indexpage(index_urls("XXIII")
# scrape_indexpage(index_urls("XXII")
# scrape_indexpage(index_urls("XXI")
# scrape_indexpage(index_urls("XX")

all_indexpagedata <- dir("data", pattern = "_X", full.names = T) %>%
map_dfr(readr::read_csv) %>%
select(-X1)

readr::write_csv(all_indexpagedata, "data/indexpagedata.csv")

```
# Main

## Scraping from proposal pages

```{r}
# reading in indexpagedata and preparing variables (as.character for strings)

indexpagedata <- readr::read_csv(file = "data/indexpagedata.csv") %>%
mutate_all(as.character)

proposalpagedata <- indexpagedata %>%
mutate(.id = 1:n()) %>%
# sample_n(10) %>%
split(1:nrow(.)) %>%
# I put the scraping code in a function, which is sourced from the script utils.R
# Map is more efficient than for loop
# It take the providedfunction and apply it to each element of the input
map_dfr(get_meta_info)

data <- indexpagedata %>%
left_join(proposalpagedata) %>%
mutate(proposal_download = paste0("www.parlament.gv.at", proposal_download),
bill_download = bill_link %>%
str_replace("^\\/(?<=PAKT)", "https://www.parlament.gv.at/") %>%
map(~{
if(str_detect(.x, "PAKT") != 1) {return(NA_character_)}
wait()

.x %>%
read_html %>%
html_nodes("ul.fliesstext li a") %>%
html_attr("href") %>%
str_subset("\\d.html") %>%
.[1] %>%
paste0("www.parlament.gv.at", .)
}))



data %>%
split(1:nrow(.)) %>%
walk(~{
download_html(.x$proposal_download , file = paste0("data/proposal/", .x$proposal_filename, ".html"))
wait()
download_html(.x$bill_download , file = paste0("data/bill/", .x$bill_id, ".html"))
wait()
})


```


121 changes: 121 additions & 0 deletions utils.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
wait <- function(wait = T){
if(wait){
Sys.sleep(runif(1, 1, 1.2))
cat(paste0(i, " - waited for ", round(t, 2), "s. "))
}
}

scrape_indexpage <- function(url) { # URL = XXV, XXIV, XXIII, ...

#read in page
indexpage <- read_html(url)

# get dates of proposals (website upload?), type (RV, A, VOLKBG, BRA), descriptive title,
indexpagedata <- indexpage %>%
html_nodes("span.table-responsive__inner") %>%
html_text(trim = T) %>% # get table inner text and trim
matrix(ncol = 5, byrow = T) %>% # put into matrix, then dataframe (tibble)
.[, -5] %>%
as_tibble(.name_repair = "minimal") %>%
setNames(c("web_date", "type", "desc_title", "proposal_id"))

indexpagedata <- indexpagedata %>%
mutate(proposal_link = indexpage %>%
html_nodes("a.link-indicator") %>%
html_attr("href") %>%
.[seq(1, length(.), by = 2)] %>%
paste0("https://www.parlament.gv.at", .),
period = url %>%
str_extract("GP=\\w+") %>%
str_sub(., start = 4),
status = indexpage %>%
html_nodes("img.status") %>%
html_attr("src") %>%
.[seq(1, length(.), by = 2)] %>%
str_extract("[1-9]") %>%
as.character(),
proposal_filename = proposal_id %>%
gsub("[^A-Za-z0-9]", "" , .) %>%
paste(period, ., sep = "_"))

#exporting to appropriatly named .csv
write_csv(indexpagedata, file = paste0("data/df_", indexpagedata$period[1], ".csv"))

}

fix_obj <- function(obj, pattern = NA_character_){if(length(obj) == 0) return(pattern) else return(obj)}

get_meta_info <- function(.x){
page <- read_html(.x$proposal_link)

bill_link <- page %>%
html_nodes("div.floatLeft p a") %>%
html_attr("href") %>%
str_subset("BNR_\\d+") %>%
str_replace("\\/pls.+[=]", "https://www.parlament.gv.at") %>%
fix_obj

bill_id <- bill_link %>%
str_extract("XX.+BNR_[0-9]*") %>%
str_replace("\\/B.+_00", "_BNR") %>%
fix_obj

parties <- page %>%
html_node("div.floatLeft p") %>%
html_text() %>%
str_to_lower() %>%
str_extract("(?<=daf.r\\:).*") %>%
str_split(", dagegen\\: ") %>%
.[[1]] %>%
str_split(", ")

resolution_NR <- tibble(party = unlist(parties)) %>%
mutate(pro = party %in% parties[[1]]) %>%
list %>%
fix_obj(pattern = list())


proposal_download <- case_when(
.x$type %in% c("A", "RV", "GABR") ~ {page %>%
html_nodes("ul.fliesstext li a") %>%
html_attr("href") %>%
str_subset("\\d.html") %>%
.[1]},
.x$type %in% c("BUA") ~ {page %>%
html_nodes("ul.fliesstext li a") %>%
html_attr("href") %>%
str_subset("\\d.html") %>%
.[2]}
) %>%
fix_obj

iniator <- page %>%
html_nodes("div.c_2 p") %>%
.[-1] %>%
html_text(trim = T) %>%
str_squish %>%
str_extract("(?<=\\:)\\s*[A-Z].+") %>%
str_trim() %>%
str_subset("[0-9]{3}", negate = T) %>%
str_subset("\\s{3,}", negate = T) %>%
list() %>%
fix_obj(pattern = list())

dates <- page %>%
html_nodes("table.table-nonresponsive") %>%
html_nodes("tr.historyShowAlways") %>%
html_text(trim = T) %>%
str_extract("\\d{2}.\\d{2}.\\d{4}") %>%
discard(is.na)

processes <- page %>%
html_nodes("a.historieOverviewToggle") %>%
html_text()

parl_verfahren <- list(date = dates,
process = processes) %>%
fix_obj(pattern = list())

return(tibble( .id = .x$.id, bill_link, bill_id, proposal_download, iniator, resolution_NR, parl_verfahren))

}