forked from CreatingData/Historical-Populations
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Second Parsing.Rmd
77 lines (59 loc) · 2.6 KB
/
Second Parsing.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
---
title: "Second Parsing"
output: html_notebook
---
I hate pandas, so some additional data cleaning in dply.
```{r fig.width=12}
library(tidyverse)
everything = read_csv("merged.csv")
head(everything)
labelled = everything %>%
mutate(latitude = case_when(
is.na(lat) & is.na(LAT) ~ LAT_BING,
is.na(lat) ~ LAT,
TRUE ~ lat),
longitude = case_when(
is.na(lon) & is.na(LON) ~ LON_BING,
is.na(lon) ~ LON,
TRUE ~ lon),
name = case_when(
is.na(title) & is.na(CityST) ~ Alperin_Place,
is.na(title) ~ CityST,
TRUE ~ title
),
state = case_when(
is.na(state) ~ ST,
TRUE ~ state
)
) %>%
mutate(longitude = ifelse(is.na(lon),LON,lon)) %>%
mutate(source = case_when(
is.na(cesta_pops) & !is.na(alperin_pops) ~ "Alperin-Sheriff Wikipedia set",
is.na(cesta_pops) & is.na(alperin_pops) ~ "Wikipedia edits not in A-S or CESTA",
!is.na(cesta_pops) & is.na(wiki_pops) ~ "CESTA only",
!is.na(cesta_pops) & !is.na(wiki_pops) ~ "In both Wikipedia and CESTA",
TRUE ~ "Other"
))
#ggplot(labelled) + geom_point(aes(x=umap_x,y=umap_y),alpha=.05)
# I like this orientation better
labelled$umap_y = labelled$umap_y
labelled %>%
# Drop wiki places with only 2016 counts.
filter(!(is.na(cesta_pops) & is.na(alperin_pops) & !grepl("[1-9]", wiki_pops))) %>%
select(name, starts_with('1'), starts_with('2'), ends_with('_pops'), STPLFIPS_2010, article_length, latitude, longitude, starts_with('umap_') ,state) %>%
write_csv("~/Dropbox/project/mapviz/build/data/city_pops.csv")
```
```{r}
biggest = labelled %>% filter(!is.na(latitude),is.na(`settlement_type`) | `settlement_type` != "US State") %>% group_by(source) %>% filter(!is.na(maxpop)) %>% arrange(-maxpop) %>% sample_n(10,weight=maxpop)
library(ggrepel)
ggplot(labelled %>% filter(latitude < 50,longitude < -20) %>% sample_frac()) + aes(x=longitude,y=latitude,label=paste(name,maxpop)) + geom_point(position="jitter",size=.5,alpha=.1) + facet_wrap(~source) + coord_map(proj="albers",y0=30,y1=45) + labs(title="Dataset sources") + geom_text_repel(data=biggest,size=3,color=scales::muted("blue"))
#head(labelled)
biggest
```
```{r}
labelled %>% filter(grepl("Heber Springs|Upper Darby|Washington Court House",name)) %>% arrange(name)
```
Add a new chunk by clicking the *Insert Chunk* button on the toolbar or by pressing *Cmd+Option+I*.
When you save the notebook, an HTML file containing the code and output will be saved alongside it (click the *Preview* button or press *Cmd+Shift+K* to preview the HTML file).
Show in New WindowClear OutputExpand/Collapse Output
Error in install.packages : Updating loaded packages