Skip to content

Commit 693e62e

Browse files
committed
2 parents eaee4ca + b78696f commit 693e62e

21 files changed

+345
-207
lines changed

.github/workflows/main.yml

+2-2
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ jobs:
4545
reticulate::install_python(version=readLines(".python-version"))
4646
reticulate::py_install(c("virtualenv"))
4747
reticulate::virtualenv_root()
48-
reticulate::virtualenv_create("venv", requirements="requirements.txt")
48+
reticulate::virtualenv_create("venv", requirements="pyenv/requirements.txt")
4949
reticulate::use_virtualenv("venv")
5050
reticulate::py_config()
5151
# Set reticulate_python env var
@@ -58,7 +58,7 @@ jobs:
5858
paste(
5959
readLines(".python-version"),
6060
reticulate::virtualenv_python("venv"),
61-
digest::digest(readLines("requirements.txt")),
61+
digest::digest(readLines("pyenv/requirements.txt")),
6262
sep = "-"
6363
)),
6464
Sys.getenv("GITHUB_ENV"))

_freeze/part-gen-prog/07-prog-data/execute-results/html.json

+2-2
Large diffs are not rendered by default.
Loading
Loading
Loading
Loading
Loading
Loading
Loading
Loading
Loading
Loading
Loading

part-gen-prog/07-prog-data.qmd

+28-25
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,15 @@ As you've probably guessed by now, this section will primarily be focused on ex
2121

2222
- Create new variables and columns or reformat existing columns in provided data structures
2323

24+
```{python}
25+
#| include: false
26+
#| echo: false
27+
#| message: false
28+
#| warning: false
29+
import warnings
30+
warnings.filterwarnings("ignore", category=FutureWarning)
31+
```
32+
2433
## Artwork Dimensions
2534

2635
The Tate Art Museum assembled a collection of 70,000 artworks (last updated in 2014). They cataloged information including accession number, artwork dimensions, units, title, date, medium, inscription, and even URLs for images of the art.
@@ -146,7 +155,7 @@ We might be interested in the aspect ratio of the artwork - let's take a look at
146155

147156
::: panel-tabset
148157

149-
####R {-}
158+
#### R {-}
150159

151160
```{r hist-dims-art, fig.height = 4, fig.width = 12}
152161
par(mfrow=c(1, 3)) # 3 plots on one row
@@ -267,7 +276,7 @@ The downside to this is that we have to write out `artwork$aspect_hw` or `artwor
267276

268277
One mistake I see people make frequently is to calculate `height/width`, but then not assign that value to a variable.
269278

270-
If you're not using `<-` in R^[(or `=` or `->` if you're a total heathen)] or `=` in Python, then you're not saving that information to be referenced later - you're just calculating values temporarily and possibly printing them as output.
279+
**If you're not using `<-` in R^[(or `=`, or `->` if you're a total heathen)] or `=` in Python, then you're not saving that information** to be referenced later - you're just calculating values temporarily and possibly printing them as output.
271280

272281
:::
273282

@@ -393,7 +402,7 @@ You may need to run `pip install plotnine` in the terminal if you have not used
393402
from plotnine import *
394403
395404
(
396-
ggplot(aes(x = 'LicenseIssuedDate'), data = dogs) +
405+
ggplot(mapping = aes(x = 'LicenseIssuedDate'), data = dogs) +
397406
geom_histogram() # Create a histogram
398407
)
399408
@@ -443,7 +452,7 @@ dogs["License_length_yr"] = dogs.License_length.dt.days/365.25
443452

444453
```{python dog-license-length2-py}
445454
(
446-
ggplot(aes(x = "License_length_yr"), data = dogs) +
455+
ggplot(mapping = aes(x = "License_length_yr"), data = dogs) +
447456
geom_histogram(bins = 30)+
448457
scale_x_continuous(limits = (0,10))
449458
)
@@ -460,7 +469,7 @@ To look at this, we'll need a bit more data. I found a list of NYC zip codes by
460469

461470
```{r get-nyc-zip-borough, echo = F, include = F, eval = F}
462471
library(xml2)
463-
library(tidyverse)
472+
library(readr)
464473
page <- read_html("https://www.nycbynatives.com/nyc_info/new_york_city_zip_codes.php")
465474
nyc_zip_borough <- data.frame(ZipCode = c(xml_find_all(page, ".//tr/td[1]"),
466475
xml_find_all(page, ".//tr/td[4]")) %>%
@@ -471,7 +480,7 @@ nyc_zip_borough <- data.frame(ZipCode = c(xml_find_all(page, ".//tr/td[1]"),
471480
purrr::map_chr(xml_text) %>%
472481
stringr::str_trim()) %>%
473482
unique()
474-
write_csv(nyc_zip_borough, "../data/nyc_zip_borough.csv")
483+
write_csv(nyc_zip_borough, file = file.path("../", "data", "nyc_zip_borough.csv"))
475484
```
476485

477486
::: panel-tabset
@@ -491,8 +500,7 @@ head(dogs)
491500
#### Python {-}
492501

493502
```{python merge-dog-borough-info-py}
494-
borough_zip = pd.read_csv("https://raw.githubusercontent.com/srvanderplas/unl-stat850/main/data/nyc_zip_borough.csv")
495-
503+
borough_zip = pd.read_csv("https://raw.githubusercontent.com/srvanderplas/stat-computing-r-python/main/data/nyc_zip_borough.csv")
496504
dogs = dogs.drop('Borough', axis = 1) # drop borough column
497505
dogs = pd.merge(dogs, borough_zip, on = 'ZipCode')
498506
dogs.head()
@@ -502,7 +510,6 @@ dogs.head()
502510

503511
Now that we have borough, let's write a function that will take a dataset and spit out a list of the top 5 dog breeds registered in that area.
504512

505-
### Custom Summary Function
506513

507514
::: panel-tabset
508515

@@ -529,8 +536,6 @@ def top_5_breeds(data):
529536
:::
530537

531538

532-
### For Loop Summary
533-
534539
Now, using that function, lets write a for loop that loops through the 5 boroughs and spits out the top 5 breeds in each borough:
535540

536541
::: panel-tabset
@@ -581,7 +586,6 @@ for i in boroughs:
581586

582587
:::
583588

584-
### Summary Data Frame
585589

586590
If we wanted to save these results as a summary data frame, we could totally do that!
587591

@@ -712,10 +716,11 @@ library(ggplot2)
712716
713717
ggplot(
714718
data = tarantino,
715-
aes(x = minutes_in, color = movie)
719+
aes(x = minutes_in, color = type)
716720
) +
717721
geom_density() +
718-
facet_wrap(~type)
722+
scale_color_manual(values = c("black", "grey")) +
723+
facet_wrap(~movie)
719724
```
720725

721726
#### Python {-}
@@ -725,12 +730,11 @@ You may need to run `pip install plotnine` in the terminal if you have not used
725730
```{python tarantino-hist-py}
726731
from plotnine import *
727732
728-
(
729-
ggplot(tarantino, aes(x = 'minutes_in', color = 'movie')) +
730-
geom_density() +
731-
facet_wrap("type")
732-
)
733-
733+
plot = ggplot(data = tarantino, mapping = aes(x = 'minutes_in', color = "type"))
734+
plot = plot + geom_density()
735+
plot = plot + scale_color_manual(values = ["black", "grey"])
736+
plot = plot + facet_wrap("movie")
737+
plot.show()
734738
```
735739

736740
:::
@@ -787,12 +791,11 @@ tarantino_words = tarantino.query("type == 'word'")
787791
788792
# Step 2 - 6 most common words
789793
794+
plot = ggplot(tarantino, aes(x = 'minutes_in', color = 'movie'))
795+
plot = plot + geom_density()
796+
plot = plot + facet_wrap("type")
790797
791-
(
792-
ggplot(tarantino, aes(x = 'minutes_in', color = 'movie')) +
793-
geom_density() +
794-
facet_wrap("type")
795-
)
798+
plot.show()
796799
797800
```
798801

py-deps

-60
This file was deleted.

pyenv/py-deps

+103
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
#!/bin/bash
2+
3+
# From https://stackoverflow.com/a/13038235/2859168
4+
set_union () {
5+
sort $1 $2 | uniq
6+
}
7+
8+
set_intersection () {
9+
sort $1 $2 | uniq -d
10+
}
11+
12+
set_difference () {
13+
sort $1 $2 $2 | uniq -u
14+
}
15+
16+
set_symmetric_difference() {
17+
sort $1 $2 | uniq -u
18+
}
19+
20+
FindRequirements() {
21+
grep -r "from .* import .*" --include "*.qmd" . | awk '{ match( $0, /:from ([A-z0-9]*).?.*? import .*$/, arr) ; if(arr[1] != "") print arr[1] } ' | sort | uniq > pyenv/py-imports
22+
grep -r "import .* as .*" --include "*.qmd" . | awk '{ match( $0, /:import ([A-z0-9]*).?.*? as .*$/, arr) ; if(arr[1] != "") print arr[1] } ' | sort | uniq >> pyenv/py-imports
23+
grep -r "import .*" --include "*.qmd" . | awk '{ match( $0, /:import ([A-z0-9]*).*$/, arr) ; if(arr[1] != "") print arr[1] } ' | sort | uniq >> pyenv/py-imports
24+
# deduplicate
25+
cat pyenv/requirements.txt >> pyenv/py-imports
26+
cat pyenv/py-deps-reqs >> pyenv/py-imports
27+
cat pyenv/py-imports | sort | uniq > pyenv/py-imports-clean
28+
29+
# replace PIL with pillow... ugh python sucks
30+
sed -i -e 's/^PIL$/pillow/g' pyenv/py-imports-clean
31+
# replace sklearn with scikit-learn
32+
sed -i -e 's/^sklearn$/scikit-learn/g' pyenv/py-imports-clean
33+
# replace cv2 with opencv
34+
sed -i -e 's/^cv2/opencv/g' pyenv/py-imports-clean
35+
36+
# Python system packages assembled from here: https://docs.python.org/3/library/index.html
37+
# Need to not install these, since python/pip will error out :(
38+
set_difference pyenv/py-imports-clean pyenv/py-system-pkgs > pyenv/py-imports-install
39+
40+
# Remove temp files created when assembling packages
41+
rm pyenv/py-imports pyenv/py-imports-clean
42+
mv pyenv/py-imports-install pyenv/requirements.txt
43+
}
44+
45+
SetRequirements() {
46+
# Update dependencies required by the book
47+
# pipdeptree -r -f --warn silence | grep -E '^[a-zA-Z0-9\-]+' > requirements.txt
48+
pip-chill --no-version > pyenv/requirements.txt
49+
FindRequirements
50+
51+
echo "Requirements file updated successfully!"
52+
}
53+
54+
55+
Install() {
56+
# Install requirements
57+
pip3 install -r pyenv/requirements.txt
58+
echo "Packages in requirements.txt installed with pip"
59+
}
60+
61+
62+
Update() {
63+
# Update package versions from what's installed
64+
pip3 --disable-pip-version-check list --outdated --format=json | \
65+
python -c "import json, sys; print('\n'.join([x['name'] for x in json.load(sys.stdin)]))" | \
66+
xargs -n1 pip3 install -U
67+
echo "Package version update complete"
68+
69+
SetRequirements
70+
}
71+
72+
73+
Help() {
74+
echo "Handle book's python dependencies."
75+
echo
76+
echo "Syntax: py-deps [-h|i|s|u]"
77+
echo "Options:"
78+
echo "h Print the help text"
79+
echo "i pip install packages from requirements.txt file"
80+
echo "s Set requirements - examine all qmd files for python deps and update requirements.txt accordingly."
81+
echo "u Update all python package versions required by pip. Will also set requirements to updated versions."
82+
echo
83+
}
84+
85+
while getopts ":hisu" option; do
86+
case $option in
87+
h) # display Help
88+
Help
89+
exit;;
90+
i) # install pkgs
91+
Install
92+
exit;;
93+
s) # set requirements file
94+
SetRequirements
95+
exit;;
96+
u) # update python pkgs
97+
Update
98+
exit;;
99+
\?) # invalid option
100+
echo "Error: Invalid option"
101+
exit;;
102+
esac
103+
done

pyenv/py-deps-reqs

+4
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
pip-chill
2+
wheel
3+
pip
4+
pipdeptree

0 commit comments

Comments
 (0)