diff --git a/lecture/lec11.md b/lecture/lec11.md index b4c8fb8..a74b22f 100644 --- a/lecture/lec11.md +++ b/lecture/lec11.md @@ -11,6 +11,6 @@ Presented by Narges Norouzi Content by many dedicated Data 100 instructors at UC Berkeley. See our [Acknowledgments](../../acks) page. - [slides](https://docs.google.com/presentation/d/1-gP234MVUtxIWWKY-8mwhrKUh0OBLEcPXemqOmwkvp0/edit?usp=sharing){:target="_blank"} - +- [code](https://data100.datahub.berkeley.edu/hub/user-redirect/git-pull?repo=https%3A%2F%2Fgithub.com%2FDS-100%2Ffa24-student&urlpath=lab%2Ftree%2Ffa24-student%2Flecture%2Flec11%2Flec11.ipynb&branch=main){:target="_blank"} - [code HTML](../../resources/assets/lectures/lec11/lec11.html){:target="_blank"} - [recording](https://youtu.be/ol0n7L9CczM){:target="_blank"} \ No newline at end of file diff --git a/resources/assets/lectures/lec18/lec18.html b/resources/assets/lectures/lec18/lec18.html index b99c037..a7bb4ca 100644 --- a/resources/assets/lectures/lec18/lec18.html +++ b/resources/assets/lectures/lec18/lec18.html @@ -7526,7 +7526,7 @@
+ | x | +P(X = x) | +
---|---|---|
0 | +1 | +0.5 | +
1 | +0 | +0.5 | +
coin_df.sample(10, weights="P(X = x)", replace=True)["x"]
@@ -7619,12 +7666,34 @@ Choice A:¶
1 0 +0 1 +0 1 +0 1 +1 0 +1 0 +1 0 +1 0 +1 0 +1 0 +Name: x, dtype: int64+
array([[False, False], + [ True, True], + [ True, False], + ..., + [ True, True], + [ True, True], + [False, True]])+
sim_flips = pd.DataFrame(
@@ -7651,6 +7738,87 @@ Choice A:¶
+ | Choice A | +
---|---|
0 | +10 | +
1 | +10 | +
2 | +10 | +
3 | +20 | +
4 | +10 | +
... | +... | +
9995 | +0 | +
9996 | +10 | +
9997 | +10 | +
9998 | +20 | +
9999 | +10 | +
10000 rows × 1 columns
+sim_flips["Choice B"] = np.sum((np.random.rand(N,20) < p), axis=1)
@@ -7677,6 +7845,99 @@ Choice B:¶
+ | Choice A | +Choice B | +
---|---|---|
0 | +10 | +11 | +
1 | +10 | +9 | +
2 | +10 | +7 | +
3 | +20 | +13 | +
4 | +10 | +10 | +
... | +... | +... | +
9995 | +0 | +17 | +
9996 | +10 | +11 | +
9997 | +10 | +9 | +
9998 | +20 | +11 | +
9999 | +10 | +14 | +
10000 rows × 2 columns
+sim_flips["Choice C"] = 20 * (np.random.rand(N,1) < p)
@@ -7703,6 +7964,111 @@ Choice C:¶
+ | Choice A | +Choice B | +Choice C | +
---|---|---|---|
0 | +10 | +11 | +20 | +
1 | +10 | +9 | +0 | +
2 | +10 | +7 | +20 | +
3 | +20 | +13 | +20 | +
4 | +10 | +10 | +20 | +
... | +... | +... | +... | +
9995 | +0 | +17 | +20 | +
9996 | +10 | +11 | +20 | +
9997 | +10 | +9 | +0 | +
9998 | +20 | +11 | +0 | +
9999 | +10 | +14 | +20 | +
10000 rows × 3 columns
+px.histogram(sim_flips.melt(), x="value", facet_row="variable",
@@ -7732,12 +8098,73 @@ Choice C:¶
pd.DataFrame([
@@ -7750,6 +8177,62 @@ Choice C:¶
+ | Choice A | +Choice B | +Choice C | +
---|---|---|---|
Simulated Mean | +10.017000 | +10.042000 | +10.046000 | +
Simulated Var | +50.514762 | +4.973533 | +100.007885 | +
Siumulated SD | +7.107374 | +2.230142 | +10.000394 | +
Full Data Size: 398 +Sample Size: 100 ++
array([-0.00730597])+
Bootstrapping: 0%| | 0/10000 [00:00<?, ?it/s]+
array([-0.00814752, -0.00653232])+
0%| | 0/10000 [00:00<?, ?it/s]+
Actual CI [-0.00852071 -0.00691023] ++
csv_file = 'data/Full24hrdataset.csv.gz'
@@ -7921,6 +8189,123 @@ PurpleAir¶
region +North 5592 +West 3750 +Central Southwest 1502 +Southeast 1032 +Alaska 365 +Name: count, dtype: int64+
+ | date | +id | +region | +pm25aqs | +pm25pa | +temp | +rh | +dew | +
---|---|---|---|---|---|---|---|---|
5416 | +2019-10-31 | +GA1 | +Southeast | +3.100000 | +7.638554 | +19.214186 | +70.443672 | +13.674061 | +
5401 | +2019-10-09 | +GA1 | +Southeast | +4.200000 | +10.059924 | +24.621388 | +57.696801 | +15.708347 | +
5407 | +2019-10-17 | +GA1 | +Southeast | +4.200000 | +6.389826 | +16.641975 | +49.377778 | +5.921212 | +
5411 | +2019-10-23 | +GA1 | +Southeast | +4.300000 | +4.544160 | +16.963735 | +50.861111 | +6.650425 | +
5325 | +2019-10-23 | +GA1 | +Southeast | +4.304167 | +4.544160 | +16.963735 | +50.861111 | +6.650425 | +
Number of Rows: 176 ++
model = lm.LinearRegression().fit(GA[['pm25aqs']], GA['pm25pa'])
@@ -7977,12 +8362,12 @@ Inverse Regression
+
fig = px.scatter(GA, x='pm25aqs', y='pm25pa', width=800)
@@ -7994,6 +8379,42 @@ Inverse Regression
+
+
+
+
+
+
+
+
+
+
+
print(f"True Air Quality Estimate = {-theta_0/theta_1:.2} + {1/theta_1:.2}PA")
@@ -8019,12 +8440,25 @@ Inverse Regression
+
+
+
+
+
+
+
+True Air Quality Estimate = 1.6 + 0.46PA
+
+
+
+
+
+
model2 = lm.LinearRegression().fit(GA[['pm25pa']], GA['pm25aqs'])
@@ -8040,6 +8474,42 @@ Inverse Regression
+
+
+
+
+
+
+
+
+
+
+
model_h = lm.LinearRegression().fit(GA[['pm25aqs', 'rh']], GA['pm25pa'])
@@ -8074,12 +8544,25 @@ The Barkjohn et al. mo
True Air Quality Estimate = 7.0 + 0.44PA + -0.092RH ++
fig = px.scatter(GA, x='pm25aqs', y='pm25pa', width=800)
@@ -8092,6 +8575,42 @@ The Barkjohn et al. mo
fig = px.scatter_3d(GA, x='pm25aqs', y='rh', z='pm25pa', width=800, height=600)
@@ -8135,6 +8654,42 @@ The Barkjohn et al. mo
theta_1
@@ -8184,6 +8739,18 @@ Bootstrapping
2.2540167939150546+
theta_2
@@ -8209,6 +8776,18 @@ Bootstrapping
0.20630108775555359+
def theta2_estimate(sample):
@@ -8263,12 +8842,12 @@ Bootstrapping
bs_theta2 = bootstrap(GA, theta2_estimate, 10000)
@@ -8277,12 +8856,24 @@ Bootstrapping
Bootstrapping: 0%| | 0/10000 [00:00<?, ?it/s]+
import plotly.express as px
@@ -8298,6 +8889,42 @@ Bootstrapping
len([elem for elem in bs_theta2 if elem < 0.0])
@@ -8326,6 +8953,18 @@ Bootstrapping
0+
eggs = pd.read_csv('data/snowy_plover.csv.gz')
@@ -8395,12 +9034,86 @@ The Data¶
+ | egg_weight | +egg_length | +egg_breadth | +bird_weight | +
---|---|---|---|---|
0 | +7.4 | +28.80 | +21.84 | +5.2 | +
1 | +7.7 | +29.04 | +22.45 | +5.4 | +
2 | +7.9 | +29.36 | +22.48 | +5.6 | +
3 | +7.5 | +30.10 | +21.71 | +5.3 | +
4 | +8.3 | +30.17 | +22.75 | +5.9 | +
y = eggs["bird_weight"]
@@ -8453,6 +9178,65 @@ The Data¶
+ | theta_hat | +
---|---|
intercept | +-4.605670 | +
egg_weight | +0.431229 | +
egg_length | +0.066570 | +
egg_breadth | +0.215914 | +
RMSE 0.045470853802757616 ++
Bootstrapping: 0%| | 0/10000 [00:00<?, ?it/s]+