Skip to content

Commit

Permalink
pulling from origin
Browse files Browse the repository at this point in the history
  • Loading branch information
Niklewa committed Jul 25, 2024
2 parents 61d320c + 4099f6a commit 5ead286
Show file tree
Hide file tree
Showing 45 changed files with 7,984,028 additions and 328 deletions.
8 changes: 8 additions & 0 deletions cities/utils/cleaning_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,12 @@
from cities.utils.cleaning_scripts.clean_gdp_ma import clean_gdp_ma
from cities.utils.cleaning_scripts.clean_hazard import clean_hazard
from cities.utils.cleaning_scripts.clean_homeownership import clean_homeownership
from cities.utils.cleaning_scripts.clean_income_CT import clean_income_CT
from cities.utils.cleaning_scripts.clean_income_distribution import (
clean_income_distribution,
)
from cities.utils.cleaning_scripts.clean_industry import clean_industry
from cities.utils.cleaning_scripts.clean_industry_ct import clean_industry_CT
from cities.utils.cleaning_scripts.clean_industry_ma import clean_industry_ma
from cities.utils.cleaning_scripts.clean_industry_ts import clean_industry_ts
from cities.utils.cleaning_scripts.clean_population import clean_population
Expand All @@ -32,6 +34,7 @@
)
from cities.utils.cleaning_scripts.clean_transport import clean_transport
from cities.utils.cleaning_scripts.clean_unemployment import clean_unemployment
from cities.utils.cleaning_scripts.clean_unemployment_ct import clean_unemployment_CT
from cities.utils.cleaning_scripts.clean_urbanicity_ct import clean_urbanicity_CT
from cities.utils.cleaning_scripts.clean_urbanicity_ma import clean_urbanicity_ma
from cities.utils.cleaning_scripts.clean_urbanization import clean_urbanization
Expand All @@ -41,6 +44,11 @@


# clean_health() lost of another 15-ish fips
clean_income_CT()

clean_industry_CT()

clean_unemployment_CT()

clean_ethnicity_CT()

Expand Down
22 changes: 22 additions & 0 deletions cities/utils/cleaning_scripts/clean_income_CT.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
from cities.utils.clean_variable import VariableCleanerCT
from cities.utils.data_grabber import find_repo_root

root = find_repo_root()


def clean_income_CT():
cleaner = VariableCleanerCT(
variable_name="income_pre2020_CT",
path_to_raw_csv=f"{root}/data/raw/income_pre2020_ct.csv",
year_or_category_column_label="Category",
time_interval="pre2020",
)
cleaner.clean_variable()

cleaner2 = VariableCleanerCT(
variable_name="income_post2020_CT",
path_to_raw_csv=f"{root}/data/raw/income_post2020_ct.csv",
year_or_category_column_label="Category",
time_interval="post2020",
)
cleaner2.clean_variable()
22 changes: 22 additions & 0 deletions cities/utils/cleaning_scripts/clean_industry_ct.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
from cities.utils.clean_variable import VariableCleanerCT
from cities.utils.data_grabber import find_repo_root

root = find_repo_root()


def clean_industry_CT():
cleaner = VariableCleanerCT(
variable_name="industry_pre2020_CT",
path_to_raw_csv=f"{root}/data/raw/industry_pre2020_ct.csv",
year_or_category_column_label="Category",
time_interval="pre2020",
)
cleaner.clean_variable()

cleaner2 = VariableCleanerCT(
variable_name="industry_post2020_CT",
path_to_raw_csv=f"{root}/data/raw/industry_post2020_ct.csv",
year_or_category_column_label="Category",
time_interval="post2020",
)
cleaner2.clean_variable()
22 changes: 22 additions & 0 deletions cities/utils/cleaning_scripts/clean_unemployment_ct.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
from cities.utils.clean_variable import VariableCleanerCT
from cities.utils.data_grabber import find_repo_root

root = find_repo_root()


def clean_unemployment_CT():
cleaner = VariableCleanerCT(
variable_name="unemployment_pre2020_CT",
path_to_raw_csv=f"{root}/data/raw/unemployment_pre2020_ct.csv",
year_or_category_column_label="Year",
time_interval="pre2020",
)
cleaner.clean_variable()

cleaner2 = VariableCleanerCT(
variable_name="unemployment_post2020_CT",
path_to_raw_csv=f"{root}/data/raw/unemployment_post2020_ct.csv",
year_or_category_column_label="Year",
time_interval="post2020",
)
cleaner2.clean_variable()
165 changes: 165 additions & 0 deletions cities/utils/scraping/scrape_income_ct.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
import numpy as np
import pandas as pd
import requests
from us import states

from cities.utils.data_grabber import find_repo_root

root = find_repo_root()

variables = "NAME,S1901_C01_013E,S1901_C01_012E"
county_fips = "*" # all counties
tract = "*" # all tracts
api_key = "077d857d6c12d5b9b3aeafa07d2c1916ba12a86c" # Your private API key
years = [2019, 2022]

dfs = []

for year in years:
for x in range(len(states.STATES)): # Iterate over all states
fips = states.STATES[x].fips

url = (
f"https://api.census.gov/data/{year}/acs/acs5/subject?"
f"get={variables}&"
f"for=tract:{tract}&"
f"in=state:{fips}&"
f"in=county:{county_fips}&"
f"key={api_key}"
)

response = requests.get(url)

assert (
response.status_code == 200
), "The data retrieval went wrong" # 200 means success

print(f"{fips} fips done for year {year}")

data = response.json()

df = pd.DataFrame(data[1:], columns=data[0])
df["Year"] = year # Add the year column

dfs.append(df)

combined_df = pd.concat(dfs, ignore_index=True)

income = combined_df.copy()

columns_income = {
"S1901_C01_012E": "median_income",
"S1901_C01_013E": "mean_income",
}

income.rename(columns=columns_income, inplace=True)

state_abbreviations = {
"Alabama": "AL",
"Alaska": "AK",
"Arizona": "AZ",
"Arkansas": "AR",
"California": "CA",
"Colorado": "CO",
"Connecticut": "CT",
"Delaware": "DE",
"Florida": "FL",
"Georgia": "GA",
"Hawaii": "HI",
"Idaho": "ID",
"Illinois": "IL",
"Indiana": "IN",
"Iowa": "IA",
"Kansas": "KS",
"Kentucky": "KY",
"Louisiana": "LA",
"Maine": "ME",
"Maryland": "MD",
"Massachusetts": "MA",
"Michigan": "MI",
"Minnesota": "MN",
"Mississippi": "MS",
"Missouri": "MO",
"Montana": "MT",
"Nebraska": "NE",
"Nevada": "NV",
"New Hampshire": "NH",
"New Jersey": "NJ",
"New Mexico": "NM",
"New York": "NY",
"North Carolina": "NC",
"North Dakota": "ND",
"Ohio": "OH",
"Oklahoma": "OK",
"Oregon": "OR",
"Pennsylvania": "PA",
"Rhode Island": "RI",
"South Carolina": "SC",
"South Dakota": "SD",
"Tennessee": "TN",
"Texas": "TX",
"Utah": "UT",
"Vermont": "VT",
"Virginia": "VA",
"Washington": "WA",
"West Virginia": "WV",
"Wisconsin": "WI",
"Wyoming": "WY",
}

income["GeoFIPS"] = income.apply(
lambda row: f"{row['state']}{row['county']}{row['tract']}", axis=1
).astype(np.int64)

income.drop(["state", "county", "tract"], axis=1, inplace=True)


def parse_geo_name(name):
if ";" in name:
parts = name.split(";")
else:
parts = name.split(",")

if len(parts) >= 3:
county = parts[1].strip().replace(" County", "")
state_full = parts[2].strip()
state_abbr = state_abbreviations.get(state_full, state_full)
return f"{county}, {state_abbr} (CT)"
return "Unknown"


income["GeoName"] = income["NAME"].apply(parse_geo_name).astype(str)

assert (
income[income["GeoName"] == "Unknown"].shape[0] == 0
), "There are Unknown GeoNames"

income = income.drop(["NAME"], axis=1)

income.sort_values(by=["Year", "GeoFIPS", "GeoName"], inplace=True)
income = income[
["GeoFIPS", "GeoName", "Year", "mean_income", "median_income"]
].reset_index(drop=True)

income_pre2020 = (
income[income["Year"] < 2020].reset_index(drop=True).drop(["Year"], axis=1)
)
income_post2020 = (
income[income["Year"] >= 2020].reset_index(drop=True).drop(["Year"], axis=1)
)

income_pre2020 = income_pre2020.dropna(how="any")
income_post2020 = income_post2020.dropna(how="any")

columns_to_convert = income_pre2020.columns[2:]
income_pre2020[columns_to_convert] = income_pre2020[columns_to_convert].astype(float)

columns_to_convert = income_post2020.columns[2:]
income_post2020[columns_to_convert] = income_post2020[columns_to_convert].astype(float)

print(f"Pre-2020 data shape: {income_pre2020.shape}")

print(f"Post-2020 data shape: {income_post2020.shape}")

income_pre2020.to_csv(f"{root}/data/raw/income_pre2020_ct.csv", index=False)
income_post2020.to_csv(f"{root}/data/raw/income_post2020_ct.csv", index=False)
Loading

0 comments on commit 5ead286

Please sign in to comment.