Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Ejscreen semi-automatic #1184

Open
wants to merge 21 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 15 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
98 changes: 98 additions & 0 deletions scripts/us_epa/ejscreen/config.json
Rohit231998 marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
{
"YEARS": [ "2015", "2016", "2017", "2018", "2019", "2020", "2021", "2022", "2023", "2024"],
"NORM_CSV_COLUMNS": ["ID", "DSLPM", "CANCER", "RESP", "OZONE", "PM25"],
"NORM_CSV_COLUMNS1": ["ID", "DSLPM", "OZONE", "PM25"],
"CSV_COLUMNS_BY_YEAR": {
"2015": ["FIPS", "dpm", "cancer", "resp", "o3", "pm"],
"2016": ["ID", "DSLPM", "CANCER", "RESP", "OZONE", "PM25"],
"2017": ["ID", "DSLPM", "CANCER", "RESP", "OZONE", "PM25"],
"2018": ["ID", "DSLPM", "CANCER", "RESP", "OZONE", "PM25"],
"2019": ["ID", "DSLPM", "CANCER", "RESP", "OZONE", "PM25"],
"2020": ["ID", "DSLPM", "CANCER", "RESP", "OZONE", "PM25"],
"2021": ["ID", "DSLPM", "CANCER", "RESP", "OZONE", "PM25"],
"2022": ["ID", "DSLPM", "CANCER", "RESP", "OZONE", "PM25"],
"2023": ["ID", "DSLPM", "CANCER", "RESP", "OZONE", "PM25"],
"2024": ["ID", "DSLPM", "OZONE", "PM25"]
},
"ZIP_FILENAMES": {
"2015": "EJSCREEN_20150505.csv",
"2016": "EJSCREEN_V3_USPR_090216_CSV",
"2017": null,
"2018": "EJSCREEN_2018_USPR_csv",
"2019": "EJSCREEN_2019_USPR.csv",
"2020": "EJSCREEN_2020_USPR.csv",
"2021": "EJSCREEN_2021_USPR.csv",
"2022": "EJSCREEN_2022_with_AS_CNMI_GU_VI.csv",
"2023": "EJSCREEN_2023_BG_with_AS_CNMI_GU_VI.csv",
"2024": "EJScreen_2024_Tract_with_AS_CNMI_GU_VI.csv"
},
"FILENAMES": {
"2015": "EJSCREEN_20150505",
"2016": "EJSCREEN_Full_V3_USPR_TSDFupdate",
"2017": "EJSCREEN_2017_USPR_Public",
"2018": "EJSCREEN_Full_USPR_2018",
"2019": "EJSCREEN_2019_USPR",
"2020": "EJSCREEN_2020_USPR",
"2021": "EJSCREEN_2021_USPR",
"2022": "EJSCREEN_2022_Full_with_AS_CNMI_GU_VI",
"2023": "EJSCREEN_2023_BG_with_AS_CNMI_GU_VI",
"2024": "EJScreen_2024_Tract_with_AS_CNMI_GU_VI"
},
"TEMPLATE_MCF": [
{
"Node": "E:ejscreen_airpollutants->E0",
"typeOf": "dcs:StatVarObservation",
"variableMeasured": "dcs:Mean_Concentration_AirPollutant_DieselPM",
"observationDate": "C:ejscreen_airpollutants->year",
"observationAbout": "C:ejscreen_airpollutants->FIPS",
"observationPeriod": "dcs:P1Y",
"value": "C:ejscreen_airpollutants->DSLPM",
"unit": "dcs:MicrogramsPerCubicMeter"
},
{
"Node": "E:ejscreen_airpollutants->E1",
"typeOf": "dcs:StatVarObservation",
"variableMeasured": "dcs:AirPollutant_Cancer_Risk",
"observationDate": "C:ejscreen_airpollutants->year",
"observationAbout": "C:ejscreen_airpollutants->FIPS",
"observationPeriod": "dcs:P1Y",
"value": "C:ejscreen_airpollutants->CANCER",
"unit": "dcs:PerMillionPerson"
},
{
"Node": "E:ejscreen_airpollutants->E2",
"typeOf": "dcs:StatVarObservation",
"variableMeasured": "dcs:AirPollutant_Respiratory_Hazard",
"observationDate": "C:ejscreen_airpollutants->year",
"observationAbout": "C:ejscreen_airpollutants->FIPS",
"observationPeriod": "dcs:P1Y",
"value": "C:ejscreen_airpollutants->RESP"
},
{
"Node": "E:ejscreen_airpollutants->E3",
"typeOf": "dcs:StatVarObservation",
"variableMeasured": "dcs:Mean_Concentration_AirPollutant_Ozone",
"observationDate": "C:ejscreen_airpollutants->year",
"observationAbout": "C:ejscreen_airpollutants->FIPS",
"observationPeriod": "dcs:P1Y",
"value": "C:ejscreen_airpollutants->OZONE",
"unit": "dcs:PartsPerBillion"
},
{
"Node": "E:ejscreen_airpollutants->E4",
"typeOf": "dcs:StatVarObservation",
"variableMeasured": "dcs:Mean_Concentration_AirPollutant_PM2.5",
"observationDate": "C:ejscreen_airpollutants->year",
"observationAbout": "C:ejscreen_airpollutants->FIPS",
"observationPeriod": "dcs:P1Y",
"value": "C:ejscreen_airpollutants->PM25",
"unit": "dcs:MicrogramsPerCubicMeter"
}
]
,
"BASE_URL": "https://gaftp.epa.gov/EJSCREEN",
"URL_SUFFIX": {
"2023": "2.22_September_UseMe",
"2024": "2.32_August_UseMe"
}
}
228 changes: 118 additions & 110 deletions scripts/us_epa/ejscreen/ejscreen.py
Rohit231998 marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
@@ -1,103 +1,78 @@
'''
Generates cleaned CSV for the EPA EJSCREEN data and TMCF.
Usage: python3 ejscreen.py
'''
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import io
import os
import zipfile
import requests
import pandas as pd
import json
from absl import logging, flags, app
import sys

_MODULE_DIR = os.path.dirname(os.path.abspath(__file__))
sys.path.append(os.path.join(_MODULE_DIR, '../../../util/'))
print(_MODULE_DIR)
import file_util

logging.set_verbosity(logging.INFO)
logger = logging
_FLAGS = flags.FLAGS
flags.DEFINE_string('config_path',
'gs://unresolved_mcf/epa/ejscreen/config.json',
'Path to config file')

_MODULE_DIR = os.path.dirname(os.path.abspath(__file__))
_CONFIG_PATH = os.path.join(_MODULE_DIR, 'config.json')

# Load configuration from config.json
with open(_CONFIG_PATH, 'r') as f:
config = json.load(f)
Rohit231998 marked this conversation as resolved.
Show resolved Hide resolved

YEARS = config["YEARS"]
NORM_CSV_COLUMNS = config["NORM_CSV_COLUMNS"]
NORM_CSV_COLUMNS1 = config["NORM_CSV_COLUMNS1"]
CSV_COLUMNS_BY_YEAR = config["CSV_COLUMNS_BY_YEAR"]
ZIP_FILENAMES = config["ZIP_FILENAMES"]
FILENAMES = config["FILENAMES"]
TEMPLATE_MCF = config["TEMPLATE_MCF"]
BASE_URL = config["BASE_URL"]
URL_SUFFIX = config["URL_SUFFIX"]


# Function to build the correct URL for each year
def build_url(year, zip_filename=None):
if zip_filename:
# Construct the URL for the zip file
if year in URL_SUFFIX:
url = f'{BASE_URL}/{year}/{URL_SUFFIX[year]}/{zip_filename}.zip'
else:
url = f'{BASE_URL}/{year}/{zip_filename}.zip'
else:
# Construct the URL for the CSV file
url = f'{BASE_URL}/{year}/{FILENAMES[year]}.csv'
return url


YEARS = ['2015', '2016', '2017', '2018', '2019', '2020']

NORM_CSV_COLUMNS = ['ID', 'DSLPM', 'CANCER', 'RESP', 'OZONE', 'PM25']

# 2015 has different csv column names
CSV_COLUMNS_BY_YEAR = {
'2015': ['FIPS', 'dpm', 'cancer', 'resp', 'o3', 'pm'],
'2016': NORM_CSV_COLUMNS,
'2017': NORM_CSV_COLUMNS,
'2018': NORM_CSV_COLUMNS,
'2019': NORM_CSV_COLUMNS,
'2020': NORM_CSV_COLUMNS
}

ZIP_FILENAMES = {
'2015': 'EJSCREEN_20150505.csv',
'2016': 'EJSCREEN_V3_USPR_090216_CSV',
'2017': None,
'2018': 'EJSCREEN_2018_USPR_csv',
'2019': 'EJSCREEN_2019_USPR.csv',
'2020': 'EJSCREEN_2020_USPR.csv'
}

FILENAMES = {
'2015': 'EJSCREEN_20150505',
'2016': 'EJSCREEN_Full_V3_USPR_TSDFupdate',
'2017': 'EJSCREEN_2017_USPR_Public',
'2018': 'EJSCREEN_Full_USPR_2018',
'2019': 'EJSCREEN_2019_USPR',
'2020': 'EJSCREEN_2020_USPR'
}

TEMPLATE_MCF = '''
Node: E:ejscreen_airpollutants->E0
typeOf: dcs:StatVarObservation
variableMeasured: dcs:Mean_Concentration_AirPollutant_DieselPM
observationDate: C:ejscreen_airpollutants->year
observationAbout: C:ejscreen_airpollutants->FIPS
observationPeriod: dcs:P1Y
value: C:ejscreen_airpollutants->DSLPM
unit: dcs:MicrogramsPerCubicMeter
Node: E:ejscreen_airpollutants->E1
typeOf: dcs:StatVarObservation
variableMeasured: dcs:AirPollutant_Cancer_Risk
observationDate: C:ejscreen_airpollutants->year
observationAbout: C:ejscreen_airpollutants->FIPS
observationPeriod: dcs:P1Y
value: C:ejscreen_airpollutants->CANCER
Node: E:ejscreen_airpollutants->E2
typeOf: dcs:StatVarObservation
variableMeasured: dcs:AirPollutant_Respiratory_Hazard
observationDate: C:ejscreen_airpollutants->year
observationAbout: C:ejscreen_airpollutants->FIPS
observationPeriod: dcs:P1Y
value: C:ejscreen_airpollutants->RESP
Node: E:ejscreen_airpollutants->E3
typeOf: dcs:StatVarObservation
variableMeasured: dcs:Mean_Concentration_AirPollutant_Ozone
observationDate: C:ejscreen_airpollutants->year
observationAbout: C:ejscreen_airpollutants->FIPS
observationPeriod: dcs:P1Y
value: C:ejscreen_airpollutants->OZONE
unit: dcs:PartsPerBillion
Node: E:ejscreen_airpollutants->E4
typeOf: dcs:StatVarObservation
variableMeasured: dcs:Mean_Concentration_AirPollutant_PM2.5
observationDate: C:ejscreen_airpollutants->year
observationAbout: C:ejscreen_airpollutants->FIPS
observationPeriod: dcs:P1Y
value: C:ejscreen_airpollutants->PM25
unit: dcs:MicrogramsPerCubicMeter
'''


# data: dictionary of dataframes in the format {year: dataframe}
# outfilename: name of the csv that data will be written to
# write_csv concatenates the dataframe from each year together
# Data processing function
def write_csv(data, outfilename):
full_df = pd.DataFrame()
for curr_year, one_year_df in data.items():
one_year_df['year'] = curr_year # add year column
full_df = pd.concat(
[full_df, one_year_df],
ignore_index=True) # concatenate year onto larger dataframe
one_year_df['year'] = curr_year
full_df = pd.concat([full_df, one_year_df], ignore_index=True)

# sort by FIPS and make into dcid
# Sort by FIPS and make into dcid
full_df = full_df.rename(columns={'ID': 'FIPS'})
full_df = full_df.sort_values(by=['FIPS'], ignore_index=True)
full_df['FIPS'] = 'dcid:geoId/' + (
Expand All @@ -108,32 +83,65 @@ def write_csv(data, outfilename):


def write_tmcf(outfilename):
# Convert each item in TEMPLATE_MCF to a string, even if it's a dictionary
if isinstance(TEMPLATE_MCF, list):
# Convert each element to a string if it's not already
template_content = "\n".join(str(item) for item in TEMPLATE_MCF)
else:
template_content = str(
TEMPLATE_MCF
) # In case it's not a list, just convert it to a string

with open(outfilename, 'w') as f_out:
f_out.write(TEMPLATE_MCF)
f_out.write(template_content)


if __name__ == '__main__':
def main(_):
dfs = {}
Rohit231998 marked this conversation as resolved.
Show resolved Hide resolved
for year in YEARS:
print(year)
logger.info(f"Processing year: {year}")
columns = CSV_COLUMNS_BY_YEAR[year]
# request file
zip_filename = ZIP_FILENAMES[year]
if zip_filename is not None:
response = requests.get(
f'https://gaftp.epa.gov/EJSCREEN/{year}/{zip_filename}.zip')
with zipfile.ZipFile(io.BytesIO(response.content())) as zfile:
with zfile.open(f'{FILENAMES[year]}.csv', 'r') as newfile:
dfs[year] = pd.read_csv(newfile, usecols=columns)
# some years are not zipped
zip_filename = ZIP_FILENAMES.get(year, None)

url = build_url(year, zip_filename)

logger.info(f"Requesting file: {url}")
response = requests.get(url, verify=False)

if response.status_code == 200:
if zip_filename:
with zipfile.ZipFile(io.BytesIO(response.content)) as zfile:
with zfile.open(f'{FILENAMES[year]}.csv', 'r') as newfile:
dfs[year] = pd.read_csv(newfile,
engine='python',
encoding='latin1',
usecols=columns)
else:
dfs[year] = pd.read_csv(io.StringIO(response.text),
sep=',',
usecols=columns)
logger.info(
f"File downloaded and processed for {year} successfully")
else:
logger.error(
f"Failed to download file for {year}. HTTP Status Code: {response.status_code}"
Rohit231998 marked this conversation as resolved.
Show resolved Hide resolved
)

# Rename columns to match other years
if year == '2024':
cols_renamed = dict(zip(columns, NORM_CSV_COLUMNS1))
Rohit231998 marked this conversation as resolved.
Show resolved Hide resolved
else:
response = requests.get(
f'https://gaftp.epa.gov/EJSCREEN/{year}/{FILENAMES[year]}.csv')
dfs[year] = pd.read_csv(response, usecols=columns)
# rename weird column names to match other years
if columns != NORM_CSV_COLUMNS:
cols_renamed = dict(zip(columns, NORM_CSV_COLUMNS))
dfs[year] = dfs[year].rename(columns=cols_renamed)

write_csv(dfs, 'ejscreen_airpollutants.csv')
write_tmcf('ejscreen.tmcf')
dfs[year] = dfs[year].rename(columns=cols_renamed)
logger.info(f"Columns renamed for {year} successfully")

logger.info("Writing data to CSV")
write_csv(dfs, 'ejscreen_airpollutants.csv')
logger.info("Writing template to TMCF")
write_tmcf('ejscreen.tmcf')
logger.info("Process completed successfully")


if __name__ == '__main__':
app.run(main)
Loading
Loading