From cf20c177ff8263ffee53380535b12272f2605841 Mon Sep 17 00:00:00 2001 From: Rohit Bhande Date: Wed, 15 Jan 2025 11:09:31 +0000 Subject: [PATCH 01/21] Ejscrren semi-automatic --- scripts/us_epa/ejscreen/config.json | 93 ++++++++ scripts/us_epa/ejscreen/ejscreen.py | 266 ++++++++++++++--------- scripts/us_epa/ejscreen/ejscreen_test.py | 2 +- 3 files changed, 261 insertions(+), 100 deletions(-) create mode 100644 scripts/us_epa/ejscreen/config.json diff --git a/scripts/us_epa/ejscreen/config.json b/scripts/us_epa/ejscreen/config.json new file mode 100644 index 0000000000..b84673a204 --- /dev/null +++ b/scripts/us_epa/ejscreen/config.json @@ -0,0 +1,93 @@ +{ + "YEARS": ["2015", "2016", "2017", "2018", "2019", "2020", "2021", "2022", "2023", "2024"], + "NORM_CSV_COLUMNS": ["ID", "DSLPM", "CANCER", "RESP", "OZONE", "PM25"], + "NORM_CSV_COLUMNS1": ["ID", "DSLPM", "OZONE", "PM25"], + "CSV_COLUMNS_BY_YEAR": { + "2015": ["FIPS", "dpm", "cancer", "resp", "o3", "pm"], + "2016": ["ID", "DSLPM", "CANCER", "RESP", "OZONE", "PM25"], + "2017": ["ID", "DSLPM", "CANCER", "RESP", "OZONE", "PM25"], + "2018": ["ID", "DSLPM", "CANCER", "RESP", "OZONE", "PM25"], + "2019": ["ID", "DSLPM", "CANCER", "RESP", "OZONE", "PM25"], + "2020": ["ID", "DSLPM", "CANCER", "RESP", "OZONE", "PM25"], + "2021": ["ID", "DSLPM", "CANCER", "RESP", "OZONE", "PM25"], + "2022": ["ID", "DSLPM", "CANCER", "RESP", "OZONE", "PM25"], + "2023": ["ID", "DSLPM", "CANCER", "RESP", "OZONE", "PM25"], + "2024": ["ID", "DSLPM", "OZONE", "PM25"] + }, + "ZIP_FILENAMES": { + "2015": "EJSCREEN_20150505.csv", + "2016": "EJSCREEN_V3_USPR_090216_CSV", + "2017": null, + "2018": "EJSCREEN_2018_USPR_csv", + "2019": "EJSCREEN_2019_USPR.csv", + "2020": "EJSCREEN_2020_USPR.csv", + "2021": "EJSCREEN_2021_USPR.csv", + "2022": "EJSCREEN_2022_with_AS_CNMI_GU_VI.csv", + "2023": "EJSCREEN_2023_BG_with_AS_CNMI_GU_VI.csv", + "2024": "EJScreen_2024_Tract_with_AS_CNMI_GU_VI.csv" + }, + "FILENAMES": { + "2015": "EJSCREEN_20150505", + "2016": "EJSCREEN_Full_V3_USPR_TSDFupdate", + "2017": "EJSCREEN_2017_USPR_Public", + "2018": "EJSCREEN_Full_USPR_2018", + "2019": "EJSCREEN_2019_USPR", + "2020": "EJSCREEN_2020_USPR", + "2021": "EJSCREEN_2021_USPR", + "2022": "EJSCREEN_2022_Full_with_AS_CNMI_GU_VI", + "2023": "EJSCREEN_2023_BG_with_AS_CNMI_GU_VI", + "2024": "EJScreen_2024_Tract_with_AS_CNMI_GU_VI" + }, + "TEMPLATE_MCF": [ + { + "Node": "E:ejscreen_airpollutants->E0", + "typeOf": "dcs:StatVarObservation", + "variableMeasured": "dcs:Mean_Concentration_AirPollutant_DieselPM", + "observationDate": "C:ejscreen_airpollutants->year", + "observationAbout": "C:ejscreen_airpollutants->FIPS", + "observationPeriod": "dcs:P1Y", + "value": "C:ejscreen_airpollutants->DSLPM", + "unit": "dcs:MicrogramsPerCubicMeter" + }, + { + "Node": "E:ejscreen_airpollutants->E1", + "typeOf": "dcs:StatVarObservation", + "variableMeasured": "dcs:AirPollutant_Cancer_Risk", + "observationDate": "C:ejscreen_airpollutants->year", + "observationAbout": "C:ejscreen_airpollutants->FIPS", + "observationPeriod": "dcs:P1Y", + "value": "C:ejscreen_airpollutants->CANCER", + "unit": "dcs:PerMillionPerson" + }, + { + "Node": "E:ejscreen_airpollutants->E2", + "typeOf": "dcs:StatVarObservation", + "variableMeasured": "dcs:AirPollutant_Respiratory_Hazard", + "observationDate": "C:ejscreen_airpollutants->year", + "observationAbout": "C:ejscreen_airpollutants->FIPS", + "observationPeriod": "dcs:P1Y", + "value": "C:ejscreen_airpollutants->RESP" + }, + { + "Node": "E:ejscreen_airpollutants->E3", + "typeOf": "dcs:StatVarObservation", + "variableMeasured": "dcs:Mean_Concentration_AirPollutant_Ozone", + "observationDate": "C:ejscreen_airpollutants->year", + "observationAbout": "C:ejscreen_airpollutants->FIPS", + "observationPeriod": "dcs:P1Y", + "value": "C:ejscreen_airpollutants->OZONE", + "unit": "dcs:PartsPerBillion" + }, + { + "Node": "E:ejscreen_airpollutants->E4", + "typeOf": "dcs:StatVarObservation", + "variableMeasured": "dcs:Mean_Concentration_AirPollutant_PM2.5", + "observationDate": "C:ejscreen_airpollutants->year", + "observationAbout": "C:ejscreen_airpollutants->FIPS", + "observationPeriod": "dcs:P1Y", + "value": "C:ejscreen_airpollutants->PM25", + "unit": "dcs:MicrogramsPerCubicMeter" + } + ] + } + \ No newline at end of file diff --git a/scripts/us_epa/ejscreen/ejscreen.py b/scripts/us_epa/ejscreen/ejscreen.py index ea9f15ed3b..b52462e42c 100644 --- a/scripts/us_epa/ejscreen/ejscreen.py +++ b/scripts/us_epa/ejscreen/ejscreen.py @@ -1,101 +1,35 @@ -''' -Generates cleaned CSV for the EPA EJSCREEN data and TMCF. -Usage: python3 ejscreen.py -''' - import io import zipfile import requests import pandas as pd +import json +from absl import logging + +logging.set_verbosity(logging.INFO) +logger = logging -YEARS = ['2015', '2016', '2017', '2018', '2019', '2020'] - -NORM_CSV_COLUMNS = ['ID', 'DSLPM', 'CANCER', 'RESP', 'OZONE', 'PM25'] - -# 2015 has different csv column names -CSV_COLUMNS_BY_YEAR = { - '2015': ['FIPS', 'dpm', 'cancer', 'resp', 'o3', 'pm'], - '2016': NORM_CSV_COLUMNS, - '2017': NORM_CSV_COLUMNS, - '2018': NORM_CSV_COLUMNS, - '2019': NORM_CSV_COLUMNS, - '2020': NORM_CSV_COLUMNS -} - -ZIP_FILENAMES = { - '2015': 'EJSCREEN_20150505.csv', - '2016': 'EJSCREEN_V3_USPR_090216_CSV', - '2017': None, - '2018': 'EJSCREEN_2018_USPR_csv', - '2019': 'EJSCREEN_2019_USPR.csv', - '2020': 'EJSCREEN_2020_USPR.csv' -} - -FILENAMES = { - '2015': 'EJSCREEN_20150505', - '2016': 'EJSCREEN_Full_V3_USPR_TSDFupdate', - '2017': 'EJSCREEN_2017_USPR_Public', - '2018': 'EJSCREEN_Full_USPR_2018', - '2019': 'EJSCREEN_2019_USPR', - '2020': 'EJSCREEN_2020_USPR' -} - -TEMPLATE_MCF = ''' -Node: E:ejscreen_airpollutants->E0 -typeOf: dcs:StatVarObservation -variableMeasured: dcs:Mean_Concentration_AirPollutant_DieselPM -observationDate: C:ejscreen_airpollutants->year -observationAbout: C:ejscreen_airpollutants->FIPS -observationPeriod: dcs:P1Y -value: C:ejscreen_airpollutants->DSLPM -unit: dcs:MicrogramsPerCubicMeter - -Node: E:ejscreen_airpollutants->E1 -typeOf: dcs:StatVarObservation -variableMeasured: dcs:AirPollutant_Cancer_Risk -observationDate: C:ejscreen_airpollutants->year -observationAbout: C:ejscreen_airpollutants->FIPS -observationPeriod: dcs:P1Y -value: C:ejscreen_airpollutants->CANCER - -Node: E:ejscreen_airpollutants->E2 -typeOf: dcs:StatVarObservation -variableMeasured: dcs:AirPollutant_Respiratory_Hazard -observationDate: C:ejscreen_airpollutants->year -observationAbout: C:ejscreen_airpollutants->FIPS -observationPeriod: dcs:P1Y -value: C:ejscreen_airpollutants->RESP - -Node: E:ejscreen_airpollutants->E3 -typeOf: dcs:StatVarObservation -variableMeasured: dcs:Mean_Concentration_AirPollutant_Ozone -observationDate: C:ejscreen_airpollutants->year -observationAbout: C:ejscreen_airpollutants->FIPS -observationPeriod: dcs:P1Y -value: C:ejscreen_airpollutants->OZONE -unit: dcs:PartsPerBillion - -Node: E:ejscreen_airpollutants->E4 -typeOf: dcs:StatVarObservation -variableMeasured: dcs:Mean_Concentration_AirPollutant_PM2.5 -observationDate: C:ejscreen_airpollutants->year -observationAbout: C:ejscreen_airpollutants->FIPS -observationPeriod: dcs:P1Y -value: C:ejscreen_airpollutants->PM25 -unit: dcs:MicrogramsPerCubicMeter -''' +# Load configuration from config.json +with open('config.json', 'r') as f: + config = json.load(f) +YEARS = config["YEARS"] +NORM_CSV_COLUMNS = config["NORM_CSV_COLUMNS"] +NORM_CSV_COLUMNS1 = config["NORM_CSV_COLUMNS1"] +CSV_COLUMNS_BY_YEAR = config["CSV_COLUMNS_BY_YEAR"] +ZIP_FILENAMES = config["ZIP_FILENAMES"] +FILENAMES = config["FILENAMES"] +TEMPLATE_MCF = config["TEMPLATE_MCF"] # data: dictionary of dataframes in the format {year: dataframe} # outfilename: name of the csv that data will be written to # write_csv concatenates the dataframe from each year together + + def write_csv(data, outfilename): full_df = pd.DataFrame() for curr_year, one_year_df in data.items(): - one_year_df['year'] = curr_year # add year column - full_df = pd.concat( - [full_df, one_year_df], - ignore_index=True) # concatenate year onto larger dataframe + one_year_df['year'] = curr_year + full_df = pd.concat([full_df, one_year_df], ignore_index=True) # sort by FIPS and make into dcid full_df = full_df.rename(columns={'ID': 'FIPS'}) @@ -115,25 +49,159 @@ def write_tmcf(outfilename): if __name__ == '__main__': dfs = {} for year in YEARS: - print(year) + logger.info(year) columns = CSV_COLUMNS_BY_YEAR[year] - # request file zip_filename = ZIP_FILENAMES[year] + if zip_filename is not None: - response = requests.get( - f'https://gaftp.epa.gov/EJSCREEN/{year}/{zip_filename}.zip') - with zipfile.ZipFile(io.BytesIO(response.content())) as zfile: - with zfile.open(f'{FILENAMES[year]}.csv', 'r') as newfile: - dfs[year] = pd.read_csv(newfile, usecols=columns) - # some years are not zipped + if year == '2024': + url = f'https://gaftp.epa.gov/EJSCREEN/2024/2.32_August_UseMe/{zip_filename}.zip' + elif year == '2023': + url = f'https://gaftp.epa.gov/EJSCREEN/2023/2.22_September_UseMe/{zip_filename}.zip' + else: + url = f'https://gaftp.epa.gov/EJSCREEN/{year}/{zip_filename}.zip' + + logger.info(f"Requesting file: {url}") + response = requests.get(url, verify=False) + + if response.status_code == 200: + with zipfile.ZipFile(io.BytesIO(response.content)) as zfile: + with zfile.open(f'{FILENAMES[year]}.csv', 'r') as newfile: + dfs[year] = pd.read_csv(newfile, + engine='python', + encoding='latin1', + usecols=columns) + else: + logger.error( + f"Failed to download file for {year}. HTTP Status Code: {response.status_code}" + ) + + else: + url = f'https://gaftp.epa.gov/EJSCREEN/{year}/{FILENAMES[year]}.csv' + logger.info(f"Requesting CSV file: {url}") + response = requests.get(url, verify=False) + + if response.status_code == 200: + dfs[year] = pd.read_csv(io.StringIO(response.text), + sep=',', + usecols=columns) + else: + logger.error( + f"Failed to download CSV for {year}. HTTP Status Code: {response.status_code}" + ) + + # Rename weird column names to match other years + if year == '2024': + cols_renamed = dict(zip(columns, NORM_CSV_COLUMNS1)) else: - response = requests.get( - f'https://gaftp.epa.gov/EJSCREEN/{year}/{FILENAMES[year]}.csv') - dfs[year] = pd.read_csv(response, usecols=columns) - # rename weird column names to match other years - if columns != NORM_CSV_COLUMNS: cols_renamed = dict(zip(columns, NORM_CSV_COLUMNS)) - dfs[year] = dfs[year].rename(columns=cols_renamed) + dfs[year] = dfs[year].rename(columns=cols_renamed) + + write_csv(dfs, 'ejscreen_airpollutants.csv') + write_tmcf('ejscreen.tmcf') +logger.info("Loading configuration from config.json") +with open('config.json', 'r') as f: + config = json.load(f) +logger.info("Configuration loaded successfully") + +YEARS = config["YEARS"] +logger.info(f"Processing years: {YEARS}") + +NORM_CSV_COLUMNS = config["NORM_CSV_COLUMNS"] +NORM_CSV_COLUMNS1 = config["NORM_CSV_COLUMNS1"] +CSV_COLUMNS_BY_YEAR = config["CSV_COLUMNS_BY_YEAR"] +ZIP_FILENAMES = config["ZIP_FILENAMES"] +FILENAMES = config["FILENAMES"] +TEMPLATE_MCF = config["TEMPLATE_MCF"] + +logger.info("Dataframes initialized") + + +def write_csv(data, outfilename): + logger.info(f"Writing data to {outfilename}") + full_df = pd.DataFrame() + for curr_year, one_year_df in data.items(): + one_year_df['year'] = curr_year + full_df = pd.concat([full_df, one_year_df], ignore_index=True) + + # sort by FIPS and make into dcid + full_df = full_df.rename(columns={'ID': 'FIPS'}) + full_df = full_df.sort_values(by=['FIPS'], ignore_index=True) + full_df['FIPS'] = 'dcid:geoId/' + ( + full_df['FIPS'].astype(str).str.zfill(12)) + full_df = full_df.fillna('') + full_df = full_df.replace('None', '') + full_df.to_csv(outfilename, index=False) + logger.info(f"Data written to {outfilename} successfully") + + +def write_tmcf(outfilename): + logger.info(f"Writing template to {outfilename}") + with open(outfilename, 'w') as f_out: + f_out.write(TEMPLATE_MCF) + logger.info(f"Template written to {outfilename} successfully") + + +if __name__ == '__main__': + dfs = {} + for year in YEARS: + logger.info(f"Processing year: {year}") + columns = CSV_COLUMNS_BY_YEAR[year] + zip_filename = ZIP_FILENAMES[year] + + if zip_filename is not None: + if year == '2024': + url = f'https://gaftp.epa.gov/EJSCREEN/2024/2.32_August_UseMe/{zip_filename}.zip' + elif year == '2023': + url = f'https://gaftp.epa.gov/EJSCREEN/2023/2.22_September_UseMe/{zip_filename}.zip' + else: + url = f'https://gaftp.epa.gov/EJSCREEN/{year}/{zip_filename}.zip' + + logger.info(f"Requesting file: {url}") + response = requests.get(url, verify=False) + + if response.status_code == 200: + with zipfile.ZipFile(io.BytesIO(response.content)) as zfile: + with zfile.open(f'{FILENAMES[year]}.csv', 'r') as newfile: + dfs[year] = pd.read_csv(newfile, + engine='python', + encoding='latin1', + usecols=columns) + logger.info( + f"File downloaded and processed for {year} successfully") + else: + logger.error( + f"Failed to download file for {year}. HTTP Status Code: {response.status_code}" + ) + + else: + url = f'https://gaftp.epa.gov/EJSCREEN/{year}/{FILENAMES[year]}.csv' + logger.info(f"Requesting CSV file: {url}") + response = requests.get(url, verify=False) + + if response.status_code == 200: + dfs[year] = pd.read_csv(io.StringIO(response.text), + sep=',', + usecols=columns) + logger.info( + f"CSV downloaded and processed for {year} successfully") + else: + logger.error( + f"Failed to download CSV for {year}. HTTP Status Code: {response.status_code}" + ) + + # Rename weird column names to match other years + if year == '2024': + cols_renamed = dict(zip(columns, NORM_CSV_COLUMNS1)) + else: + cols_renamed = dict(zip(columns, NORM_CSV_COLUMNS)) + + dfs[year] = dfs[year].rename(columns=cols_renamed) + logger.info(f"Columns renamed for {year} successfully") + + logger.info("Writing data to csv") write_csv(dfs, 'ejscreen_airpollutants.csv') + logger.info("Writing template to tmcf") write_tmcf('ejscreen.tmcf') + logger.info("Process completed successfully") diff --git a/scripts/us_epa/ejscreen/ejscreen_test.py b/scripts/us_epa/ejscreen/ejscreen_test.py index 7965671493..b6bb505f5d 100644 --- a/scripts/us_epa/ejscreen/ejscreen_test.py +++ b/scripts/us_epa/ejscreen/ejscreen_test.py @@ -7,7 +7,7 @@ import os import tempfile import pandas as pd -from .ejscreen import write_csv +from ejscreen import write_csv module_dir_ = os.path.dirname(__file__) From 1b0c74d2c3e8be811a6bcef82cc40e1c3560c5cd Mon Sep 17 00:00:00 2001 From: Rohit Bhande Date: Wed, 15 Jan 2025 11:21:23 +0000 Subject: [PATCH 02/21] Ejscreen semiautomatic --- scripts/us_epa/ejscreen/ejscreen_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/us_epa/ejscreen/ejscreen_test.py b/scripts/us_epa/ejscreen/ejscreen_test.py index b6bb505f5d..7965671493 100644 --- a/scripts/us_epa/ejscreen/ejscreen_test.py +++ b/scripts/us_epa/ejscreen/ejscreen_test.py @@ -7,7 +7,7 @@ import os import tempfile import pandas as pd -from ejscreen import write_csv +from .ejscreen import write_csv module_dir_ = os.path.dirname(__file__) From 9f2cf3c5618b8aa1fa0c584f2f0fab092837f1b1 Mon Sep 17 00:00:00 2001 From: Rohit Bhande Date: Wed, 15 Jan 2025 11:33:00 +0000 Subject: [PATCH 03/21] Ejscreen semiautomatic --- scripts/us_epa/ejscreen/ejscreen_test.py | 26 ++++++++++++++---------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/scripts/us_epa/ejscreen/ejscreen_test.py b/scripts/us_epa/ejscreen/ejscreen_test.py index 7965671493..c7e154863f 100644 --- a/scripts/us_epa/ejscreen/ejscreen_test.py +++ b/scripts/us_epa/ejscreen/ejscreen_test.py @@ -2,33 +2,37 @@ Unit tests for ejscreen.py Usage: python3 -m unittest discover -v -s ../ -p "ejscreen_test.py" ''' - import unittest import os import tempfile import pandas as pd -from .ejscreen import write_csv +from ejscreen import write_csv module_dir_ = os.path.dirname(__file__) - class TestEjscreen(unittest.TestCase): def test_write_csv(self): with tempfile.TemporaryDirectory() as tmp_dir: + # Ensure test data file exists in the expected directory + test_data_file = os.path.join(module_dir_, 'test_data/test_data.csv') + expected_data_file = os.path.join(module_dir_, 'test_data/test_data_expected.csv') + + if not os.path.exists(test_data_file) or not os.path.exists(expected_data_file): + raise FileNotFoundError(f"Test data files are missing: {test_data_file}, {expected_data_file}") + dfs = {} - dfs['2020'] = pd.read_csv(os.path.join(module_dir_, - 'test_data/test_data.csv'), - float_precision='high') + dfs['2020'] = pd.read_csv(test_data_file, float_precision='high') test_csv = os.path.join(tmp_dir, 'test_csv.csv') write_csv(dfs, test_csv) - expected_csv = os.path.join(module_dir_, - 'test_data/test_data_expected.csv') + with open(test_csv, 'r') as test: - test_str: str = test.read() - with open(expected_csv, 'r') as expected: - expected_str: str = expected.read() + test_str = test.read() + with open(expected_data_file, 'r') as expected: + expected_str = expected.read() self.assertEqual(test_str, expected_str) + + # Remove temporary test file after assertion os.remove(test_csv) From c0073bce9d7a674f6c2aa29911cbcd3ca28eee96 Mon Sep 17 00:00:00 2001 From: Rohit Bhande Date: Wed, 15 Jan 2025 11:40:37 +0000 Subject: [PATCH 04/21] Ejscreen semiautomatic --- scripts/us_epa/ejscreen/ejscreen_test.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/scripts/us_epa/ejscreen/ejscreen_test.py b/scripts/us_epa/ejscreen/ejscreen_test.py index c7e154863f..6ef215db44 100644 --- a/scripts/us_epa/ejscreen/ejscreen_test.py +++ b/scripts/us_epa/ejscreen/ejscreen_test.py @@ -6,26 +6,32 @@ import os import tempfile import pandas as pd -from ejscreen import write_csv +from ejscreen import write_csv module_dir_ = os.path.dirname(__file__) + class TestEjscreen(unittest.TestCase): def test_write_csv(self): with tempfile.TemporaryDirectory() as tmp_dir: # Ensure test data file exists in the expected directory - test_data_file = os.path.join(module_dir_, 'test_data/test_data.csv') - expected_data_file = os.path.join(module_dir_, 'test_data/test_data_expected.csv') - - if not os.path.exists(test_data_file) or not os.path.exists(expected_data_file): - raise FileNotFoundError(f"Test data files are missing: {test_data_file}, {expected_data_file}") + test_data_file = os.path.join(module_dir_, + 'test_data/test_data.csv') + expected_data_file = os.path.join( + module_dir_, 'test_data/test_data_expected.csv') + + if not os.path.exists(test_data_file) or not os.path.exists( + expected_data_file): + raise FileNotFoundError( + f"Test data files are missing: {test_data_file}, {expected_data_file}" + ) dfs = {} dfs['2020'] = pd.read_csv(test_data_file, float_precision='high') test_csv = os.path.join(tmp_dir, 'test_csv.csv') write_csv(dfs, test_csv) - + with open(test_csv, 'r') as test: test_str = test.read() with open(expected_data_file, 'r') as expected: From 32b47d4427ac737c6f165d3acfb945e271db2f5a Mon Sep 17 00:00:00 2001 From: Rohit Bhande Date: Wed, 15 Jan 2025 12:09:27 +0000 Subject: [PATCH 05/21] Ejscreen semiautomatic --- scripts/us_epa/ejscreen/ejscreen_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/us_epa/ejscreen/ejscreen_test.py b/scripts/us_epa/ejscreen/ejscreen_test.py index 6ef215db44..3e1e80af70 100644 --- a/scripts/us_epa/ejscreen/ejscreen_test.py +++ b/scripts/us_epa/ejscreen/ejscreen_test.py @@ -6,7 +6,7 @@ import os import tempfile import pandas as pd -from ejscreen import write_csv +from .ejscreen import write_csv module_dir_ = os.path.dirname(__file__) From 644c0fe2eaec14a43f3cbb648b968883cb401d06 Mon Sep 17 00:00:00 2001 From: Rohit Bhande Date: Wed, 15 Jan 2025 12:31:28 +0000 Subject: [PATCH 06/21] Ejscreen semiautomatic --- scripts/us_epa/ejscreen/ejscreen.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/scripts/us_epa/ejscreen/ejscreen.py b/scripts/us_epa/ejscreen/ejscreen.py index b52462e42c..cf1bbd7bfd 100644 --- a/scripts/us_epa/ejscreen/ejscreen.py +++ b/scripts/us_epa/ejscreen/ejscreen.py @@ -24,6 +24,12 @@ # outfilename: name of the csv that data will be written to # write_csv concatenates the dataframe from each year together +# def read_config(): +# # Load configuration from config.json +# with open('config.json', 'r') as f: +# config = json.load(f) +# return config + def write_csv(data, outfilename): full_df = pd.DataFrame() @@ -100,10 +106,6 @@ def write_tmcf(outfilename): write_csv(dfs, 'ejscreen_airpollutants.csv') write_tmcf('ejscreen.tmcf') -logger.info("Loading configuration from config.json") -with open('config.json', 'r') as f: - config = json.load(f) -logger.info("Configuration loaded successfully") YEARS = config["YEARS"] logger.info(f"Processing years: {YEARS}") From c968571de172ce3034eeb01b426190a17090eb97 Mon Sep 17 00:00:00 2001 From: Rohit Bhande Date: Wed, 15 Jan 2025 13:04:47 +0000 Subject: [PATCH 07/21] Ejscreen semiautomatic2 --- scripts/us_epa/ejscreen/ejscreen.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/scripts/us_epa/ejscreen/ejscreen.py b/scripts/us_epa/ejscreen/ejscreen.py index cf1bbd7bfd..264c5eb2a0 100644 --- a/scripts/us_epa/ejscreen/ejscreen.py +++ b/scripts/us_epa/ejscreen/ejscreen.py @@ -1,4 +1,5 @@ import io +import os import zipfile import requests import pandas as pd @@ -8,8 +9,11 @@ logging.set_verbosity(logging.INFO) logger = logging +_MODULE_DIR = os.path.dirname(os.path.abspath(__file__)) +_CONFIG_PATH = os.path.join(_MODULE_DIR, 'config.json') + # Load configuration from config.json -with open('config.json', 'r') as f: +with open(_CONFIG_PATH, 'r') as f: config = json.load(f) YEARS = config["YEARS"] @@ -24,12 +28,6 @@ # outfilename: name of the csv that data will be written to # write_csv concatenates the dataframe from each year together -# def read_config(): -# # Load configuration from config.json -# with open('config.json', 'r') as f: -# config = json.load(f) -# return config - def write_csv(data, outfilename): full_df = pd.DataFrame() From ebbd9a46f1860d53006ba4f021db4e1a09306073 Mon Sep 17 00:00:00 2001 From: Rohit Bhande Date: Thu, 16 Jan 2025 10:03:47 +0000 Subject: [PATCH 08/21] Ejscreen semiautomatic2 --- scripts/us_epa/ejscreen/config.json | 2 + scripts/us_epa/ejscreen/ejscreen.py | 154 +++++------------------ scripts/us_epa/ejscreen/ejscreen_test.py | 16 ++- scripts/us_epa/ejscreen/manifest.json | 23 ++++ 4 files changed, 69 insertions(+), 126 deletions(-) create mode 100644 scripts/us_epa/ejscreen/manifest.json diff --git a/scripts/us_epa/ejscreen/config.json b/scripts/us_epa/ejscreen/config.json index b84673a204..9375ee2504 100644 --- a/scripts/us_epa/ejscreen/config.json +++ b/scripts/us_epa/ejscreen/config.json @@ -38,6 +38,8 @@ "2023": "EJSCREEN_2023_BG_with_AS_CNMI_GU_VI", "2024": "EJScreen_2024_Tract_with_AS_CNMI_GU_VI" }, + "URL_TEMPLATE": "https://gaftp.epa.gov/EJSCREEN/{year}/{zip_filename}.zip", + "URL_TEMPLATE_NON_ZIPPED": "https://gaftp.epa.gov/EJSCREEN/{year}/{filename}.csv", "TEMPLATE_MCF": [ { "Node": "E:ejscreen_airpollutants->E0", diff --git a/scripts/us_epa/ejscreen/ejscreen.py b/scripts/us_epa/ejscreen/ejscreen.py index 264c5eb2a0..83c1e3bfde 100644 --- a/scripts/us_epa/ejscreen/ejscreen.py +++ b/scripts/us_epa/ejscreen/ejscreen.py @@ -1,3 +1,18 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + import io import os import zipfile @@ -23,12 +38,13 @@ ZIP_FILENAMES = config["ZIP_FILENAMES"] FILENAMES = config["FILENAMES"] TEMPLATE_MCF = config["TEMPLATE_MCF"] +URL_TEMPLATE = config["URL_TEMPLATE"] +URL_TEMPLATE_NON_ZIPPED = config["URL_TEMPLATE_NON_ZIPPED"] # data: dictionary of dataframes in the format {year: dataframe} # outfilename: name of the csv that data will be written to # write_csv concatenates the dataframe from each year together - def write_csv(data, outfilename): full_df = pd.DataFrame() for curr_year, one_year_df in data.items(): @@ -44,152 +60,40 @@ def write_csv(data, outfilename): full_df = full_df.replace('None', '') full_df.to_csv(outfilename, index=False) - def write_tmcf(outfilename): with open(outfilename, 'w') as f_out: f_out.write(TEMPLATE_MCF) - -if __name__ == '__main__': - dfs = {} - for year in YEARS: - logger.info(year) - columns = CSV_COLUMNS_BY_YEAR[year] - zip_filename = ZIP_FILENAMES[year] - - if zip_filename is not None: - if year == '2024': - url = f'https://gaftp.epa.gov/EJSCREEN/2024/2.32_August_UseMe/{zip_filename}.zip' - elif year == '2023': - url = f'https://gaftp.epa.gov/EJSCREEN/2023/2.22_September_UseMe/{zip_filename}.zip' - else: - url = f'https://gaftp.epa.gov/EJSCREEN/{year}/{zip_filename}.zip' - - logger.info(f"Requesting file: {url}") - response = requests.get(url, verify=False) - - if response.status_code == 200: - with zipfile.ZipFile(io.BytesIO(response.content)) as zfile: - with zfile.open(f'{FILENAMES[year]}.csv', 'r') as newfile: - dfs[year] = pd.read_csv(newfile, - engine='python', - encoding='latin1', - usecols=columns) - else: - logger.error( - f"Failed to download file for {year}. HTTP Status Code: {response.status_code}" - ) - - else: - url = f'https://gaftp.epa.gov/EJSCREEN/{year}/{FILENAMES[year]}.csv' - logger.info(f"Requesting CSV file: {url}") - response = requests.get(url, verify=False) - - if response.status_code == 200: - dfs[year] = pd.read_csv(io.StringIO(response.text), - sep=',', - usecols=columns) - else: - logger.error( - f"Failed to download CSV for {year}. HTTP Status Code: {response.status_code}" - ) - - # Rename weird column names to match other years - if year == '2024': - cols_renamed = dict(zip(columns, NORM_CSV_COLUMNS1)) - else: - cols_renamed = dict(zip(columns, NORM_CSV_COLUMNS)) - - dfs[year] = dfs[year].rename(columns=cols_renamed) - - write_csv(dfs, 'ejscreen_airpollutants.csv') - write_tmcf('ejscreen.tmcf') - -YEARS = config["YEARS"] -logger.info(f"Processing years: {YEARS}") - -NORM_CSV_COLUMNS = config["NORM_CSV_COLUMNS"] -NORM_CSV_COLUMNS1 = config["NORM_CSV_COLUMNS1"] -CSV_COLUMNS_BY_YEAR = config["CSV_COLUMNS_BY_YEAR"] -ZIP_FILENAMES = config["ZIP_FILENAMES"] -FILENAMES = config["FILENAMES"] -TEMPLATE_MCF = config["TEMPLATE_MCF"] - -logger.info("Dataframes initialized") - - -def write_csv(data, outfilename): - logger.info(f"Writing data to {outfilename}") - full_df = pd.DataFrame() - for curr_year, one_year_df in data.items(): - one_year_df['year'] = curr_year - full_df = pd.concat([full_df, one_year_df], ignore_index=True) - - # sort by FIPS and make into dcid - full_df = full_df.rename(columns={'ID': 'FIPS'}) - full_df = full_df.sort_values(by=['FIPS'], ignore_index=True) - full_df['FIPS'] = 'dcid:geoId/' + ( - full_df['FIPS'].astype(str).str.zfill(12)) - full_df = full_df.fillna('') - full_df = full_df.replace('None', '') - full_df.to_csv(outfilename, index=False) - logger.info(f"Data written to {outfilename} successfully") - - -def write_tmcf(outfilename): - logger.info(f"Writing template to {outfilename}") - with open(outfilename, 'w') as f_out: - f_out.write(TEMPLATE_MCF) - logger.info(f"Template written to {outfilename} successfully") - - if __name__ == '__main__': dfs = {} for year in YEARS: logger.info(f"Processing year: {year}") columns = CSV_COLUMNS_BY_YEAR[year] - zip_filename = ZIP_FILENAMES[year] - - if zip_filename is not None: - if year == '2024': - url = f'https://gaftp.epa.gov/EJSCREEN/2024/2.32_August_UseMe/{zip_filename}.zip' - elif year == '2023': - url = f'https://gaftp.epa.gov/EJSCREEN/2023/2.22_September_UseMe/{zip_filename}.zip' - else: - url = f'https://gaftp.epa.gov/EJSCREEN/{year}/{zip_filename}.zip' + zip_filename = ZIP_FILENAMES.get(year, None) + # Check if the year has a zip file or not + if zip_filename: + url = URL_TEMPLATE.format(year=year, zip_filename=zip_filename) logger.info(f"Requesting file: {url}") response = requests.get(url, verify=False) if response.status_code == 200: with zipfile.ZipFile(io.BytesIO(response.content)) as zfile: with zfile.open(f'{FILENAMES[year]}.csv', 'r') as newfile: - dfs[year] = pd.read_csv(newfile, - engine='python', - encoding='latin1', - usecols=columns) - logger.info( - f"File downloaded and processed for {year} successfully") + dfs[year] = pd.read_csv(newfile, usecols=columns) + logger.info(f"File downloaded and processed for {year} successfully") else: - logger.error( - f"Failed to download file for {year}. HTTP Status Code: {response.status_code}" - ) - + logger.error(f"Failed to download file for {year}. HTTP Status Code: {response.status_code}") else: - url = f'https://gaftp.epa.gov/EJSCREEN/{year}/{FILENAMES[year]}.csv' + url = URL_TEMPLATE_NON_ZIPPED.format(year=year, filename=FILENAMES[year]) logger.info(f"Requesting CSV file: {url}") response = requests.get(url, verify=False) if response.status_code == 200: - dfs[year] = pd.read_csv(io.StringIO(response.text), - sep=',', - usecols=columns) - logger.info( - f"CSV downloaded and processed for {year} successfully") + dfs[year] = pd.read_csv(io.StringIO(response.text), sep=',', usecols=columns) + logger.info(f"CSV downloaded and processed for {year} successfully") else: - logger.error( - f"Failed to download CSV for {year}. HTTP Status Code: {response.status_code}" - ) + logger.error(f"Failed to download CSV for {year}. HTTP Status Code: {response.status_code}") # Rename weird column names to match other years if year == '2024': @@ -204,4 +108,4 @@ def write_tmcf(outfilename): write_csv(dfs, 'ejscreen_airpollutants.csv') logger.info("Writing template to tmcf") write_tmcf('ejscreen.tmcf') - logger.info("Process completed successfully") + logger.info("Process completed successfully") \ No newline at end of file diff --git a/scripts/us_epa/ejscreen/ejscreen_test.py b/scripts/us_epa/ejscreen/ejscreen_test.py index 3e1e80af70..4d9b386700 100644 --- a/scripts/us_epa/ejscreen/ejscreen_test.py +++ b/scripts/us_epa/ejscreen/ejscreen_test.py @@ -1,3 +1,17 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + ''' Unit tests for ejscreen.py Usage: python3 -m unittest discover -v -s ../ -p "ejscreen_test.py" @@ -6,7 +20,7 @@ import os import tempfile import pandas as pd -from .ejscreen import write_csv +from ejscreen import write_csv module_dir_ = os.path.dirname(__file__) diff --git a/scripts/us_epa/ejscreen/manifest.json b/scripts/us_epa/ejscreen/manifest.json new file mode 100644 index 0000000000..6938eaa11f --- /dev/null +++ b/scripts/us_epa/ejscreen/manifest.json @@ -0,0 +1,23 @@ +{ + "import_specifications": [ + { + "import_name": "EPA_EJSCREEN", + "curator_emails": [ + "rbhande@google.com" + ], + "provenance_url": "https://gaftp.epa.gov/EJSCREEN/", + "provenance_description": "The Census Bureau's Ejscreen data", + "scripts": [ + "ejscreen.py" + ], + "import_inputs": [ + { + "template_mcf": "us_epa/ejscreen/ejscreen.tmcf", + "cleaned_csv": "us_epa/ejscreen/ejscreen_airpollutants.csv" + } + ], + "cron_schedule": "0 07 * * 1" + } + ] +} + From 1702ace085694fbd673ea6415d1b5a3ba856ed80 Mon Sep 17 00:00:00 2001 From: Rohit Bhande Date: Thu, 16 Jan 2025 10:14:42 +0000 Subject: [PATCH 09/21] Ejscreen semiautomatic2 --- scripts/us_epa/ejscreen/ejscreen.py | 27 ++++++++++----- scripts/us_epa/ejscreen/ejscreen.tmcf | 42 ------------------------ scripts/us_epa/ejscreen/ejscreen_test.py | 1 - 3 files changed, 19 insertions(+), 51 deletions(-) diff --git a/scripts/us_epa/ejscreen/ejscreen.py b/scripts/us_epa/ejscreen/ejscreen.py index 83c1e3bfde..bce739901c 100644 --- a/scripts/us_epa/ejscreen/ejscreen.py +++ b/scripts/us_epa/ejscreen/ejscreen.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. - import io import os import zipfile @@ -45,6 +44,7 @@ # outfilename: name of the csv that data will be written to # write_csv concatenates the dataframe from each year together + def write_csv(data, outfilename): full_df = pd.DataFrame() for curr_year, one_year_df in data.items(): @@ -60,10 +60,12 @@ def write_csv(data, outfilename): full_df = full_df.replace('None', '') full_df.to_csv(outfilename, index=False) + def write_tmcf(outfilename): with open(outfilename, 'w') as f_out: f_out.write(TEMPLATE_MCF) + if __name__ == '__main__': dfs = {} for year in YEARS: @@ -81,19 +83,28 @@ def write_tmcf(outfilename): with zipfile.ZipFile(io.BytesIO(response.content)) as zfile: with zfile.open(f'{FILENAMES[year]}.csv', 'r') as newfile: dfs[year] = pd.read_csv(newfile, usecols=columns) - logger.info(f"File downloaded and processed for {year} successfully") + logger.info( + f"File downloaded and processed for {year} successfully") else: - logger.error(f"Failed to download file for {year}. HTTP Status Code: {response.status_code}") + logger.error( + f"Failed to download file for {year}. HTTP Status Code: {response.status_code}" + ) else: - url = URL_TEMPLATE_NON_ZIPPED.format(year=year, filename=FILENAMES[year]) + url = URL_TEMPLATE_NON_ZIPPED.format(year=year, + filename=FILENAMES[year]) logger.info(f"Requesting CSV file: {url}") response = requests.get(url, verify=False) if response.status_code == 200: - dfs[year] = pd.read_csv(io.StringIO(response.text), sep=',', usecols=columns) - logger.info(f"CSV downloaded and processed for {year} successfully") + dfs[year] = pd.read_csv(io.StringIO(response.text), + sep=',', + usecols=columns) + logger.info( + f"CSV downloaded and processed for {year} successfully") else: - logger.error(f"Failed to download CSV for {year}. HTTP Status Code: {response.status_code}") + logger.error( + f"Failed to download CSV for {year}. HTTP Status Code: {response.status_code}" + ) # Rename weird column names to match other years if year == '2024': @@ -108,4 +119,4 @@ def write_tmcf(outfilename): write_csv(dfs, 'ejscreen_airpollutants.csv') logger.info("Writing template to tmcf") write_tmcf('ejscreen.tmcf') - logger.info("Process completed successfully") \ No newline at end of file + logger.info("Process completed successfully") diff --git a/scripts/us_epa/ejscreen/ejscreen.tmcf b/scripts/us_epa/ejscreen/ejscreen.tmcf index 21ef79d3f3..e69de29bb2 100644 --- a/scripts/us_epa/ejscreen/ejscreen.tmcf +++ b/scripts/us_epa/ejscreen/ejscreen.tmcf @@ -1,42 +0,0 @@ -Node: E:ejscreen_airpollutants->E0 -typeOf: dcs:StatVarObservation -variableMeasured: dcs:Mean_Concentration_AirPollutant_DieselPM -observationDate: C:ejscreen_airpollutants->year -observationAbout: C:ejscreen_airpollutants->FIPS -observationPeriod: dcs:P1Y -value: C:ejscreen_airpollutants->DSLPM -unit: dcs:MicrogramsPerCubicMeter - -Node: E:ejscreen_airpollutants->E1 -typeOf: dcs:StatVarObservation -variableMeasured: dcs:AirPollutant_Cancer_Risk -observationDate: C:ejscreen_airpollutants->year -observationAbout: C:ejscreen_airpollutants->FIPS -observationPeriod: dcs:P1Y -value: C:ejscreen_airpollutants->CANCER - -Node: E:ejscreen_airpollutants->E2 -typeOf: dcs:StatVarObservation -variableMeasured: dcs:AirPollutant_Respiratory_Hazard -observationDate: C:ejscreen_airpollutants->year -observationAbout: C:ejscreen_airpollutants->FIPS -observationPeriod: dcs:P1Y -value: C:ejscreen_airpollutants->RESP - -Node: E:ejscreen_airpollutants->E3 -typeOf: dcs:StatVarObservation -variableMeasured: dcs:Mean_Concentration_AirPollutant_Ozone -observationDate: C:ejscreen_airpollutants->year -observationAbout: C:ejscreen_airpollutants->FIPS -observationPeriod: dcs:P1Y -value: C:ejscreen_airpollutants->OZONE -unit: dcs:PartsPerBillion - -Node: E:ejscreen_airpollutants->E4 -typeOf: dcs:StatVarObservation -variableMeasured: dcs:Mean_Concentration_AirPollutant_PM2.5 -observationDate: C:ejscreen_airpollutants->year -observationAbout: C:ejscreen_airpollutants->FIPS -observationPeriod: dcs:P1Y -value: C:ejscreen_airpollutants->PM25 -unit: dcs:MicrogramsPerCubicMeter \ No newline at end of file diff --git a/scripts/us_epa/ejscreen/ejscreen_test.py b/scripts/us_epa/ejscreen/ejscreen_test.py index 4d9b386700..7a6ae60392 100644 --- a/scripts/us_epa/ejscreen/ejscreen_test.py +++ b/scripts/us_epa/ejscreen/ejscreen_test.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - ''' Unit tests for ejscreen.py Usage: python3 -m unittest discover -v -s ../ -p "ejscreen_test.py" From 664968513baa3ab97901eea6ff09304b811fd1f8 Mon Sep 17 00:00:00 2001 From: Rohit Bhande Date: Thu, 16 Jan 2025 10:23:18 +0000 Subject: [PATCH 10/21] Ejscreen semiautomatic2 --- scripts/us_epa/ejscreen/ejscreen_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/us_epa/ejscreen/ejscreen_test.py b/scripts/us_epa/ejscreen/ejscreen_test.py index 7a6ae60392..12c98be7a9 100644 --- a/scripts/us_epa/ejscreen/ejscreen_test.py +++ b/scripts/us_epa/ejscreen/ejscreen_test.py @@ -19,7 +19,7 @@ import os import tempfile import pandas as pd -from ejscreen import write_csv +from .ejscreen import write_csv module_dir_ = os.path.dirname(__file__) From 46fe8af81eb1911a4dac37b805d6d70d2a07bf8f Mon Sep 17 00:00:00 2001 From: Rohit Bhande Date: Thu, 16 Jan 2025 10:28:20 +0000 Subject: [PATCH 11/21] Ejscreen semiautomatic2 --- scripts/us_epa/ejscreen/ejscreen_test.py | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/us_epa/ejscreen/ejscreen_test.py b/scripts/us_epa/ejscreen/ejscreen_test.py index 12c98be7a9..8e97d841be 100644 --- a/scripts/us_epa/ejscreen/ejscreen_test.py +++ b/scripts/us_epa/ejscreen/ejscreen_test.py @@ -11,6 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + ''' Unit tests for ejscreen.py Usage: python3 -m unittest discover -v -s ../ -p "ejscreen_test.py" From dcb84fd9555a0d245b0e175d434039a7f917ca2d Mon Sep 17 00:00:00 2001 From: Rohit Bhande Date: Thu, 16 Jan 2025 10:37:03 +0000 Subject: [PATCH 12/21] Ejscreen semiautomatic2 --- scripts/us_epa/ejscreen/ejscreen_test.py | 1 - 1 file changed, 1 deletion(-) diff --git a/scripts/us_epa/ejscreen/ejscreen_test.py b/scripts/us_epa/ejscreen/ejscreen_test.py index 8e97d841be..12c98be7a9 100644 --- a/scripts/us_epa/ejscreen/ejscreen_test.py +++ b/scripts/us_epa/ejscreen/ejscreen_test.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - ''' Unit tests for ejscreen.py Usage: python3 -m unittest discover -v -s ../ -p "ejscreen_test.py" From 9793b3324ab69c9102b579ef541bdb5a2db3f45f Mon Sep 17 00:00:00 2001 From: Rohit Bhande Date: Mon, 20 Jan 2025 05:22:47 +0000 Subject: [PATCH 13/21] Ejscreen semiautomatic2 --- scripts/us_epa/ejscreen/config.json | 5 ++- scripts/us_epa/ejscreen/ejscreen.py | 68 ++++++++++++++++------------- 2 files changed, 41 insertions(+), 32 deletions(-) diff --git a/scripts/us_epa/ejscreen/config.json b/scripts/us_epa/ejscreen/config.json index 9375ee2504..828cc4a7ae 100644 --- a/scripts/us_epa/ejscreen/config.json +++ b/scripts/us_epa/ejscreen/config.json @@ -1,5 +1,5 @@ { - "YEARS": ["2015", "2016", "2017", "2018", "2019", "2020", "2021", "2022", "2023", "2024"], + "YEARS": [ "2015", "2016", "2017", "2018", "2019", "2020", "2021", "2022", "2023", "2024"], "NORM_CSV_COLUMNS": ["ID", "DSLPM", "CANCER", "RESP", "OZONE", "PM25"], "NORM_CSV_COLUMNS1": ["ID", "DSLPM", "OZONE", "PM25"], "CSV_COLUMNS_BY_YEAR": { @@ -40,6 +40,9 @@ }, "URL_TEMPLATE": "https://gaftp.epa.gov/EJSCREEN/{year}/{zip_filename}.zip", "URL_TEMPLATE_NON_ZIPPED": "https://gaftp.epa.gov/EJSCREEN/{year}/{filename}.csv", + "URL_TEMPLATE": "https://gaftp.epa.gov/EJSCREEN/{year}/{zip_filename}.zip", + "URL_TEMPLATE_2023": "https://gaftp.epa.gov/EJSCREEN/2023/2.22_September_UseMe/{zip_filename}.zip", + "URL_TEMPLATE_2024": "https://gaftp.epa.gov/EJSCREEN/2024/2.32_August_UseMe/{zip_filename}.zip", "TEMPLATE_MCF": [ { "Node": "E:ejscreen_airpollutants->E0", diff --git a/scripts/us_epa/ejscreen/ejscreen.py b/scripts/us_epa/ejscreen/ejscreen.py index bce739901c..38f6d46704 100644 --- a/scripts/us_epa/ejscreen/ejscreen.py +++ b/scripts/us_epa/ejscreen/ejscreen.py @@ -1,16 +1,16 @@ -# Copyright 2023 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# # Copyright 2023 Google LLC +# # +# # Licensed under the Apache License, Version 2.0 (the "License"); +# # you may not use this file except in compliance with the License. +# # You may obtain a copy of the License at +# # +# # https://www.apache.org/licenses/LICENSE-2.0 +# # +# # Unless required by applicable law or agreed to in writing, software +# # distributed under the License is distributed on an "AS IS" BASIS, +# # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# # See the License for the specific language governing permissions and +# # limitations under the License. import io import os @@ -38,7 +38,8 @@ FILENAMES = config["FILENAMES"] TEMPLATE_MCF = config["TEMPLATE_MCF"] URL_TEMPLATE = config["URL_TEMPLATE"] -URL_TEMPLATE_NON_ZIPPED = config["URL_TEMPLATE_NON_ZIPPED"] +URL_TEMPLATE_2023 = config.get("URL_TEMPLATE_2023", URL_TEMPLATE) +URL_TEMPLATE_2024 = config.get("URL_TEMPLATE_2024", URL_TEMPLATE) # data: dictionary of dataframes in the format {year: dataframe} # outfilename: name of the csv that data will be written to @@ -75,36 +76,36 @@ def write_tmcf(outfilename): # Check if the year has a zip file or not if zip_filename: - url = URL_TEMPLATE.format(year=year, zip_filename=zip_filename) + # Select the appropriate URL template based on the year + if year == '2023': + url = URL_TEMPLATE_2023.format(year=year, zip_filename=zip_filename) + elif year == '2024': + url = URL_TEMPLATE_2024.format(year=year, zip_filename=zip_filename) + else: + url = URL_TEMPLATE.format(year=year, zip_filename=zip_filename) + logger.info(f"Requesting file: {url}") response = requests.get(url, verify=False) if response.status_code == 200: with zipfile.ZipFile(io.BytesIO(response.content)) as zfile: with zfile.open(f'{FILENAMES[year]}.csv', 'r') as newfile: - dfs[year] = pd.read_csv(newfile, usecols=columns) - logger.info( - f"File downloaded and processed for {year} successfully") + # Specify encoding to handle special characters + dfs[year] = pd.read_csv(newfile, usecols=columns, encoding='latin1') # Added encoding='latin1' + logger.info(f"File downloaded and processed for {year} successfully") else: - logger.error( - f"Failed to download file for {year}. HTTP Status Code: {response.status_code}" - ) + logger.error(f"Failed to download file for {year}. HTTP Status Code: {response.status_code}") else: - url = URL_TEMPLATE_NON_ZIPPED.format(year=year, - filename=FILENAMES[year]) + url = URL_TEMPLATE.format(year=year, filename=FILENAMES[year]) logger.info(f"Requesting CSV file: {url}") response = requests.get(url, verify=False) if response.status_code == 200: - dfs[year] = pd.read_csv(io.StringIO(response.text), - sep=',', - usecols=columns) - logger.info( - f"CSV downloaded and processed for {year} successfully") + # Specify encoding to handle special characters + dfs[year] = pd.read_csv(io.StringIO(response.text), sep=',', usecols=columns, encoding='latin1') # Added encoding='latin1' + logger.info(f"CSV downloaded and processed for {year} successfully") else: - logger.error( - f"Failed to download CSV for {year}. HTTP Status Code: {response.status_code}" - ) + logger.error(f"Failed to download CSV for {year}. HTTP Status Code: {response.status_code}") # Rename weird column names to match other years if year == '2024': @@ -120,3 +121,8 @@ def write_tmcf(outfilename): logger.info("Writing template to tmcf") write_tmcf('ejscreen.tmcf') logger.info("Process completed successfully") + + + + + From 6e4669234e1fe149de231905420334f6fd8946d1 Mon Sep 17 00:00:00 2001 From: Rohit Bhande Date: Mon, 20 Jan 2025 05:29:28 +0000 Subject: [PATCH 14/21] Ejscreen semiautomatic2 --- scripts/us_epa/ejscreen/ejscreen.py | 39 ++++++++++++++++++----------- 1 file changed, 24 insertions(+), 15 deletions(-) diff --git a/scripts/us_epa/ejscreen/ejscreen.py b/scripts/us_epa/ejscreen/ejscreen.py index 38f6d46704..716e97f3ed 100644 --- a/scripts/us_epa/ejscreen/ejscreen.py +++ b/scripts/us_epa/ejscreen/ejscreen.py @@ -38,8 +38,8 @@ FILENAMES = config["FILENAMES"] TEMPLATE_MCF = config["TEMPLATE_MCF"] URL_TEMPLATE = config["URL_TEMPLATE"] -URL_TEMPLATE_2023 = config.get("URL_TEMPLATE_2023", URL_TEMPLATE) -URL_TEMPLATE_2024 = config.get("URL_TEMPLATE_2024", URL_TEMPLATE) +URL_TEMPLATE_2023 = config.get("URL_TEMPLATE_2023", URL_TEMPLATE) +URL_TEMPLATE_2024 = config.get("URL_TEMPLATE_2024", URL_TEMPLATE) # data: dictionary of dataframes in the format {year: dataframe} # outfilename: name of the csv that data will be written to @@ -78,9 +78,11 @@ def write_tmcf(outfilename): if zip_filename: # Select the appropriate URL template based on the year if year == '2023': - url = URL_TEMPLATE_2023.format(year=year, zip_filename=zip_filename) + url = URL_TEMPLATE_2023.format(year=year, + zip_filename=zip_filename) elif year == '2024': - url = URL_TEMPLATE_2024.format(year=year, zip_filename=zip_filename) + url = URL_TEMPLATE_2024.format(year=year, + zip_filename=zip_filename) else: url = URL_TEMPLATE.format(year=year, zip_filename=zip_filename) @@ -91,10 +93,15 @@ def write_tmcf(outfilename): with zipfile.ZipFile(io.BytesIO(response.content)) as zfile: with zfile.open(f'{FILENAMES[year]}.csv', 'r') as newfile: # Specify encoding to handle special characters - dfs[year] = pd.read_csv(newfile, usecols=columns, encoding='latin1') # Added encoding='latin1' - logger.info(f"File downloaded and processed for {year} successfully") + dfs[year] = pd.read_csv( + newfile, usecols=columns, + encoding='latin1') # Added encoding='latin1' + logger.info( + f"File downloaded and processed for {year} successfully") else: - logger.error(f"Failed to download file for {year}. HTTP Status Code: {response.status_code}") + logger.error( + f"Failed to download file for {year}. HTTP Status Code: {response.status_code}" + ) else: url = URL_TEMPLATE.format(year=year, filename=FILENAMES[year]) logger.info(f"Requesting CSV file: {url}") @@ -102,10 +109,17 @@ def write_tmcf(outfilename): if response.status_code == 200: # Specify encoding to handle special characters - dfs[year] = pd.read_csv(io.StringIO(response.text), sep=',', usecols=columns, encoding='latin1') # Added encoding='latin1' - logger.info(f"CSV downloaded and processed for {year} successfully") + dfs[year] = pd.read_csv( + io.StringIO(response.text), + sep=',', + usecols=columns, + encoding='latin1') # Added encoding='latin1' + logger.info( + f"CSV downloaded and processed for {year} successfully") else: - logger.error(f"Failed to download CSV for {year}. HTTP Status Code: {response.status_code}") + logger.error( + f"Failed to download CSV for {year}. HTTP Status Code: {response.status_code}" + ) # Rename weird column names to match other years if year == '2024': @@ -121,8 +135,3 @@ def write_tmcf(outfilename): logger.info("Writing template to tmcf") write_tmcf('ejscreen.tmcf') logger.info("Process completed successfully") - - - - - From 218133a8f652b8fd4f37b94a2b6d34a9d7205c8a Mon Sep 17 00:00:00 2001 From: Rohit Bhande Date: Tue, 21 Jan 2025 06:06:10 +0000 Subject: [PATCH 15/21] Ejscreen semiautomatic2 --- scripts/us_epa/ejscreen/config.json | 186 +++++++++++++------------- scripts/us_epa/ejscreen/ejscreen.py | 148 ++++++++++---------- scripts/us_epa/ejscreen/ejscreen.tmcf | 5 + scripts/us_epa/ejscreen/manifest.json | 8 +- 4 files changed, 180 insertions(+), 167 deletions(-) diff --git a/scripts/us_epa/ejscreen/config.json b/scripts/us_epa/ejscreen/config.json index 828cc4a7ae..0826dde403 100644 --- a/scripts/us_epa/ejscreen/config.json +++ b/scripts/us_epa/ejscreen/config.json @@ -1,98 +1,98 @@ { - "YEARS": [ "2015", "2016", "2017", "2018", "2019", "2020", "2021", "2022", "2023", "2024"], - "NORM_CSV_COLUMNS": ["ID", "DSLPM", "CANCER", "RESP", "OZONE", "PM25"], - "NORM_CSV_COLUMNS1": ["ID", "DSLPM", "OZONE", "PM25"], - "CSV_COLUMNS_BY_YEAR": { - "2015": ["FIPS", "dpm", "cancer", "resp", "o3", "pm"], - "2016": ["ID", "DSLPM", "CANCER", "RESP", "OZONE", "PM25"], - "2017": ["ID", "DSLPM", "CANCER", "RESP", "OZONE", "PM25"], - "2018": ["ID", "DSLPM", "CANCER", "RESP", "OZONE", "PM25"], - "2019": ["ID", "DSLPM", "CANCER", "RESP", "OZONE", "PM25"], - "2020": ["ID", "DSLPM", "CANCER", "RESP", "OZONE", "PM25"], - "2021": ["ID", "DSLPM", "CANCER", "RESP", "OZONE", "PM25"], - "2022": ["ID", "DSLPM", "CANCER", "RESP", "OZONE", "PM25"], - "2023": ["ID", "DSLPM", "CANCER", "RESP", "OZONE", "PM25"], - "2024": ["ID", "DSLPM", "OZONE", "PM25"] + "YEARS": [ "2015", "2016", "2017", "2018", "2019", "2020", "2021", "2022", "2023", "2024"], + "NORM_CSV_COLUMNS": ["ID", "DSLPM", "CANCER", "RESP", "OZONE", "PM25"], + "NORM_CSV_COLUMNS1": ["ID", "DSLPM", "OZONE", "PM25"], + "CSV_COLUMNS_BY_YEAR": { + "2015": ["FIPS", "dpm", "cancer", "resp", "o3", "pm"], + "2016": ["ID", "DSLPM", "CANCER", "RESP", "OZONE", "PM25"], + "2017": ["ID", "DSLPM", "CANCER", "RESP", "OZONE", "PM25"], + "2018": ["ID", "DSLPM", "CANCER", "RESP", "OZONE", "PM25"], + "2019": ["ID", "DSLPM", "CANCER", "RESP", "OZONE", "PM25"], + "2020": ["ID", "DSLPM", "CANCER", "RESP", "OZONE", "PM25"], + "2021": ["ID", "DSLPM", "CANCER", "RESP", "OZONE", "PM25"], + "2022": ["ID", "DSLPM", "CANCER", "RESP", "OZONE", "PM25"], + "2023": ["ID", "DSLPM", "CANCER", "RESP", "OZONE", "PM25"], + "2024": ["ID", "DSLPM", "OZONE", "PM25"] + }, + "ZIP_FILENAMES": { + "2015": "EJSCREEN_20150505.csv", + "2016": "EJSCREEN_V3_USPR_090216_CSV", + "2017": null, + "2018": "EJSCREEN_2018_USPR_csv", + "2019": "EJSCREEN_2019_USPR.csv", + "2020": "EJSCREEN_2020_USPR.csv", + "2021": "EJSCREEN_2021_USPR.csv", + "2022": "EJSCREEN_2022_with_AS_CNMI_GU_VI.csv", + "2023": "EJSCREEN_2023_BG_with_AS_CNMI_GU_VI.csv", + "2024": "EJScreen_2024_Tract_with_AS_CNMI_GU_VI.csv" + }, + "FILENAMES": { + "2015": "EJSCREEN_20150505", + "2016": "EJSCREEN_Full_V3_USPR_TSDFupdate", + "2017": "EJSCREEN_2017_USPR_Public", + "2018": "EJSCREEN_Full_USPR_2018", + "2019": "EJSCREEN_2019_USPR", + "2020": "EJSCREEN_2020_USPR", + "2021": "EJSCREEN_2021_USPR", + "2022": "EJSCREEN_2022_Full_with_AS_CNMI_GU_VI", + "2023": "EJSCREEN_2023_BG_with_AS_CNMI_GU_VI", + "2024": "EJScreen_2024_Tract_with_AS_CNMI_GU_VI" + }, + "TEMPLATE_MCF": [ + { + "Node": "E:ejscreen_airpollutants->E0", + "typeOf": "dcs:StatVarObservation", + "variableMeasured": "dcs:Mean_Concentration_AirPollutant_DieselPM", + "observationDate": "C:ejscreen_airpollutants->year", + "observationAbout": "C:ejscreen_airpollutants->FIPS", + "observationPeriod": "dcs:P1Y", + "value": "C:ejscreen_airpollutants->DSLPM", + "unit": "dcs:MicrogramsPerCubicMeter" }, - "ZIP_FILENAMES": { - "2015": "EJSCREEN_20150505.csv", - "2016": "EJSCREEN_V3_USPR_090216_CSV", - "2017": null, - "2018": "EJSCREEN_2018_USPR_csv", - "2019": "EJSCREEN_2019_USPR.csv", - "2020": "EJSCREEN_2020_USPR.csv", - "2021": "EJSCREEN_2021_USPR.csv", - "2022": "EJSCREEN_2022_with_AS_CNMI_GU_VI.csv", - "2023": "EJSCREEN_2023_BG_with_AS_CNMI_GU_VI.csv", - "2024": "EJScreen_2024_Tract_with_AS_CNMI_GU_VI.csv" + { + "Node": "E:ejscreen_airpollutants->E1", + "typeOf": "dcs:StatVarObservation", + "variableMeasured": "dcs:AirPollutant_Cancer_Risk", + "observationDate": "C:ejscreen_airpollutants->year", + "observationAbout": "C:ejscreen_airpollutants->FIPS", + "observationPeriod": "dcs:P1Y", + "value": "C:ejscreen_airpollutants->CANCER", + "unit": "dcs:PerMillionPerson" }, - "FILENAMES": { - "2015": "EJSCREEN_20150505", - "2016": "EJSCREEN_Full_V3_USPR_TSDFupdate", - "2017": "EJSCREEN_2017_USPR_Public", - "2018": "EJSCREEN_Full_USPR_2018", - "2019": "EJSCREEN_2019_USPR", - "2020": "EJSCREEN_2020_USPR", - "2021": "EJSCREEN_2021_USPR", - "2022": "EJSCREEN_2022_Full_with_AS_CNMI_GU_VI", - "2023": "EJSCREEN_2023_BG_with_AS_CNMI_GU_VI", - "2024": "EJScreen_2024_Tract_with_AS_CNMI_GU_VI" + { + "Node": "E:ejscreen_airpollutants->E2", + "typeOf": "dcs:StatVarObservation", + "variableMeasured": "dcs:AirPollutant_Respiratory_Hazard", + "observationDate": "C:ejscreen_airpollutants->year", + "observationAbout": "C:ejscreen_airpollutants->FIPS", + "observationPeriod": "dcs:P1Y", + "value": "C:ejscreen_airpollutants->RESP" }, - "URL_TEMPLATE": "https://gaftp.epa.gov/EJSCREEN/{year}/{zip_filename}.zip", - "URL_TEMPLATE_NON_ZIPPED": "https://gaftp.epa.gov/EJSCREEN/{year}/{filename}.csv", - "URL_TEMPLATE": "https://gaftp.epa.gov/EJSCREEN/{year}/{zip_filename}.zip", - "URL_TEMPLATE_2023": "https://gaftp.epa.gov/EJSCREEN/2023/2.22_September_UseMe/{zip_filename}.zip", - "URL_TEMPLATE_2024": "https://gaftp.epa.gov/EJSCREEN/2024/2.32_August_UseMe/{zip_filename}.zip", - "TEMPLATE_MCF": [ - { - "Node": "E:ejscreen_airpollutants->E0", - "typeOf": "dcs:StatVarObservation", - "variableMeasured": "dcs:Mean_Concentration_AirPollutant_DieselPM", - "observationDate": "C:ejscreen_airpollutants->year", - "observationAbout": "C:ejscreen_airpollutants->FIPS", - "observationPeriod": "dcs:P1Y", - "value": "C:ejscreen_airpollutants->DSLPM", - "unit": "dcs:MicrogramsPerCubicMeter" - }, - { - "Node": "E:ejscreen_airpollutants->E1", - "typeOf": "dcs:StatVarObservation", - "variableMeasured": "dcs:AirPollutant_Cancer_Risk", - "observationDate": "C:ejscreen_airpollutants->year", - "observationAbout": "C:ejscreen_airpollutants->FIPS", - "observationPeriod": "dcs:P1Y", - "value": "C:ejscreen_airpollutants->CANCER", - "unit": "dcs:PerMillionPerson" - }, - { - "Node": "E:ejscreen_airpollutants->E2", - "typeOf": "dcs:StatVarObservation", - "variableMeasured": "dcs:AirPollutant_Respiratory_Hazard", - "observationDate": "C:ejscreen_airpollutants->year", - "observationAbout": "C:ejscreen_airpollutants->FIPS", - "observationPeriod": "dcs:P1Y", - "value": "C:ejscreen_airpollutants->RESP" - }, - { - "Node": "E:ejscreen_airpollutants->E3", - "typeOf": "dcs:StatVarObservation", - "variableMeasured": "dcs:Mean_Concentration_AirPollutant_Ozone", - "observationDate": "C:ejscreen_airpollutants->year", - "observationAbout": "C:ejscreen_airpollutants->FIPS", - "observationPeriod": "dcs:P1Y", - "value": "C:ejscreen_airpollutants->OZONE", - "unit": "dcs:PartsPerBillion" - }, - { - "Node": "E:ejscreen_airpollutants->E4", - "typeOf": "dcs:StatVarObservation", - "variableMeasured": "dcs:Mean_Concentration_AirPollutant_PM2.5", - "observationDate": "C:ejscreen_airpollutants->year", - "observationAbout": "C:ejscreen_airpollutants->FIPS", - "observationPeriod": "dcs:P1Y", - "value": "C:ejscreen_airpollutants->PM25", - "unit": "dcs:MicrogramsPerCubicMeter" - } - ] + { + "Node": "E:ejscreen_airpollutants->E3", + "typeOf": "dcs:StatVarObservation", + "variableMeasured": "dcs:Mean_Concentration_AirPollutant_Ozone", + "observationDate": "C:ejscreen_airpollutants->year", + "observationAbout": "C:ejscreen_airpollutants->FIPS", + "observationPeriod": "dcs:P1Y", + "value": "C:ejscreen_airpollutants->OZONE", + "unit": "dcs:PartsPerBillion" + }, + { + "Node": "E:ejscreen_airpollutants->E4", + "typeOf": "dcs:StatVarObservation", + "variableMeasured": "dcs:Mean_Concentration_AirPollutant_PM2.5", + "observationDate": "C:ejscreen_airpollutants->year", + "observationAbout": "C:ejscreen_airpollutants->FIPS", + "observationPeriod": "dcs:P1Y", + "value": "C:ejscreen_airpollutants->PM25", + "unit": "dcs:MicrogramsPerCubicMeter" + } + ] + , + "BASE_URL": "https://gaftp.epa.gov/EJSCREEN", + "URL_SUFFIX": { + "2023": "2.22_September_UseMe", + "2024": "2.32_August_UseMe" } - \ No newline at end of file +} diff --git a/scripts/us_epa/ejscreen/ejscreen.py b/scripts/us_epa/ejscreen/ejscreen.py index 716e97f3ed..87673aa491 100644 --- a/scripts/us_epa/ejscreen/ejscreen.py +++ b/scripts/us_epa/ejscreen/ejscreen.py @@ -1,16 +1,16 @@ -# # Copyright 2023 Google LLC -# # -# # Licensed under the Apache License, Version 2.0 (the "License"); -# # you may not use this file except in compliance with the License. -# # You may obtain a copy of the License at -# # -# # https://www.apache.org/licenses/LICENSE-2.0 -# # -# # Unless required by applicable law or agreed to in writing, software -# # distributed under the License is distributed on an "AS IS" BASIS, -# # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# # See the License for the specific language governing permissions and -# # limitations under the License. +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. import io import os @@ -18,10 +18,20 @@ import requests import pandas as pd import json -from absl import logging +from absl import logging, flags, app +import sys + +_MODULE_DIR = os.path.dirname(os.path.abspath(__file__)) +sys.path.append(os.path.join(_MODULE_DIR, '../../../util/')) +print(_MODULE_DIR) +import file_util logging.set_verbosity(logging.INFO) logger = logging +_FLAGS = flags.FLAGS +flags.DEFINE_string('config_path', + 'gs://unresolved_mcf/epa/ejscreen/config.json', + 'Path to config file') _MODULE_DIR = os.path.dirname(os.path.abspath(__file__)) _CONFIG_PATH = os.path.join(_MODULE_DIR, 'config.json') @@ -37,22 +47,32 @@ ZIP_FILENAMES = config["ZIP_FILENAMES"] FILENAMES = config["FILENAMES"] TEMPLATE_MCF = config["TEMPLATE_MCF"] -URL_TEMPLATE = config["URL_TEMPLATE"] -URL_TEMPLATE_2023 = config.get("URL_TEMPLATE_2023", URL_TEMPLATE) -URL_TEMPLATE_2024 = config.get("URL_TEMPLATE_2024", URL_TEMPLATE) +BASE_URL = config["BASE_URL"] +URL_SUFFIX = config["URL_SUFFIX"] + -# data: dictionary of dataframes in the format {year: dataframe} -# outfilename: name of the csv that data will be written to -# write_csv concatenates the dataframe from each year together +# Function to build the correct URL for each year +def build_url(year, zip_filename=None): + if zip_filename: + # Construct the URL for the zip file + if year in URL_SUFFIX: + url = f'{BASE_URL}/{year}/{URL_SUFFIX[year]}/{zip_filename}.zip' + else: + url = f'{BASE_URL}/{year}/{zip_filename}.zip' + else: + # Construct the URL for the CSV file + url = f'{BASE_URL}/{year}/{FILENAMES[year]}.csv' + return url +# Data processing function def write_csv(data, outfilename): full_df = pd.DataFrame() for curr_year, one_year_df in data.items(): one_year_df['year'] = curr_year full_df = pd.concat([full_df, one_year_df], ignore_index=True) - # sort by FIPS and make into dcid + # Sort by FIPS and make into dcid full_df = full_df.rename(columns={'ID': 'FIPS'}) full_df = full_df.sort_values(by=['FIPS'], ignore_index=True) full_df['FIPS'] = 'dcid:geoId/' + ( @@ -63,65 +83,51 @@ def write_csv(data, outfilename): def write_tmcf(outfilename): + # Convert each item in TEMPLATE_MCF to a string, even if it's a dictionary + if isinstance(TEMPLATE_MCF, list): + # Convert each element to a string if it's not already + template_content = "\n".join(str(item) for item in TEMPLATE_MCF) + else: + template_content = str( + TEMPLATE_MCF + ) # In case it's not a list, just convert it to a string + with open(outfilename, 'w') as f_out: - f_out.write(TEMPLATE_MCF) + f_out.write(template_content) -if __name__ == '__main__': +def main(_): dfs = {} for year in YEARS: logger.info(f"Processing year: {year}") columns = CSV_COLUMNS_BY_YEAR[year] zip_filename = ZIP_FILENAMES.get(year, None) - # Check if the year has a zip file or not - if zip_filename: - # Select the appropriate URL template based on the year - if year == '2023': - url = URL_TEMPLATE_2023.format(year=year, - zip_filename=zip_filename) - elif year == '2024': - url = URL_TEMPLATE_2024.format(year=year, - zip_filename=zip_filename) - else: - url = URL_TEMPLATE.format(year=year, zip_filename=zip_filename) + url = build_url(year, zip_filename) - logger.info(f"Requesting file: {url}") - response = requests.get(url, verify=False) + logger.info(f"Requesting file: {url}") + response = requests.get(url, verify=False) - if response.status_code == 200: + if response.status_code == 200: + if zip_filename: with zipfile.ZipFile(io.BytesIO(response.content)) as zfile: with zfile.open(f'{FILENAMES[year]}.csv', 'r') as newfile: - # Specify encoding to handle special characters - dfs[year] = pd.read_csv( - newfile, usecols=columns, - encoding='latin1') # Added encoding='latin1' - logger.info( - f"File downloaded and processed for {year} successfully") + dfs[year] = pd.read_csv(newfile, + engine='python', + encoding='latin1', + usecols=columns) else: - logger.error( - f"Failed to download file for {year}. HTTP Status Code: {response.status_code}" - ) + dfs[year] = pd.read_csv(io.StringIO(response.text), + sep=',', + usecols=columns) + logger.info( + f"File downloaded and processed for {year} successfully") else: - url = URL_TEMPLATE.format(year=year, filename=FILENAMES[year]) - logger.info(f"Requesting CSV file: {url}") - response = requests.get(url, verify=False) - - if response.status_code == 200: - # Specify encoding to handle special characters - dfs[year] = pd.read_csv( - io.StringIO(response.text), - sep=',', - usecols=columns, - encoding='latin1') # Added encoding='latin1' - logger.info( - f"CSV downloaded and processed for {year} successfully") - else: - logger.error( - f"Failed to download CSV for {year}. HTTP Status Code: {response.status_code}" - ) + logger.error( + f"Failed to download file for {year}. HTTP Status Code: {response.status_code}" + ) - # Rename weird column names to match other years + # Rename columns to match other years if year == '2024': cols_renamed = dict(zip(columns, NORM_CSV_COLUMNS1)) else: @@ -130,8 +136,12 @@ def write_tmcf(outfilename): dfs[year] = dfs[year].rename(columns=cols_renamed) logger.info(f"Columns renamed for {year} successfully") - logger.info("Writing data to csv") - write_csv(dfs, 'ejscreen_airpollutants.csv') - logger.info("Writing template to tmcf") - write_tmcf('ejscreen.tmcf') - logger.info("Process completed successfully") + logger.info("Writing data to CSV") + write_csv(dfs, 'ejscreen_airpollutants.csv') + logger.info("Writing template to TMCF") + write_tmcf('ejscreen.tmcf') + logger.info("Process completed successfully") + + +if __name__ == '__main__': + app.run(main) diff --git a/scripts/us_epa/ejscreen/ejscreen.tmcf b/scripts/us_epa/ejscreen/ejscreen.tmcf index e69de29bb2..785c1a0a8b 100644 --- a/scripts/us_epa/ejscreen/ejscreen.tmcf +++ b/scripts/us_epa/ejscreen/ejscreen.tmcf @@ -0,0 +1,5 @@ +{'Node': 'E:ejscreen_airpollutants->E0', 'typeOf': 'dcs:StatVarObservation', 'variableMeasured': 'dcs:Mean_Concentration_AirPollutant_DieselPM', 'observationDate': 'C:ejscreen_airpollutants->year', 'observationAbout': 'C:ejscreen_airpollutants->FIPS', 'observationPeriod': 'dcs:P1Y', 'value': 'C:ejscreen_airpollutants->DSLPM', 'unit': 'dcs:MicrogramsPerCubicMeter'} +{'Node': 'E:ejscreen_airpollutants->E1', 'typeOf': 'dcs:StatVarObservation', 'variableMeasured': 'dcs:AirPollutant_Cancer_Risk', 'observationDate': 'C:ejscreen_airpollutants->year', 'observationAbout': 'C:ejscreen_airpollutants->FIPS', 'observationPeriod': 'dcs:P1Y', 'value': 'C:ejscreen_airpollutants->CANCER', 'unit': 'dcs:PerMillionPerson'} +{'Node': 'E:ejscreen_airpollutants->E2', 'typeOf': 'dcs:StatVarObservation', 'variableMeasured': 'dcs:AirPollutant_Respiratory_Hazard', 'observationDate': 'C:ejscreen_airpollutants->year', 'observationAbout': 'C:ejscreen_airpollutants->FIPS', 'observationPeriod': 'dcs:P1Y', 'value': 'C:ejscreen_airpollutants->RESP'} +{'Node': 'E:ejscreen_airpollutants->E3', 'typeOf': 'dcs:StatVarObservation', 'variableMeasured': 'dcs:Mean_Concentration_AirPollutant_Ozone', 'observationDate': 'C:ejscreen_airpollutants->year', 'observationAbout': 'C:ejscreen_airpollutants->FIPS', 'observationPeriod': 'dcs:P1Y', 'value': 'C:ejscreen_airpollutants->OZONE', 'unit': 'dcs:PartsPerBillion'} +{'Node': 'E:ejscreen_airpollutants->E4', 'typeOf': 'dcs:StatVarObservation', 'variableMeasured': 'dcs:Mean_Concentration_AirPollutant_PM2.5', 'observationDate': 'C:ejscreen_airpollutants->year', 'observationAbout': 'C:ejscreen_airpollutants->FIPS', 'observationPeriod': 'dcs:P1Y', 'value': 'C:ejscreen_airpollutants->PM25', 'unit': 'dcs:MicrogramsPerCubicMeter'} \ No newline at end of file diff --git a/scripts/us_epa/ejscreen/manifest.json b/scripts/us_epa/ejscreen/manifest.json index 6938eaa11f..dbb7caa53c 100644 --- a/scripts/us_epa/ejscreen/manifest.json +++ b/scripts/us_epa/ejscreen/manifest.json @@ -2,9 +2,7 @@ "import_specifications": [ { "import_name": "EPA_EJSCREEN", - "curator_emails": [ - "rbhande@google.com" - ], + "curator_emails": [], "provenance_url": "https://gaftp.epa.gov/EJSCREEN/", "provenance_description": "The Census Bureau's Ejscreen data", "scripts": [ @@ -12,8 +10,8 @@ ], "import_inputs": [ { - "template_mcf": "us_epa/ejscreen/ejscreen.tmcf", - "cleaned_csv": "us_epa/ejscreen/ejscreen_airpollutants.csv" + "template_mcf": "ejscreen.tmcf", + "cleaned_csv": "ejscreen_airpollutants.csv" } ], "cron_schedule": "0 07 * * 1" From 6a650927e1c817b2d7c168cbb0f730d66f4c019b Mon Sep 17 00:00:00 2001 From: Rohit Bhande Date: Wed, 22 Jan 2025 09:40:40 +0000 Subject: [PATCH 16/21] Ejscreen semiautomatic2 --- scripts/us_epa/ejscreen/config.json | 98 ---------------- scripts/us_epa/ejscreen/ejscreen.py | 163 ++++++++++++++------------ scripts/us_epa/ejscreen/manifest.json | 4 +- 3 files changed, 91 insertions(+), 174 deletions(-) delete mode 100644 scripts/us_epa/ejscreen/config.json diff --git a/scripts/us_epa/ejscreen/config.json b/scripts/us_epa/ejscreen/config.json deleted file mode 100644 index 0826dde403..0000000000 --- a/scripts/us_epa/ejscreen/config.json +++ /dev/null @@ -1,98 +0,0 @@ -{ - "YEARS": [ "2015", "2016", "2017", "2018", "2019", "2020", "2021", "2022", "2023", "2024"], - "NORM_CSV_COLUMNS": ["ID", "DSLPM", "CANCER", "RESP", "OZONE", "PM25"], - "NORM_CSV_COLUMNS1": ["ID", "DSLPM", "OZONE", "PM25"], - "CSV_COLUMNS_BY_YEAR": { - "2015": ["FIPS", "dpm", "cancer", "resp", "o3", "pm"], - "2016": ["ID", "DSLPM", "CANCER", "RESP", "OZONE", "PM25"], - "2017": ["ID", "DSLPM", "CANCER", "RESP", "OZONE", "PM25"], - "2018": ["ID", "DSLPM", "CANCER", "RESP", "OZONE", "PM25"], - "2019": ["ID", "DSLPM", "CANCER", "RESP", "OZONE", "PM25"], - "2020": ["ID", "DSLPM", "CANCER", "RESP", "OZONE", "PM25"], - "2021": ["ID", "DSLPM", "CANCER", "RESP", "OZONE", "PM25"], - "2022": ["ID", "DSLPM", "CANCER", "RESP", "OZONE", "PM25"], - "2023": ["ID", "DSLPM", "CANCER", "RESP", "OZONE", "PM25"], - "2024": ["ID", "DSLPM", "OZONE", "PM25"] - }, - "ZIP_FILENAMES": { - "2015": "EJSCREEN_20150505.csv", - "2016": "EJSCREEN_V3_USPR_090216_CSV", - "2017": null, - "2018": "EJSCREEN_2018_USPR_csv", - "2019": "EJSCREEN_2019_USPR.csv", - "2020": "EJSCREEN_2020_USPR.csv", - "2021": "EJSCREEN_2021_USPR.csv", - "2022": "EJSCREEN_2022_with_AS_CNMI_GU_VI.csv", - "2023": "EJSCREEN_2023_BG_with_AS_CNMI_GU_VI.csv", - "2024": "EJScreen_2024_Tract_with_AS_CNMI_GU_VI.csv" - }, - "FILENAMES": { - "2015": "EJSCREEN_20150505", - "2016": "EJSCREEN_Full_V3_USPR_TSDFupdate", - "2017": "EJSCREEN_2017_USPR_Public", - "2018": "EJSCREEN_Full_USPR_2018", - "2019": "EJSCREEN_2019_USPR", - "2020": "EJSCREEN_2020_USPR", - "2021": "EJSCREEN_2021_USPR", - "2022": "EJSCREEN_2022_Full_with_AS_CNMI_GU_VI", - "2023": "EJSCREEN_2023_BG_with_AS_CNMI_GU_VI", - "2024": "EJScreen_2024_Tract_with_AS_CNMI_GU_VI" - }, - "TEMPLATE_MCF": [ - { - "Node": "E:ejscreen_airpollutants->E0", - "typeOf": "dcs:StatVarObservation", - "variableMeasured": "dcs:Mean_Concentration_AirPollutant_DieselPM", - "observationDate": "C:ejscreen_airpollutants->year", - "observationAbout": "C:ejscreen_airpollutants->FIPS", - "observationPeriod": "dcs:P1Y", - "value": "C:ejscreen_airpollutants->DSLPM", - "unit": "dcs:MicrogramsPerCubicMeter" - }, - { - "Node": "E:ejscreen_airpollutants->E1", - "typeOf": "dcs:StatVarObservation", - "variableMeasured": "dcs:AirPollutant_Cancer_Risk", - "observationDate": "C:ejscreen_airpollutants->year", - "observationAbout": "C:ejscreen_airpollutants->FIPS", - "observationPeriod": "dcs:P1Y", - "value": "C:ejscreen_airpollutants->CANCER", - "unit": "dcs:PerMillionPerson" - }, - { - "Node": "E:ejscreen_airpollutants->E2", - "typeOf": "dcs:StatVarObservation", - "variableMeasured": "dcs:AirPollutant_Respiratory_Hazard", - "observationDate": "C:ejscreen_airpollutants->year", - "observationAbout": "C:ejscreen_airpollutants->FIPS", - "observationPeriod": "dcs:P1Y", - "value": "C:ejscreen_airpollutants->RESP" - }, - { - "Node": "E:ejscreen_airpollutants->E3", - "typeOf": "dcs:StatVarObservation", - "variableMeasured": "dcs:Mean_Concentration_AirPollutant_Ozone", - "observationDate": "C:ejscreen_airpollutants->year", - "observationAbout": "C:ejscreen_airpollutants->FIPS", - "observationPeriod": "dcs:P1Y", - "value": "C:ejscreen_airpollutants->OZONE", - "unit": "dcs:PartsPerBillion" - }, - { - "Node": "E:ejscreen_airpollutants->E4", - "typeOf": "dcs:StatVarObservation", - "variableMeasured": "dcs:Mean_Concentration_AirPollutant_PM2.5", - "observationDate": "C:ejscreen_airpollutants->year", - "observationAbout": "C:ejscreen_airpollutants->FIPS", - "observationPeriod": "dcs:P1Y", - "value": "C:ejscreen_airpollutants->PM25", - "unit": "dcs:MicrogramsPerCubicMeter" - } - ] - , - "BASE_URL": "https://gaftp.epa.gov/EJSCREEN", - "URL_SUFFIX": { - "2023": "2.22_September_UseMe", - "2024": "2.32_August_UseMe" - } -} diff --git a/scripts/us_epa/ejscreen/ejscreen.py b/scripts/us_epa/ejscreen/ejscreen.py index 87673aa491..c38d7a0527 100644 --- a/scripts/us_epa/ejscreen/ejscreen.py +++ b/scripts/us_epa/ejscreen/ejscreen.py @@ -1,19 +1,19 @@ # Copyright 2023 Google LLC -# + # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at -# + # https://www.apache.org/licenses/LICENSE-2.0 -# + # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import io import os +import io import zipfile import requests import pandas as pd @@ -23,47 +23,40 @@ _MODULE_DIR = os.path.dirname(os.path.abspath(__file__)) sys.path.append(os.path.join(_MODULE_DIR, '../../../util/')) -print(_MODULE_DIR) import file_util logging.set_verbosity(logging.INFO) logger = logging _FLAGS = flags.FLAGS + flags.DEFINE_string('config_path', 'gs://unresolved_mcf/epa/ejscreen/config.json', 'Path to config file') -_MODULE_DIR = os.path.dirname(os.path.abspath(__file__)) -_CONFIG_PATH = os.path.join(_MODULE_DIR, 'config.json') - -# Load configuration from config.json -with open(_CONFIG_PATH, 'r') as f: - config = json.load(f) - -YEARS = config["YEARS"] -NORM_CSV_COLUMNS = config["NORM_CSV_COLUMNS"] -NORM_CSV_COLUMNS1 = config["NORM_CSV_COLUMNS1"] -CSV_COLUMNS_BY_YEAR = config["CSV_COLUMNS_BY_YEAR"] -ZIP_FILENAMES = config["ZIP_FILENAMES"] -FILENAMES = config["FILENAMES"] -TEMPLATE_MCF = config["TEMPLATE_MCF"] -BASE_URL = config["BASE_URL"] -URL_SUFFIX = config["URL_SUFFIX"] - - # Function to build the correct URL for each year def build_url(year, zip_filename=None): if zip_filename: - # Construct the URL for the zip file if year in URL_SUFFIX: url = f'{BASE_URL}/{year}/{URL_SUFFIX[year]}/{zip_filename}.zip' else: url = f'{BASE_URL}/{year}/{zip_filename}.zip' else: - # Construct the URL for the CSV file url = f'{BASE_URL}/{year}/{FILENAMES[year]}.csv' return url +# Download the file and save it in the input folder +def download_file(url, year, zip_filename=None): + response = requests.get(url, verify=False) + if response.status_code == 200: + input_folder = os.path.join(_MODULE_DIR, 'input') + os.makedirs(input_folder, exist_ok=True) # Create the folder if it doesn't exist + + file_path = os.path.join(input_folder, f'{year}.zip' if zip_filename else f'{year}.csv') + with open(file_path, 'wb') as f: + f.write(response.content) + logger.info(f"File downloaded and saved as {file_path}") + else: + logger.fatal(f"Failed to download file for {year}. HTTP Status Code: {response.status_code}") # Data processing function def write_csv(data, outfilename): @@ -72,76 +65,98 @@ def write_csv(data, outfilename): one_year_df['year'] = curr_year full_df = pd.concat([full_df, one_year_df], ignore_index=True) - # Sort by FIPS and make into dcid full_df = full_df.rename(columns={'ID': 'FIPS'}) full_df = full_df.sort_values(by=['FIPS'], ignore_index=True) - full_df['FIPS'] = 'dcid:geoId/' + ( - full_df['FIPS'].astype(str).str.zfill(12)) + full_df['FIPS'] = 'dcid:geoId/' + (full_df['FIPS'].astype(str).str.zfill(12)) full_df = full_df.fillna('') full_df = full_df.replace('None', '') full_df.to_csv(outfilename, index=False) - def write_tmcf(outfilename): - # Convert each item in TEMPLATE_MCF to a string, even if it's a dictionary if isinstance(TEMPLATE_MCF, list): - # Convert each element to a string if it's not already template_content = "\n".join(str(item) for item in TEMPLATE_MCF) else: - template_content = str( - TEMPLATE_MCF - ) # In case it's not a list, just convert it to a string + template_content = str(TEMPLATE_MCF) with open(outfilename, 'w') as f_out: f_out.write(template_content) - def main(_): - dfs = {} - for year in YEARS: - logger.info(f"Processing year: {year}") - columns = CSV_COLUMNS_BY_YEAR[year] - zip_filename = ZIP_FILENAMES.get(year, None) - - url = build_url(year, zip_filename) - - logger.info(f"Requesting file: {url}") - response = requests.get(url, verify=False) - - if response.status_code == 200: - if zip_filename: - with zipfile.ZipFile(io.BytesIO(response.content)) as zfile: - with zfile.open(f'{FILENAMES[year]}.csv', 'r') as newfile: - dfs[year] = pd.read_csv(newfile, - engine='python', - encoding='latin1', - usecols=columns) - else: - dfs[year] = pd.read_csv(io.StringIO(response.text), - sep=',', - usecols=columns) - logger.info( - f"File downloaded and processed for {year} successfully") - else: - logger.error( - f"Failed to download file for {year}. HTTP Status Code: {response.status_code}" - ) - - # Rename columns to match other years - if year == '2024': - cols_renamed = dict(zip(columns, NORM_CSV_COLUMNS1)) - else: - cols_renamed = dict(zip(columns, NORM_CSV_COLUMNS)) - - dfs[year] = dfs[year].rename(columns=cols_renamed) - logger.info(f"Columns renamed for {year} successfully") - + global URL_SUFFIX, BASE_URL, TEMPLATE_MCF, FILENAMES + + try: + # Load configuration from config.json + with file_util.FileIO(_FLAGS.config_path, 'r') as f: + config = json.load(f) + + YEARS = config["YEARS"] + NORM_CSV_COLUMNS = config["NORM_CSV_COLUMNS"] + NORM_CSV_COLUMNS1 = config["NORM_CSV_COLUMNS1"] + CSV_COLUMNS_BY_YEAR = config["CSV_COLUMNS_BY_YEAR"] + ZIP_FILENAMES = config["ZIP_FILENAMES"] + FILENAMES = config["FILENAMES"] + TEMPLATE_MCF = config["TEMPLATE_MCF"] + BASE_URL = config["BASE_URL"] + URL_SUFFIX = config["URL_SUFFIX"] + RENAME_COLUMNS_YEARS = config["RENAME_COLUMNS_YEARS"] + + dfs = {} + + for year in YEARS: + try: + logger.info(f"Processing year: {year}") + columns = CSV_COLUMNS_BY_YEAR[year] + zip_filename = ZIP_FILENAMES.get(year, None) + + # If the file for the current year is not already downloaded, download it + input_folder = os.path.join(_MODULE_DIR, 'input') + file_path = os.path.join(input_folder, f'{year}.zip' if zip_filename else f'{year}.csv') + + # Download if the file is missing + if not os.path.exists(file_path): + logger.info(f"File for {year} not found. Downloading...") + url = build_url(year, zip_filename) + download_file(url, year, zip_filename) + + # Process the downloaded file + if zip_filename: + with zipfile.ZipFile(file_path, 'r') as zfile: + with zfile.open(f'{FILENAMES[year]}.csv', 'r') as newfile: + dfs[year] = pd.read_csv(newfile, + engine='python', + encoding='latin1', + usecols=columns) + else: + dfs[year] = pd.read_csv(file_path, + sep=',', + usecols=columns) + + logger.info(f"File processed for {year} successfully") + + if year in RENAME_COLUMNS_YEARS: + cols_renamed = dict(zip(columns, NORM_CSV_COLUMNS1)) + else: + cols_renamed = dict(zip(columns, NORM_CSV_COLUMNS)) + + dfs[year] = dfs[year].rename(columns=cols_renamed) + logger.info(f"Columns renamed for {year} successfully") + + except Exception as e: + logger.fatal(f"Error processing data for year {year}: {e}") + continue + + # Write the combined data and template logger.info("Writing data to CSV") write_csv(dfs, 'ejscreen_airpollutants.csv') + logger.info("Writing template to TMCF") write_tmcf('ejscreen.tmcf') + logger.info("Process completed successfully") + except Exception as e: + logger.fatal(f"Unexpected error in the main process: {e}") + sys.exit(1) if __name__ == '__main__': app.run(main) diff --git a/scripts/us_epa/ejscreen/manifest.json b/scripts/us_epa/ejscreen/manifest.json index dbb7caa53c..d68f2e67d5 100644 --- a/scripts/us_epa/ejscreen/manifest.json +++ b/scripts/us_epa/ejscreen/manifest.json @@ -2,7 +2,7 @@ "import_specifications": [ { "import_name": "EPA_EJSCREEN", - "curator_emails": [], + "curator_emails": ["rbhande@google.com"], "provenance_url": "https://gaftp.epa.gov/EJSCREEN/", "provenance_description": "The Census Bureau's Ejscreen data", "scripts": [ @@ -14,7 +14,7 @@ "cleaned_csv": "ejscreen_airpollutants.csv" } ], - "cron_schedule": "0 07 * * 1" + "cron_schedule": "0 7 * * 1" } ] } From 1e71abb60f648423c6a22f6b51d83d816f4342b7 Mon Sep 17 00:00:00 2001 From: Rohit Bhande Date: Wed, 22 Jan 2025 09:46:57 +0000 Subject: [PATCH 17/21] Ejscreen semiautomatic2 --- scripts/us_epa/ejscreen/ejscreen.py | 32 ++++++++++++++++++++--------- 1 file changed, 22 insertions(+), 10 deletions(-) diff --git a/scripts/us_epa/ejscreen/ejscreen.py b/scripts/us_epa/ejscreen/ejscreen.py index c38d7a0527..58f4f2bd37 100644 --- a/scripts/us_epa/ejscreen/ejscreen.py +++ b/scripts/us_epa/ejscreen/ejscreen.py @@ -33,6 +33,7 @@ 'gs://unresolved_mcf/epa/ejscreen/config.json', 'Path to config file') + # Function to build the correct URL for each year def build_url(year, zip_filename=None): if zip_filename: @@ -44,19 +45,25 @@ def build_url(year, zip_filename=None): url = f'{BASE_URL}/{year}/{FILENAMES[year]}.csv' return url + # Download the file and save it in the input folder def download_file(url, year, zip_filename=None): response = requests.get(url, verify=False) if response.status_code == 200: input_folder = os.path.join(_MODULE_DIR, 'input') - os.makedirs(input_folder, exist_ok=True) # Create the folder if it doesn't exist - - file_path = os.path.join(input_folder, f'{year}.zip' if zip_filename else f'{year}.csv') + os.makedirs(input_folder, + exist_ok=True) # Create the folder if it doesn't exist + + file_path = os.path.join( + input_folder, f'{year}.zip' if zip_filename else f'{year}.csv') with open(file_path, 'wb') as f: f.write(response.content) logger.info(f"File downloaded and saved as {file_path}") else: - logger.fatal(f"Failed to download file for {year}. HTTP Status Code: {response.status_code}") + logger.fatal( + f"Failed to download file for {year}. HTTP Status Code: {response.status_code}" + ) + # Data processing function def write_csv(data, outfilename): @@ -67,11 +74,13 @@ def write_csv(data, outfilename): full_df = full_df.rename(columns={'ID': 'FIPS'}) full_df = full_df.sort_values(by=['FIPS'], ignore_index=True) - full_df['FIPS'] = 'dcid:geoId/' + (full_df['FIPS'].astype(str).str.zfill(12)) + full_df['FIPS'] = 'dcid:geoId/' + ( + full_df['FIPS'].astype(str).str.zfill(12)) full_df = full_df.fillna('') full_df = full_df.replace('None', '') full_df.to_csv(outfilename, index=False) + def write_tmcf(outfilename): if isinstance(TEMPLATE_MCF, list): template_content = "\n".join(str(item) for item in TEMPLATE_MCF) @@ -81,6 +90,7 @@ def write_tmcf(outfilename): with open(outfilename, 'w') as f_out: f_out.write(template_content) + def main(_): global URL_SUFFIX, BASE_URL, TEMPLATE_MCF, FILENAMES @@ -110,7 +120,9 @@ def main(_): # If the file for the current year is not already downloaded, download it input_folder = os.path.join(_MODULE_DIR, 'input') - file_path = os.path.join(input_folder, f'{year}.zip' if zip_filename else f'{year}.csv') + file_path = os.path.join( + input_folder, + f'{year}.zip' if zip_filename else f'{year}.csv') # Download if the file is missing if not os.path.exists(file_path): @@ -121,15 +133,14 @@ def main(_): # Process the downloaded file if zip_filename: with zipfile.ZipFile(file_path, 'r') as zfile: - with zfile.open(f'{FILENAMES[year]}.csv', 'r') as newfile: + with zfile.open(f'{FILENAMES[year]}.csv', + 'r') as newfile: dfs[year] = pd.read_csv(newfile, engine='python', encoding='latin1', usecols=columns) else: - dfs[year] = pd.read_csv(file_path, - sep=',', - usecols=columns) + dfs[year] = pd.read_csv(file_path, sep=',', usecols=columns) logger.info(f"File processed for {year} successfully") @@ -158,5 +169,6 @@ def main(_): logger.fatal(f"Unexpected error in the main process: {e}") sys.exit(1) + if __name__ == '__main__': app.run(main) From a9083f10cc366fff3ad9c6af60996c1d427d103b Mon Sep 17 00:00:00 2001 From: Rohit Bhande Date: Fri, 24 Jan 2025 06:05:05 +0000 Subject: [PATCH 18/21] Ejscreen semiautomatic2 --- scripts/us_epa/ejscreen/README.md | 7 +- scripts/us_epa/ejscreen/config.json | 99 ++++++++++++++++ scripts/us_epa/ejscreen/ejscreen.py | 160 +++++++++++++++----------- scripts/us_epa/ejscreen/manifest.json | 2 +- 4 files changed, 195 insertions(+), 73 deletions(-) create mode 100644 scripts/us_epa/ejscreen/config.json diff --git a/scripts/us_epa/ejscreen/README.md b/scripts/us_epa/ejscreen/README.md index e531f63ac3..10ddec5e20 100644 --- a/scripts/us_epa/ejscreen/README.md +++ b/scripts/us_epa/ejscreen/README.md @@ -20,9 +20,12 @@ which are a small subset of the available EJSCREEN variables. To generate `ejscreen_airpollutants.csv` and `ejscreen.tmcf` run the following: - `python3 ejscreen.py` +#Downloading and Processing Data +To perform "download and process", run the below command: python3 ejscreen.py Running this command generates input_fles and csv, mcf, tmcf files -As of July, 2021 this includes data through the end of 2020. +If you want to perform "only process", run the below command: python3 ejscreen.py --mode=process + +If you want to perform "only download", run the below command: python3 ejscreen.py --mode=download ### Unit Tests diff --git a/scripts/us_epa/ejscreen/config.json b/scripts/us_epa/ejscreen/config.json new file mode 100644 index 0000000000..9bed0a5556 --- /dev/null +++ b/scripts/us_epa/ejscreen/config.json @@ -0,0 +1,99 @@ +{ + "YEARS": [ "2015", "2016", "2017", "2018", "2019", "2020", "2021", "2022", "2023", "2024"], + "NORM_CSV_COLUMNS": ["ID", "DSLPM", "CANCER", "RESP", "OZONE", "PM25"], + "NORM_CSV_COLUMNS1": ["ID", "DSLPM", "OZONE", "PM25"], + "CSV_COLUMNS_BY_YEAR": { + "2015": ["FIPS", "dpm", "cancer", "resp", "o3", "pm"], + "2016": ["ID", "DSLPM", "CANCER", "RESP", "OZONE", "PM25"], + "2017": ["ID", "DSLPM", "CANCER", "RESP", "OZONE", "PM25"], + "2018": ["ID", "DSLPM", "CANCER", "RESP", "OZONE", "PM25"], + "2019": ["ID", "DSLPM", "CANCER", "RESP", "OZONE", "PM25"], + "2020": ["ID", "DSLPM", "CANCER", "RESP", "OZONE", "PM25"], + "2021": ["ID", "DSLPM", "CANCER", "RESP", "OZONE", "PM25"], + "2022": ["ID", "DSLPM", "CANCER", "RESP", "OZONE", "PM25"], + "2023": ["ID", "DSLPM", "CANCER", "RESP", "OZONE", "PM25"], + "2024": ["ID", "DSLPM", "OZONE", "PM25"] + }, + "ZIP_FILENAMES": { + "2015": "EJSCREEN_20150505.csv", + "2016": "EJSCREEN_V3_USPR_090216_CSV", + "2017": null, + "2018": "EJSCREEN_2018_USPR_csv", + "2019": "EJSCREEN_2019_USPR.csv", + "2020": "EJSCREEN_2020_USPR.csv", + "2021": "EJSCREEN_2021_USPR.csv", + "2022": "EJSCREEN_2022_with_AS_CNMI_GU_VI.csv", + "2023": "EJSCREEN_2023_BG_with_AS_CNMI_GU_VI.csv", + "2024": "EJScreen_2024_Tract_with_AS_CNMI_GU_VI.csv" + }, + "FILENAMES": { + "2015": "EJSCREEN_20150505", + "2016": "EJSCREEN_Full_V3_USPR_TSDFupdate", + "2017": "EJSCREEN_2017_USPR_Public", + "2018": "EJSCREEN_Full_USPR_2018", + "2019": "EJSCREEN_2019_USPR", + "2020": "EJSCREEN_2020_USPR", + "2021": "EJSCREEN_2021_USPR", + "2022": "EJSCREEN_2022_Full_with_AS_CNMI_GU_VI", + "2023": "EJSCREEN_2023_BG_with_AS_CNMI_GU_VI", + "2024": "EJScreen_2024_Tract_with_AS_CNMI_GU_VI" + }, + "TEMPLATE_MCF": [ + { + "Node": "E:ejscreen_airpollutants->E0", + "typeOf": "dcs:StatVarObservation", + "variableMeasured": "dcs:Mean_Concentration_AirPollutant_DieselPM", + "observationDate": "C:ejscreen_airpollutants->year", + "observationAbout": "C:ejscreen_airpollutants->FIPS", + "observationPeriod": "dcs:P1Y", + "value": "C:ejscreen_airpollutants->DSLPM", + "unit": "dcs:MicrogramsPerCubicMeter" + }, + { + "Node": "E:ejscreen_airpollutants->E1", + "typeOf": "dcs:StatVarObservation", + "variableMeasured": "dcs:AirPollutant_Cancer_Risk", + "observationDate": "C:ejscreen_airpollutants->year", + "observationAbout": "C:ejscreen_airpollutants->FIPS", + "observationPeriod": "dcs:P1Y", + "value": "C:ejscreen_airpollutants->CANCER", + "unit": "dcs:PerMillionPerson" + }, + { + "Node": "E:ejscreen_airpollutants->E2", + "typeOf": "dcs:StatVarObservation", + "variableMeasured": "dcs:AirPollutant_Respiratory_Hazard", + "observationDate": "C:ejscreen_airpollutants->year", + "observationAbout": "C:ejscreen_airpollutants->FIPS", + "observationPeriod": "dcs:P1Y", + "value": "C:ejscreen_airpollutants->RESP" + }, + { + "Node": "E:ejscreen_airpollutants->E3", + "typeOf": "dcs:StatVarObservation", + "variableMeasured": "dcs:Mean_Concentration_AirPollutant_Ozone", + "observationDate": "C:ejscreen_airpollutants->year", + "observationAbout": "C:ejscreen_airpollutants->FIPS", + "observationPeriod": "dcs:P1Y", + "value": "C:ejscreen_airpollutants->OZONE", + "unit": "dcs:PartsPerBillion" + }, + { + "Node": "E:ejscreen_airpollutants->E4", + "typeOf": "dcs:StatVarObservation", + "variableMeasured": "dcs:Mean_Concentration_AirPollutant_PM2.5", + "observationDate": "C:ejscreen_airpollutants->year", + "observationAbout": "C:ejscreen_airpollutants->FIPS", + "observationPeriod": "dcs:P1Y", + "value": "C:ejscreen_airpollutants->PM25", + "unit": "dcs:MicrogramsPerCubicMeter" + } + ] + , + "BASE_URL": "https://gaftp.epa.gov/EJSCREEN", + "URL_SUFFIX": { + "2023": "2.22_September_UseMe", + "2024": "2.32_August_UseMe" + }, + "RENAME_COLUMNS_YEARS": ["2024"] +} diff --git a/scripts/us_epa/ejscreen/ejscreen.py b/scripts/us_epa/ejscreen/ejscreen.py index 58f4f2bd37..04d093c5cb 100644 --- a/scripts/us_epa/ejscreen/ejscreen.py +++ b/scripts/us_epa/ejscreen/ejscreen.py @@ -20,6 +20,7 @@ import json from absl import logging, flags, app import sys +import time # Import time for delay in retries _MODULE_DIR = os.path.dirname(os.path.abspath(__file__)) sys.path.append(os.path.join(_MODULE_DIR, '../../../util/')) @@ -32,7 +33,7 @@ flags.DEFINE_string('config_path', 'gs://unresolved_mcf/epa/ejscreen/config.json', 'Path to config file') - +flags.DEFINE_string('mode', '', 'Mode of operation: "download" to only download, "process" to only process, leave empty for both.') # Function to build the correct URL for each year def build_url(year, zip_filename=None): @@ -48,21 +49,32 @@ def build_url(year, zip_filename=None): # Download the file and save it in the input folder def download_file(url, year, zip_filename=None): - response = requests.get(url, verify=False) - if response.status_code == 200: - input_folder = os.path.join(_MODULE_DIR, 'input') - os.makedirs(input_folder, - exist_ok=True) # Create the folder if it doesn't exist - - file_path = os.path.join( - input_folder, f'{year}.zip' if zip_filename else f'{year}.csv') - with open(file_path, 'wb') as f: - f.write(response.content) - logger.info(f"File downloaded and saved as {file_path}") - else: - logger.fatal( - f"Failed to download file for {year}. HTTP Status Code: {response.status_code}" - ) + max_retry = 5 + retry_number = 0 + while retry_number < max_retry: + try: + response = requests.get(url, verify=False) + if response.status_code == 200: + input_folder = os.path.join(_MODULE_DIR, 'input') + os.makedirs(input_folder, exist_ok=True) + + file_path = os.path.join(input_folder, f'{year}.zip' if zip_filename else f'{year}.csv') + with open(file_path, 'wb') as f: + f.write(response.content) + logger.info(f"File downloaded and saved as {file_path}") + return + else: + logger.fatal(f"Failed to download file for {year}. HTTP Status Code: {response.status_code}") + retry_number += 1 + time.sleep(5) + except Exception as e: + logger.error(f"Error downloading file for {year}: {e}") + retry_number += 1 + time.sleep(5) + + # If we reached max retries and failed, log the fatal error + logger.fatal(f"Failed to download file for {year} after {max_retry} retries.") + # Data processing function @@ -74,8 +86,7 @@ def write_csv(data, outfilename): full_df = full_df.rename(columns={'ID': 'FIPS'}) full_df = full_df.sort_values(by=['FIPS'], ignore_index=True) - full_df['FIPS'] = 'dcid:geoId/' + ( - full_df['FIPS'].astype(str).str.zfill(12)) + full_df['FIPS'] = 'dcid:geoId/' + (full_df['FIPS'].astype(str).str.zfill(12)) full_df = full_df.fillna('') full_df = full_df.replace('None', '') full_df.to_csv(outfilename, index=False) @@ -112,62 +123,71 @@ def main(_): dfs = {} - for year in YEARS: - try: - logger.info(f"Processing year: {year}") - columns = CSV_COLUMNS_BY_YEAR[year] - zip_filename = ZIP_FILENAMES.get(year, None) + # Download files if the mode is 'download' or if no mode is specified + if _FLAGS.mode == "" or _FLAGS.mode == "download": + for year in YEARS: + try: + logger.info(f"Processing year: {year}") + columns = CSV_COLUMNS_BY_YEAR[year] + zip_filename = ZIP_FILENAMES.get(year, None) - # If the file for the current year is not already downloaded, download it - input_folder = os.path.join(_MODULE_DIR, 'input') - file_path = os.path.join( - input_folder, - f'{year}.zip' if zip_filename else f'{year}.csv') - - # Download if the file is missing - if not os.path.exists(file_path): - logger.info(f"File for {year} not found. Downloading...") - url = build_url(year, zip_filename) - download_file(url, year, zip_filename) - - # Process the downloaded file - if zip_filename: - with zipfile.ZipFile(file_path, 'r') as zfile: - with zfile.open(f'{FILENAMES[year]}.csv', - 'r') as newfile: - dfs[year] = pd.read_csv(newfile, - engine='python', - encoding='latin1', - usecols=columns) - else: - dfs[year] = pd.read_csv(file_path, sep=',', usecols=columns) - - logger.info(f"File processed for {year} successfully") - - if year in RENAME_COLUMNS_YEARS: - cols_renamed = dict(zip(columns, NORM_CSV_COLUMNS1)) - else: - cols_renamed = dict(zip(columns, NORM_CSV_COLUMNS)) - - dfs[year] = dfs[year].rename(columns=cols_renamed) - logger.info(f"Columns renamed for {year} successfully") - - except Exception as e: - logger.fatal(f"Error processing data for year {year}: {e}") - continue - - # Write the combined data and template - logger.info("Writing data to CSV") - write_csv(dfs, 'ejscreen_airpollutants.csv') - - logger.info("Writing template to TMCF") - write_tmcf('ejscreen.tmcf') - - logger.info("Process completed successfully") + input_folder = os.path.join(_MODULE_DIR, 'input') + file_path = os.path.join(input_folder, f'{year}.zip' if zip_filename else f'{year}.csv') + + if not os.path.exists(file_path): + logger.info(f"File for {year} not found. Downloading...") + url = build_url(year, zip_filename) + download_file(url, year, zip_filename) + + except Exception as e: + logger.fatal(f"Error processing data for year {year}: {e}") + continue + + # Process files if the mode is 'process' or if no mode is specified + if _FLAGS.mode == "" or _FLAGS.mode == "process": + for year in YEARS: + try: + logger.info(f"Processing data for year {year}") + columns = CSV_COLUMNS_BY_YEAR[year] + zip_filename = ZIP_FILENAMES.get(year, None) + + input_folder = os.path.join(_MODULE_DIR, 'input') + file_path = os.path.join(input_folder, f'{year}.zip' if zip_filename else f'{year}.csv') + + # Process the downloaded file + if zip_filename: + with zipfile.ZipFile(file_path, 'r') as zfile: + with zfile.open(f'{FILENAMES[year]}.csv', 'r') as newfile: + dfs[year] = pd.read_csv(newfile, engine='python', encoding='latin1', usecols=columns) + else: + dfs[year] = pd.read_csv(file_path, sep=',', usecols=columns) + + logger.info(f"File processed for {year} successfully") + + if year in RENAME_COLUMNS_YEARS: + cols_renamed = dict(zip(columns, NORM_CSV_COLUMNS1)) + else: + cols_renamed = dict(zip(columns, NORM_CSV_COLUMNS)) + + dfs[year] = dfs[year].rename(columns=cols_renamed) + logger.info(f"Columns renamed for {year} successfully") + + except Exception as e: + logger.fatal(f"Error processing data for year {year}: {e}") + continue + + # Write the combined data and template + logger.info("Writing data to CSV") + write_csv(dfs, 'ejscreen_airpollutants.csv') + + logger.info("Writing template to TMCF") + write_tmcf('ejscreen.tmcf') + + logger.info("Process completed successfully") except Exception as e: logger.fatal(f"Unexpected error in the main process: {e}") - sys.exit(1) + if __name__ == '__main__': diff --git a/scripts/us_epa/ejscreen/manifest.json b/scripts/us_epa/ejscreen/manifest.json index d68f2e67d5..d2bd898d21 100644 --- a/scripts/us_epa/ejscreen/manifest.json +++ b/scripts/us_epa/ejscreen/manifest.json @@ -2,7 +2,7 @@ "import_specifications": [ { "import_name": "EPA_EJSCREEN", - "curator_emails": ["rbhande@google.com"], + "curator_emails": [], "provenance_url": "https://gaftp.epa.gov/EJSCREEN/", "provenance_description": "The Census Bureau's Ejscreen data", "scripts": [ From 0865095109038189bb1514c174436349382b7271 Mon Sep 17 00:00:00 2001 From: Rohit Bhande Date: Fri, 24 Jan 2025 06:09:12 +0000 Subject: [PATCH 19/21] Ejscreen semiautomatic2 --- scripts/us_epa/ejscreen/ejscreen.py | 55 +++++++++++++++++++---------- 1 file changed, 37 insertions(+), 18 deletions(-) diff --git a/scripts/us_epa/ejscreen/ejscreen.py b/scripts/us_epa/ejscreen/ejscreen.py index 04d093c5cb..389f1acc0a 100644 --- a/scripts/us_epa/ejscreen/ejscreen.py +++ b/scripts/us_epa/ejscreen/ejscreen.py @@ -33,7 +33,11 @@ flags.DEFINE_string('config_path', 'gs://unresolved_mcf/epa/ejscreen/config.json', 'Path to config file') -flags.DEFINE_string('mode', '', 'Mode of operation: "download" to only download, "process" to only process, leave empty for both.') +flags.DEFINE_string( + 'mode', '', + 'Mode of operation: "download" to only download, "process" to only process, leave empty for both.' +) + # Function to build the correct URL for each year def build_url(year, zip_filename=None): @@ -49,32 +53,36 @@ def build_url(year, zip_filename=None): # Download the file and save it in the input folder def download_file(url, year, zip_filename=None): - max_retry = 5 + max_retry = 5 retry_number = 0 while retry_number < max_retry: try: response = requests.get(url, verify=False) if response.status_code == 200: input_folder = os.path.join(_MODULE_DIR, 'input') - os.makedirs(input_folder, exist_ok=True) + os.makedirs(input_folder, exist_ok=True) - file_path = os.path.join(input_folder, f'{year}.zip' if zip_filename else f'{year}.csv') + file_path = os.path.join( + input_folder, + f'{year}.zip' if zip_filename else f'{year}.csv') with open(file_path, 'wb') as f: f.write(response.content) logger.info(f"File downloaded and saved as {file_path}") - return + return else: - logger.fatal(f"Failed to download file for {year}. HTTP Status Code: {response.status_code}") + logger.fatal( + f"Failed to download file for {year}. HTTP Status Code: {response.status_code}" + ) retry_number += 1 - time.sleep(5) + time.sleep(5) except Exception as e: logger.error(f"Error downloading file for {year}: {e}") retry_number += 1 - time.sleep(5) + time.sleep(5) # If we reached max retries and failed, log the fatal error - logger.fatal(f"Failed to download file for {year} after {max_retry} retries.") - + logger.fatal( + f"Failed to download file for {year} after {max_retry} retries.") # Data processing function @@ -86,7 +94,8 @@ def write_csv(data, outfilename): full_df = full_df.rename(columns={'ID': 'FIPS'}) full_df = full_df.sort_values(by=['FIPS'], ignore_index=True) - full_df['FIPS'] = 'dcid:geoId/' + (full_df['FIPS'].astype(str).str.zfill(12)) + full_df['FIPS'] = 'dcid:geoId/' + ( + full_df['FIPS'].astype(str).str.zfill(12)) full_df = full_df.fillna('') full_df = full_df.replace('None', '') full_df.to_csv(outfilename, index=False) @@ -132,10 +141,13 @@ def main(_): zip_filename = ZIP_FILENAMES.get(year, None) input_folder = os.path.join(_MODULE_DIR, 'input') - file_path = os.path.join(input_folder, f'{year}.zip' if zip_filename else f'{year}.csv') + file_path = os.path.join( + input_folder, + f'{year}.zip' if zip_filename else f'{year}.csv') if not os.path.exists(file_path): - logger.info(f"File for {year} not found. Downloading...") + logger.info( + f"File for {year} not found. Downloading...") url = build_url(year, zip_filename) download_file(url, year, zip_filename) @@ -152,15 +164,23 @@ def main(_): zip_filename = ZIP_FILENAMES.get(year, None) input_folder = os.path.join(_MODULE_DIR, 'input') - file_path = os.path.join(input_folder, f'{year}.zip' if zip_filename else f'{year}.csv') + file_path = os.path.join( + input_folder, + f'{year}.zip' if zip_filename else f'{year}.csv') # Process the downloaded file if zip_filename: with zipfile.ZipFile(file_path, 'r') as zfile: - with zfile.open(f'{FILENAMES[year]}.csv', 'r') as newfile: - dfs[year] = pd.read_csv(newfile, engine='python', encoding='latin1', usecols=columns) + with zfile.open(f'{FILENAMES[year]}.csv', + 'r') as newfile: + dfs[year] = pd.read_csv(newfile, + engine='python', + encoding='latin1', + usecols=columns) else: - dfs[year] = pd.read_csv(file_path, sep=',', usecols=columns) + dfs[year] = pd.read_csv(file_path, + sep=',', + usecols=columns) logger.info(f"File processed for {year} successfully") @@ -187,7 +207,6 @@ def main(_): except Exception as e: logger.fatal(f"Unexpected error in the main process: {e}") - if __name__ == '__main__': From ba95b9a07d5ec94657c83e4776ca3119114eb5a1 Mon Sep 17 00:00:00 2001 From: Rohit Bhande Date: Fri, 24 Jan 2025 06:57:36 +0000 Subject: [PATCH 20/21] Ejscreen semiautomatic2 --- scripts/us_epa/ejscreen/ejscreen.py | 63 ++++++++++++++--------------- 1 file changed, 30 insertions(+), 33 deletions(-) diff --git a/scripts/us_epa/ejscreen/ejscreen.py b/scripts/us_epa/ejscreen/ejscreen.py index 389f1acc0a..494f115f66 100644 --- a/scripts/us_epa/ejscreen/ejscreen.py +++ b/scripts/us_epa/ejscreen/ejscreen.py @@ -20,14 +20,15 @@ import json from absl import logging, flags, app import sys -import time # Import time for delay in retries +import time +from retry import retry _MODULE_DIR = os.path.dirname(os.path.abspath(__file__)) sys.path.append(os.path.join(_MODULE_DIR, '../../../util/')) import file_util logging.set_verbosity(logging.INFO) -logger = logging + _FLAGS = flags.FLAGS flags.DEFINE_string('config_path', @@ -50,16 +51,16 @@ def build_url(year, zip_filename=None): url = f'{BASE_URL}/{year}/{FILENAMES[year]}.csv' return url +@retry(tries=5, delay=5, backoff=5) +def download_with_retry(url): + logging.info(f"Downloading URL : {url}") + return requests.get(url=url, verify=False) # Download the file and save it in the input folder -def download_file(url, year, zip_filename=None): - max_retry = 5 - retry_number = 0 - while retry_number < max_retry: +def download_file(url, year, input_folder, zip_filename=None): try: - response = requests.get(url, verify=False) + response = download_with_retry(url) if response.status_code == 200: - input_folder = os.path.join(_MODULE_DIR, 'input') os.makedirs(input_folder, exist_ok=True) file_path = os.path.join( @@ -67,23 +68,19 @@ def download_file(url, year, zip_filename=None): f'{year}.zip' if zip_filename else f'{year}.csv') with open(file_path, 'wb') as f: f.write(response.content) - logger.info(f"File downloaded and saved as {file_path}") + logging.info(f"File downloaded and saved as {file_path}") return else: - logger.fatal( - f"Failed to download file for {year}. HTTP Status Code: {response.status_code}" + logging.fatal( + f"Failed to download file for {year}. HTTP Status Code: {response.status_code} URL : {url}" ) - retry_number += 1 - time.sleep(5) except Exception as e: - logger.error(f"Error downloading file for {year}: {e}") - retry_number += 1 - time.sleep(5) + logging.fatal( + f"Failed to download file for {year} after {url} .") - # If we reached max retries and failed, log the fatal error - logger.fatal( - f"Failed to download file for {year} after {max_retry} retries.") + + # Data processing function def write_csv(data, outfilename): @@ -131,39 +128,39 @@ def main(_): RENAME_COLUMNS_YEARS = config["RENAME_COLUMNS_YEARS"] dfs = {} + input_folder = os.path.join(_MODULE_DIR, 'input') # Download files if the mode is 'download' or if no mode is specified if _FLAGS.mode == "" or _FLAGS.mode == "download": for year in YEARS: try: - logger.info(f"Processing year: {year}") + logging.info(f"Processing year: {year}") columns = CSV_COLUMNS_BY_YEAR[year] zip_filename = ZIP_FILENAMES.get(year, None) - input_folder = os.path.join(_MODULE_DIR, 'input') file_path = os.path.join( input_folder, f'{year}.zip' if zip_filename else f'{year}.csv') if not os.path.exists(file_path): - logger.info( + logging.info( f"File for {year} not found. Downloading...") url = build_url(year, zip_filename) - download_file(url, year, zip_filename) + download_file(url, year, input_folder,zip_filename) except Exception as e: - logger.fatal(f"Error processing data for year {year}: {e}") + logging.fatal(f"Error processing data for year {year}: {e}") continue # Process files if the mode is 'process' or if no mode is specified if _FLAGS.mode == "" or _FLAGS.mode == "process": for year in YEARS: try: - logger.info(f"Processing data for year {year}") + logging.info(f"Processing data for year {year}") columns = CSV_COLUMNS_BY_YEAR[year] zip_filename = ZIP_FILENAMES.get(year, None) - input_folder = os.path.join(_MODULE_DIR, 'input') + file_path = os.path.join( input_folder, f'{year}.zip' if zip_filename else f'{year}.csv') @@ -182,7 +179,7 @@ def main(_): sep=',', usecols=columns) - logger.info(f"File processed for {year} successfully") + logging.info(f"File processed for {year} successfully") if year in RENAME_COLUMNS_YEARS: cols_renamed = dict(zip(columns, NORM_CSV_COLUMNS1)) @@ -190,23 +187,23 @@ def main(_): cols_renamed = dict(zip(columns, NORM_CSV_COLUMNS)) dfs[year] = dfs[year].rename(columns=cols_renamed) - logger.info(f"Columns renamed for {year} successfully") + logging.info(f"Columns renamed for {year} successfully") except Exception as e: - logger.fatal(f"Error processing data for year {year}: {e}") + logging.fatal(f"Error processing data for year {year}: {e}") continue # Write the combined data and template - logger.info("Writing data to CSV") + logging.info("Writing data to CSV") write_csv(dfs, 'ejscreen_airpollutants.csv') - logger.info("Writing template to TMCF") + logging.info("Writing template to TMCF") write_tmcf('ejscreen.tmcf') - logger.info("Process completed successfully") + logging.info("Process completed successfully") except Exception as e: - logger.fatal(f"Unexpected error in the main process: {e}") + logging.fatal(f"Unexpected error in the main process: {e}") if __name__ == '__main__': From 6e9dbfae996e287381f5264bb936a419cc1a2749 Mon Sep 17 00:00:00 2001 From: Rohit Bhande Date: Fri, 24 Jan 2025 06:58:09 +0000 Subject: [PATCH 21/21] Ejscreen semiautomatic2 --- scripts/us_epa/ejscreen/ejscreen.py | 42 +++++++++++++---------------- 1 file changed, 19 insertions(+), 23 deletions(-) diff --git a/scripts/us_epa/ejscreen/ejscreen.py b/scripts/us_epa/ejscreen/ejscreen.py index 494f115f66..904f3a0291 100644 --- a/scripts/us_epa/ejscreen/ejscreen.py +++ b/scripts/us_epa/ejscreen/ejscreen.py @@ -51,36 +51,33 @@ def build_url(year, zip_filename=None): url = f'{BASE_URL}/{year}/{FILENAMES[year]}.csv' return url + @retry(tries=5, delay=5, backoff=5) def download_with_retry(url): logging.info(f"Downloading URL : {url}") return requests.get(url=url, verify=False) + # Download the file and save it in the input folder def download_file(url, year, input_folder, zip_filename=None): - try: - response = download_with_retry(url) - if response.status_code == 200: - os.makedirs(input_folder, exist_ok=True) - - file_path = os.path.join( - input_folder, - f'{year}.zip' if zip_filename else f'{year}.csv') - with open(file_path, 'wb') as f: - f.write(response.content) - logging.info(f"File downloaded and saved as {file_path}") - return - else: - logging.fatal( - f"Failed to download file for {year}. HTTP Status Code: {response.status_code} URL : {url}" - ) - except Exception as e: + try: + response = download_with_retry(url) + if response.status_code == 200: + os.makedirs(input_folder, exist_ok=True) + + file_path = os.path.join( + input_folder, f'{year}.zip' if zip_filename else f'{year}.csv') + with open(file_path, 'wb') as f: + f.write(response.content) + logging.info(f"File downloaded and saved as {file_path}") + return + else: logging.fatal( - f"Failed to download file for {year} after {url} .") - + f"Failed to download file for {year}. HTTP Status Code: {response.status_code} URL : {url}" + ) + except Exception as e: + logging.fatal(f"Failed to download file for {year} after {url} .") - - # Data processing function def write_csv(data, outfilename): @@ -146,7 +143,7 @@ def main(_): logging.info( f"File for {year} not found. Downloading...") url = build_url(year, zip_filename) - download_file(url, year, input_folder,zip_filename) + download_file(url, year, input_folder, zip_filename) except Exception as e: logging.fatal(f"Error processing data for year {year}: {e}") @@ -160,7 +157,6 @@ def main(_): columns = CSV_COLUMNS_BY_YEAR[year] zip_filename = ZIP_FILENAMES.get(year, None) - file_path = os.path.join( input_folder, f'{year}.zip' if zip_filename else f'{year}.csv')