datacommonsorg · Rohit231998 · Jan 15, 2025 · Jan 15, 2025 · Jan 15, 2025 · Jan 15, 2025
diff --git a/scripts/us_epa/ejscreen/config.json b/scripts/us_epa/ejscreen/config.json
@@ -0,0 +1,98 @@
+{
+  "YEARS": [ "2015", "2016", "2017", "2018", "2019", "2020", "2021", "2022", "2023", "2024"],
+  "NORM_CSV_COLUMNS": ["ID", "DSLPM", "CANCER", "RESP", "OZONE", "PM25"],
+  "NORM_CSV_COLUMNS1": ["ID", "DSLPM", "OZONE", "PM25"],
+  "CSV_COLUMNS_BY_YEAR": {
+    "2015": ["FIPS", "dpm", "cancer", "resp", "o3", "pm"],
+    "2016": ["ID", "DSLPM", "CANCER", "RESP", "OZONE", "PM25"],
+    "2017": ["ID", "DSLPM", "CANCER", "RESP", "OZONE", "PM25"],
+    "2018": ["ID", "DSLPM", "CANCER", "RESP", "OZONE", "PM25"],
+    "2019": ["ID", "DSLPM", "CANCER", "RESP", "OZONE", "PM25"],
+    "2020": ["ID", "DSLPM", "CANCER", "RESP", "OZONE", "PM25"],
+    "2021": ["ID", "DSLPM", "CANCER", "RESP", "OZONE", "PM25"],
+    "2022": ["ID", "DSLPM", "CANCER", "RESP", "OZONE", "PM25"],
+    "2023": ["ID", "DSLPM", "CANCER", "RESP", "OZONE", "PM25"],
+    "2024": ["ID", "DSLPM", "OZONE", "PM25"]
+  },
+  "ZIP_FILENAMES": {
+    "2015": "EJSCREEN_20150505.csv",
+    "2016": "EJSCREEN_V3_USPR_090216_CSV",
+    "2017": null,
+    "2018": "EJSCREEN_2018_USPR_csv",
+    "2019": "EJSCREEN_2019_USPR.csv",
+    "2020": "EJSCREEN_2020_USPR.csv",
+    "2021": "EJSCREEN_2021_USPR.csv",
+    "2022": "EJSCREEN_2022_with_AS_CNMI_GU_VI.csv",
+    "2023": "EJSCREEN_2023_BG_with_AS_CNMI_GU_VI.csv",
+    "2024": "EJScreen_2024_Tract_with_AS_CNMI_GU_VI.csv"
+  },
+  "FILENAMES": {
+    "2015": "EJSCREEN_20150505",
+    "2016": "EJSCREEN_Full_V3_USPR_TSDFupdate",
+    "2017": "EJSCREEN_2017_USPR_Public",
+    "2018": "EJSCREEN_Full_USPR_2018",
+    "2019": "EJSCREEN_2019_USPR",
+    "2020": "EJSCREEN_2020_USPR",
+    "2021": "EJSCREEN_2021_USPR",
+    "2022": "EJSCREEN_2022_Full_with_AS_CNMI_GU_VI",
+    "2023": "EJSCREEN_2023_BG_with_AS_CNMI_GU_VI",
+    "2024": "EJScreen_2024_Tract_with_AS_CNMI_GU_VI"
+  },
+  "TEMPLATE_MCF": [
+    {
+      "Node": "E:ejscreen_airpollutants->E0",
+      "typeOf": "dcs:StatVarObservation",
+      "variableMeasured": "dcs:Mean_Concentration_AirPollutant_DieselPM",
+      "observationDate": "C:ejscreen_airpollutants->year",
+      "observationAbout": "C:ejscreen_airpollutants->FIPS",
+      "observationPeriod": "dcs:P1Y",
+      "value": "C:ejscreen_airpollutants->DSLPM",
+      "unit": "dcs:MicrogramsPerCubicMeter"
+    },
+    {
+      "Node": "E:ejscreen_airpollutants->E1",
+      "typeOf": "dcs:StatVarObservation",
+      "variableMeasured": "dcs:AirPollutant_Cancer_Risk",
+      "observationDate": "C:ejscreen_airpollutants->year",
+      "observationAbout": "C:ejscreen_airpollutants->FIPS",
+      "observationPeriod": "dcs:P1Y",
+      "value": "C:ejscreen_airpollutants->CANCER",
+      "unit": "dcs:PerMillionPerson"
+    },
+    {
+      "Node": "E:ejscreen_airpollutants->E2",
+      "typeOf": "dcs:StatVarObservation",
+      "variableMeasured": "dcs:AirPollutant_Respiratory_Hazard",
+      "observationDate": "C:ejscreen_airpollutants->year",
+      "observationAbout": "C:ejscreen_airpollutants->FIPS",
+      "observationPeriod": "dcs:P1Y",
+      "value": "C:ejscreen_airpollutants->RESP"
+    },
+    {
+      "Node": "E:ejscreen_airpollutants->E3",
+      "typeOf": "dcs:StatVarObservation",
+      "variableMeasured": "dcs:Mean_Concentration_AirPollutant_Ozone",
+      "observationDate": "C:ejscreen_airpollutants->year",
+      "observationAbout": "C:ejscreen_airpollutants->FIPS",
+      "observationPeriod": "dcs:P1Y",
+      "value": "C:ejscreen_airpollutants->OZONE",
+      "unit": "dcs:PartsPerBillion"
+    },
+    {
+      "Node": "E:ejscreen_airpollutants->E4",
+      "typeOf": "dcs:StatVarObservation",
+      "variableMeasured": "dcs:Mean_Concentration_AirPollutant_PM2.5",
+      "observationDate": "C:ejscreen_airpollutants->year",
+      "observationAbout": "C:ejscreen_airpollutants->FIPS",
+      "observationPeriod": "dcs:P1Y",
+      "value": "C:ejscreen_airpollutants->PM25",
+      "unit": "dcs:MicrogramsPerCubicMeter"
+    }
+  ]
+  ,
+  "BASE_URL": "https://gaftp.epa.gov/EJSCREEN",
+  "URL_SUFFIX": {
+    "2023": "2.22_September_UseMe",
+    "2024": "2.32_August_UseMe"
+  }
+}
diff --git a/scripts/us_epa/ejscreen/ejscreen.py b/scripts/us_epa/ejscreen/ejscreen.py
@@ -1,103 +1,78 @@
-'''
-Generates cleaned CSV for the EPA EJSCREEN data and TMCF.
-Usage: python3 ejscreen.py
-'''
+# Copyright 2023 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 
 import io
+import os
 import zipfile
 import requests
 import pandas as pd
+import json
+from absl import logging, flags, app
+import sys
+
+_MODULE_DIR = os.path.dirname(os.path.abspath(__file__))
+sys.path.append(os.path.join(_MODULE_DIR, '../../../util/'))
+print(_MODULE_DIR)
+import file_util
+
+logging.set_verbosity(logging.INFO)
+logger = logging
+_FLAGS = flags.FLAGS
+flags.DEFINE_string('config_path',
+                    'gs://unresolved_mcf/epa/ejscreen/config.json',
+                    'Path to config file')
+
+_MODULE_DIR = os.path.dirname(os.path.abspath(__file__))
+_CONFIG_PATH = os.path.join(_MODULE_DIR, 'config.json')
+
+# Load configuration from config.json
+with open(_CONFIG_PATH, 'r') as f:
+    config = json.load(f)
+
+YEARS = config["YEARS"]
+NORM_CSV_COLUMNS = config["NORM_CSV_COLUMNS"]
+NORM_CSV_COLUMNS1 = config["NORM_CSV_COLUMNS1"]
+CSV_COLUMNS_BY_YEAR = config["CSV_COLUMNS_BY_YEAR"]
+ZIP_FILENAMES = config["ZIP_FILENAMES"]
+FILENAMES = config["FILENAMES"]
+TEMPLATE_MCF = config["TEMPLATE_MCF"]
+BASE_URL = config["BASE_URL"]
+URL_SUFFIX = config["URL_SUFFIX"]
+
+
+# Function to build the correct URL for each year
+def build_url(year, zip_filename=None):
+    if zip_filename:
+        # Construct the URL for the zip file
+        if year in URL_SUFFIX:
+            url = f'{BASE_URL}/{year}/{URL_SUFFIX[year]}/{zip_filename}.zip'
+        else:
+            url = f'{BASE_URL}/{year}/{zip_filename}.zip'
+    else:
+        # Construct the URL for the CSV file
+        url = f'{BASE_URL}/{year}/{FILENAMES[year]}.csv'
+    return url
+
 
-YEARS = ['2015', '2016', '2017', '2018', '2019', '2020']
-
-NORM_CSV_COLUMNS = ['ID', 'DSLPM', 'CANCER', 'RESP', 'OZONE', 'PM25']
-
-# 2015 has different csv column names
-CSV_COLUMNS_BY_YEAR = {
-    '2015': ['FIPS', 'dpm', 'cancer', 'resp', 'o3', 'pm'],
-    '2016': NORM_CSV_COLUMNS,
-    '2017': NORM_CSV_COLUMNS,
-    '2018': NORM_CSV_COLUMNS,
-    '2019': NORM_CSV_COLUMNS,
-    '2020': NORM_CSV_COLUMNS
-}
-
-ZIP_FILENAMES = {
-    '2015': 'EJSCREEN_20150505.csv',
-    '2016': 'EJSCREEN_V3_USPR_090216_CSV',
-    '2017': None,
-    '2018': 'EJSCREEN_2018_USPR_csv',
-    '2019': 'EJSCREEN_2019_USPR.csv',
-    '2020': 'EJSCREEN_2020_USPR.csv'
-}
-
-FILENAMES = {
-    '2015': 'EJSCREEN_20150505',
-    '2016': 'EJSCREEN_Full_V3_USPR_TSDFupdate',
-    '2017': 'EJSCREEN_2017_USPR_Public',
-    '2018': 'EJSCREEN_Full_USPR_2018',
-    '2019': 'EJSCREEN_2019_USPR',
-    '2020': 'EJSCREEN_2020_USPR'
-}
-
-TEMPLATE_MCF = '''
-Node: E:ejscreen_airpollutants->E0
-typeOf: dcs:StatVarObservation
-variableMeasured: dcs:Mean_Concentration_AirPollutant_DieselPM
-observationDate: C:ejscreen_airpollutants->year
-observationAbout: C:ejscreen_airpollutants->FIPS
-observationPeriod: dcs:P1Y
-value: C:ejscreen_airpollutants->DSLPM
-unit: dcs:MicrogramsPerCubicMeter
-
-Node: E:ejscreen_airpollutants->E1
-typeOf: dcs:StatVarObservation
-variableMeasured: dcs:AirPollutant_Cancer_Risk
-observationDate: C:ejscreen_airpollutants->year
-observationAbout: C:ejscreen_airpollutants->FIPS
-observationPeriod: dcs:P1Y
-value: C:ejscreen_airpollutants->CANCER
-
-Node: E:ejscreen_airpollutants->E2
-typeOf: dcs:StatVarObservation
-variableMeasured: dcs:AirPollutant_Respiratory_Hazard
-observationDate: C:ejscreen_airpollutants->year
-observationAbout: C:ejscreen_airpollutants->FIPS
-observationPeriod: dcs:P1Y
-value: C:ejscreen_airpollutants->RESP
-
-Node: E:ejscreen_airpollutants->E3
-typeOf: dcs:StatVarObservation
-variableMeasured: dcs:Mean_Concentration_AirPollutant_Ozone
-observationDate: C:ejscreen_airpollutants->year
-observationAbout: C:ejscreen_airpollutants->FIPS
-observationPeriod: dcs:P1Y
-value: C:ejscreen_airpollutants->OZONE
-unit: dcs:PartsPerBillion
-
-Node: E:ejscreen_airpollutants->E4
-typeOf: dcs:StatVarObservation
-variableMeasured: dcs:Mean_Concentration_AirPollutant_PM2.5
-observationDate: C:ejscreen_airpollutants->year
-observationAbout: C:ejscreen_airpollutants->FIPS
-observationPeriod: dcs:P1Y
-value: C:ejscreen_airpollutants->PM25
-unit: dcs:MicrogramsPerCubicMeter
-'''
-
-
-# data: dictionary of dataframes in the format {year: dataframe}
-# outfilename: name of the csv that data will be written to
-# write_csv concatenates the dataframe from each year together
+# Data processing function
 def write_csv(data, outfilename):
     full_df = pd.DataFrame()
     for curr_year, one_year_df in data.items():
-        one_year_df['year'] = curr_year  # add year column
-        full_df = pd.concat(
-            [full_df, one_year_df],
-            ignore_index=True)  # concatenate year onto larger dataframe
+        one_year_df['year'] = curr_year
+        full_df = pd.concat([full_df, one_year_df], ignore_index=True)
 
-    # sort by FIPS and make into dcid
+    # Sort by FIPS and make into dcid
     full_df = full_df.rename(columns={'ID': 'FIPS'})
     full_df = full_df.sort_values(by=['FIPS'], ignore_index=True)
     full_df['FIPS'] = 'dcid:geoId/' + (
@@ -108,32 +83,65 @@ def write_csv(data, outfilename):
 
 
 def write_tmcf(outfilename):
+    # Convert each item in TEMPLATE_MCF to a string, even if it's a dictionary
+    if isinstance(TEMPLATE_MCF, list):
+        # Convert each element to a string if it's not already
+        template_content = "\n".join(str(item) for item in TEMPLATE_MCF)
+    else:
+        template_content = str(
+            TEMPLATE_MCF
+        )  # In case it's not a list, just convert it to a string
+
     with open(outfilename, 'w') as f_out:
-        f_out.write(TEMPLATE_MCF)
+        f_out.write(template_content)
 
 
-if __name__ == '__main__':
+def main(_):
     dfs = {}
     for year in YEARS:
-        print(year)
+        logger.info(f"Processing year: {year}")
         columns = CSV_COLUMNS_BY_YEAR[year]
-        # request file
-        zip_filename = ZIP_FILENAMES[year]
-        if zip_filename is not None:
-            response = requests.get(
-                f'https://gaftp.epa.gov/EJSCREEN/{year}/{zip_filename}.zip')
-            with zipfile.ZipFile(io.BytesIO(response.content())) as zfile:
-                with zfile.open(f'{FILENAMES[year]}.csv', 'r') as newfile:
-                    dfs[year] = pd.read_csv(newfile, usecols=columns)
-        # some years are not zipped
+        zip_filename = ZIP_FILENAMES.get(year, None)
+
+        url = build_url(year, zip_filename)
+
+        logger.info(f"Requesting file: {url}")
+        response = requests.get(url, verify=False)
+
+        if response.status_code == 200:
+            if zip_filename:
+                with zipfile.ZipFile(io.BytesIO(response.content)) as zfile:
+                    with zfile.open(f'{FILENAMES[year]}.csv', 'r') as newfile:
+                        dfs[year] = pd.read_csv(newfile,
+                                                engine='python',
+                                                encoding='latin1',
+                                                usecols=columns)
+            else:
+                dfs[year] = pd.read_csv(io.StringIO(response.text),
+                                        sep=',',
+                                        usecols=columns)
+            logger.info(
+                f"File downloaded and processed for {year} successfully")
+        else:
+            logger.error(
+                f"Failed to download file for {year}. HTTP Status Code: {response.status_code}"
+            )
+
+        # Rename columns to match other years
+        if year == '2024':
+            cols_renamed = dict(zip(columns, NORM_CSV_COLUMNS1))
         else:
-            response = requests.get(
-                f'https://gaftp.epa.gov/EJSCREEN/{year}/{FILENAMES[year]}.csv')
-            dfs[year] = pd.read_csv(response, usecols=columns)
-        # rename weird column names to match other years
-        if columns != NORM_CSV_COLUMNS:
             cols_renamed = dict(zip(columns, NORM_CSV_COLUMNS))
-            dfs[year] = dfs[year].rename(columns=cols_renamed)
 
-    write_csv(dfs, 'ejscreen_airpollutants.csv')
-    write_tmcf('ejscreen.tmcf')
+        dfs[year] = dfs[year].rename(columns=cols_renamed)
+        logger.info(f"Columns renamed for {year} successfully")
+
+        logger.info("Writing data to CSV")
+        write_csv(dfs, 'ejscreen_airpollutants.csv')
+        logger.info("Writing template to TMCF")
+        write_tmcf('ejscreen.tmcf')
+        logger.info("Process completed successfully")
+
+
+if __name__ == '__main__':
+    app.run(main)