From cf20c177ff8263ffee53380535b12272f2605841 Mon Sep 17 00:00:00 2001
From: Rohit Bhande <rbhande@google.com>
Date: Wed, 15 Jan 2025 11:09:31 +0000
Subject: [PATCH 01/21] Ejscrren semi-automatic

---
 scripts/us_epa/ejscreen/config.json      |  93 ++++++++
 scripts/us_epa/ejscreen/ejscreen.py      | 266 ++++++++++++++---------
 scripts/us_epa/ejscreen/ejscreen_test.py |   2 +-
 3 files changed, 261 insertions(+), 100 deletions(-)
 create mode 100644 scripts/us_epa/ejscreen/config.json

diff --git a/scripts/us_epa/ejscreen/config.json b/scripts/us_epa/ejscreen/config.json
new file mode 100644
index 0000000000..b84673a204
--- /dev/null
+++ b/scripts/us_epa/ejscreen/config.json
@@ -0,0 +1,93 @@
+{
+    "YEARS": ["2015", "2016", "2017", "2018", "2019", "2020", "2021", "2022", "2023", "2024"],
+    "NORM_CSV_COLUMNS": ["ID", "DSLPM", "CANCER", "RESP", "OZONE", "PM25"],
+    "NORM_CSV_COLUMNS1": ["ID", "DSLPM", "OZONE", "PM25"],
+    "CSV_COLUMNS_BY_YEAR": {
+      "2015": ["FIPS", "dpm", "cancer", "resp", "o3", "pm"],
+      "2016": ["ID", "DSLPM", "CANCER", "RESP", "OZONE", "PM25"],
+      "2017": ["ID", "DSLPM", "CANCER", "RESP", "OZONE", "PM25"],
+      "2018": ["ID", "DSLPM", "CANCER", "RESP", "OZONE", "PM25"],
+      "2019": ["ID", "DSLPM", "CANCER", "RESP", "OZONE", "PM25"],
+      "2020": ["ID", "DSLPM", "CANCER", "RESP", "OZONE", "PM25"],
+      "2021": ["ID", "DSLPM", "CANCER", "RESP", "OZONE", "PM25"],
+      "2022": ["ID", "DSLPM", "CANCER", "RESP", "OZONE", "PM25"],
+      "2023": ["ID", "DSLPM", "CANCER", "RESP", "OZONE", "PM25"],
+      "2024": ["ID", "DSLPM", "OZONE", "PM25"]
+    },
+    "ZIP_FILENAMES": {
+      "2015": "EJSCREEN_20150505.csv",
+      "2016": "EJSCREEN_V3_USPR_090216_CSV",
+      "2017": null,
+      "2018": "EJSCREEN_2018_USPR_csv",
+      "2019": "EJSCREEN_2019_USPR.csv",
+      "2020": "EJSCREEN_2020_USPR.csv",
+      "2021": "EJSCREEN_2021_USPR.csv",
+      "2022": "EJSCREEN_2022_with_AS_CNMI_GU_VI.csv",
+      "2023": "EJSCREEN_2023_BG_with_AS_CNMI_GU_VI.csv",
+      "2024": "EJScreen_2024_Tract_with_AS_CNMI_GU_VI.csv"
+    },
+    "FILENAMES": {
+      "2015": "EJSCREEN_20150505",
+      "2016": "EJSCREEN_Full_V3_USPR_TSDFupdate",
+      "2017": "EJSCREEN_2017_USPR_Public",
+      "2018": "EJSCREEN_Full_USPR_2018",
+      "2019": "EJSCREEN_2019_USPR",
+      "2020": "EJSCREEN_2020_USPR",
+      "2021": "EJSCREEN_2021_USPR",
+      "2022": "EJSCREEN_2022_Full_with_AS_CNMI_GU_VI",
+      "2023": "EJSCREEN_2023_BG_with_AS_CNMI_GU_VI",
+      "2024": "EJScreen_2024_Tract_with_AS_CNMI_GU_VI"
+    },
+    "TEMPLATE_MCF": [
+      {
+        "Node": "E:ejscreen_airpollutants->E0",
+        "typeOf": "dcs:StatVarObservation",
+        "variableMeasured": "dcs:Mean_Concentration_AirPollutant_DieselPM",
+        "observationDate": "C:ejscreen_airpollutants->year",
+        "observationAbout": "C:ejscreen_airpollutants->FIPS",
+        "observationPeriod": "dcs:P1Y",
+        "value": "C:ejscreen_airpollutants->DSLPM",
+        "unit": "dcs:MicrogramsPerCubicMeter"
+      },
+      {
+        "Node": "E:ejscreen_airpollutants->E1",
+        "typeOf": "dcs:StatVarObservation",
+        "variableMeasured": "dcs:AirPollutant_Cancer_Risk",
+        "observationDate": "C:ejscreen_airpollutants->year",
+        "observationAbout": "C:ejscreen_airpollutants->FIPS",
+        "observationPeriod": "dcs:P1Y",
+        "value": "C:ejscreen_airpollutants->CANCER",
+        "unit": "dcs:PerMillionPerson"
+      },
+      {
+        "Node": "E:ejscreen_airpollutants->E2",
+        "typeOf": "dcs:StatVarObservation",
+        "variableMeasured": "dcs:AirPollutant_Respiratory_Hazard",
+        "observationDate": "C:ejscreen_airpollutants->year",
+        "observationAbout": "C:ejscreen_airpollutants->FIPS",
+        "observationPeriod": "dcs:P1Y",
+        "value": "C:ejscreen_airpollutants->RESP"
+      },
+      {
+        "Node": "E:ejscreen_airpollutants->E3",
+        "typeOf": "dcs:StatVarObservation",
+        "variableMeasured": "dcs:Mean_Concentration_AirPollutant_Ozone",
+        "observationDate": "C:ejscreen_airpollutants->year",
+        "observationAbout": "C:ejscreen_airpollutants->FIPS",
+        "observationPeriod": "dcs:P1Y",
+        "value": "C:ejscreen_airpollutants->OZONE",
+        "unit": "dcs:PartsPerBillion"
+      },
+      {
+        "Node": "E:ejscreen_airpollutants->E4",
+        "typeOf": "dcs:StatVarObservation",
+        "variableMeasured": "dcs:Mean_Concentration_AirPollutant_PM2.5",
+        "observationDate": "C:ejscreen_airpollutants->year",
+        "observationAbout": "C:ejscreen_airpollutants->FIPS",
+        "observationPeriod": "dcs:P1Y",
+        "value": "C:ejscreen_airpollutants->PM25",
+        "unit": "dcs:MicrogramsPerCubicMeter"
+      }
+    ]
+  }
+  
\ No newline at end of file
diff --git a/scripts/us_epa/ejscreen/ejscreen.py b/scripts/us_epa/ejscreen/ejscreen.py
index ea9f15ed3b..b52462e42c 100644
--- a/scripts/us_epa/ejscreen/ejscreen.py
+++ b/scripts/us_epa/ejscreen/ejscreen.py
@@ -1,101 +1,35 @@
-'''
-Generates cleaned CSV for the EPA EJSCREEN data and TMCF.
-Usage: python3 ejscreen.py
-'''
-
 import io
 import zipfile
 import requests
 import pandas as pd
+import json
+from absl import logging
+
+logging.set_verbosity(logging.INFO)
+logger = logging
 
-YEARS = ['2015', '2016', '2017', '2018', '2019', '2020']
-
-NORM_CSV_COLUMNS = ['ID', 'DSLPM', 'CANCER', 'RESP', 'OZONE', 'PM25']
-
-# 2015 has different csv column names
-CSV_COLUMNS_BY_YEAR = {
-    '2015': ['FIPS', 'dpm', 'cancer', 'resp', 'o3', 'pm'],
-    '2016': NORM_CSV_COLUMNS,
-    '2017': NORM_CSV_COLUMNS,
-    '2018': NORM_CSV_COLUMNS,
-    '2019': NORM_CSV_COLUMNS,
-    '2020': NORM_CSV_COLUMNS
-}
-
-ZIP_FILENAMES = {
-    '2015': 'EJSCREEN_20150505.csv',
-    '2016': 'EJSCREEN_V3_USPR_090216_CSV',
-    '2017': None,
-    '2018': 'EJSCREEN_2018_USPR_csv',
-    '2019': 'EJSCREEN_2019_USPR.csv',
-    '2020': 'EJSCREEN_2020_USPR.csv'
-}
-
-FILENAMES = {
-    '2015': 'EJSCREEN_20150505',
-    '2016': 'EJSCREEN_Full_V3_USPR_TSDFupdate',
-    '2017': 'EJSCREEN_2017_USPR_Public',
-    '2018': 'EJSCREEN_Full_USPR_2018',
-    '2019': 'EJSCREEN_2019_USPR',
-    '2020': 'EJSCREEN_2020_USPR'
-}
-
-TEMPLATE_MCF = '''
-Node: E:ejscreen_airpollutants->E0
-typeOf: dcs:StatVarObservation
-variableMeasured: dcs:Mean_Concentration_AirPollutant_DieselPM
-observationDate: C:ejscreen_airpollutants->year
-observationAbout: C:ejscreen_airpollutants->FIPS
-observationPeriod: dcs:P1Y
-value: C:ejscreen_airpollutants->DSLPM
-unit: dcs:MicrogramsPerCubicMeter
-
-Node: E:ejscreen_airpollutants->E1
-typeOf: dcs:StatVarObservation
-variableMeasured: dcs:AirPollutant_Cancer_Risk
-observationDate: C:ejscreen_airpollutants->year
-observationAbout: C:ejscreen_airpollutants->FIPS
-observationPeriod: dcs:P1Y
-value: C:ejscreen_airpollutants->CANCER
-
-Node: E:ejscreen_airpollutants->E2
-typeOf: dcs:StatVarObservation
-variableMeasured: dcs:AirPollutant_Respiratory_Hazard
-observationDate: C:ejscreen_airpollutants->year
-observationAbout: C:ejscreen_airpollutants->FIPS
-observationPeriod: dcs:P1Y
-value: C:ejscreen_airpollutants->RESP
-
-Node: E:ejscreen_airpollutants->E3
-typeOf: dcs:StatVarObservation
-variableMeasured: dcs:Mean_Concentration_AirPollutant_Ozone
-observationDate: C:ejscreen_airpollutants->year
-observationAbout: C:ejscreen_airpollutants->FIPS
-observationPeriod: dcs:P1Y
-value: C:ejscreen_airpollutants->OZONE
-unit: dcs:PartsPerBillion
-
-Node: E:ejscreen_airpollutants->E4
-typeOf: dcs:StatVarObservation
-variableMeasured: dcs:Mean_Concentration_AirPollutant_PM2.5
-observationDate: C:ejscreen_airpollutants->year
-observationAbout: C:ejscreen_airpollutants->FIPS
-observationPeriod: dcs:P1Y
-value: C:ejscreen_airpollutants->PM25
-unit: dcs:MicrogramsPerCubicMeter
-'''
+# Load configuration from config.json
+with open('config.json', 'r') as f:
+    config = json.load(f)
 
+YEARS = config["YEARS"]
+NORM_CSV_COLUMNS = config["NORM_CSV_COLUMNS"]
+NORM_CSV_COLUMNS1 = config["NORM_CSV_COLUMNS1"]
+CSV_COLUMNS_BY_YEAR = config["CSV_COLUMNS_BY_YEAR"]
+ZIP_FILENAMES = config["ZIP_FILENAMES"]
+FILENAMES = config["FILENAMES"]
+TEMPLATE_MCF = config["TEMPLATE_MCF"]
 
 # data: dictionary of dataframes in the format {year: dataframe}
 # outfilename: name of the csv that data will be written to
 # write_csv concatenates the dataframe from each year together
+
+
 def write_csv(data, outfilename):
     full_df = pd.DataFrame()
     for curr_year, one_year_df in data.items():
-        one_year_df['year'] = curr_year  # add year column
-        full_df = pd.concat(
-            [full_df, one_year_df],
-            ignore_index=True)  # concatenate year onto larger dataframe
+        one_year_df['year'] = curr_year
+        full_df = pd.concat([full_df, one_year_df], ignore_index=True)
 
     # sort by FIPS and make into dcid
     full_df = full_df.rename(columns={'ID': 'FIPS'})
@@ -115,25 +49,159 @@ def write_tmcf(outfilename):
 if __name__ == '__main__':
     dfs = {}
     for year in YEARS:
-        print(year)
+        logger.info(year)
         columns = CSV_COLUMNS_BY_YEAR[year]
-        # request file
         zip_filename = ZIP_FILENAMES[year]
+
         if zip_filename is not None:
-            response = requests.get(
-                f'https://gaftp.epa.gov/EJSCREEN/{year}/{zip_filename}.zip')
-            with zipfile.ZipFile(io.BytesIO(response.content())) as zfile:
-                with zfile.open(f'{FILENAMES[year]}.csv', 'r') as newfile:
-                    dfs[year] = pd.read_csv(newfile, usecols=columns)
-        # some years are not zipped
+            if year == '2024':
+                url = f'https://gaftp.epa.gov/EJSCREEN/2024/2.32_August_UseMe/{zip_filename}.zip'
+            elif year == '2023':
+                url = f'https://gaftp.epa.gov/EJSCREEN/2023/2.22_September_UseMe/{zip_filename}.zip'
+            else:
+                url = f'https://gaftp.epa.gov/EJSCREEN/{year}/{zip_filename}.zip'
+
+            logger.info(f"Requesting file: {url}")
+            response = requests.get(url, verify=False)
+
+            if response.status_code == 200:
+                with zipfile.ZipFile(io.BytesIO(response.content)) as zfile:
+                    with zfile.open(f'{FILENAMES[year]}.csv', 'r') as newfile:
+                        dfs[year] = pd.read_csv(newfile,
+                                                engine='python',
+                                                encoding='latin1',
+                                                usecols=columns)
+            else:
+                logger.error(
+                    f"Failed to download file for {year}. HTTP Status Code: {response.status_code}"
+                )
+
+        else:
+            url = f'https://gaftp.epa.gov/EJSCREEN/{year}/{FILENAMES[year]}.csv'
+            logger.info(f"Requesting CSV file: {url}")
+            response = requests.get(url, verify=False)
+
+            if response.status_code == 200:
+                dfs[year] = pd.read_csv(io.StringIO(response.text),
+                                        sep=',',
+                                        usecols=columns)
+            else:
+                logger.error(
+                    f"Failed to download CSV for {year}. HTTP Status Code: {response.status_code}"
+                )
+
+        # Rename weird column names to match other years
+        if year == '2024':
+            cols_renamed = dict(zip(columns, NORM_CSV_COLUMNS1))
         else:
-            response = requests.get(
-                f'https://gaftp.epa.gov/EJSCREEN/{year}/{FILENAMES[year]}.csv')
-            dfs[year] = pd.read_csv(response, usecols=columns)
-        # rename weird column names to match other years
-        if columns != NORM_CSV_COLUMNS:
             cols_renamed = dict(zip(columns, NORM_CSV_COLUMNS))
-            dfs[year] = dfs[year].rename(columns=cols_renamed)
 
+        dfs[year] = dfs[year].rename(columns=cols_renamed)
+
+    write_csv(dfs, 'ejscreen_airpollutants.csv')
+    write_tmcf('ejscreen.tmcf')
+logger.info("Loading configuration from config.json")
+with open('config.json', 'r') as f:
+    config = json.load(f)
+logger.info("Configuration loaded successfully")
+
+YEARS = config["YEARS"]
+logger.info(f"Processing years: {YEARS}")
+
+NORM_CSV_COLUMNS = config["NORM_CSV_COLUMNS"]
+NORM_CSV_COLUMNS1 = config["NORM_CSV_COLUMNS1"]
+CSV_COLUMNS_BY_YEAR = config["CSV_COLUMNS_BY_YEAR"]
+ZIP_FILENAMES = config["ZIP_FILENAMES"]
+FILENAMES = config["FILENAMES"]
+TEMPLATE_MCF = config["TEMPLATE_MCF"]
+
+logger.info("Dataframes initialized")
+
+
+def write_csv(data, outfilename):
+    logger.info(f"Writing data to {outfilename}")
+    full_df = pd.DataFrame()
+    for curr_year, one_year_df in data.items():
+        one_year_df['year'] = curr_year
+        full_df = pd.concat([full_df, one_year_df], ignore_index=True)
+
+    # sort by FIPS and make into dcid
+    full_df = full_df.rename(columns={'ID': 'FIPS'})
+    full_df = full_df.sort_values(by=['FIPS'], ignore_index=True)
+    full_df['FIPS'] = 'dcid:geoId/' + (
+        full_df['FIPS'].astype(str).str.zfill(12))
+    full_df = full_df.fillna('')
+    full_df = full_df.replace('None', '')
+    full_df.to_csv(outfilename, index=False)
+    logger.info(f"Data written to {outfilename} successfully")
+
+
+def write_tmcf(outfilename):
+    logger.info(f"Writing template to {outfilename}")
+    with open(outfilename, 'w') as f_out:
+        f_out.write(TEMPLATE_MCF)
+    logger.info(f"Template written to {outfilename} successfully")
+
+
+if __name__ == '__main__':
+    dfs = {}
+    for year in YEARS:
+        logger.info(f"Processing year: {year}")
+        columns = CSV_COLUMNS_BY_YEAR[year]
+        zip_filename = ZIP_FILENAMES[year]
+
+        if zip_filename is not None:
+            if year == '2024':
+                url = f'https://gaftp.epa.gov/EJSCREEN/2024/2.32_August_UseMe/{zip_filename}.zip'
+            elif year == '2023':
+                url = f'https://gaftp.epa.gov/EJSCREEN/2023/2.22_September_UseMe/{zip_filename}.zip'
+            else:
+                url = f'https://gaftp.epa.gov/EJSCREEN/{year}/{zip_filename}.zip'
+
+            logger.info(f"Requesting file: {url}")
+            response = requests.get(url, verify=False)
+
+            if response.status_code == 200:
+                with zipfile.ZipFile(io.BytesIO(response.content)) as zfile:
+                    with zfile.open(f'{FILENAMES[year]}.csv', 'r') as newfile:
+                        dfs[year] = pd.read_csv(newfile,
+                                                engine='python',
+                                                encoding='latin1',
+                                                usecols=columns)
+                logger.info(
+                    f"File downloaded and processed for {year} successfully")
+            else:
+                logger.error(
+                    f"Failed to download file for {year}. HTTP Status Code: {response.status_code}"
+                )
+
+        else:
+            url = f'https://gaftp.epa.gov/EJSCREEN/{year}/{FILENAMES[year]}.csv'
+            logger.info(f"Requesting CSV file: {url}")
+            response = requests.get(url, verify=False)
+
+            if response.status_code == 200:
+                dfs[year] = pd.read_csv(io.StringIO(response.text),
+                                        sep=',',
+                                        usecols=columns)
+                logger.info(
+                    f"CSV downloaded and processed for {year} successfully")
+            else:
+                logger.error(
+                    f"Failed to download CSV for {year}. HTTP Status Code: {response.status_code}"
+                )
+
+        # Rename weird column names to match other years
+        if year == '2024':
+            cols_renamed = dict(zip(columns, NORM_CSV_COLUMNS1))
+        else:
+            cols_renamed = dict(zip(columns, NORM_CSV_COLUMNS))
+
+        dfs[year] = dfs[year].rename(columns=cols_renamed)
+        logger.info(f"Columns renamed for {year} successfully")
+
+    logger.info("Writing data to csv")
     write_csv(dfs, 'ejscreen_airpollutants.csv')
+    logger.info("Writing template to tmcf")
     write_tmcf('ejscreen.tmcf')
+    logger.info("Process completed successfully")
diff --git a/scripts/us_epa/ejscreen/ejscreen_test.py b/scripts/us_epa/ejscreen/ejscreen_test.py
index 7965671493..b6bb505f5d 100644
--- a/scripts/us_epa/ejscreen/ejscreen_test.py
+++ b/scripts/us_epa/ejscreen/ejscreen_test.py
@@ -7,7 +7,7 @@
 import os
 import tempfile
 import pandas as pd
-from .ejscreen import write_csv
+from ejscreen import write_csv
 
 module_dir_ = os.path.dirname(__file__)
 

From 1b0c74d2c3e8be811a6bcef82cc40e1c3560c5cd Mon Sep 17 00:00:00 2001
From: Rohit Bhande <rbhande@google.com>
Date: Wed, 15 Jan 2025 11:21:23 +0000
Subject: [PATCH 02/21] Ejscreen semiautomatic

---
 scripts/us_epa/ejscreen/ejscreen_test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/us_epa/ejscreen/ejscreen_test.py b/scripts/us_epa/ejscreen/ejscreen_test.py
index b6bb505f5d..7965671493 100644
--- a/scripts/us_epa/ejscreen/ejscreen_test.py
+++ b/scripts/us_epa/ejscreen/ejscreen_test.py
@@ -7,7 +7,7 @@
 import os
 import tempfile
 import pandas as pd
-from ejscreen import write_csv
+from .ejscreen import write_csv
 
 module_dir_ = os.path.dirname(__file__)
 

From 9f2cf3c5618b8aa1fa0c584f2f0fab092837f1b1 Mon Sep 17 00:00:00 2001
From: Rohit Bhande <rbhande@google.com>
Date: Wed, 15 Jan 2025 11:33:00 +0000
Subject: [PATCH 03/21] Ejscreen semiautomatic

---
 scripts/us_epa/ejscreen/ejscreen_test.py | 26 ++++++++++++++----------
 1 file changed, 15 insertions(+), 11 deletions(-)

diff --git a/scripts/us_epa/ejscreen/ejscreen_test.py b/scripts/us_epa/ejscreen/ejscreen_test.py
index 7965671493..c7e154863f 100644
--- a/scripts/us_epa/ejscreen/ejscreen_test.py
+++ b/scripts/us_epa/ejscreen/ejscreen_test.py
@@ -2,33 +2,37 @@
 Unit tests for ejscreen.py
 Usage: python3 -m unittest discover -v -s ../ -p "ejscreen_test.py"
 '''
-
 import unittest
 import os
 import tempfile
 import pandas as pd
-from .ejscreen import write_csv
+from ejscreen import write_csv  
 
 module_dir_ = os.path.dirname(__file__)
 
-
 class TestEjscreen(unittest.TestCase):
 
     def test_write_csv(self):
         with tempfile.TemporaryDirectory() as tmp_dir:
+            # Ensure test data file exists in the expected directory
+            test_data_file = os.path.join(module_dir_, 'test_data/test_data.csv')
+            expected_data_file = os.path.join(module_dir_, 'test_data/test_data_expected.csv')
+            
+            if not os.path.exists(test_data_file) or not os.path.exists(expected_data_file):
+                raise FileNotFoundError(f"Test data files are missing: {test_data_file}, {expected_data_file}")
+
             dfs = {}
-            dfs['2020'] = pd.read_csv(os.path.join(module_dir_,
-                                                   'test_data/test_data.csv'),
-                                      float_precision='high')
+            dfs['2020'] = pd.read_csv(test_data_file, float_precision='high')
             test_csv = os.path.join(tmp_dir, 'test_csv.csv')
             write_csv(dfs, test_csv)
-            expected_csv = os.path.join(module_dir_,
-                                        'test_data/test_data_expected.csv')
+            
             with open(test_csv, 'r') as test:
-                test_str: str = test.read()
-                with open(expected_csv, 'r') as expected:
-                    expected_str: str = expected.read()
+                test_str = test.read()
+                with open(expected_data_file, 'r') as expected:
+                    expected_str = expected.read()
                     self.assertEqual(test_str, expected_str)
+
+            # Remove temporary test file after assertion
             os.remove(test_csv)
 
 

From c0073bce9d7a674f6c2aa29911cbcd3ca28eee96 Mon Sep 17 00:00:00 2001
From: Rohit Bhande <rbhande@google.com>
Date: Wed, 15 Jan 2025 11:40:37 +0000
Subject: [PATCH 04/21] Ejscreen semiautomatic

---
 scripts/us_epa/ejscreen/ejscreen_test.py | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/scripts/us_epa/ejscreen/ejscreen_test.py b/scripts/us_epa/ejscreen/ejscreen_test.py
index c7e154863f..6ef215db44 100644
--- a/scripts/us_epa/ejscreen/ejscreen_test.py
+++ b/scripts/us_epa/ejscreen/ejscreen_test.py
@@ -6,26 +6,32 @@
 import os
 import tempfile
 import pandas as pd
-from ejscreen import write_csv  
+from ejscreen import write_csv
 
 module_dir_ = os.path.dirname(__file__)
 
+
 class TestEjscreen(unittest.TestCase):
 
     def test_write_csv(self):
         with tempfile.TemporaryDirectory() as tmp_dir:
             # Ensure test data file exists in the expected directory
-            test_data_file = os.path.join(module_dir_, 'test_data/test_data.csv')
-            expected_data_file = os.path.join(module_dir_, 'test_data/test_data_expected.csv')
-            
-            if not os.path.exists(test_data_file) or not os.path.exists(expected_data_file):
-                raise FileNotFoundError(f"Test data files are missing: {test_data_file}, {expected_data_file}")
+            test_data_file = os.path.join(module_dir_,
+                                          'test_data/test_data.csv')
+            expected_data_file = os.path.join(
+                module_dir_, 'test_data/test_data_expected.csv')
+
+            if not os.path.exists(test_data_file) or not os.path.exists(
+                    expected_data_file):
+                raise FileNotFoundError(
+                    f"Test data files are missing: {test_data_file}, {expected_data_file}"
+                )
 
             dfs = {}
             dfs['2020'] = pd.read_csv(test_data_file, float_precision='high')
             test_csv = os.path.join(tmp_dir, 'test_csv.csv')
             write_csv(dfs, test_csv)
-            
+
             with open(test_csv, 'r') as test:
                 test_str = test.read()
                 with open(expected_data_file, 'r') as expected:

From 32b47d4427ac737c6f165d3acfb945e271db2f5a Mon Sep 17 00:00:00 2001
From: Rohit Bhande <rbhande@google.com>
Date: Wed, 15 Jan 2025 12:09:27 +0000
Subject: [PATCH 05/21] Ejscreen semiautomatic

---
 scripts/us_epa/ejscreen/ejscreen_test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/us_epa/ejscreen/ejscreen_test.py b/scripts/us_epa/ejscreen/ejscreen_test.py
index 6ef215db44..3e1e80af70 100644
--- a/scripts/us_epa/ejscreen/ejscreen_test.py
+++ b/scripts/us_epa/ejscreen/ejscreen_test.py
@@ -6,7 +6,7 @@
 import os
 import tempfile
 import pandas as pd
-from ejscreen import write_csv
+from .ejscreen import write_csv
 
 module_dir_ = os.path.dirname(__file__)
 

From 644c0fe2eaec14a43f3cbb648b968883cb401d06 Mon Sep 17 00:00:00 2001
From: Rohit Bhande <rbhande@google.com>
Date: Wed, 15 Jan 2025 12:31:28 +0000
Subject: [PATCH 06/21] Ejscreen semiautomatic

---
 scripts/us_epa/ejscreen/ejscreen.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/scripts/us_epa/ejscreen/ejscreen.py b/scripts/us_epa/ejscreen/ejscreen.py
index b52462e42c..cf1bbd7bfd 100644
--- a/scripts/us_epa/ejscreen/ejscreen.py
+++ b/scripts/us_epa/ejscreen/ejscreen.py
@@ -24,6 +24,12 @@
 # outfilename: name of the csv that data will be written to
 # write_csv concatenates the dataframe from each year together
 
+# def read_config():
+#     # Load configuration from config.json
+#     with open('config.json', 'r') as f:
+#         config = json.load(f)
+#     return config
+
 
 def write_csv(data, outfilename):
     full_df = pd.DataFrame()
@@ -100,10 +106,6 @@ def write_tmcf(outfilename):
 
     write_csv(dfs, 'ejscreen_airpollutants.csv')
     write_tmcf('ejscreen.tmcf')
-logger.info("Loading configuration from config.json")
-with open('config.json', 'r') as f:
-    config = json.load(f)
-logger.info("Configuration loaded successfully")
 
 YEARS = config["YEARS"]
 logger.info(f"Processing years: {YEARS}")

From c968571de172ce3034eeb01b426190a17090eb97 Mon Sep 17 00:00:00 2001
From: Rohit Bhande <rbhande@google.com>
Date: Wed, 15 Jan 2025 13:04:47 +0000
Subject: [PATCH 07/21] Ejscreen semiautomatic2

---
 scripts/us_epa/ejscreen/ejscreen.py | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/scripts/us_epa/ejscreen/ejscreen.py b/scripts/us_epa/ejscreen/ejscreen.py
index cf1bbd7bfd..264c5eb2a0 100644
--- a/scripts/us_epa/ejscreen/ejscreen.py
+++ b/scripts/us_epa/ejscreen/ejscreen.py
@@ -1,4 +1,5 @@
 import io
+import os
 import zipfile
 import requests
 import pandas as pd
@@ -8,8 +9,11 @@
 logging.set_verbosity(logging.INFO)
 logger = logging
 
+_MODULE_DIR = os.path.dirname(os.path.abspath(__file__))
+_CONFIG_PATH = os.path.join(_MODULE_DIR, 'config.json')
+
 # Load configuration from config.json
-with open('config.json', 'r') as f:
+with open(_CONFIG_PATH, 'r') as f:
     config = json.load(f)
 
 YEARS = config["YEARS"]
@@ -24,12 +28,6 @@
 # outfilename: name of the csv that data will be written to
 # write_csv concatenates the dataframe from each year together
 
-# def read_config():
-#     # Load configuration from config.json
-#     with open('config.json', 'r') as f:
-#         config = json.load(f)
-#     return config
-
 
 def write_csv(data, outfilename):
     full_df = pd.DataFrame()

From ebbd9a46f1860d53006ba4f021db4e1a09306073 Mon Sep 17 00:00:00 2001
From: Rohit Bhande <rbhande@google.com>
Date: Thu, 16 Jan 2025 10:03:47 +0000
Subject: [PATCH 08/21] Ejscreen semiautomatic2

---
 scripts/us_epa/ejscreen/config.json      |   2 +
 scripts/us_epa/ejscreen/ejscreen.py      | 154 +++++------------------
 scripts/us_epa/ejscreen/ejscreen_test.py |  16 ++-
 scripts/us_epa/ejscreen/manifest.json    |  23 ++++
 4 files changed, 69 insertions(+), 126 deletions(-)
 create mode 100644 scripts/us_epa/ejscreen/manifest.json

diff --git a/scripts/us_epa/ejscreen/config.json b/scripts/us_epa/ejscreen/config.json
index b84673a204..9375ee2504 100644
--- a/scripts/us_epa/ejscreen/config.json
+++ b/scripts/us_epa/ejscreen/config.json
@@ -38,6 +38,8 @@
       "2023": "EJSCREEN_2023_BG_with_AS_CNMI_GU_VI",
       "2024": "EJScreen_2024_Tract_with_AS_CNMI_GU_VI"
     },
+    "URL_TEMPLATE": "https://gaftp.epa.gov/EJSCREEN/{year}/{zip_filename}.zip",
+    "URL_TEMPLATE_NON_ZIPPED": "https://gaftp.epa.gov/EJSCREEN/{year}/{filename}.csv",
     "TEMPLATE_MCF": [
       {
         "Node": "E:ejscreen_airpollutants->E0",
diff --git a/scripts/us_epa/ejscreen/ejscreen.py b/scripts/us_epa/ejscreen/ejscreen.py
index 264c5eb2a0..83c1e3bfde 100644
--- a/scripts/us_epa/ejscreen/ejscreen.py
+++ b/scripts/us_epa/ejscreen/ejscreen.py
@@ -1,3 +1,18 @@
+# Copyright 2023 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
 import io
 import os
 import zipfile
@@ -23,12 +38,13 @@
 ZIP_FILENAMES = config["ZIP_FILENAMES"]
 FILENAMES = config["FILENAMES"]
 TEMPLATE_MCF = config["TEMPLATE_MCF"]
+URL_TEMPLATE = config["URL_TEMPLATE"]
+URL_TEMPLATE_NON_ZIPPED = config["URL_TEMPLATE_NON_ZIPPED"]
 
 # data: dictionary of dataframes in the format {year: dataframe}
 # outfilename: name of the csv that data will be written to
 # write_csv concatenates the dataframe from each year together
 
-
 def write_csv(data, outfilename):
     full_df = pd.DataFrame()
     for curr_year, one_year_df in data.items():
@@ -44,152 +60,40 @@ def write_csv(data, outfilename):
     full_df = full_df.replace('None', '')
     full_df.to_csv(outfilename, index=False)
 
-
 def write_tmcf(outfilename):
     with open(outfilename, 'w') as f_out:
         f_out.write(TEMPLATE_MCF)
 
-
-if __name__ == '__main__':
-    dfs = {}
-    for year in YEARS:
-        logger.info(year)
-        columns = CSV_COLUMNS_BY_YEAR[year]
-        zip_filename = ZIP_FILENAMES[year]
-
-        if zip_filename is not None:
-            if year == '2024':
-                url = f'https://gaftp.epa.gov/EJSCREEN/2024/2.32_August_UseMe/{zip_filename}.zip'
-            elif year == '2023':
-                url = f'https://gaftp.epa.gov/EJSCREEN/2023/2.22_September_UseMe/{zip_filename}.zip'
-            else:
-                url = f'https://gaftp.epa.gov/EJSCREEN/{year}/{zip_filename}.zip'
-
-            logger.info(f"Requesting file: {url}")
-            response = requests.get(url, verify=False)
-
-            if response.status_code == 200:
-                with zipfile.ZipFile(io.BytesIO(response.content)) as zfile:
-                    with zfile.open(f'{FILENAMES[year]}.csv', 'r') as newfile:
-                        dfs[year] = pd.read_csv(newfile,
-                                                engine='python',
-                                                encoding='latin1',
-                                                usecols=columns)
-            else:
-                logger.error(
-                    f"Failed to download file for {year}. HTTP Status Code: {response.status_code}"
-                )
-
-        else:
-            url = f'https://gaftp.epa.gov/EJSCREEN/{year}/{FILENAMES[year]}.csv'
-            logger.info(f"Requesting CSV file: {url}")
-            response = requests.get(url, verify=False)
-
-            if response.status_code == 200:
-                dfs[year] = pd.read_csv(io.StringIO(response.text),
-                                        sep=',',
-                                        usecols=columns)
-            else:
-                logger.error(
-                    f"Failed to download CSV for {year}. HTTP Status Code: {response.status_code}"
-                )
-
-        # Rename weird column names to match other years
-        if year == '2024':
-            cols_renamed = dict(zip(columns, NORM_CSV_COLUMNS1))
-        else:
-            cols_renamed = dict(zip(columns, NORM_CSV_COLUMNS))
-
-        dfs[year] = dfs[year].rename(columns=cols_renamed)
-
-    write_csv(dfs, 'ejscreen_airpollutants.csv')
-    write_tmcf('ejscreen.tmcf')
-
-YEARS = config["YEARS"]
-logger.info(f"Processing years: {YEARS}")
-
-NORM_CSV_COLUMNS = config["NORM_CSV_COLUMNS"]
-NORM_CSV_COLUMNS1 = config["NORM_CSV_COLUMNS1"]
-CSV_COLUMNS_BY_YEAR = config["CSV_COLUMNS_BY_YEAR"]
-ZIP_FILENAMES = config["ZIP_FILENAMES"]
-FILENAMES = config["FILENAMES"]
-TEMPLATE_MCF = config["TEMPLATE_MCF"]
-
-logger.info("Dataframes initialized")
-
-
-def write_csv(data, outfilename):
-    logger.info(f"Writing data to {outfilename}")
-    full_df = pd.DataFrame()
-    for curr_year, one_year_df in data.items():
-        one_year_df['year'] = curr_year
-        full_df = pd.concat([full_df, one_year_df], ignore_index=True)
-
-    # sort by FIPS and make into dcid
-    full_df = full_df.rename(columns={'ID': 'FIPS'})
-    full_df = full_df.sort_values(by=['FIPS'], ignore_index=True)
-    full_df['FIPS'] = 'dcid:geoId/' + (
-        full_df['FIPS'].astype(str).str.zfill(12))
-    full_df = full_df.fillna('')
-    full_df = full_df.replace('None', '')
-    full_df.to_csv(outfilename, index=False)
-    logger.info(f"Data written to {outfilename} successfully")
-
-
-def write_tmcf(outfilename):
-    logger.info(f"Writing template to {outfilename}")
-    with open(outfilename, 'w') as f_out:
-        f_out.write(TEMPLATE_MCF)
-    logger.info(f"Template written to {outfilename} successfully")
-
-
 if __name__ == '__main__':
     dfs = {}
     for year in YEARS:
         logger.info(f"Processing year: {year}")
         columns = CSV_COLUMNS_BY_YEAR[year]
-        zip_filename = ZIP_FILENAMES[year]
-
-        if zip_filename is not None:
-            if year == '2024':
-                url = f'https://gaftp.epa.gov/EJSCREEN/2024/2.32_August_UseMe/{zip_filename}.zip'
-            elif year == '2023':
-                url = f'https://gaftp.epa.gov/EJSCREEN/2023/2.22_September_UseMe/{zip_filename}.zip'
-            else:
-                url = f'https://gaftp.epa.gov/EJSCREEN/{year}/{zip_filename}.zip'
+        zip_filename = ZIP_FILENAMES.get(year, None)
 
+        # Check if the year has a zip file or not
+        if zip_filename:
+            url = URL_TEMPLATE.format(year=year, zip_filename=zip_filename)
             logger.info(f"Requesting file: {url}")
             response = requests.get(url, verify=False)
 
             if response.status_code == 200:
                 with zipfile.ZipFile(io.BytesIO(response.content)) as zfile:
                     with zfile.open(f'{FILENAMES[year]}.csv', 'r') as newfile:
-                        dfs[year] = pd.read_csv(newfile,
-                                                engine='python',
-                                                encoding='latin1',
-                                                usecols=columns)
-                logger.info(
-                    f"File downloaded and processed for {year} successfully")
+                        dfs[year] = pd.read_csv(newfile, usecols=columns)
+                logger.info(f"File downloaded and processed for {year} successfully")
             else:
-                logger.error(
-                    f"Failed to download file for {year}. HTTP Status Code: {response.status_code}"
-                )
-
+                logger.error(f"Failed to download file for {year}. HTTP Status Code: {response.status_code}")
         else:
-            url = f'https://gaftp.epa.gov/EJSCREEN/{year}/{FILENAMES[year]}.csv'
+            url = URL_TEMPLATE_NON_ZIPPED.format(year=year, filename=FILENAMES[year])
             logger.info(f"Requesting CSV file: {url}")
             response = requests.get(url, verify=False)
 
             if response.status_code == 200:
-                dfs[year] = pd.read_csv(io.StringIO(response.text),
-                                        sep=',',
-                                        usecols=columns)
-                logger.info(
-                    f"CSV downloaded and processed for {year} successfully")
+                dfs[year] = pd.read_csv(io.StringIO(response.text), sep=',', usecols=columns)
+                logger.info(f"CSV downloaded and processed for {year} successfully")
             else:
-                logger.error(
-                    f"Failed to download CSV for {year}. HTTP Status Code: {response.status_code}"
-                )
+                logger.error(f"Failed to download CSV for {year}. HTTP Status Code: {response.status_code}")
 
         # Rename weird column names to match other years
         if year == '2024':
@@ -204,4 +108,4 @@ def write_tmcf(outfilename):
     write_csv(dfs, 'ejscreen_airpollutants.csv')
     logger.info("Writing template to tmcf")
     write_tmcf('ejscreen.tmcf')
-    logger.info("Process completed successfully")
+    logger.info("Process completed successfully")
\ No newline at end of file
diff --git a/scripts/us_epa/ejscreen/ejscreen_test.py b/scripts/us_epa/ejscreen/ejscreen_test.py
index 3e1e80af70..4d9b386700 100644
--- a/scripts/us_epa/ejscreen/ejscreen_test.py
+++ b/scripts/us_epa/ejscreen/ejscreen_test.py
@@ -1,3 +1,17 @@
+# Copyright 2023 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 '''
 Unit tests for ejscreen.py
 Usage: python3 -m unittest discover -v -s ../ -p "ejscreen_test.py"
@@ -6,7 +20,7 @@
 import os
 import tempfile
 import pandas as pd
-from .ejscreen import write_csv
+from ejscreen import write_csv
 
 module_dir_ = os.path.dirname(__file__)
 
diff --git a/scripts/us_epa/ejscreen/manifest.json b/scripts/us_epa/ejscreen/manifest.json
new file mode 100644
index 0000000000..6938eaa11f
--- /dev/null
+++ b/scripts/us_epa/ejscreen/manifest.json
@@ -0,0 +1,23 @@
+{
+    "import_specifications": [
+        {
+            "import_name": "EPA_EJSCREEN",
+            "curator_emails": [
+                "rbhande@google.com"
+            ],
+            "provenance_url": "https://gaftp.epa.gov/EJSCREEN/",
+            "provenance_description": "The Census Bureau's Ejscreen data",
+            "scripts": [
+                "ejscreen.py"
+            ],
+            "import_inputs": [
+                {
+                    "template_mcf": "us_epa/ejscreen/ejscreen.tmcf",
+                    "cleaned_csv": "us_epa/ejscreen/ejscreen_airpollutants.csv"
+                }
+            ],
+            "cron_schedule": "0 07 * * 1"
+        }
+    ]
+}
+

From 1702ace085694fbd673ea6415d1b5a3ba856ed80 Mon Sep 17 00:00:00 2001
From: Rohit Bhande <rbhande@google.com>
Date: Thu, 16 Jan 2025 10:14:42 +0000
Subject: [PATCH 09/21] Ejscreen semiautomatic2

---
 scripts/us_epa/ejscreen/ejscreen.py      | 27 ++++++++++-----
 scripts/us_epa/ejscreen/ejscreen.tmcf    | 42 ------------------------
 scripts/us_epa/ejscreen/ejscreen_test.py |  1 -
 3 files changed, 19 insertions(+), 51 deletions(-)

diff --git a/scripts/us_epa/ejscreen/ejscreen.py b/scripts/us_epa/ejscreen/ejscreen.py
index 83c1e3bfde..bce739901c 100644
--- a/scripts/us_epa/ejscreen/ejscreen.py
+++ b/scripts/us_epa/ejscreen/ejscreen.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-
 import io
 import os
 import zipfile
@@ -45,6 +44,7 @@
 # outfilename: name of the csv that data will be written to
 # write_csv concatenates the dataframe from each year together
 
+
 def write_csv(data, outfilename):
     full_df = pd.DataFrame()
     for curr_year, one_year_df in data.items():
@@ -60,10 +60,12 @@ def write_csv(data, outfilename):
     full_df = full_df.replace('None', '')
     full_df.to_csv(outfilename, index=False)
 
+
 def write_tmcf(outfilename):
     with open(outfilename, 'w') as f_out:
         f_out.write(TEMPLATE_MCF)
 
+
 if __name__ == '__main__':
     dfs = {}
     for year in YEARS:
@@ -81,19 +83,28 @@ def write_tmcf(outfilename):
                 with zipfile.ZipFile(io.BytesIO(response.content)) as zfile:
                     with zfile.open(f'{FILENAMES[year]}.csv', 'r') as newfile:
                         dfs[year] = pd.read_csv(newfile, usecols=columns)
-                logger.info(f"File downloaded and processed for {year} successfully")
+                logger.info(
+                    f"File downloaded and processed for {year} successfully")
             else:
-                logger.error(f"Failed to download file for {year}. HTTP Status Code: {response.status_code}")
+                logger.error(
+                    f"Failed to download file for {year}. HTTP Status Code: {response.status_code}"
+                )
         else:
-            url = URL_TEMPLATE_NON_ZIPPED.format(year=year, filename=FILENAMES[year])
+            url = URL_TEMPLATE_NON_ZIPPED.format(year=year,
+                                                 filename=FILENAMES[year])
             logger.info(f"Requesting CSV file: {url}")
             response = requests.get(url, verify=False)
 
             if response.status_code == 200:
-                dfs[year] = pd.read_csv(io.StringIO(response.text), sep=',', usecols=columns)
-                logger.info(f"CSV downloaded and processed for {year} successfully")
+                dfs[year] = pd.read_csv(io.StringIO(response.text),
+                                        sep=',',
+                                        usecols=columns)
+                logger.info(
+                    f"CSV downloaded and processed for {year} successfully")
             else:
-                logger.error(f"Failed to download CSV for {year}. HTTP Status Code: {response.status_code}")
+                logger.error(
+                    f"Failed to download CSV for {year}. HTTP Status Code: {response.status_code}"
+                )
 
         # Rename weird column names to match other years
         if year == '2024':
@@ -108,4 +119,4 @@ def write_tmcf(outfilename):
     write_csv(dfs, 'ejscreen_airpollutants.csv')
     logger.info("Writing template to tmcf")
     write_tmcf('ejscreen.tmcf')
-    logger.info("Process completed successfully")
\ No newline at end of file
+    logger.info("Process completed successfully")
diff --git a/scripts/us_epa/ejscreen/ejscreen.tmcf b/scripts/us_epa/ejscreen/ejscreen.tmcf
index 21ef79d3f3..e69de29bb2 100644
--- a/scripts/us_epa/ejscreen/ejscreen.tmcf
+++ b/scripts/us_epa/ejscreen/ejscreen.tmcf
@@ -1,42 +0,0 @@
-Node: E:ejscreen_airpollutants->E0
-typeOf: dcs:StatVarObservation
-variableMeasured: dcs:Mean_Concentration_AirPollutant_DieselPM
-observationDate: C:ejscreen_airpollutants->year
-observationAbout: C:ejscreen_airpollutants->FIPS
-observationPeriod: dcs:P1Y
-value: C:ejscreen_airpollutants->DSLPM
-unit: dcs:MicrogramsPerCubicMeter
-
-Node: E:ejscreen_airpollutants->E1
-typeOf: dcs:StatVarObservation
-variableMeasured: dcs:AirPollutant_Cancer_Risk
-observationDate: C:ejscreen_airpollutants->year
-observationAbout: C:ejscreen_airpollutants->FIPS
-observationPeriod: dcs:P1Y
-value: C:ejscreen_airpollutants->CANCER
-
-Node: E:ejscreen_airpollutants->E2
-typeOf: dcs:StatVarObservation
-variableMeasured: dcs:AirPollutant_Respiratory_Hazard
-observationDate: C:ejscreen_airpollutants->year
-observationAbout: C:ejscreen_airpollutants->FIPS
-observationPeriod: dcs:P1Y
-value: C:ejscreen_airpollutants->RESP
-
-Node: E:ejscreen_airpollutants->E3
-typeOf: dcs:StatVarObservation
-variableMeasured: dcs:Mean_Concentration_AirPollutant_Ozone
-observationDate: C:ejscreen_airpollutants->year
-observationAbout: C:ejscreen_airpollutants->FIPS
-observationPeriod: dcs:P1Y
-value: C:ejscreen_airpollutants->OZONE
-unit: dcs:PartsPerBillion
-
-Node: E:ejscreen_airpollutants->E4
-typeOf: dcs:StatVarObservation
-variableMeasured: dcs:Mean_Concentration_AirPollutant_PM2.5
-observationDate: C:ejscreen_airpollutants->year
-observationAbout: C:ejscreen_airpollutants->FIPS
-observationPeriod: dcs:P1Y
-value: C:ejscreen_airpollutants->PM25
-unit: dcs:MicrogramsPerCubicMeter
\ No newline at end of file
diff --git a/scripts/us_epa/ejscreen/ejscreen_test.py b/scripts/us_epa/ejscreen/ejscreen_test.py
index 4d9b386700..7a6ae60392 100644
--- a/scripts/us_epa/ejscreen/ejscreen_test.py
+++ b/scripts/us_epa/ejscreen/ejscreen_test.py
@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 '''
 Unit tests for ejscreen.py
 Usage: python3 -m unittest discover -v -s ../ -p "ejscreen_test.py"

From 664968513baa3ab97901eea6ff09304b811fd1f8 Mon Sep 17 00:00:00 2001
From: Rohit Bhande <rbhande@google.com>
Date: Thu, 16 Jan 2025 10:23:18 +0000
Subject: [PATCH 10/21] Ejscreen semiautomatic2

---
 scripts/us_epa/ejscreen/ejscreen_test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/us_epa/ejscreen/ejscreen_test.py b/scripts/us_epa/ejscreen/ejscreen_test.py
index 7a6ae60392..12c98be7a9 100644
--- a/scripts/us_epa/ejscreen/ejscreen_test.py
+++ b/scripts/us_epa/ejscreen/ejscreen_test.py
@@ -19,7 +19,7 @@
 import os
 import tempfile
 import pandas as pd
-from ejscreen import write_csv
+from .ejscreen import write_csv
 
 module_dir_ = os.path.dirname(__file__)
 

From 46fe8af81eb1911a4dac37b805d6d70d2a07bf8f Mon Sep 17 00:00:00 2001
From: Rohit Bhande <rbhande@google.com>
Date: Thu, 16 Jan 2025 10:28:20 +0000
Subject: [PATCH 11/21] Ejscreen semiautomatic2

---
 scripts/us_epa/ejscreen/ejscreen_test.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/scripts/us_epa/ejscreen/ejscreen_test.py b/scripts/us_epa/ejscreen/ejscreen_test.py
index 12c98be7a9..8e97d841be 100644
--- a/scripts/us_epa/ejscreen/ejscreen_test.py
+++ b/scripts/us_epa/ejscreen/ejscreen_test.py
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
 '''
 Unit tests for ejscreen.py
 Usage: python3 -m unittest discover -v -s ../ -p "ejscreen_test.py"

From dcb84fd9555a0d245b0e175d434039a7f917ca2d Mon Sep 17 00:00:00 2001
From: Rohit Bhande <rbhande@google.com>
Date: Thu, 16 Jan 2025 10:37:03 +0000
Subject: [PATCH 12/21] Ejscreen semiautomatic2

---
 scripts/us_epa/ejscreen/ejscreen_test.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/scripts/us_epa/ejscreen/ejscreen_test.py b/scripts/us_epa/ejscreen/ejscreen_test.py
index 8e97d841be..12c98be7a9 100644
--- a/scripts/us_epa/ejscreen/ejscreen_test.py
+++ b/scripts/us_epa/ejscreen/ejscreen_test.py
@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 '''
 Unit tests for ejscreen.py
 Usage: python3 -m unittest discover -v -s ../ -p "ejscreen_test.py"

From 9793b3324ab69c9102b579ef541bdb5a2db3f45f Mon Sep 17 00:00:00 2001
From: Rohit Bhande <rbhande@google.com>
Date: Mon, 20 Jan 2025 05:22:47 +0000
Subject: [PATCH 13/21] Ejscreen semiautomatic2

---
 scripts/us_epa/ejscreen/config.json |  5 ++-
 scripts/us_epa/ejscreen/ejscreen.py | 68 ++++++++++++++++-------------
 2 files changed, 41 insertions(+), 32 deletions(-)

diff --git a/scripts/us_epa/ejscreen/config.json b/scripts/us_epa/ejscreen/config.json
index 9375ee2504..828cc4a7ae 100644
--- a/scripts/us_epa/ejscreen/config.json
+++ b/scripts/us_epa/ejscreen/config.json
@@ -1,5 +1,5 @@
 {
-    "YEARS": ["2015", "2016", "2017", "2018", "2019", "2020", "2021", "2022", "2023", "2024"],
+    "YEARS": [ "2015", "2016", "2017", "2018", "2019", "2020", "2021", "2022", "2023", "2024"],
     "NORM_CSV_COLUMNS": ["ID", "DSLPM", "CANCER", "RESP", "OZONE", "PM25"],
     "NORM_CSV_COLUMNS1": ["ID", "DSLPM", "OZONE", "PM25"],
     "CSV_COLUMNS_BY_YEAR": {
@@ -40,6 +40,9 @@
     },
     "URL_TEMPLATE": "https://gaftp.epa.gov/EJSCREEN/{year}/{zip_filename}.zip",
     "URL_TEMPLATE_NON_ZIPPED": "https://gaftp.epa.gov/EJSCREEN/{year}/{filename}.csv",
+    "URL_TEMPLATE": "https://gaftp.epa.gov/EJSCREEN/{year}/{zip_filename}.zip",
+    "URL_TEMPLATE_2023": "https://gaftp.epa.gov/EJSCREEN/2023/2.22_September_UseMe/{zip_filename}.zip",
+    "URL_TEMPLATE_2024": "https://gaftp.epa.gov/EJSCREEN/2024/2.32_August_UseMe/{zip_filename}.zip",
     "TEMPLATE_MCF": [
       {
         "Node": "E:ejscreen_airpollutants->E0",
diff --git a/scripts/us_epa/ejscreen/ejscreen.py b/scripts/us_epa/ejscreen/ejscreen.py
index bce739901c..38f6d46704 100644
--- a/scripts/us_epa/ejscreen/ejscreen.py
+++ b/scripts/us_epa/ejscreen/ejscreen.py
@@ -1,16 +1,16 @@
-# Copyright 2023 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     https://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
+# # Copyright 2023 Google LLC
+# #
+# # Licensed under the Apache License, Version 2.0 (the "License");
+# # you may not use this file except in compliance with the License.
+# # You may obtain a copy of the License at
+# #
+# #     https://www.apache.org/licenses/LICENSE-2.0
+# #
+# # Unless required by applicable law or agreed to in writing, software
+# # distributed under the License is distributed on an "AS IS" BASIS,
+# # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# # See the License for the specific language governing permissions and
+# # limitations under the License.
 
 import io
 import os
@@ -38,7 +38,8 @@
 FILENAMES = config["FILENAMES"]
 TEMPLATE_MCF = config["TEMPLATE_MCF"]
 URL_TEMPLATE = config["URL_TEMPLATE"]
-URL_TEMPLATE_NON_ZIPPED = config["URL_TEMPLATE_NON_ZIPPED"]
+URL_TEMPLATE_2023 = config.get("URL_TEMPLATE_2023", URL_TEMPLATE)  
+URL_TEMPLATE_2024 = config.get("URL_TEMPLATE_2024", URL_TEMPLATE)  
 
 # data: dictionary of dataframes in the format {year: dataframe}
 # outfilename: name of the csv that data will be written to
@@ -75,36 +76,36 @@ def write_tmcf(outfilename):
 
         # Check if the year has a zip file or not
         if zip_filename:
-            url = URL_TEMPLATE.format(year=year, zip_filename=zip_filename)
+            # Select the appropriate URL template based on the year
+            if year == '2023':
+                url = URL_TEMPLATE_2023.format(year=year, zip_filename=zip_filename)
+            elif year == '2024':
+                url = URL_TEMPLATE_2024.format(year=year, zip_filename=zip_filename)
+            else:
+                url = URL_TEMPLATE.format(year=year, zip_filename=zip_filename)
+
             logger.info(f"Requesting file: {url}")
             response = requests.get(url, verify=False)
 
             if response.status_code == 200:
                 with zipfile.ZipFile(io.BytesIO(response.content)) as zfile:
                     with zfile.open(f'{FILENAMES[year]}.csv', 'r') as newfile:
-                        dfs[year] = pd.read_csv(newfile, usecols=columns)
-                logger.info(
-                    f"File downloaded and processed for {year} successfully")
+                        # Specify encoding to handle special characters
+                        dfs[year] = pd.read_csv(newfile, usecols=columns, encoding='latin1')  # Added encoding='latin1'
+                logger.info(f"File downloaded and processed for {year} successfully")
             else:
-                logger.error(
-                    f"Failed to download file for {year}. HTTP Status Code: {response.status_code}"
-                )
+                logger.error(f"Failed to download file for {year}. HTTP Status Code: {response.status_code}")
         else:
-            url = URL_TEMPLATE_NON_ZIPPED.format(year=year,
-                                                 filename=FILENAMES[year])
+            url = URL_TEMPLATE.format(year=year, filename=FILENAMES[year])
             logger.info(f"Requesting CSV file: {url}")
             response = requests.get(url, verify=False)
 
             if response.status_code == 200:
-                dfs[year] = pd.read_csv(io.StringIO(response.text),
-                                        sep=',',
-                                        usecols=columns)
-                logger.info(
-                    f"CSV downloaded and processed for {year} successfully")
+                # Specify encoding to handle special characters
+                dfs[year] = pd.read_csv(io.StringIO(response.text), sep=',', usecols=columns, encoding='latin1')  # Added encoding='latin1'
+                logger.info(f"CSV downloaded and processed for {year} successfully")
             else:
-                logger.error(
-                    f"Failed to download CSV for {year}. HTTP Status Code: {response.status_code}"
-                )
+                logger.error(f"Failed to download CSV for {year}. HTTP Status Code: {response.status_code}")
 
         # Rename weird column names to match other years
         if year == '2024':
@@ -120,3 +121,8 @@ def write_tmcf(outfilename):
     logger.info("Writing template to tmcf")
     write_tmcf('ejscreen.tmcf')
     logger.info("Process completed successfully")
+
+
+
+
+

From 6e4669234e1fe149de231905420334f6fd8946d1 Mon Sep 17 00:00:00 2001
From: Rohit Bhande <rbhande@google.com>
Date: Mon, 20 Jan 2025 05:29:28 +0000
Subject: [PATCH 14/21] Ejscreen semiautomatic2

---
 scripts/us_epa/ejscreen/ejscreen.py | 39 ++++++++++++++++++-----------
 1 file changed, 24 insertions(+), 15 deletions(-)

diff --git a/scripts/us_epa/ejscreen/ejscreen.py b/scripts/us_epa/ejscreen/ejscreen.py
index 38f6d46704..716e97f3ed 100644
--- a/scripts/us_epa/ejscreen/ejscreen.py
+++ b/scripts/us_epa/ejscreen/ejscreen.py
@@ -38,8 +38,8 @@
 FILENAMES = config["FILENAMES"]
 TEMPLATE_MCF = config["TEMPLATE_MCF"]
 URL_TEMPLATE = config["URL_TEMPLATE"]
-URL_TEMPLATE_2023 = config.get("URL_TEMPLATE_2023", URL_TEMPLATE)  
-URL_TEMPLATE_2024 = config.get("URL_TEMPLATE_2024", URL_TEMPLATE)  
+URL_TEMPLATE_2023 = config.get("URL_TEMPLATE_2023", URL_TEMPLATE)
+URL_TEMPLATE_2024 = config.get("URL_TEMPLATE_2024", URL_TEMPLATE)
 
 # data: dictionary of dataframes in the format {year: dataframe}
 # outfilename: name of the csv that data will be written to
@@ -78,9 +78,11 @@ def write_tmcf(outfilename):
         if zip_filename:
             # Select the appropriate URL template based on the year
             if year == '2023':
-                url = URL_TEMPLATE_2023.format(year=year, zip_filename=zip_filename)
+                url = URL_TEMPLATE_2023.format(year=year,
+                                               zip_filename=zip_filename)
             elif year == '2024':
-                url = URL_TEMPLATE_2024.format(year=year, zip_filename=zip_filename)
+                url = URL_TEMPLATE_2024.format(year=year,
+                                               zip_filename=zip_filename)
             else:
                 url = URL_TEMPLATE.format(year=year, zip_filename=zip_filename)
 
@@ -91,10 +93,15 @@ def write_tmcf(outfilename):
                 with zipfile.ZipFile(io.BytesIO(response.content)) as zfile:
                     with zfile.open(f'{FILENAMES[year]}.csv', 'r') as newfile:
                         # Specify encoding to handle special characters
-                        dfs[year] = pd.read_csv(newfile, usecols=columns, encoding='latin1')  # Added encoding='latin1'
-                logger.info(f"File downloaded and processed for {year} successfully")
+                        dfs[year] = pd.read_csv(
+                            newfile, usecols=columns,
+                            encoding='latin1')  # Added encoding='latin1'
+                logger.info(
+                    f"File downloaded and processed for {year} successfully")
             else:
-                logger.error(f"Failed to download file for {year}. HTTP Status Code: {response.status_code}")
+                logger.error(
+                    f"Failed to download file for {year}. HTTP Status Code: {response.status_code}"
+                )
         else:
             url = URL_TEMPLATE.format(year=year, filename=FILENAMES[year])
             logger.info(f"Requesting CSV file: {url}")
@@ -102,10 +109,17 @@ def write_tmcf(outfilename):
 
             if response.status_code == 200:
                 # Specify encoding to handle special characters
-                dfs[year] = pd.read_csv(io.StringIO(response.text), sep=',', usecols=columns, encoding='latin1')  # Added encoding='latin1'
-                logger.info(f"CSV downloaded and processed for {year} successfully")
+                dfs[year] = pd.read_csv(
+                    io.StringIO(response.text),
+                    sep=',',
+                    usecols=columns,
+                    encoding='latin1')  # Added encoding='latin1'
+                logger.info(
+                    f"CSV downloaded and processed for {year} successfully")
             else:
-                logger.error(f"Failed to download CSV for {year}. HTTP Status Code: {response.status_code}")
+                logger.error(
+                    f"Failed to download CSV for {year}. HTTP Status Code: {response.status_code}"
+                )
 
         # Rename weird column names to match other years
         if year == '2024':
@@ -121,8 +135,3 @@ def write_tmcf(outfilename):
     logger.info("Writing template to tmcf")
     write_tmcf('ejscreen.tmcf')
     logger.info("Process completed successfully")
-
-
-
-
-

From 218133a8f652b8fd4f37b94a2b6d34a9d7205c8a Mon Sep 17 00:00:00 2001
From: Rohit Bhande <rbhande@google.com>
Date: Tue, 21 Jan 2025 06:06:10 +0000
Subject: [PATCH 15/21] Ejscreen semiautomatic2

---
 scripts/us_epa/ejscreen/config.json   | 186 +++++++++++++-------------
 scripts/us_epa/ejscreen/ejscreen.py   | 148 ++++++++++----------
 scripts/us_epa/ejscreen/ejscreen.tmcf |   5 +
 scripts/us_epa/ejscreen/manifest.json |   8 +-
 4 files changed, 180 insertions(+), 167 deletions(-)

diff --git a/scripts/us_epa/ejscreen/config.json b/scripts/us_epa/ejscreen/config.json
index 828cc4a7ae..0826dde403 100644
--- a/scripts/us_epa/ejscreen/config.json
+++ b/scripts/us_epa/ejscreen/config.json
@@ -1,98 +1,98 @@
 {
-    "YEARS": [ "2015", "2016", "2017", "2018", "2019", "2020", "2021", "2022", "2023", "2024"],
-    "NORM_CSV_COLUMNS": ["ID", "DSLPM", "CANCER", "RESP", "OZONE", "PM25"],
-    "NORM_CSV_COLUMNS1": ["ID", "DSLPM", "OZONE", "PM25"],
-    "CSV_COLUMNS_BY_YEAR": {
-      "2015": ["FIPS", "dpm", "cancer", "resp", "o3", "pm"],
-      "2016": ["ID", "DSLPM", "CANCER", "RESP", "OZONE", "PM25"],
-      "2017": ["ID", "DSLPM", "CANCER", "RESP", "OZONE", "PM25"],
-      "2018": ["ID", "DSLPM", "CANCER", "RESP", "OZONE", "PM25"],
-      "2019": ["ID", "DSLPM", "CANCER", "RESP", "OZONE", "PM25"],
-      "2020": ["ID", "DSLPM", "CANCER", "RESP", "OZONE", "PM25"],
-      "2021": ["ID", "DSLPM", "CANCER", "RESP", "OZONE", "PM25"],
-      "2022": ["ID", "DSLPM", "CANCER", "RESP", "OZONE", "PM25"],
-      "2023": ["ID", "DSLPM", "CANCER", "RESP", "OZONE", "PM25"],
-      "2024": ["ID", "DSLPM", "OZONE", "PM25"]
+  "YEARS": [ "2015", "2016", "2017", "2018", "2019", "2020", "2021", "2022", "2023", "2024"],
+  "NORM_CSV_COLUMNS": ["ID", "DSLPM", "CANCER", "RESP", "OZONE", "PM25"],
+  "NORM_CSV_COLUMNS1": ["ID", "DSLPM", "OZONE", "PM25"],
+  "CSV_COLUMNS_BY_YEAR": {
+    "2015": ["FIPS", "dpm", "cancer", "resp", "o3", "pm"],
+    "2016": ["ID", "DSLPM", "CANCER", "RESP", "OZONE", "PM25"],
+    "2017": ["ID", "DSLPM", "CANCER", "RESP", "OZONE", "PM25"],
+    "2018": ["ID", "DSLPM", "CANCER", "RESP", "OZONE", "PM25"],
+    "2019": ["ID", "DSLPM", "CANCER", "RESP", "OZONE", "PM25"],
+    "2020": ["ID", "DSLPM", "CANCER", "RESP", "OZONE", "PM25"],
+    "2021": ["ID", "DSLPM", "CANCER", "RESP", "OZONE", "PM25"],
+    "2022": ["ID", "DSLPM", "CANCER", "RESP", "OZONE", "PM25"],
+    "2023": ["ID", "DSLPM", "CANCER", "RESP", "OZONE", "PM25"],
+    "2024": ["ID", "DSLPM", "OZONE", "PM25"]
+  },
+  "ZIP_FILENAMES": {
+    "2015": "EJSCREEN_20150505.csv",
+    "2016": "EJSCREEN_V3_USPR_090216_CSV",
+    "2017": null,
+    "2018": "EJSCREEN_2018_USPR_csv",
+    "2019": "EJSCREEN_2019_USPR.csv",
+    "2020": "EJSCREEN_2020_USPR.csv",
+    "2021": "EJSCREEN_2021_USPR.csv",
+    "2022": "EJSCREEN_2022_with_AS_CNMI_GU_VI.csv",
+    "2023": "EJSCREEN_2023_BG_with_AS_CNMI_GU_VI.csv",
+    "2024": "EJScreen_2024_Tract_with_AS_CNMI_GU_VI.csv"
+  },
+  "FILENAMES": {
+    "2015": "EJSCREEN_20150505",
+    "2016": "EJSCREEN_Full_V3_USPR_TSDFupdate",
+    "2017": "EJSCREEN_2017_USPR_Public",
+    "2018": "EJSCREEN_Full_USPR_2018",
+    "2019": "EJSCREEN_2019_USPR",
+    "2020": "EJSCREEN_2020_USPR",
+    "2021": "EJSCREEN_2021_USPR",
+    "2022": "EJSCREEN_2022_Full_with_AS_CNMI_GU_VI",
+    "2023": "EJSCREEN_2023_BG_with_AS_CNMI_GU_VI",
+    "2024": "EJScreen_2024_Tract_with_AS_CNMI_GU_VI"
+  },
+  "TEMPLATE_MCF": [
+    {
+      "Node": "E:ejscreen_airpollutants->E0",
+      "typeOf": "dcs:StatVarObservation",
+      "variableMeasured": "dcs:Mean_Concentration_AirPollutant_DieselPM",
+      "observationDate": "C:ejscreen_airpollutants->year",
+      "observationAbout": "C:ejscreen_airpollutants->FIPS",
+      "observationPeriod": "dcs:P1Y",
+      "value": "C:ejscreen_airpollutants->DSLPM",
+      "unit": "dcs:MicrogramsPerCubicMeter"
     },
-    "ZIP_FILENAMES": {
-      "2015": "EJSCREEN_20150505.csv",
-      "2016": "EJSCREEN_V3_USPR_090216_CSV",
-      "2017": null,
-      "2018": "EJSCREEN_2018_USPR_csv",
-      "2019": "EJSCREEN_2019_USPR.csv",
-      "2020": "EJSCREEN_2020_USPR.csv",
-      "2021": "EJSCREEN_2021_USPR.csv",
-      "2022": "EJSCREEN_2022_with_AS_CNMI_GU_VI.csv",
-      "2023": "EJSCREEN_2023_BG_with_AS_CNMI_GU_VI.csv",
-      "2024": "EJScreen_2024_Tract_with_AS_CNMI_GU_VI.csv"
+    {
+      "Node": "E:ejscreen_airpollutants->E1",
+      "typeOf": "dcs:StatVarObservation",
+      "variableMeasured": "dcs:AirPollutant_Cancer_Risk",
+      "observationDate": "C:ejscreen_airpollutants->year",
+      "observationAbout": "C:ejscreen_airpollutants->FIPS",
+      "observationPeriod": "dcs:P1Y",
+      "value": "C:ejscreen_airpollutants->CANCER",
+      "unit": "dcs:PerMillionPerson"
     },
-    "FILENAMES": {
-      "2015": "EJSCREEN_20150505",
-      "2016": "EJSCREEN_Full_V3_USPR_TSDFupdate",
-      "2017": "EJSCREEN_2017_USPR_Public",
-      "2018": "EJSCREEN_Full_USPR_2018",
-      "2019": "EJSCREEN_2019_USPR",
-      "2020": "EJSCREEN_2020_USPR",
-      "2021": "EJSCREEN_2021_USPR",
-      "2022": "EJSCREEN_2022_Full_with_AS_CNMI_GU_VI",
-      "2023": "EJSCREEN_2023_BG_with_AS_CNMI_GU_VI",
-      "2024": "EJScreen_2024_Tract_with_AS_CNMI_GU_VI"
+    {
+      "Node": "E:ejscreen_airpollutants->E2",
+      "typeOf": "dcs:StatVarObservation",
+      "variableMeasured": "dcs:AirPollutant_Respiratory_Hazard",
+      "observationDate": "C:ejscreen_airpollutants->year",
+      "observationAbout": "C:ejscreen_airpollutants->FIPS",
+      "observationPeriod": "dcs:P1Y",
+      "value": "C:ejscreen_airpollutants->RESP"
     },
-    "URL_TEMPLATE": "https://gaftp.epa.gov/EJSCREEN/{year}/{zip_filename}.zip",
-    "URL_TEMPLATE_NON_ZIPPED": "https://gaftp.epa.gov/EJSCREEN/{year}/{filename}.csv",
-    "URL_TEMPLATE": "https://gaftp.epa.gov/EJSCREEN/{year}/{zip_filename}.zip",
-    "URL_TEMPLATE_2023": "https://gaftp.epa.gov/EJSCREEN/2023/2.22_September_UseMe/{zip_filename}.zip",
-    "URL_TEMPLATE_2024": "https://gaftp.epa.gov/EJSCREEN/2024/2.32_August_UseMe/{zip_filename}.zip",
-    "TEMPLATE_MCF": [
-      {
-        "Node": "E:ejscreen_airpollutants->E0",
-        "typeOf": "dcs:StatVarObservation",
-        "variableMeasured": "dcs:Mean_Concentration_AirPollutant_DieselPM",
-        "observationDate": "C:ejscreen_airpollutants->year",
-        "observationAbout": "C:ejscreen_airpollutants->FIPS",
-        "observationPeriod": "dcs:P1Y",
-        "value": "C:ejscreen_airpollutants->DSLPM",
-        "unit": "dcs:MicrogramsPerCubicMeter"
-      },
-      {
-        "Node": "E:ejscreen_airpollutants->E1",
-        "typeOf": "dcs:StatVarObservation",
-        "variableMeasured": "dcs:AirPollutant_Cancer_Risk",
-        "observationDate": "C:ejscreen_airpollutants->year",
-        "observationAbout": "C:ejscreen_airpollutants->FIPS",
-        "observationPeriod": "dcs:P1Y",
-        "value": "C:ejscreen_airpollutants->CANCER",
-        "unit": "dcs:PerMillionPerson"
-      },
-      {
-        "Node": "E:ejscreen_airpollutants->E2",
-        "typeOf": "dcs:StatVarObservation",
-        "variableMeasured": "dcs:AirPollutant_Respiratory_Hazard",
-        "observationDate": "C:ejscreen_airpollutants->year",
-        "observationAbout": "C:ejscreen_airpollutants->FIPS",
-        "observationPeriod": "dcs:P1Y",
-        "value": "C:ejscreen_airpollutants->RESP"
-      },
-      {
-        "Node": "E:ejscreen_airpollutants->E3",
-        "typeOf": "dcs:StatVarObservation",
-        "variableMeasured": "dcs:Mean_Concentration_AirPollutant_Ozone",
-        "observationDate": "C:ejscreen_airpollutants->year",
-        "observationAbout": "C:ejscreen_airpollutants->FIPS",
-        "observationPeriod": "dcs:P1Y",
-        "value": "C:ejscreen_airpollutants->OZONE",
-        "unit": "dcs:PartsPerBillion"
-      },
-      {
-        "Node": "E:ejscreen_airpollutants->E4",
-        "typeOf": "dcs:StatVarObservation",
-        "variableMeasured": "dcs:Mean_Concentration_AirPollutant_PM2.5",
-        "observationDate": "C:ejscreen_airpollutants->year",
-        "observationAbout": "C:ejscreen_airpollutants->FIPS",
-        "observationPeriod": "dcs:P1Y",
-        "value": "C:ejscreen_airpollutants->PM25",
-        "unit": "dcs:MicrogramsPerCubicMeter"
-      }
-    ]
+    {
+      "Node": "E:ejscreen_airpollutants->E3",
+      "typeOf": "dcs:StatVarObservation",
+      "variableMeasured": "dcs:Mean_Concentration_AirPollutant_Ozone",
+      "observationDate": "C:ejscreen_airpollutants->year",
+      "observationAbout": "C:ejscreen_airpollutants->FIPS",
+      "observationPeriod": "dcs:P1Y",
+      "value": "C:ejscreen_airpollutants->OZONE",
+      "unit": "dcs:PartsPerBillion"
+    },
+    {
+      "Node": "E:ejscreen_airpollutants->E4",
+      "typeOf": "dcs:StatVarObservation",
+      "variableMeasured": "dcs:Mean_Concentration_AirPollutant_PM2.5",
+      "observationDate": "C:ejscreen_airpollutants->year",
+      "observationAbout": "C:ejscreen_airpollutants->FIPS",
+      "observationPeriod": "dcs:P1Y",
+      "value": "C:ejscreen_airpollutants->PM25",
+      "unit": "dcs:MicrogramsPerCubicMeter"
+    }
+  ]
+  ,
+  "BASE_URL": "https://gaftp.epa.gov/EJSCREEN",
+  "URL_SUFFIX": {
+    "2023": "2.22_September_UseMe",
+    "2024": "2.32_August_UseMe"
   }
-  
\ No newline at end of file
+}
diff --git a/scripts/us_epa/ejscreen/ejscreen.py b/scripts/us_epa/ejscreen/ejscreen.py
index 716e97f3ed..87673aa491 100644
--- a/scripts/us_epa/ejscreen/ejscreen.py
+++ b/scripts/us_epa/ejscreen/ejscreen.py
@@ -1,16 +1,16 @@
-# # Copyright 2023 Google LLC
-# #
-# # Licensed under the Apache License, Version 2.0 (the "License");
-# # you may not use this file except in compliance with the License.
-# # You may obtain a copy of the License at
-# #
-# #     https://www.apache.org/licenses/LICENSE-2.0
-# #
-# # Unless required by applicable law or agreed to in writing, software
-# # distributed under the License is distributed on an "AS IS" BASIS,
-# # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# # See the License for the specific language governing permissions and
-# # limitations under the License.
+# Copyright 2023 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 
 import io
 import os
@@ -18,10 +18,20 @@
 import requests
 import pandas as pd
 import json
-from absl import logging
+from absl import logging, flags, app
+import sys
+
+_MODULE_DIR = os.path.dirname(os.path.abspath(__file__))
+sys.path.append(os.path.join(_MODULE_DIR, '../../../util/'))
+print(_MODULE_DIR)
+import file_util
 
 logging.set_verbosity(logging.INFO)
 logger = logging
+_FLAGS = flags.FLAGS
+flags.DEFINE_string('config_path',
+                    'gs://unresolved_mcf/epa/ejscreen/config.json',
+                    'Path to config file')
 
 _MODULE_DIR = os.path.dirname(os.path.abspath(__file__))
 _CONFIG_PATH = os.path.join(_MODULE_DIR, 'config.json')
@@ -37,22 +47,32 @@
 ZIP_FILENAMES = config["ZIP_FILENAMES"]
 FILENAMES = config["FILENAMES"]
 TEMPLATE_MCF = config["TEMPLATE_MCF"]
-URL_TEMPLATE = config["URL_TEMPLATE"]
-URL_TEMPLATE_2023 = config.get("URL_TEMPLATE_2023", URL_TEMPLATE)
-URL_TEMPLATE_2024 = config.get("URL_TEMPLATE_2024", URL_TEMPLATE)
+BASE_URL = config["BASE_URL"]
+URL_SUFFIX = config["URL_SUFFIX"]
+
 
-# data: dictionary of dataframes in the format {year: dataframe}
-# outfilename: name of the csv that data will be written to
-# write_csv concatenates the dataframe from each year together
+# Function to build the correct URL for each year
+def build_url(year, zip_filename=None):
+    if zip_filename:
+        # Construct the URL for the zip file
+        if year in URL_SUFFIX:
+            url = f'{BASE_URL}/{year}/{URL_SUFFIX[year]}/{zip_filename}.zip'
+        else:
+            url = f'{BASE_URL}/{year}/{zip_filename}.zip'
+    else:
+        # Construct the URL for the CSV file
+        url = f'{BASE_URL}/{year}/{FILENAMES[year]}.csv'
+    return url
 
 
+# Data processing function
 def write_csv(data, outfilename):
     full_df = pd.DataFrame()
     for curr_year, one_year_df in data.items():
         one_year_df['year'] = curr_year
         full_df = pd.concat([full_df, one_year_df], ignore_index=True)
 
-    # sort by FIPS and make into dcid
+    # Sort by FIPS and make into dcid
     full_df = full_df.rename(columns={'ID': 'FIPS'})
     full_df = full_df.sort_values(by=['FIPS'], ignore_index=True)
     full_df['FIPS'] = 'dcid:geoId/' + (
@@ -63,65 +83,51 @@ def write_csv(data, outfilename):
 
 
 def write_tmcf(outfilename):
+    # Convert each item in TEMPLATE_MCF to a string, even if it's a dictionary
+    if isinstance(TEMPLATE_MCF, list):
+        # Convert each element to a string if it's not already
+        template_content = "\n".join(str(item) for item in TEMPLATE_MCF)
+    else:
+        template_content = str(
+            TEMPLATE_MCF
+        )  # In case it's not a list, just convert it to a string
+
     with open(outfilename, 'w') as f_out:
-        f_out.write(TEMPLATE_MCF)
+        f_out.write(template_content)
 
 
-if __name__ == '__main__':
+def main(_):
     dfs = {}
     for year in YEARS:
         logger.info(f"Processing year: {year}")
         columns = CSV_COLUMNS_BY_YEAR[year]
         zip_filename = ZIP_FILENAMES.get(year, None)
 
-        # Check if the year has a zip file or not
-        if zip_filename:
-            # Select the appropriate URL template based on the year
-            if year == '2023':
-                url = URL_TEMPLATE_2023.format(year=year,
-                                               zip_filename=zip_filename)
-            elif year == '2024':
-                url = URL_TEMPLATE_2024.format(year=year,
-                                               zip_filename=zip_filename)
-            else:
-                url = URL_TEMPLATE.format(year=year, zip_filename=zip_filename)
+        url = build_url(year, zip_filename)
 
-            logger.info(f"Requesting file: {url}")
-            response = requests.get(url, verify=False)
+        logger.info(f"Requesting file: {url}")
+        response = requests.get(url, verify=False)
 
-            if response.status_code == 200:
+        if response.status_code == 200:
+            if zip_filename:
                 with zipfile.ZipFile(io.BytesIO(response.content)) as zfile:
                     with zfile.open(f'{FILENAMES[year]}.csv', 'r') as newfile:
-                        # Specify encoding to handle special characters
-                        dfs[year] = pd.read_csv(
-                            newfile, usecols=columns,
-                            encoding='latin1')  # Added encoding='latin1'
-                logger.info(
-                    f"File downloaded and processed for {year} successfully")
+                        dfs[year] = pd.read_csv(newfile,
+                                                engine='python',
+                                                encoding='latin1',
+                                                usecols=columns)
             else:
-                logger.error(
-                    f"Failed to download file for {year}. HTTP Status Code: {response.status_code}"
-                )
+                dfs[year] = pd.read_csv(io.StringIO(response.text),
+                                        sep=',',
+                                        usecols=columns)
+            logger.info(
+                f"File downloaded and processed for {year} successfully")
         else:
-            url = URL_TEMPLATE.format(year=year, filename=FILENAMES[year])
-            logger.info(f"Requesting CSV file: {url}")
-            response = requests.get(url, verify=False)
-
-            if response.status_code == 200:
-                # Specify encoding to handle special characters
-                dfs[year] = pd.read_csv(
-                    io.StringIO(response.text),
-                    sep=',',
-                    usecols=columns,
-                    encoding='latin1')  # Added encoding='latin1'
-                logger.info(
-                    f"CSV downloaded and processed for {year} successfully")
-            else:
-                logger.error(
-                    f"Failed to download CSV for {year}. HTTP Status Code: {response.status_code}"
-                )
+            logger.error(
+                f"Failed to download file for {year}. HTTP Status Code: {response.status_code}"
+            )
 
-        # Rename weird column names to match other years
+        # Rename columns to match other years
         if year == '2024':
             cols_renamed = dict(zip(columns, NORM_CSV_COLUMNS1))
         else:
@@ -130,8 +136,12 @@ def write_tmcf(outfilename):
         dfs[year] = dfs[year].rename(columns=cols_renamed)
         logger.info(f"Columns renamed for {year} successfully")
 
-    logger.info("Writing data to csv")
-    write_csv(dfs, 'ejscreen_airpollutants.csv')
-    logger.info("Writing template to tmcf")
-    write_tmcf('ejscreen.tmcf')
-    logger.info("Process completed successfully")
+        logger.info("Writing data to CSV")
+        write_csv(dfs, 'ejscreen_airpollutants.csv')
+        logger.info("Writing template to TMCF")
+        write_tmcf('ejscreen.tmcf')
+        logger.info("Process completed successfully")
+
+
+if __name__ == '__main__':
+    app.run(main)
diff --git a/scripts/us_epa/ejscreen/ejscreen.tmcf b/scripts/us_epa/ejscreen/ejscreen.tmcf
index e69de29bb2..785c1a0a8b 100644
--- a/scripts/us_epa/ejscreen/ejscreen.tmcf
+++ b/scripts/us_epa/ejscreen/ejscreen.tmcf
@@ -0,0 +1,5 @@
+{'Node': 'E:ejscreen_airpollutants->E0', 'typeOf': 'dcs:StatVarObservation', 'variableMeasured': 'dcs:Mean_Concentration_AirPollutant_DieselPM', 'observationDate': 'C:ejscreen_airpollutants->year', 'observationAbout': 'C:ejscreen_airpollutants->FIPS', 'observationPeriod': 'dcs:P1Y', 'value': 'C:ejscreen_airpollutants->DSLPM', 'unit': 'dcs:MicrogramsPerCubicMeter'}
+{'Node': 'E:ejscreen_airpollutants->E1', 'typeOf': 'dcs:StatVarObservation', 'variableMeasured': 'dcs:AirPollutant_Cancer_Risk', 'observationDate': 'C:ejscreen_airpollutants->year', 'observationAbout': 'C:ejscreen_airpollutants->FIPS', 'observationPeriod': 'dcs:P1Y', 'value': 'C:ejscreen_airpollutants->CANCER', 'unit': 'dcs:PerMillionPerson'}
+{'Node': 'E:ejscreen_airpollutants->E2', 'typeOf': 'dcs:StatVarObservation', 'variableMeasured': 'dcs:AirPollutant_Respiratory_Hazard', 'observationDate': 'C:ejscreen_airpollutants->year', 'observationAbout': 'C:ejscreen_airpollutants->FIPS', 'observationPeriod': 'dcs:P1Y', 'value': 'C:ejscreen_airpollutants->RESP'}
+{'Node': 'E:ejscreen_airpollutants->E3', 'typeOf': 'dcs:StatVarObservation', 'variableMeasured': 'dcs:Mean_Concentration_AirPollutant_Ozone', 'observationDate': 'C:ejscreen_airpollutants->year', 'observationAbout': 'C:ejscreen_airpollutants->FIPS', 'observationPeriod': 'dcs:P1Y', 'value': 'C:ejscreen_airpollutants->OZONE', 'unit': 'dcs:PartsPerBillion'}
+{'Node': 'E:ejscreen_airpollutants->E4', 'typeOf': 'dcs:StatVarObservation', 'variableMeasured': 'dcs:Mean_Concentration_AirPollutant_PM2.5', 'observationDate': 'C:ejscreen_airpollutants->year', 'observationAbout': 'C:ejscreen_airpollutants->FIPS', 'observationPeriod': 'dcs:P1Y', 'value': 'C:ejscreen_airpollutants->PM25', 'unit': 'dcs:MicrogramsPerCubicMeter'}
\ No newline at end of file
diff --git a/scripts/us_epa/ejscreen/manifest.json b/scripts/us_epa/ejscreen/manifest.json
index 6938eaa11f..dbb7caa53c 100644
--- a/scripts/us_epa/ejscreen/manifest.json
+++ b/scripts/us_epa/ejscreen/manifest.json
@@ -2,9 +2,7 @@
     "import_specifications": [
         {
             "import_name": "EPA_EJSCREEN",
-            "curator_emails": [
-                "rbhande@google.com"
-            ],
+            "curator_emails": [],
             "provenance_url": "https://gaftp.epa.gov/EJSCREEN/",
             "provenance_description": "The Census Bureau's Ejscreen data",
             "scripts": [
@@ -12,8 +10,8 @@
             ],
             "import_inputs": [
                 {
-                    "template_mcf": "us_epa/ejscreen/ejscreen.tmcf",
-                    "cleaned_csv": "us_epa/ejscreen/ejscreen_airpollutants.csv"
+                    "template_mcf": "ejscreen.tmcf",
+                    "cleaned_csv": "ejscreen_airpollutants.csv"
                 }
             ],
             "cron_schedule": "0 07 * * 1"

From 6a650927e1c817b2d7c168cbb0f730d66f4c019b Mon Sep 17 00:00:00 2001
From: Rohit Bhande <rbhande@google.com>
Date: Wed, 22 Jan 2025 09:40:40 +0000
Subject: [PATCH 16/21] Ejscreen semiautomatic2

---
 scripts/us_epa/ejscreen/config.json   |  98 ----------------
 scripts/us_epa/ejscreen/ejscreen.py   | 163 ++++++++++++++------------
 scripts/us_epa/ejscreen/manifest.json |   4 +-
 3 files changed, 91 insertions(+), 174 deletions(-)
 delete mode 100644 scripts/us_epa/ejscreen/config.json

diff --git a/scripts/us_epa/ejscreen/config.json b/scripts/us_epa/ejscreen/config.json
deleted file mode 100644
index 0826dde403..0000000000
--- a/scripts/us_epa/ejscreen/config.json
+++ /dev/null
@@ -1,98 +0,0 @@
-{
-  "YEARS": [ "2015", "2016", "2017", "2018", "2019", "2020", "2021", "2022", "2023", "2024"],
-  "NORM_CSV_COLUMNS": ["ID", "DSLPM", "CANCER", "RESP", "OZONE", "PM25"],
-  "NORM_CSV_COLUMNS1": ["ID", "DSLPM", "OZONE", "PM25"],
-  "CSV_COLUMNS_BY_YEAR": {
-    "2015": ["FIPS", "dpm", "cancer", "resp", "o3", "pm"],
-    "2016": ["ID", "DSLPM", "CANCER", "RESP", "OZONE", "PM25"],
-    "2017": ["ID", "DSLPM", "CANCER", "RESP", "OZONE", "PM25"],
-    "2018": ["ID", "DSLPM", "CANCER", "RESP", "OZONE", "PM25"],
-    "2019": ["ID", "DSLPM", "CANCER", "RESP", "OZONE", "PM25"],
-    "2020": ["ID", "DSLPM", "CANCER", "RESP", "OZONE", "PM25"],
-    "2021": ["ID", "DSLPM", "CANCER", "RESP", "OZONE", "PM25"],
-    "2022": ["ID", "DSLPM", "CANCER", "RESP", "OZONE", "PM25"],
-    "2023": ["ID", "DSLPM", "CANCER", "RESP", "OZONE", "PM25"],
-    "2024": ["ID", "DSLPM", "OZONE", "PM25"]
-  },
-  "ZIP_FILENAMES": {
-    "2015": "EJSCREEN_20150505.csv",
-    "2016": "EJSCREEN_V3_USPR_090216_CSV",
-    "2017": null,
-    "2018": "EJSCREEN_2018_USPR_csv",
-    "2019": "EJSCREEN_2019_USPR.csv",
-    "2020": "EJSCREEN_2020_USPR.csv",
-    "2021": "EJSCREEN_2021_USPR.csv",
-    "2022": "EJSCREEN_2022_with_AS_CNMI_GU_VI.csv",
-    "2023": "EJSCREEN_2023_BG_with_AS_CNMI_GU_VI.csv",
-    "2024": "EJScreen_2024_Tract_with_AS_CNMI_GU_VI.csv"
-  },
-  "FILENAMES": {
-    "2015": "EJSCREEN_20150505",
-    "2016": "EJSCREEN_Full_V3_USPR_TSDFupdate",
-    "2017": "EJSCREEN_2017_USPR_Public",
-    "2018": "EJSCREEN_Full_USPR_2018",
-    "2019": "EJSCREEN_2019_USPR",
-    "2020": "EJSCREEN_2020_USPR",
-    "2021": "EJSCREEN_2021_USPR",
-    "2022": "EJSCREEN_2022_Full_with_AS_CNMI_GU_VI",
-    "2023": "EJSCREEN_2023_BG_with_AS_CNMI_GU_VI",
-    "2024": "EJScreen_2024_Tract_with_AS_CNMI_GU_VI"
-  },
-  "TEMPLATE_MCF": [
-    {
-      "Node": "E:ejscreen_airpollutants->E0",
-      "typeOf": "dcs:StatVarObservation",
-      "variableMeasured": "dcs:Mean_Concentration_AirPollutant_DieselPM",
-      "observationDate": "C:ejscreen_airpollutants->year",
-      "observationAbout": "C:ejscreen_airpollutants->FIPS",
-      "observationPeriod": "dcs:P1Y",
-      "value": "C:ejscreen_airpollutants->DSLPM",
-      "unit": "dcs:MicrogramsPerCubicMeter"
-    },
-    {
-      "Node": "E:ejscreen_airpollutants->E1",
-      "typeOf": "dcs:StatVarObservation",
-      "variableMeasured": "dcs:AirPollutant_Cancer_Risk",
-      "observationDate": "C:ejscreen_airpollutants->year",
-      "observationAbout": "C:ejscreen_airpollutants->FIPS",
-      "observationPeriod": "dcs:P1Y",
-      "value": "C:ejscreen_airpollutants->CANCER",
-      "unit": "dcs:PerMillionPerson"
-    },
-    {
-      "Node": "E:ejscreen_airpollutants->E2",
-      "typeOf": "dcs:StatVarObservation",
-      "variableMeasured": "dcs:AirPollutant_Respiratory_Hazard",
-      "observationDate": "C:ejscreen_airpollutants->year",
-      "observationAbout": "C:ejscreen_airpollutants->FIPS",
-      "observationPeriod": "dcs:P1Y",
-      "value": "C:ejscreen_airpollutants->RESP"
-    },
-    {
-      "Node": "E:ejscreen_airpollutants->E3",
-      "typeOf": "dcs:StatVarObservation",
-      "variableMeasured": "dcs:Mean_Concentration_AirPollutant_Ozone",
-      "observationDate": "C:ejscreen_airpollutants->year",
-      "observationAbout": "C:ejscreen_airpollutants->FIPS",
-      "observationPeriod": "dcs:P1Y",
-      "value": "C:ejscreen_airpollutants->OZONE",
-      "unit": "dcs:PartsPerBillion"
-    },
-    {
-      "Node": "E:ejscreen_airpollutants->E4",
-      "typeOf": "dcs:StatVarObservation",
-      "variableMeasured": "dcs:Mean_Concentration_AirPollutant_PM2.5",
-      "observationDate": "C:ejscreen_airpollutants->year",
-      "observationAbout": "C:ejscreen_airpollutants->FIPS",
-      "observationPeriod": "dcs:P1Y",
-      "value": "C:ejscreen_airpollutants->PM25",
-      "unit": "dcs:MicrogramsPerCubicMeter"
-    }
-  ]
-  ,
-  "BASE_URL": "https://gaftp.epa.gov/EJSCREEN",
-  "URL_SUFFIX": {
-    "2023": "2.22_September_UseMe",
-    "2024": "2.32_August_UseMe"
-  }
-}
diff --git a/scripts/us_epa/ejscreen/ejscreen.py b/scripts/us_epa/ejscreen/ejscreen.py
index 87673aa491..c38d7a0527 100644
--- a/scripts/us_epa/ejscreen/ejscreen.py
+++ b/scripts/us_epa/ejscreen/ejscreen.py
@@ -1,19 +1,19 @@
 # Copyright 2023 Google LLC
-#
+
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-#
+
 #     https://www.apache.org/licenses/LICENSE-2.0
-#
+
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import io
 import os
+import io
 import zipfile
 import requests
 import pandas as pd
@@ -23,47 +23,40 @@
 
 _MODULE_DIR = os.path.dirname(os.path.abspath(__file__))
 sys.path.append(os.path.join(_MODULE_DIR, '../../../util/'))
-print(_MODULE_DIR)
 import file_util
 
 logging.set_verbosity(logging.INFO)
 logger = logging
 _FLAGS = flags.FLAGS
+
 flags.DEFINE_string('config_path',
                     'gs://unresolved_mcf/epa/ejscreen/config.json',
                     'Path to config file')
 
-_MODULE_DIR = os.path.dirname(os.path.abspath(__file__))
-_CONFIG_PATH = os.path.join(_MODULE_DIR, 'config.json')
-
-# Load configuration from config.json
-with open(_CONFIG_PATH, 'r') as f:
-    config = json.load(f)
-
-YEARS = config["YEARS"]
-NORM_CSV_COLUMNS = config["NORM_CSV_COLUMNS"]
-NORM_CSV_COLUMNS1 = config["NORM_CSV_COLUMNS1"]
-CSV_COLUMNS_BY_YEAR = config["CSV_COLUMNS_BY_YEAR"]
-ZIP_FILENAMES = config["ZIP_FILENAMES"]
-FILENAMES = config["FILENAMES"]
-TEMPLATE_MCF = config["TEMPLATE_MCF"]
-BASE_URL = config["BASE_URL"]
-URL_SUFFIX = config["URL_SUFFIX"]
-
-
 # Function to build the correct URL for each year
 def build_url(year, zip_filename=None):
     if zip_filename:
-        # Construct the URL for the zip file
         if year in URL_SUFFIX:
             url = f'{BASE_URL}/{year}/{URL_SUFFIX[year]}/{zip_filename}.zip'
         else:
             url = f'{BASE_URL}/{year}/{zip_filename}.zip'
     else:
-        # Construct the URL for the CSV file
         url = f'{BASE_URL}/{year}/{FILENAMES[year]}.csv'
     return url
 
+# Download the file and save it in the input folder
+def download_file(url, year, zip_filename=None):
+    response = requests.get(url, verify=False)
+    if response.status_code == 200:
+        input_folder = os.path.join(_MODULE_DIR, 'input')
+        os.makedirs(input_folder, exist_ok=True)  # Create the folder if it doesn't exist
+        
+        file_path = os.path.join(input_folder, f'{year}.zip' if zip_filename else f'{year}.csv')
+        with open(file_path, 'wb') as f:
+            f.write(response.content)
+        logger.info(f"File downloaded and saved as {file_path}")
+    else:
+        logger.fatal(f"Failed to download file for {year}. HTTP Status Code: {response.status_code}")
 
 # Data processing function
 def write_csv(data, outfilename):
@@ -72,76 +65,98 @@ def write_csv(data, outfilename):
         one_year_df['year'] = curr_year
         full_df = pd.concat([full_df, one_year_df], ignore_index=True)
 
-    # Sort by FIPS and make into dcid
     full_df = full_df.rename(columns={'ID': 'FIPS'})
     full_df = full_df.sort_values(by=['FIPS'], ignore_index=True)
-    full_df['FIPS'] = 'dcid:geoId/' + (
-        full_df['FIPS'].astype(str).str.zfill(12))
+    full_df['FIPS'] = 'dcid:geoId/' + (full_df['FIPS'].astype(str).str.zfill(12))
     full_df = full_df.fillna('')
     full_df = full_df.replace('None', '')
     full_df.to_csv(outfilename, index=False)
 
-
 def write_tmcf(outfilename):
-    # Convert each item in TEMPLATE_MCF to a string, even if it's a dictionary
     if isinstance(TEMPLATE_MCF, list):
-        # Convert each element to a string if it's not already
         template_content = "\n".join(str(item) for item in TEMPLATE_MCF)
     else:
-        template_content = str(
-            TEMPLATE_MCF
-        )  # In case it's not a list, just convert it to a string
+        template_content = str(TEMPLATE_MCF)
 
     with open(outfilename, 'w') as f_out:
         f_out.write(template_content)
 
-
 def main(_):
-    dfs = {}
-    for year in YEARS:
-        logger.info(f"Processing year: {year}")
-        columns = CSV_COLUMNS_BY_YEAR[year]
-        zip_filename = ZIP_FILENAMES.get(year, None)
-
-        url = build_url(year, zip_filename)
-
-        logger.info(f"Requesting file: {url}")
-        response = requests.get(url, verify=False)
-
-        if response.status_code == 200:
-            if zip_filename:
-                with zipfile.ZipFile(io.BytesIO(response.content)) as zfile:
-                    with zfile.open(f'{FILENAMES[year]}.csv', 'r') as newfile:
-                        dfs[year] = pd.read_csv(newfile,
-                                                engine='python',
-                                                encoding='latin1',
-                                                usecols=columns)
-            else:
-                dfs[year] = pd.read_csv(io.StringIO(response.text),
-                                        sep=',',
-                                        usecols=columns)
-            logger.info(
-                f"File downloaded and processed for {year} successfully")
-        else:
-            logger.error(
-                f"Failed to download file for {year}. HTTP Status Code: {response.status_code}"
-            )
-
-        # Rename columns to match other years
-        if year == '2024':
-            cols_renamed = dict(zip(columns, NORM_CSV_COLUMNS1))
-        else:
-            cols_renamed = dict(zip(columns, NORM_CSV_COLUMNS))
-
-        dfs[year] = dfs[year].rename(columns=cols_renamed)
-        logger.info(f"Columns renamed for {year} successfully")
-
+    global URL_SUFFIX, BASE_URL, TEMPLATE_MCF, FILENAMES
+
+    try:
+        # Load configuration from config.json
+        with file_util.FileIO(_FLAGS.config_path, 'r') as f:
+            config = json.load(f)
+
+        YEARS = config["YEARS"]
+        NORM_CSV_COLUMNS = config["NORM_CSV_COLUMNS"]
+        NORM_CSV_COLUMNS1 = config["NORM_CSV_COLUMNS1"]
+        CSV_COLUMNS_BY_YEAR = config["CSV_COLUMNS_BY_YEAR"]
+        ZIP_FILENAMES = config["ZIP_FILENAMES"]
+        FILENAMES = config["FILENAMES"]
+        TEMPLATE_MCF = config["TEMPLATE_MCF"]
+        BASE_URL = config["BASE_URL"]
+        URL_SUFFIX = config["URL_SUFFIX"]
+        RENAME_COLUMNS_YEARS = config["RENAME_COLUMNS_YEARS"]
+
+        dfs = {}
+
+        for year in YEARS:
+            try:
+                logger.info(f"Processing year: {year}")
+                columns = CSV_COLUMNS_BY_YEAR[year]
+                zip_filename = ZIP_FILENAMES.get(year, None)
+
+                # If the file for the current year is not already downloaded, download it
+                input_folder = os.path.join(_MODULE_DIR, 'input')
+                file_path = os.path.join(input_folder, f'{year}.zip' if zip_filename else f'{year}.csv')
+
+                # Download if the file is missing
+                if not os.path.exists(file_path):
+                    logger.info(f"File for {year} not found. Downloading...")
+                    url = build_url(year, zip_filename)
+                    download_file(url, year, zip_filename)
+
+                # Process the downloaded file
+                if zip_filename:
+                    with zipfile.ZipFile(file_path, 'r') as zfile:
+                        with zfile.open(f'{FILENAMES[year]}.csv', 'r') as newfile:
+                            dfs[year] = pd.read_csv(newfile,
+                                                    engine='python',
+                                                    encoding='latin1',
+                                                    usecols=columns)
+                else:
+                    dfs[year] = pd.read_csv(file_path,
+                                            sep=',',
+                                            usecols=columns)
+
+                logger.info(f"File processed for {year} successfully")
+
+                if year in RENAME_COLUMNS_YEARS:
+                    cols_renamed = dict(zip(columns, NORM_CSV_COLUMNS1))
+                else:
+                    cols_renamed = dict(zip(columns, NORM_CSV_COLUMNS))
+
+                dfs[year] = dfs[year].rename(columns=cols_renamed)
+                logger.info(f"Columns renamed for {year} successfully")
+
+            except Exception as e:
+                logger.fatal(f"Error processing data for year {year}: {e}")
+                continue
+
+        # Write the combined data and template
         logger.info("Writing data to CSV")
         write_csv(dfs, 'ejscreen_airpollutants.csv')
+
         logger.info("Writing template to TMCF")
         write_tmcf('ejscreen.tmcf')
+
         logger.info("Process completed successfully")
 
+    except Exception as e:
+        logger.fatal(f"Unexpected error in the main process: {e}")
+        sys.exit(1)
 
 if __name__ == '__main__':
     app.run(main)
diff --git a/scripts/us_epa/ejscreen/manifest.json b/scripts/us_epa/ejscreen/manifest.json
index dbb7caa53c..d68f2e67d5 100644
--- a/scripts/us_epa/ejscreen/manifest.json
+++ b/scripts/us_epa/ejscreen/manifest.json
@@ -2,7 +2,7 @@
     "import_specifications": [
         {
             "import_name": "EPA_EJSCREEN",
-            "curator_emails": [],
+            "curator_emails": ["rbhande@google.com"],
             "provenance_url": "https://gaftp.epa.gov/EJSCREEN/",
             "provenance_description": "The Census Bureau's Ejscreen data",
             "scripts": [
@@ -14,7 +14,7 @@
                     "cleaned_csv": "ejscreen_airpollutants.csv"
                 }
             ],
-            "cron_schedule": "0 07 * * 1"
+            "cron_schedule": "0 7 * * 1"
         }
     ]
 }

From 1e71abb60f648423c6a22f6b51d83d816f4342b7 Mon Sep 17 00:00:00 2001
From: Rohit Bhande <rbhande@google.com>
Date: Wed, 22 Jan 2025 09:46:57 +0000
Subject: [PATCH 17/21] Ejscreen semiautomatic2

---
 scripts/us_epa/ejscreen/ejscreen.py | 32 ++++++++++++++++++++---------
 1 file changed, 22 insertions(+), 10 deletions(-)

diff --git a/scripts/us_epa/ejscreen/ejscreen.py b/scripts/us_epa/ejscreen/ejscreen.py
index c38d7a0527..58f4f2bd37 100644
--- a/scripts/us_epa/ejscreen/ejscreen.py
+++ b/scripts/us_epa/ejscreen/ejscreen.py
@@ -33,6 +33,7 @@
                     'gs://unresolved_mcf/epa/ejscreen/config.json',
                     'Path to config file')
 
+
 # Function to build the correct URL for each year
 def build_url(year, zip_filename=None):
     if zip_filename:
@@ -44,19 +45,25 @@ def build_url(year, zip_filename=None):
         url = f'{BASE_URL}/{year}/{FILENAMES[year]}.csv'
     return url
 
+
 # Download the file and save it in the input folder
 def download_file(url, year, zip_filename=None):
     response = requests.get(url, verify=False)
     if response.status_code == 200:
         input_folder = os.path.join(_MODULE_DIR, 'input')
-        os.makedirs(input_folder, exist_ok=True)  # Create the folder if it doesn't exist
-        
-        file_path = os.path.join(input_folder, f'{year}.zip' if zip_filename else f'{year}.csv')
+        os.makedirs(input_folder,
+                    exist_ok=True)  # Create the folder if it doesn't exist
+
+        file_path = os.path.join(
+            input_folder, f'{year}.zip' if zip_filename else f'{year}.csv')
         with open(file_path, 'wb') as f:
             f.write(response.content)
         logger.info(f"File downloaded and saved as {file_path}")
     else:
-        logger.fatal(f"Failed to download file for {year}. HTTP Status Code: {response.status_code}")
+        logger.fatal(
+            f"Failed to download file for {year}. HTTP Status Code: {response.status_code}"
+        )
+
 
 # Data processing function
 def write_csv(data, outfilename):
@@ -67,11 +74,13 @@ def write_csv(data, outfilename):
 
     full_df = full_df.rename(columns={'ID': 'FIPS'})
     full_df = full_df.sort_values(by=['FIPS'], ignore_index=True)
-    full_df['FIPS'] = 'dcid:geoId/' + (full_df['FIPS'].astype(str).str.zfill(12))
+    full_df['FIPS'] = 'dcid:geoId/' + (
+        full_df['FIPS'].astype(str).str.zfill(12))
     full_df = full_df.fillna('')
     full_df = full_df.replace('None', '')
     full_df.to_csv(outfilename, index=False)
 
+
 def write_tmcf(outfilename):
     if isinstance(TEMPLATE_MCF, list):
         template_content = "\n".join(str(item) for item in TEMPLATE_MCF)
@@ -81,6 +90,7 @@ def write_tmcf(outfilename):
     with open(outfilename, 'w') as f_out:
         f_out.write(template_content)
 
+
 def main(_):
     global URL_SUFFIX, BASE_URL, TEMPLATE_MCF, FILENAMES
 
@@ -110,7 +120,9 @@ def main(_):
 
                 # If the file for the current year is not already downloaded, download it
                 input_folder = os.path.join(_MODULE_DIR, 'input')
-                file_path = os.path.join(input_folder, f'{year}.zip' if zip_filename else f'{year}.csv')
+                file_path = os.path.join(
+                    input_folder,
+                    f'{year}.zip' if zip_filename else f'{year}.csv')
 
                 # Download if the file is missing
                 if not os.path.exists(file_path):
@@ -121,15 +133,14 @@ def main(_):
                 # Process the downloaded file
                 if zip_filename:
                     with zipfile.ZipFile(file_path, 'r') as zfile:
-                        with zfile.open(f'{FILENAMES[year]}.csv', 'r') as newfile:
+                        with zfile.open(f'{FILENAMES[year]}.csv',
+                                        'r') as newfile:
                             dfs[year] = pd.read_csv(newfile,
                                                     engine='python',
                                                     encoding='latin1',
                                                     usecols=columns)
                 else:
-                    dfs[year] = pd.read_csv(file_path,
-                                            sep=',',
-                                            usecols=columns)
+                    dfs[year] = pd.read_csv(file_path, sep=',', usecols=columns)
 
                 logger.info(f"File processed for {year} successfully")
 
@@ -158,5 +169,6 @@ def main(_):
         logger.fatal(f"Unexpected error in the main process: {e}")
         sys.exit(1)
 
+
 if __name__ == '__main__':
     app.run(main)

From a9083f10cc366fff3ad9c6af60996c1d427d103b Mon Sep 17 00:00:00 2001
From: Rohit Bhande <rbhande@google.com>
Date: Fri, 24 Jan 2025 06:05:05 +0000
Subject: [PATCH 18/21] Ejscreen semiautomatic2

---
 scripts/us_epa/ejscreen/README.md     |   7 +-
 scripts/us_epa/ejscreen/config.json   |  99 ++++++++++++++++
 scripts/us_epa/ejscreen/ejscreen.py   | 160 +++++++++++++++-----------
 scripts/us_epa/ejscreen/manifest.json |   2 +-
 4 files changed, 195 insertions(+), 73 deletions(-)
 create mode 100644 scripts/us_epa/ejscreen/config.json

diff --git a/scripts/us_epa/ejscreen/README.md b/scripts/us_epa/ejscreen/README.md
index e531f63ac3..10ddec5e20 100644
--- a/scripts/us_epa/ejscreen/README.md
+++ b/scripts/us_epa/ejscreen/README.md
@@ -20,9 +20,12 @@ which are a small subset of the available EJSCREEN variables.
 
 To generate `ejscreen_airpollutants.csv` and `ejscreen.tmcf` run the following:  
 
-    `python3 ejscreen.py`
+#Downloading and Processing Data
+To perform "download and process", run the below command: python3 ejscreen.py Running this command generates input_fles and csv, mcf, tmcf files
 
-As of July, 2021 this includes data through the end of 2020.
+If you want to perform "only process", run the below command: python3 ejscreen.py --mode=process
+
+If you want to perform "only download", run the below command: python3 ejscreen.py --mode=download
 
 ### Unit Tests
 
diff --git a/scripts/us_epa/ejscreen/config.json b/scripts/us_epa/ejscreen/config.json
new file mode 100644
index 0000000000..9bed0a5556
--- /dev/null
+++ b/scripts/us_epa/ejscreen/config.json
@@ -0,0 +1,99 @@
+{
+  "YEARS": [ "2015", "2016", "2017", "2018", "2019", "2020", "2021", "2022", "2023", "2024"],
+  "NORM_CSV_COLUMNS": ["ID", "DSLPM", "CANCER", "RESP", "OZONE", "PM25"],
+  "NORM_CSV_COLUMNS1": ["ID", "DSLPM", "OZONE", "PM25"],
+  "CSV_COLUMNS_BY_YEAR": {
+    "2015": ["FIPS", "dpm", "cancer", "resp", "o3", "pm"],
+    "2016": ["ID", "DSLPM", "CANCER", "RESP", "OZONE", "PM25"],
+    "2017": ["ID", "DSLPM", "CANCER", "RESP", "OZONE", "PM25"],
+    "2018": ["ID", "DSLPM", "CANCER", "RESP", "OZONE", "PM25"],
+    "2019": ["ID", "DSLPM", "CANCER", "RESP", "OZONE", "PM25"],
+    "2020": ["ID", "DSLPM", "CANCER", "RESP", "OZONE", "PM25"],
+    "2021": ["ID", "DSLPM", "CANCER", "RESP", "OZONE", "PM25"],
+    "2022": ["ID", "DSLPM", "CANCER", "RESP", "OZONE", "PM25"],
+    "2023": ["ID", "DSLPM", "CANCER", "RESP", "OZONE", "PM25"],
+    "2024": ["ID", "DSLPM", "OZONE", "PM25"]
+  },
+  "ZIP_FILENAMES": {
+    "2015": "EJSCREEN_20150505.csv",
+    "2016": "EJSCREEN_V3_USPR_090216_CSV",
+    "2017": null,
+    "2018": "EJSCREEN_2018_USPR_csv",
+    "2019": "EJSCREEN_2019_USPR.csv",
+    "2020": "EJSCREEN_2020_USPR.csv",
+    "2021": "EJSCREEN_2021_USPR.csv",
+    "2022": "EJSCREEN_2022_with_AS_CNMI_GU_VI.csv",
+    "2023": "EJSCREEN_2023_BG_with_AS_CNMI_GU_VI.csv",
+    "2024": "EJScreen_2024_Tract_with_AS_CNMI_GU_VI.csv"
+  },
+  "FILENAMES": {
+    "2015": "EJSCREEN_20150505",
+    "2016": "EJSCREEN_Full_V3_USPR_TSDFupdate",
+    "2017": "EJSCREEN_2017_USPR_Public",
+    "2018": "EJSCREEN_Full_USPR_2018",
+    "2019": "EJSCREEN_2019_USPR",
+    "2020": "EJSCREEN_2020_USPR",
+    "2021": "EJSCREEN_2021_USPR",
+    "2022": "EJSCREEN_2022_Full_with_AS_CNMI_GU_VI",
+    "2023": "EJSCREEN_2023_BG_with_AS_CNMI_GU_VI",
+    "2024": "EJScreen_2024_Tract_with_AS_CNMI_GU_VI"
+  },
+  "TEMPLATE_MCF": [
+    {
+      "Node": "E:ejscreen_airpollutants->E0",
+      "typeOf": "dcs:StatVarObservation",
+      "variableMeasured": "dcs:Mean_Concentration_AirPollutant_DieselPM",
+      "observationDate": "C:ejscreen_airpollutants->year",
+      "observationAbout": "C:ejscreen_airpollutants->FIPS",
+      "observationPeriod": "dcs:P1Y",
+      "value": "C:ejscreen_airpollutants->DSLPM",
+      "unit": "dcs:MicrogramsPerCubicMeter"
+    },
+    {
+      "Node": "E:ejscreen_airpollutants->E1",
+      "typeOf": "dcs:StatVarObservation",
+      "variableMeasured": "dcs:AirPollutant_Cancer_Risk",
+      "observationDate": "C:ejscreen_airpollutants->year",
+      "observationAbout": "C:ejscreen_airpollutants->FIPS",
+      "observationPeriod": "dcs:P1Y",
+      "value": "C:ejscreen_airpollutants->CANCER",
+      "unit": "dcs:PerMillionPerson"
+    },
+    {
+      "Node": "E:ejscreen_airpollutants->E2",
+      "typeOf": "dcs:StatVarObservation",
+      "variableMeasured": "dcs:AirPollutant_Respiratory_Hazard",
+      "observationDate": "C:ejscreen_airpollutants->year",
+      "observationAbout": "C:ejscreen_airpollutants->FIPS",
+      "observationPeriod": "dcs:P1Y",
+      "value": "C:ejscreen_airpollutants->RESP"
+    },
+    {
+      "Node": "E:ejscreen_airpollutants->E3",
+      "typeOf": "dcs:StatVarObservation",
+      "variableMeasured": "dcs:Mean_Concentration_AirPollutant_Ozone",
+      "observationDate": "C:ejscreen_airpollutants->year",
+      "observationAbout": "C:ejscreen_airpollutants->FIPS",
+      "observationPeriod": "dcs:P1Y",
+      "value": "C:ejscreen_airpollutants->OZONE",
+      "unit": "dcs:PartsPerBillion"
+    },
+    {
+      "Node": "E:ejscreen_airpollutants->E4",
+      "typeOf": "dcs:StatVarObservation",
+      "variableMeasured": "dcs:Mean_Concentration_AirPollutant_PM2.5",
+      "observationDate": "C:ejscreen_airpollutants->year",
+      "observationAbout": "C:ejscreen_airpollutants->FIPS",
+      "observationPeriod": "dcs:P1Y",
+      "value": "C:ejscreen_airpollutants->PM25",
+      "unit": "dcs:MicrogramsPerCubicMeter"
+    }
+  ]
+  ,
+  "BASE_URL": "https://gaftp.epa.gov/EJSCREEN",
+  "URL_SUFFIX": {
+    "2023": "2.22_September_UseMe",
+    "2024": "2.32_August_UseMe"
+  },
+  "RENAME_COLUMNS_YEARS": ["2024"]
+}
diff --git a/scripts/us_epa/ejscreen/ejscreen.py b/scripts/us_epa/ejscreen/ejscreen.py
index 58f4f2bd37..04d093c5cb 100644
--- a/scripts/us_epa/ejscreen/ejscreen.py
+++ b/scripts/us_epa/ejscreen/ejscreen.py
@@ -20,6 +20,7 @@
 import json
 from absl import logging, flags, app
 import sys
+import time  # Import time for delay in retries
 
 _MODULE_DIR = os.path.dirname(os.path.abspath(__file__))
 sys.path.append(os.path.join(_MODULE_DIR, '../../../util/'))
@@ -32,7 +33,7 @@
 flags.DEFINE_string('config_path',
                     'gs://unresolved_mcf/epa/ejscreen/config.json',
                     'Path to config file')
-
+flags.DEFINE_string('mode', '', 'Mode of operation: "download" to only download, "process" to only process, leave empty for both.')
 
 # Function to build the correct URL for each year
 def build_url(year, zip_filename=None):
@@ -48,21 +49,32 @@ def build_url(year, zip_filename=None):
 
 # Download the file and save it in the input folder
 def download_file(url, year, zip_filename=None):
-    response = requests.get(url, verify=False)
-    if response.status_code == 200:
-        input_folder = os.path.join(_MODULE_DIR, 'input')
-        os.makedirs(input_folder,
-                    exist_ok=True)  # Create the folder if it doesn't exist
-
-        file_path = os.path.join(
-            input_folder, f'{year}.zip' if zip_filename else f'{year}.csv')
-        with open(file_path, 'wb') as f:
-            f.write(response.content)
-        logger.info(f"File downloaded and saved as {file_path}")
-    else:
-        logger.fatal(
-            f"Failed to download file for {year}. HTTP Status Code: {response.status_code}"
-        )
+    max_retry = 5 
+    retry_number = 0
+    while retry_number < max_retry:
+        try:
+            response = requests.get(url, verify=False)
+            if response.status_code == 200:
+                input_folder = os.path.join(_MODULE_DIR, 'input')
+                os.makedirs(input_folder, exist_ok=True)  
+
+                file_path = os.path.join(input_folder, f'{year}.zip' if zip_filename else f'{year}.csv')
+                with open(file_path, 'wb') as f:
+                    f.write(response.content)
+                logger.info(f"File downloaded and saved as {file_path}")
+                return  
+            else:
+                logger.fatal(f"Failed to download file for {year}. HTTP Status Code: {response.status_code}")
+                retry_number += 1
+                time.sleep(5)  
+        except Exception as e:
+            logger.error(f"Error downloading file for {year}: {e}")
+            retry_number += 1
+            time.sleep(5) 
+
+    # If we reached max retries and failed, log the fatal error
+    logger.fatal(f"Failed to download file for {year} after {max_retry} retries.")
+    
 
 
 # Data processing function
@@ -74,8 +86,7 @@ def write_csv(data, outfilename):
 
     full_df = full_df.rename(columns={'ID': 'FIPS'})
     full_df = full_df.sort_values(by=['FIPS'], ignore_index=True)
-    full_df['FIPS'] = 'dcid:geoId/' + (
-        full_df['FIPS'].astype(str).str.zfill(12))
+    full_df['FIPS'] = 'dcid:geoId/' + (full_df['FIPS'].astype(str).str.zfill(12))
     full_df = full_df.fillna('')
     full_df = full_df.replace('None', '')
     full_df.to_csv(outfilename, index=False)
@@ -112,62 +123,71 @@ def main(_):
 
         dfs = {}
 
-        for year in YEARS:
-            try:
-                logger.info(f"Processing year: {year}")
-                columns = CSV_COLUMNS_BY_YEAR[year]
-                zip_filename = ZIP_FILENAMES.get(year, None)
+        # Download files if the mode is 'download' or if no mode is specified
+        if _FLAGS.mode == "" or _FLAGS.mode == "download":
+            for year in YEARS:
+                try:
+                    logger.info(f"Processing year: {year}")
+                    columns = CSV_COLUMNS_BY_YEAR[year]
+                    zip_filename = ZIP_FILENAMES.get(year, None)
 
-                # If the file for the current year is not already downloaded, download it
-                input_folder = os.path.join(_MODULE_DIR, 'input')
-                file_path = os.path.join(
-                    input_folder,
-                    f'{year}.zip' if zip_filename else f'{year}.csv')
-
-                # Download if the file is missing
-                if not os.path.exists(file_path):
-                    logger.info(f"File for {year} not found. Downloading...")
-                    url = build_url(year, zip_filename)
-                    download_file(url, year, zip_filename)
-
-                # Process the downloaded file
-                if zip_filename:
-                    with zipfile.ZipFile(file_path, 'r') as zfile:
-                        with zfile.open(f'{FILENAMES[year]}.csv',
-                                        'r') as newfile:
-                            dfs[year] = pd.read_csv(newfile,
-                                                    engine='python',
-                                                    encoding='latin1',
-                                                    usecols=columns)
-                else:
-                    dfs[year] = pd.read_csv(file_path, sep=',', usecols=columns)
-
-                logger.info(f"File processed for {year} successfully")
-
-                if year in RENAME_COLUMNS_YEARS:
-                    cols_renamed = dict(zip(columns, NORM_CSV_COLUMNS1))
-                else:
-                    cols_renamed = dict(zip(columns, NORM_CSV_COLUMNS))
-
-                dfs[year] = dfs[year].rename(columns=cols_renamed)
-                logger.info(f"Columns renamed for {year} successfully")
-
-            except Exception as e:
-                logger.fatal(f"Error processing data for year {year}: {e}")
-                continue
-
-        # Write the combined data and template
-        logger.info("Writing data to CSV")
-        write_csv(dfs, 'ejscreen_airpollutants.csv')
-
-        logger.info("Writing template to TMCF")
-        write_tmcf('ejscreen.tmcf')
-
-        logger.info("Process completed successfully")
+                    input_folder = os.path.join(_MODULE_DIR, 'input')
+                    file_path = os.path.join(input_folder, f'{year}.zip' if zip_filename else f'{year}.csv')
+
+                    if not os.path.exists(file_path):
+                        logger.info(f"File for {year} not found. Downloading...")
+                        url = build_url(year, zip_filename)
+                        download_file(url, year, zip_filename)
+
+                except Exception as e:
+                    logger.fatal(f"Error processing data for year {year}: {e}")
+                    continue
+
+        # Process files if the mode is 'process' or if no mode is specified
+        if _FLAGS.mode == "" or _FLAGS.mode == "process":
+            for year in YEARS:
+                try:
+                    logger.info(f"Processing data for year {year}")
+                    columns = CSV_COLUMNS_BY_YEAR[year]
+                    zip_filename = ZIP_FILENAMES.get(year, None)
+
+                    input_folder = os.path.join(_MODULE_DIR, 'input')
+                    file_path = os.path.join(input_folder, f'{year}.zip' if zip_filename else f'{year}.csv')
+
+                    # Process the downloaded file
+                    if zip_filename:
+                        with zipfile.ZipFile(file_path, 'r') as zfile:
+                            with zfile.open(f'{FILENAMES[year]}.csv', 'r') as newfile:
+                                dfs[year] = pd.read_csv(newfile, engine='python', encoding='latin1', usecols=columns)
+                    else:
+                        dfs[year] = pd.read_csv(file_path, sep=',', usecols=columns)
+
+                    logger.info(f"File processed for {year} successfully")
+
+                    if year in RENAME_COLUMNS_YEARS:
+                        cols_renamed = dict(zip(columns, NORM_CSV_COLUMNS1))
+                    else:
+                        cols_renamed = dict(zip(columns, NORM_CSV_COLUMNS))
+
+                    dfs[year] = dfs[year].rename(columns=cols_renamed)
+                    logger.info(f"Columns renamed for {year} successfully")
+
+                except Exception as e:
+                    logger.fatal(f"Error processing data for year {year}: {e}")
+                    continue
+
+            # Write the combined data and template
+            logger.info("Writing data to CSV")
+            write_csv(dfs, 'ejscreen_airpollutants.csv')
+
+            logger.info("Writing template to TMCF")
+            write_tmcf('ejscreen.tmcf')
+
+            logger.info("Process completed successfully")
 
     except Exception as e:
         logger.fatal(f"Unexpected error in the main process: {e}")
-        sys.exit(1)
+        
 
 
 if __name__ == '__main__':
diff --git a/scripts/us_epa/ejscreen/manifest.json b/scripts/us_epa/ejscreen/manifest.json
index d68f2e67d5..d2bd898d21 100644
--- a/scripts/us_epa/ejscreen/manifest.json
+++ b/scripts/us_epa/ejscreen/manifest.json
@@ -2,7 +2,7 @@
     "import_specifications": [
         {
             "import_name": "EPA_EJSCREEN",
-            "curator_emails": ["rbhande@google.com"],
+            "curator_emails": [],
             "provenance_url": "https://gaftp.epa.gov/EJSCREEN/",
             "provenance_description": "The Census Bureau's Ejscreen data",
             "scripts": [

From 0865095109038189bb1514c174436349382b7271 Mon Sep 17 00:00:00 2001
From: Rohit Bhande <rbhande@google.com>
Date: Fri, 24 Jan 2025 06:09:12 +0000
Subject: [PATCH 19/21] Ejscreen semiautomatic2

---
 scripts/us_epa/ejscreen/ejscreen.py | 55 +++++++++++++++++++----------
 1 file changed, 37 insertions(+), 18 deletions(-)

diff --git a/scripts/us_epa/ejscreen/ejscreen.py b/scripts/us_epa/ejscreen/ejscreen.py
index 04d093c5cb..389f1acc0a 100644
--- a/scripts/us_epa/ejscreen/ejscreen.py
+++ b/scripts/us_epa/ejscreen/ejscreen.py
@@ -33,7 +33,11 @@
 flags.DEFINE_string('config_path',
                     'gs://unresolved_mcf/epa/ejscreen/config.json',
                     'Path to config file')
-flags.DEFINE_string('mode', '', 'Mode of operation: "download" to only download, "process" to only process, leave empty for both.')
+flags.DEFINE_string(
+    'mode', '',
+    'Mode of operation: "download" to only download, "process" to only process, leave empty for both.'
+)
+
 
 # Function to build the correct URL for each year
 def build_url(year, zip_filename=None):
@@ -49,32 +53,36 @@ def build_url(year, zip_filename=None):
 
 # Download the file and save it in the input folder
 def download_file(url, year, zip_filename=None):
-    max_retry = 5 
+    max_retry = 5
     retry_number = 0
     while retry_number < max_retry:
         try:
             response = requests.get(url, verify=False)
             if response.status_code == 200:
                 input_folder = os.path.join(_MODULE_DIR, 'input')
-                os.makedirs(input_folder, exist_ok=True)  
+                os.makedirs(input_folder, exist_ok=True)
 
-                file_path = os.path.join(input_folder, f'{year}.zip' if zip_filename else f'{year}.csv')
+                file_path = os.path.join(
+                    input_folder,
+                    f'{year}.zip' if zip_filename else f'{year}.csv')
                 with open(file_path, 'wb') as f:
                     f.write(response.content)
                 logger.info(f"File downloaded and saved as {file_path}")
-                return  
+                return
             else:
-                logger.fatal(f"Failed to download file for {year}. HTTP Status Code: {response.status_code}")
+                logger.fatal(
+                    f"Failed to download file for {year}. HTTP Status Code: {response.status_code}"
+                )
                 retry_number += 1
-                time.sleep(5)  
+                time.sleep(5)
         except Exception as e:
             logger.error(f"Error downloading file for {year}: {e}")
             retry_number += 1
-            time.sleep(5) 
+            time.sleep(5)
 
     # If we reached max retries and failed, log the fatal error
-    logger.fatal(f"Failed to download file for {year} after {max_retry} retries.")
-    
+    logger.fatal(
+        f"Failed to download file for {year} after {max_retry} retries.")
 
 
 # Data processing function
@@ -86,7 +94,8 @@ def write_csv(data, outfilename):
 
     full_df = full_df.rename(columns={'ID': 'FIPS'})
     full_df = full_df.sort_values(by=['FIPS'], ignore_index=True)
-    full_df['FIPS'] = 'dcid:geoId/' + (full_df['FIPS'].astype(str).str.zfill(12))
+    full_df['FIPS'] = 'dcid:geoId/' + (
+        full_df['FIPS'].astype(str).str.zfill(12))
     full_df = full_df.fillna('')
     full_df = full_df.replace('None', '')
     full_df.to_csv(outfilename, index=False)
@@ -132,10 +141,13 @@ def main(_):
                     zip_filename = ZIP_FILENAMES.get(year, None)
 
                     input_folder = os.path.join(_MODULE_DIR, 'input')
-                    file_path = os.path.join(input_folder, f'{year}.zip' if zip_filename else f'{year}.csv')
+                    file_path = os.path.join(
+                        input_folder,
+                        f'{year}.zip' if zip_filename else f'{year}.csv')
 
                     if not os.path.exists(file_path):
-                        logger.info(f"File for {year} not found. Downloading...")
+                        logger.info(
+                            f"File for {year} not found. Downloading...")
                         url = build_url(year, zip_filename)
                         download_file(url, year, zip_filename)
 
@@ -152,15 +164,23 @@ def main(_):
                     zip_filename = ZIP_FILENAMES.get(year, None)
 
                     input_folder = os.path.join(_MODULE_DIR, 'input')
-                    file_path = os.path.join(input_folder, f'{year}.zip' if zip_filename else f'{year}.csv')
+                    file_path = os.path.join(
+                        input_folder,
+                        f'{year}.zip' if zip_filename else f'{year}.csv')
 
                     # Process the downloaded file
                     if zip_filename:
                         with zipfile.ZipFile(file_path, 'r') as zfile:
-                            with zfile.open(f'{FILENAMES[year]}.csv', 'r') as newfile:
-                                dfs[year] = pd.read_csv(newfile, engine='python', encoding='latin1', usecols=columns)
+                            with zfile.open(f'{FILENAMES[year]}.csv',
+                                            'r') as newfile:
+                                dfs[year] = pd.read_csv(newfile,
+                                                        engine='python',
+                                                        encoding='latin1',
+                                                        usecols=columns)
                     else:
-                        dfs[year] = pd.read_csv(file_path, sep=',', usecols=columns)
+                        dfs[year] = pd.read_csv(file_path,
+                                                sep=',',
+                                                usecols=columns)
 
                     logger.info(f"File processed for {year} successfully")
 
@@ -187,7 +207,6 @@ def main(_):
 
     except Exception as e:
         logger.fatal(f"Unexpected error in the main process: {e}")
-        
 
 
 if __name__ == '__main__':

From ba95b9a07d5ec94657c83e4776ca3119114eb5a1 Mon Sep 17 00:00:00 2001
From: Rohit Bhande <rbhande@google.com>
Date: Fri, 24 Jan 2025 06:57:36 +0000
Subject: [PATCH 20/21] Ejscreen semiautomatic2

---
 scripts/us_epa/ejscreen/ejscreen.py | 63 ++++++++++++++---------------
 1 file changed, 30 insertions(+), 33 deletions(-)

diff --git a/scripts/us_epa/ejscreen/ejscreen.py b/scripts/us_epa/ejscreen/ejscreen.py
index 389f1acc0a..494f115f66 100644
--- a/scripts/us_epa/ejscreen/ejscreen.py
+++ b/scripts/us_epa/ejscreen/ejscreen.py
@@ -20,14 +20,15 @@
 import json
 from absl import logging, flags, app
 import sys
-import time  # Import time for delay in retries
+import time
+from retry import retry
 
 _MODULE_DIR = os.path.dirname(os.path.abspath(__file__))
 sys.path.append(os.path.join(_MODULE_DIR, '../../../util/'))
 import file_util
 
 logging.set_verbosity(logging.INFO)
-logger = logging
+
 _FLAGS = flags.FLAGS
 
 flags.DEFINE_string('config_path',
@@ -50,16 +51,16 @@ def build_url(year, zip_filename=None):
         url = f'{BASE_URL}/{year}/{FILENAMES[year]}.csv'
     return url
 
+@retry(tries=5, delay=5, backoff=5)
+def download_with_retry(url):
+    logging.info(f"Downloading URL : {url}")
+    return requests.get(url=url, verify=False)
 
 # Download the file and save it in the input folder
-def download_file(url, year, zip_filename=None):
-    max_retry = 5
-    retry_number = 0
-    while retry_number < max_retry:
+def download_file(url, year, input_folder, zip_filename=None):
         try:
-            response = requests.get(url, verify=False)
+            response = download_with_retry(url)
             if response.status_code == 200:
-                input_folder = os.path.join(_MODULE_DIR, 'input')
                 os.makedirs(input_folder, exist_ok=True)
 
                 file_path = os.path.join(
@@ -67,23 +68,19 @@ def download_file(url, year, zip_filename=None):
                     f'{year}.zip' if zip_filename else f'{year}.csv')
                 with open(file_path, 'wb') as f:
                     f.write(response.content)
-                logger.info(f"File downloaded and saved as {file_path}")
+                logging.info(f"File downloaded and saved as {file_path}")
                 return
             else:
-                logger.fatal(
-                    f"Failed to download file for {year}. HTTP Status Code: {response.status_code}"
+                logging.fatal(
+                    f"Failed to download file for {year}. HTTP Status Code: {response.status_code} URL : {url}"
                 )
-                retry_number += 1
-                time.sleep(5)
         except Exception as e:
-            logger.error(f"Error downloading file for {year}: {e}")
-            retry_number += 1
-            time.sleep(5)
+            logging.fatal(
+        f"Failed to download file for {year} after {url} .")
 
-    # If we reached max retries and failed, log the fatal error
-    logger.fatal(
-        f"Failed to download file for {year} after {max_retry} retries.")
 
+   
+    
 
 # Data processing function
 def write_csv(data, outfilename):
@@ -131,39 +128,39 @@ def main(_):
         RENAME_COLUMNS_YEARS = config["RENAME_COLUMNS_YEARS"]
 
         dfs = {}
+        input_folder = os.path.join(_MODULE_DIR, 'input')
 
         # Download files if the mode is 'download' or if no mode is specified
         if _FLAGS.mode == "" or _FLAGS.mode == "download":
             for year in YEARS:
                 try:
-                    logger.info(f"Processing year: {year}")
+                    logging.info(f"Processing year: {year}")
                     columns = CSV_COLUMNS_BY_YEAR[year]
                     zip_filename = ZIP_FILENAMES.get(year, None)
 
-                    input_folder = os.path.join(_MODULE_DIR, 'input')
                     file_path = os.path.join(
                         input_folder,
                         f'{year}.zip' if zip_filename else f'{year}.csv')
 
                     if not os.path.exists(file_path):
-                        logger.info(
+                        logging.info(
                             f"File for {year} not found. Downloading...")
                         url = build_url(year, zip_filename)
-                        download_file(url, year, zip_filename)
+                        download_file(url, year, input_folder,zip_filename)
 
                 except Exception as e:
-                    logger.fatal(f"Error processing data for year {year}: {e}")
+                    logging.fatal(f"Error processing data for year {year}: {e}")
                     continue
 
         # Process files if the mode is 'process' or if no mode is specified
         if _FLAGS.mode == "" or _FLAGS.mode == "process":
             for year in YEARS:
                 try:
-                    logger.info(f"Processing data for year {year}")
+                    logging.info(f"Processing data for year {year}")
                     columns = CSV_COLUMNS_BY_YEAR[year]
                     zip_filename = ZIP_FILENAMES.get(year, None)
 
-                    input_folder = os.path.join(_MODULE_DIR, 'input')
+                    
                     file_path = os.path.join(
                         input_folder,
                         f'{year}.zip' if zip_filename else f'{year}.csv')
@@ -182,7 +179,7 @@ def main(_):
                                                 sep=',',
                                                 usecols=columns)
 
-                    logger.info(f"File processed for {year} successfully")
+                    logging.info(f"File processed for {year} successfully")
 
                     if year in RENAME_COLUMNS_YEARS:
                         cols_renamed = dict(zip(columns, NORM_CSV_COLUMNS1))
@@ -190,23 +187,23 @@ def main(_):
                         cols_renamed = dict(zip(columns, NORM_CSV_COLUMNS))
 
                     dfs[year] = dfs[year].rename(columns=cols_renamed)
-                    logger.info(f"Columns renamed for {year} successfully")
+                    logging.info(f"Columns renamed for {year} successfully")
 
                 except Exception as e:
-                    logger.fatal(f"Error processing data for year {year}: {e}")
+                    logging.fatal(f"Error processing data for year {year}: {e}")
                     continue
 
             # Write the combined data and template
-            logger.info("Writing data to CSV")
+            logging.info("Writing data to CSV")
             write_csv(dfs, 'ejscreen_airpollutants.csv')
 
-            logger.info("Writing template to TMCF")
+            logging.info("Writing template to TMCF")
             write_tmcf('ejscreen.tmcf')
 
-            logger.info("Process completed successfully")
+            logging.info("Process completed successfully")
 
     except Exception as e:
-        logger.fatal(f"Unexpected error in the main process: {e}")
+        logging.fatal(f"Unexpected error in the main process: {e}")
 
 
 if __name__ == '__main__':

From 6e9dbfae996e287381f5264bb936a419cc1a2749 Mon Sep 17 00:00:00 2001
From: Rohit Bhande <rbhande@google.com>
Date: Fri, 24 Jan 2025 06:58:09 +0000
Subject: [PATCH 21/21] Ejscreen semiautomatic2

---
 scripts/us_epa/ejscreen/ejscreen.py | 42 +++++++++++++----------------
 1 file changed, 19 insertions(+), 23 deletions(-)

diff --git a/scripts/us_epa/ejscreen/ejscreen.py b/scripts/us_epa/ejscreen/ejscreen.py
index 494f115f66..904f3a0291 100644
--- a/scripts/us_epa/ejscreen/ejscreen.py
+++ b/scripts/us_epa/ejscreen/ejscreen.py
@@ -51,36 +51,33 @@ def build_url(year, zip_filename=None):
         url = f'{BASE_URL}/{year}/{FILENAMES[year]}.csv'
     return url
 
+
 @retry(tries=5, delay=5, backoff=5)
 def download_with_retry(url):
     logging.info(f"Downloading URL : {url}")
     return requests.get(url=url, verify=False)
 
+
 # Download the file and save it in the input folder
 def download_file(url, year, input_folder, zip_filename=None):
-        try:
-            response = download_with_retry(url)
-            if response.status_code == 200:
-                os.makedirs(input_folder, exist_ok=True)
-
-                file_path = os.path.join(
-                    input_folder,
-                    f'{year}.zip' if zip_filename else f'{year}.csv')
-                with open(file_path, 'wb') as f:
-                    f.write(response.content)
-                logging.info(f"File downloaded and saved as {file_path}")
-                return
-            else:
-                logging.fatal(
-                    f"Failed to download file for {year}. HTTP Status Code: {response.status_code} URL : {url}"
-                )
-        except Exception as e:
+    try:
+        response = download_with_retry(url)
+        if response.status_code == 200:
+            os.makedirs(input_folder, exist_ok=True)
+
+            file_path = os.path.join(
+                input_folder, f'{year}.zip' if zip_filename else f'{year}.csv')
+            with open(file_path, 'wb') as f:
+                f.write(response.content)
+            logging.info(f"File downloaded and saved as {file_path}")
+            return
+        else:
             logging.fatal(
-        f"Failed to download file for {year} after {url} .")
-
+                f"Failed to download file for {year}. HTTP Status Code: {response.status_code} URL : {url}"
+            )
+    except Exception as e:
+        logging.fatal(f"Failed to download file for {year} after {url} .")
 
-   
-    
 
 # Data processing function
 def write_csv(data, outfilename):
@@ -146,7 +143,7 @@ def main(_):
                         logging.info(
                             f"File for {year} not found. Downloading...")
                         url = build_url(year, zip_filename)
-                        download_file(url, year, input_folder,zip_filename)
+                        download_file(url, year, input_folder, zip_filename)
 
                 except Exception as e:
                     logging.fatal(f"Error processing data for year {year}: {e}")
@@ -160,7 +157,6 @@ def main(_):
                     columns = CSV_COLUMNS_BY_YEAR[year]
                     zip_filename = ZIP_FILENAMES.get(year, None)
 
-                    
                     file_path = os.path.join(
                         input_folder,
                         f'{year}.zip' if zip_filename else f'{year}.csv')