dossgollin-lab · jdossgollin · Jun 5, 2024 · Jul 11, 2023 · Jul 12, 2023 · Oct 17, 2023
diff --git a/.gitignore b/.gitignore
@@ -9,4 +9,5 @@ logs/*
 nohup.out
 *.pyc
 nexrad/__pycache__/
-.mypy_cache/
+.mypy_cache/
+temp/
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -0,0 +1,6 @@
+{
+    "[python]": {
+        "editor.defaultFormatter": "ms-python.black-formatter"
+    },
+    "python.formatting.provider": "none"
+}
diff --git a/README.md b/README.md
@@ -89,7 +89,7 @@ When you're just rapidly prototyping things, this lag time is annoying.
 To speed up, you can batch files like this:
 
 ```shell
-snakemake all --use-conda --cores 1 --batch all=1/1000
+nohup snakemake /home/jd82/RDF/jd82/NEXRAD/2020-11.nc --use-conda --cores all --rerun-incomplete &
 ```
 
 See [dealing with very large workflows](https://snakemake.readthedocs.io/en/stable/executing/cli.html#dealing-with-very-large-workflows) for more details.
@@ -99,19 +99,19 @@ See [dealing with very large workflows](https://snakemake.readthedocs.io/en/stab
 If you just want to build the dataset, a good default command to use is
 
 ```shell
-snakemake all --use-conda --cores all  --rerun-incomplete --keep-going
+nohup snakemake all --use-conda --cores all  --rerun-incomplete --keep-going
 ```
 
 Note:
 
+* `nohup`: Keep running on a remote machine via `ssh`, even if the connection closes (see [`nohup`](https://www.computerhope.com/unix/unohup.htm) docs).
 * `use_conda`: use anaconda for environments
-* `--cores 10`: use 10 cores (out of 12)
+* `--cores all`: use all cores (workstation has 12)
 * `--rerun-incomplete`: reduces errors if a job was canceled earlier
 * `--keep-going`: if a file causes an error, don't give up
 
 If you are on a different machine you can learn about how many cores are available with the `lscpu` command.
 
-If you are running on a remote machine via `ssh`, then prepending the command above with [`nohup`](https://www.computerhope.com/unix/unohup.htm) may be a good idea.
 In a nutshell, this will keep the process running even after you close your `ssh` session.
 
 ### Linters

diff --git a/Snakefile b/Snakefile
@@ -1,4 +1,4 @@
-from datetime import datetime
+from datetime import datetime, timedelta
 import platform
 import os
 

diff --git a/environment.yml b/environment.yml
@@ -6,13 +6,18 @@ channels:
 dependencies:
   - python=3.10 # specify python version
   - black # clean code and make it pretty
+  - cartopy
   - dask # needed for open_mfdataset
-  - ipykernel # for playing around locally
+  - matplotlib
   - mypy # type checking
+  - netcdf4
+  - pandas
   - pylint # catch errors before you make them
   - snakemake # workflow management engine
   - snakefmt # format snakefiles correctly
   - xarray # N-d labeled array library
+  - zstandard
   - pip # install local packages
   - pip:
       - -e .
+      - ipython
diff --git a/era5/download_era5_pressure.py b/era5/download_era5_pressure.py
@@ -44,25 +44,28 @@ def main() -> None:
     parser.add_argument("--latmax", type=float, default=CONUS[1][1])
     args = parser.parse_args()
 
-    if args.year >= 1959:
-        dataset = "reanalysis-era5-pressure-levels"
-    else:
-        dataset = "reanalysis-era5-pressure-levels-preliminary-back-extension"
+    dataset = "reanalysis-era5-pressure-levels"
+    product_type = "reanalysis"
+    months = [f"{month:02d}" for month in range(1, 13)]  # 01, 02, ..., 12
+    days = [f"{day}" for day in np.arange(1, 32)]  # 1, 2, ..., 31
+    hours = [f"{hour:02d}:00" for hour in range(24)]  # 00:00, 01:00, ... 23:00
+    bbox = [args.latmax, args.lonmin, args.latmin, args.lonmax]
+    grid = [args.resolution, args.resolution]
 
     ecmwf_client = cdsapi.Client()
     ecmwf_client.retrieve(
         dataset,
         {
-            "product_type": "reanalysis",
+            "product_type": product_type,
+            "format": "netcdf",
             "variable": args.variable,
-            "pressure_level": args.pressure_level,
+            "pressure_level": [args.pressure_level],
             "year": [args.year],
-            "month": [f"{month:02d}" for month in range(1, 13)],
-            "day": [f"{day}" for day in np.arange(1, 31 + 1)],
-            "time": [f"{hour:02d}:00" for hour in range(24)],  # 00:00, 01:00, ... 23:00
-            "area": [args.latmax, args.lonmin, args.latmin, args.lonmax],
-            "format": "netcdf",
-            "grid": [args.resolution, args.resolution],
+            "month": months,
+            "day": days,
+            "time": hours,
+            "area": bbox,
+            "grid": grid,
         },
         args.outfile,
     )

diff --git a/era5/download_era5_single_level.py b/era5/download_era5_single_level.py
@@ -3,8 +3,8 @@
 (defaults to CONUS). Storing one year of data in one file makes it easier to work with
 and more resilient to errors in downloading.
 
-You need to specify the year, the output file name, and the variable. Optionally, specify
-the domain. The default domain is CONUS.
+You need to specify the year, the output file name, the variable, and the pressure level.
+Optionally, specify the domain. The default domain is CONUS.
 
 The variable should follow the ERA5 documentation. Some common ones: 
 - 2m_temperature
@@ -33,34 +33,37 @@ def main() -> None:
 
     # parse command line arguments
     parser = argparse.ArgumentParser()
-    parser.add_argument("--year", type=int)
     parser.add_argument("-o", "--outfile", type=str)
     parser.add_argument("--variable", type=str)
+    parser.add_argument("--year", type=int)
     parser.add_argument("--resolution", type=float)
     parser.add_argument("--lonmin", type=float, default=CONUS[0][0])
     parser.add_argument("--lonmax", type=float, default=CONUS[0][1])
     parser.add_argument("--latmin", type=float, default=CONUS[1][0])
     parser.add_argument("--latmax", type=float, default=CONUS[1][1])
     args = parser.parse_args()
 
-    if args.year >= 1959:
-        dataset = "reanalysis-era5-single-levels"
-    else:
-        dataset = "reanalysis-era5-single-levels-preliminary-back-extension"
+    dataset = "reanalysis-era5-single-levels"
+    product_type = "reanalysis"
+    months = [f"{month:02d}" for month in range(1, 13)]  # 01, 02, ..., 12
+    days = [f"{day}" for day in np.arange(1, 32)]  # 1, 2, ..., 31
+    hours = [f"{hour:02d}:00" for hour in range(24)]  # 00:00, 01:00, ... 23:00
+    bbox = [args.latmax, args.lonmin, args.latmin, args.lonmax]
+    grid = [args.resolution, args.resolution]
 
     ecmwf_client = cdsapi.Client()
     ecmwf_client.retrieve(
         dataset,
         {
-            "product_type": "reanalysis",
+            "product_type": product_type,
+            "format": "netcdf",
             "variable": args.variable,
             "year": [args.year],
-            "month": [f"{month:02d}" for month in range(1, 13)],
-            "day": [f"{day}" for day in np.arange(1, 31 + 1)],
-            "time": [f"{hour:02d}:00" for hour in range(24)],  # 00:00, 01:00, ... 23:00
-            "area": [args.latmax, args.lonmin, args.latmin, args.lonmax],
-            "format": "netcdf",
-            "grid": [args.resolution, args.resolution],
+            "month": months,
+            "day": days,
+            "time": hours,
+            "area": bbox,
+            "grid": grid,
         },
         args.outfile,
     )

diff --git a/era5/era5.smk b/era5/era5.smk
@@ -73,18 +73,19 @@ rule era5_single_level:
 era5_years = range(config["era5"]["first_year"], config["era5"]["last_year"] + 1)
 
 # specify the files to download
-pressure_vars = config["era5"]["vars"]["pressure_level"]
-pressure_files = [
-    os.path.join(ERA5_DATA_DIR, "pressure_level", f"{var}_500_{year}.nc")
-    for year in era5_years
-    for var in pressure_vars
-] # ugly hack, currently forcing 500hPa
+pressure_files = []
+for year in era5_years:
+    for var in config["era5"]["vars"]["pressure_level"]:
+        varname = var["name"]
+        levels = var["levels"]
+        for level in levels:
+            pressure_files.append(os.path.join(ERA5_DATA_DIR, "pressure_level", f"{varname}_{level}_{year}.nc"))
 
 single_level_vars = config["era5"]["vars"]["single_level"]
 single_level_files = [
     os.path.join(ERA5_DATA_DIR, "single_level", f"{var}_{year}.nc")
     for year in era5_years
-    for var in pressure_vars
+    for var in single_level_vars
 ]
 
 

diff --git a/era5/era5_config.yml b/era5/era5_config.yml
@@ -5,12 +5,16 @@ era5:
   latmin: 25
   latmax: 50
   resolution: 0.25
-  first_year: 2015
+  first_year: 1940
   last_year: 2022
   vars:
     pressure_level:
-        - u_component_of_wind
-        - v_component_of_wind
+        - name: u_component_of_wind
+          levels:
+            - 500
+        - name: v_component_of_wind
+          levels: 
+            - 500
     single_level:
       - vertical_integral_of_eastward_water_vapour_flux
       - vertical_integral_of_northward_water_vapour_flux