diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index ff510b18..01acf9e1 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -31,3 +31,4 @@ repos: hooks: - id: isort name: isort (python) + args: ["--profile", "black"] diff --git a/config/catfim_template.env b/config/catfim_template.env new file mode 100644 index 00000000..f6e2cdd0 --- /dev/null +++ b/config/catfim_template.env @@ -0,0 +1,6 @@ +API_BASE_URL= Enter Words API path here +EVALUATED_SITES_CSV=/data/inputs/ahp_sites/evaluated_ahps_sites.csv +WBD_LAYER=/data/inputs/wbd/WBD_National.gpkg +NWM_FLOWS_MS=/data/inputs/nwm_hydrofabric/nwm_flows_ms_wrds.gpkg +USGS_METADATA_URL=https://fimnew.wim.usgs.gov +USGS_DOWNLOAD_URL=https://s3.amazonaws.com/wimcloud.usgs.gov diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md index 18d83ad4..900119b1 100755 --- a/docs/CHANGELOG.md +++ b/docs/CHANGELOG.md @@ -1,6 +1,44 @@ All notable changes to this project will be documented in this file. We follow the [Semantic Versioning 2.0.0](http://semver.org/) format. +## v4.5.8.0 - 2024-09-13 - [PR#1165](https://github.com/NOAA-OWP/inundation-mapping/pull/1165) + +This PR was originally intended to get Alaska HUCs incorporated into CatFIM, but there were a very, very large array of problems and the tool was unable to run. We have made some major modifications and many more will come in the near future. There are partial hooks and commented code for Alaska integration, but temporarily disabled are included and will be handled by a separate branch / PR. + +One of the biggest improvement was to add a logging system to track what is breaking and where. Earlier, there were a very large number of places were errors occurred but they were suppressed and never recorded anywhere. A few put the errors on screen but this is a very long running process tool, which can take 2 days, and any messages to the screen are lost. Now all errors and warning are caught and at least logged in the master log but also the "warning" or "error" log to help them stand out better. Many of the warnings are truly and fairly rejected but at least we know when and why. When we started working with CatFIM again a few months back, there were show stopping errors and we did not know where or why but now we can find and diagnose them. + +All three of the core "generate_catfim...py" files include major amounts of changes to improve variable and function names, improve flow and readability, move functions for better understanding of the product, lots of new inline commenting. However, there is a lot to do but it is on a better footing, is pretty stable and hopefully easier to continue upgrades in the near future. + +CatFIM is still considered a WIP but it is fully functional again and more adjustments hopefully will go much quicker and smoother. + +Also added a system where a config file can be passed into the CatFIM tools instead of assuming a file name and path of simply ".env" in the tools directory. + +This update also relaxes the coordinate accuracy requirements for stage-based CatFIM, which will result in stage-based CatFIM being generated for more sites. + +#### Informally, this is now known as CatFIM 2.0 + + +### Additions +- `config/catfim_template.env`: Template version of the required catfim env file. The template keeps all values that are non sensitive but removes one that is. The true catfim.env for OWP can be found in our .. data/config/catfim.env. Example pathing here based on docker mounts. + +- `src/utils/fim_logger.py`: A new multi proc custom logging system, modelled directly off of the proven ras2fim logging system. The reason for this custom system is that the native Python logging is not stable in multi-proc environments and tends to loose data. This new logger can relatively easily be bolted into almost any of our py scripts if required. + +### Changes +- `.pre-commit-config.yaml`: A linting config adjustment. +- `pyproject.toml`: linting config adjustments +- `src/utils/shared_variables.py`: Added a comment +- `tools` + - `generate_categorical_fim.py`: As mentioned above + - `generate_categorical_fim_flows.py`: As mentioned above + - `generate_categorical_fim_mapping.py`: As mentioned above + - `generate_nws_lid.py`: No real changes but Git thinks something did. It is the same as in current Dev. + - `mosaic_inundation.py`: Added a comment + - `tools_shared_functions.py` + - added some better error handing in a few places, plus some commenting and cleanup. + - Added a feature to the `aggregate_wbd_hucs` function to optionally submit a list of HUCs for filtering results. + +

+ ## v4.5.7.2 - 2024-09-13 - [PR#1149](https://github.com/NOAA-OWP/inundation-mapping/pull/1149) This PR adds scripts that can identify areas within produced inundation rasters where glasswalling of inundation occurs due to catchment boundaries, know as catchment boundary issues. diff --git a/pyproject.toml b/pyproject.toml index ee3862fd..c91682ff 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -46,8 +46,8 @@ skip-magic-trailing-comma = true [tool.isort] profile = 'black' -multi_line_output = 3 line_length = 110 +multi_line_output = 3 lines_after_imports = 2 [tool.flake8] @@ -69,12 +69,16 @@ per-file-ignores = """ src/src_roughness_optimization.py: E712, F841 src/agreedem.py: E712 src/build_stream_traversal.py: E722 - tools/vary_mannings_n_composite.py: E712 - tools/tools_shared_functions.py: F821, F841, E711 + + tools/check_deep_flooding.py: E712 + tools/eval_alt_catfim.py: F841 + tools/generate_categorical_fim.py: C901, E712 + tools/generate_categorical_fim_mapping.py: E712 + tools/inundation.py: F821 tools/rating_curve_comparison.py: F821, F841 tools/run_test_case.py: E711 - tools/inundation.py: F821 - tools/eval_alt_catfim.py: F841 - tools/check_deep_flooding.py: E712 + tools/tools_shared_functions.py: F821, F841, E711 + tools/vary_mannings_n_composite.py: E712 + data/usgs/rating_curve_get_usgs_curves.py: F841 """ diff --git a/src/utils/fim_logger.py b/src/utils/fim_logger.py new file mode 100644 index 00000000..1b9a77ec --- /dev/null +++ b/src/utils/fim_logger.py @@ -0,0 +1,414 @@ +#!/usr/bin/env python3 + +import datetime as dt +import os +import random +import traceback +from pathlib import Path + + +# Careful... You might not put shared_functions here as it can create circular references, +# so shared_functions is put inside functions only/as is need + +# Why does this class exist instead of useing standard python logging tools or other python packes? +# Some of those loggers cache the log file output until the end which can be a problem for crashes. +# Also, none of them are very good at handling multi-processing including file collisions and mixed +# multi-proc output data into the log file. + +# Our answer.. A bit weird, but highly effective. Write to the log file as it goes along. +# For multi-processing, create a temp log for each process, then when the total process is done, merge +# those outputs in the master log file. + + +# This is not a perfect system. When rolling multi-proc logs into a rollup log, there can still be +# collisions but it is rare. + + +class FIM_logger: + CUSTOM_LOG_FILES_PATHS = {} + LOG_SYSTEM_IS_SETUP = False + + LOG_DEFAULT_FOLDER = "" + LOG_FILE_PATH = "" # full path and file name + LOG_WARNING_FILE_PATH = "" + LOG_ERROR_FILE_PATH = "" + + LOG_SYS_NOT_SETUP_MSG = "****** Logging to the file system not yet setup.\n" + "****** Sometimes this is not setup until after initial validation.\n" + + """ + Levels available for use + trace - does not show in console but goes to the default log file + lprint - goes to console and default log file + (use use "print" if you want console only) + notice - goes to console and default log file (Adds word "NOTICE" to front of output) + success - goes to console and default log file (Adds word "SUCCESS" to front of output) + warning - goes to console, log file and warning log file (Adds word "WARNING" to front of output) + error - goes to console, log file and error log file. Normally used when the error + does not kill the application. (Adds word "ERROR" to front of output) + critical - goes to console, log file and error log file. Normally used when the + program is aborted. (Adds word "CRITICAL" to front of output) + + + NOTE: While unconfirmed in the Linux world, special backslashs + combo's like \r, \t, \n can get create problems. + + """ + + # ------------------------------------------------- + def __get_dt(self): + cur_dt = dt.datetime.now() + ret_dt = f"{cur_dt.strftime('%Y-%m-%d')} {cur_dt.strftime('%H:%M:%S')}" + return ret_dt + + # ------------------------------------------------- + def calc_log_name_and_path(self, output_log_dir, file_prefix): + # setup general logger + os.makedirs(output_log_dir, exist_ok=True) + start_time = dt.datetime.now(dt.timezone.utc) + file_dt_string = start_time.strftime("%Y_%m_%d-%H_%M_%S") + log_file_name = f"{file_prefix}_{file_dt_string}.log" + log_output_file = os.path.join(output_log_dir, log_file_name) + + return log_output_file + + # ------------------------------------------------- + def setup(self, log_file_path: str): + """ + During this process, a second log file will be created as an error file which will + duplicate all log message with the levels of ERROR, and CRITICAL. + + input: + - log_file_path : ie) /data/catfim/test/logs/gen_catfim.log + """ + + # ----------- + # Validation + if log_file_path is None: + raise ValueError("Error: log_file_path not defined") + log_file_path = log_file_path.strip() + if log_file_path == "": + raise ValueError("Error: log_file_path can not be empty") + + folder_path = os.path.dirname(log_file_path) + log_file_name = os.path.basename(log_file_path) + + os.makedirs(folder_path, exist_ok=True) + + self.LOG_DEFAULT_FOLDER = folder_path + + # pull out the file name without extension + file_name_parts = os.path.splitext(log_file_name) + if len(file_name_parts) != 2: + raise ValueError("The submitted log_file_name appears to be an invalid file name") + + self.__calc_warning_error_file_names(log_file_path) + self.LOG_FILE_PATH = log_file_path + + # We need to remove the older ones if they already exist. Why? one attempt of running an script + # might trigger and log file and an error file. So, it is correct, run again and now we have an + # old invalid error file + # if os.path.isfile(log_file_path): + # os.remove(log_file_path) + + # if os.path.isfile(self.LOG_ERROR_FILE_PATH): + # os.remove(self.LOG_ERROR_FILE_PATH) + + # if os.path.isfile(self.LOG_WARNING_FILE_PATH): + # os.remove(self.LOG_WARNING_FILE_PATH) + + self.LOG_SYSTEM_IS_SETUP = True + return + + # ------------------------------------------------- + def MP_calc_prefix_name(self, parent_log_output_file, file_prefix, huc: str = ""): + """_summary_ + Uses the file name of the parent log to create the new prefix name (without MP date) + + You don't need to use this method if you don't want to prepend the parent file name prefix + """ + + parent_log_file_name = os.path.basename(parent_log_output_file).replace(".log", "") + + if huc != "": + prefix = f"{parent_log_file_name}--{huc}--{file_prefix}" + else: + prefix = f"{parent_log_file_name}--{file_prefix}" + return prefix + + # ------------------------------------------------- + def MP_Log_setup(self, parent_log_output_file, file_prefix): + """ + Overview: + This is for logs used inside code that is multi-processing, aka. inside the actual functions + of the call from Pool + + This method is sort of a wrapper in that it just manually creates a file name + using a defined file path. + + As this is an MP file, the parent_log_output_file may have a date in it + The file name is calculated as such {file_prefix}-{random 12 digit key}.log() + ie) catfim_2024_07_09-16_30_02__012345678901.log + + The extra file portion is added as in MultiProc, you can have dozens of processes + and each are loggign to their own file. At then end of an MP, you call a function called merge_log_files + which will merge them into a parent log file if requested. + + Inputs: + file_prefix (str): a value to prepend to the file names. Often is the name of the function + that called this method. Note: Later, when a person has these MP_Logs cleaned up + they will use this file_prefix again to search and remove the temp MP_log files as they + get rolled back up to the master log file. + parent_log_output_file (str): folder location for the files to be created. Note: it has to be in + the same folder as the master log file. + """ + # ----------------- + log_folder = os.path.dirname(parent_log_output_file) + + random_id = random.randrange(1000000000, 99999999999) + log_file_name = f"{file_prefix}___{random_id}.log" + log_file_path = os.path.join(log_folder, log_file_name) + + self.setup(log_file_path) + return + + # ------------------------------------------------- + def __calc_warning_error_file_names(self, log_file_and_path): + """ + Process: + Parses the log_file_and_path to add either the name of _warnings or _errors + into the file name. + Why not update LOG_WARNING_FILE_PATH and LOG_ERROR_FILE_PATH + Input: + log_file_and_path: ie) /data/outputs/rob_test/logs/catfim.log + Output: + Updates LOG_WARNING_FILE_PATH and LOG_ERROR_FILE_PATH variables + """ + + folder_path = os.path.dirname(log_file_and_path) + log_file_name = os.path.basename(log_file_and_path) + + # pull out the file name without extension + file_name_parts = os.path.splitext(log_file_name) + if len(file_name_parts) != 2: + raise ValueError("The submitted log_file_name appears to be an invalid file name") + + # now calc the warning log file + self.LOG_WARNING_FILE_PATH = os.path.join( + folder_path, file_name_parts[0] + "_warnings" + file_name_parts[1] + ) + + # now calc the error log file + self.LOG_ERROR_FILE_PATH = os.path.join( + folder_path, file_name_parts[0] + "_errors" + file_name_parts[1] + ) + return + + # ------------------------------------------------- + def merge_log_files(self, parent_log_output_file, file_prefix, remove_old_files=True): + """ + Overview: + This tool is mostly for merging log files during multi processing which each had their own file. + + This will search all of the files in directory in the same folder as the + incoming log_file_and_path. It then looks for all files starting with the + file_prefix and adds them to the log file (via log_file_and_path) + Inputs: + - log_file_and_path: ie) /data/outputs/rob_test/logs/catfim.log + - file_prefix: This value must be the start of file names. ie) mp_create_gdf_of_points + as in /data/outputs/rob_test/logs/mp_generate_categorical_fim(_231122_1407444333_12345).log + """ + + # ----------- + # Validation + if parent_log_output_file is None: + raise ValueError("Error: parent_log_file_and_path not defined") + + parent_log_output_file = parent_log_output_file.strip() + + if parent_log_output_file == "": + raise ValueError("Error: parent log_file_and_path can not be empty") + + folder_path = os.path.dirname(parent_log_output_file) + os.makedirs(folder_path, exist_ok=True) + + log_file_list_paths = list(Path(folder_path).rglob(f"{file_prefix}*")) + log_file_list = [str(x) for x in log_file_list_paths] + + if len(log_file_list) > 0: + log_file_list.sort() + + # we are merging them in order (reg files, then warnings, then errors) + + # open and write to the parent log + # This will write all logs including errors and warning + with open(parent_log_output_file, 'a') as main_log: + # Iterate through list + for temp_log_file in log_file_list: + # Open each file in read mode + with open(temp_log_file) as infile: + main_log.write(infile.read()) + if "warning" not in temp_log_file and "error" not in temp_log_file: + if remove_old_files: + os.remove(temp_log_file) + + # now the warning files if there are any + log_warning_file_list = list(Path(folder_path).rglob(f"{file_prefix}*_warnings*")) + if len(log_warning_file_list) > 0: + log_warning_file_list.sort() + parent_warning_file = parent_log_output_file.replace(".log", "_warnings.log") + with open(parent_warning_file, 'a') as warning_log: + # Iterate through list + for temp_log_file in log_warning_file_list: + # Open each file in read mode + with open(temp_log_file) as infile: + warning_log.write(infile.read()) + + if remove_old_files: + os.remove(temp_log_file) + + # now the warning files if there are any + log_error_file_list = list(Path(folder_path).rglob(f"{file_prefix}*_errors*")) + if len(log_error_file_list) > 0: + log_error_file_list.sort() + parent_error_file = parent_log_output_file.replace(".log", "_errors.log") + # doesn't yet exist, then create a blank one + with open(parent_error_file, 'a') as error_log: + # Iterate through list + for temp_log_file in log_error_file_list: + # Open each file in read mode + with open(temp_log_file) as infile: + error_log.write(infile.read()) + + if remove_old_files: + os.remove(temp_log_file) + + return + + # ------------------------------------------------- + def trace(self, msg): + # goes to file only, not console + level = "TRACE " # keeps spacing the same + if self.LOG_FILE_PATH == "": + print(self.LOG_SYS_NOT_SETUP_MSG) + return + + with open(self.LOG_FILE_PATH, "a") as f_log: + f_log.write(f"{self.__get_dt()} | {level} || {msg}\n") + + return + + # ------------------------------------------------- + def lprint(self, msg): + # goes to console and log file + level = "LPRINT " # keeps spacing the same + print(f"{msg} ") + + if self.LOG_FILE_PATH == "": + print(self.LOG_SYS_NOT_SETUP_MSG) + return + + with open(self.LOG_FILE_PATH, "a") as f_log: + f_log.write(f"{self.__get_dt()} | {level} || {msg}\n") + + return + + # ------------------------------------------------- + def notice(self, msg): + # goes to console and log file + level = "NOTICE " # keeps spacing the same + # print(f"{cl.fore.TURQUOISE_2}{msg}{cl.style.RESET}") + print(f"{level}: {msg}") + + if self.LOG_FILE_PATH == "": + print(self.LOG_SYS_NOT_SETUP_MSG) + return + + with open(self.LOG_FILE_PATH, "a") as f_log: + f_log.write(f"{self.__get_dt()} | {level} || {msg}\n") + + return + + # ------------------------------------------------- + def success(self, msg): + # goes to console and log file + level = "SUCCESS " # keeps spacing the same + + # c_msg_type = f"{cl.fore.SPRING_GREEN_2B}<{level}>{cl.style.RESET}" + # print(f"{self.__get_clog_dt()} {c_msg_type} : {msg}") + print(f"{level}: {msg}") + + if self.LOG_FILE_PATH == "": + print(self.LOG_SYS_NOT_SETUP_MSG) + return + + with open(self.LOG_FILE_PATH, "a") as f_log: + f_log.write(f"{self.__get_dt()} | {level} || {msg}\n") + + return + + # ------------------------------------------------- + def warning(self, msg): + # goes to console and log file and warning log file + level = "WARNING " # keeps spacing the same + + # c_msg_type = f"{cl.fore.LIGHT_YELLOW}<{level}>{cl.style.RESET}" + # print(f"{self.__get_clog_dt()} {c_msg_type} : {msg}") + print(f"{level}: {msg}") + + if self.LOG_FILE_PATH == "": + print(self.LOG_SYS_NOT_SETUP_MSG) + return + + with open(self.LOG_FILE_PATH, "a") as f_log: + f_log.write(f"{self.__get_dt()} | {level} || {msg}\n") + + # and also write to warning logs + with open(self.LOG_WARNING_FILE_PATH, "a") as f_log: + f_log.write(f"{self.__get_dt()} | {level} || {msg}\n") + + return + + # ------------------------------------------------- + def error(self, msg): + # goes to console and log file and error log file + level = "ERROR " # keeps spacing the same + + # c_msg_type = f"{cl.fore.RED_1}<{level}>{cl.style.RESET}" + # print(f"{self.__get_clog_dt()} {c_msg_type} : {msg}") + print(f"{level}{msg}") + + if self.LOG_FILE_PATH == "": + print(self.LOG_SYS_NOT_SETUP_MSG) + return + + with open(self.LOG_FILE_PATH, "a") as f_log: + f_log.write(f"{self.__get_dt()} | {level} || {msg}\n") + + # and also write to error logs + with open(self.LOG_ERROR_FILE_PATH, "a") as f_log: + f_log.write(f"{self.__get_dt()} | {level} || {msg}\n") + + return + + # ------------------------------------------------- + def critical(self, msg): + level = "CRITICAL" # keeps spacing the same + + # c_msg_type = f"{cl.style.BOLD}{cl.fore.RED_3A}{cl.back.WHITE}{self.__get_dt()}" + # c_msg_type += f" <{level}>" + # print(f" {c_msg_type} : {msg} {cl.style.RESET}") + print(f"{level}{msg}") + + if self.LOG_FILE_PATH == "": + print(self.LOG_SYS_NOT_SETUP_MSG) + return + + with open(self.LOG_FILE_PATH, "a") as f_log: + f_log.write(f"{self.__get_dt()} | {level} || {msg}\n") + + # and also write to error logs + with open(self.LOG_ERROR_FILE_PATH, "a") as f_log: + f_log.write(f"{self.__get_dt()} | {level} || {msg}\n") + + return diff --git a/src/utils/shared_variables.py b/src/utils/shared_variables.py index 72a28a66..8ea520e7 100644 --- a/src/utils/shared_variables.py +++ b/src/utils/shared_variables.py @@ -38,7 +38,7 @@ '0.0174532925199433]],PROJECTION["Mercator_Auxiliary_Sphere"],PARAMETER["False_Easting",0.0],' 'PARAMETER["False_Northing",0.0],PARAMETER["Central_Meridian",0.0],PARAMETER["Standard_Parallel_1",0.0],' 'PARAMETER["Auxiliary_Sphere_Type",0.0],UNIT["Meter",1.0]]' -) +) # 3857 # -- Data URLs-- # NHD_URL_PARENT = r'https://prd-tnm.s3.amazonaws.com/StagedProducts/Hydrography/NHDPlusHR/Beta/GDB/' diff --git a/tools/generate_categorical_fim.py b/tools/generate_categorical_fim.py index e3d248be..5d2a3bcd 100755 --- a/tools/generate_categorical_fim.py +++ b/tools/generate_categorical_fim.py @@ -4,20 +4,28 @@ import csv import glob import os +import shutil import sys import time import traceback from concurrent.futures import ProcessPoolExecutor, as_completed, wait -from datetime import datetime + +# from logging.handlers import QueueHandler, QueueListener +from datetime import datetime, timezone +from itertools import repeat from pathlib import Path import geopandas as gpd import numpy as np import pandas as pd import rasterio -from generate_categorical_fim_flows import generate_catfim_flows -from generate_categorical_fim_mapping import manage_catfim_mapping, post_process_cat_fim_for_viz -from rasterio.warp import Resampling, calculate_default_transform, reproject +from dotenv import load_dotenv +from generate_categorical_fim_flows import generate_flows +from generate_categorical_fim_mapping import ( + manage_catfim_mapping, + post_process_cat_fim_for_viz, + produce_stage_based_catfim_tifs, +) from tools_shared_functions import ( filter_nwm_segments_by_stream_order, get_datum, @@ -33,198 +41,293 @@ acceptable_site_type_list, ) +import utils.fim_logger as fl from utils.shared_variables import VIZ_PROJECTION +# global RLOG +FLOG = fl.FIM_logger() # the non mp version +MP_LOG = fl.FIM_logger() # the Multi Proc version + gpd.options.io_engine = "pyogrio" +# TODO: Aug 2024: This script was upgraded significantly with lots of misc TODO's embedded. +# Lots of inline documenation needs updating as well + + +""" +Jun 17, 2024 +This system is continuing to mature over time. It has a number of optimizations that can still +be applied in areas such as logic, performance and error handling. + +In the interium there is still a consider amount of debug lines and tools embedded in that can +be commented on/off as required. + + +NOTE: For now.. all logs roll up to the parent log file. ie) catfim_2024_07_09-22-20-12.log +This creates a VERY large final log file, but the warnings and errors file should be manageable. +Later: Let's split this to seperate log files per huc. Easy to do that for Stage Based it has +"iterate_through_stage_based" function. Flow based? we have to think that one out a bit + +""" + + def process_generate_categorical_fim( fim_run_dir, + env_file, job_number_huc, job_number_inundate, - stage_based, + is_stage_based, output_folder, overwrite, search, lid_to_run, job_number_intervals, past_major_interval_cap, + nwm_metafile, ): - print("================================") - print("Start generate categorical fim") - overall_start_time = datetime.now() - dt_string = datetime.now().strftime("%m/%d/%Y %H:%M:%S") - print(f"started: {dt_string}") - print() + # ================================ + # Validation and setup + + # Append option configuration (flow_based or stage_based) to output folder name. + if is_stage_based: + catfim_method = "stage_based" + else: + catfim_method = "flow_based" + + # Define output directories + if output_folder.endswith("/"): + output_folder = output_folder[:-1] + output_catfim_dir = output_folder + "_" + catfim_method + + output_flows_dir = os.path.join(output_catfim_dir, 'flows') + output_mapping_dir = os.path.join(output_catfim_dir, 'mapping') + attributes_dir = os.path.join(output_catfim_dir, 'attributes') + + # The override is not for the parent folder as we want to keep logs around with or without override + if os.path.exists(output_catfim_dir) is False: + os.mkdir(output_catfim_dir) + + # Create output directories (check against maping only as a proxy for all three) + if os.path.exists(output_mapping_dir) is True: + if overwrite is False: + raise Exception( + f"The output mapping folder of {output_catfim_dir} already exists." + " If you want to overwrite it, please add the -o flag. Note: When overwritten, " + " the three folders of mapping, flows and attributes wil be deleted and rebuilt" + ) + + gpkg_dir = os.path.join(output_mapping_dir, 'gpkg') + shutil.rmtree(gpkg_dir, ignore_errors=True) + + shutil.rmtree(output_mapping_dir, ignore_errors=True) + shutil.rmtree(output_flows_dir, ignore_errors=True) + shutil.rmtree(attributes_dir, ignore_errors=True) + + # Keeps the logs folder + + if nwm_metafile != "": + if os.path.exists(nwm_metafile) is False: + raise Exception("The nwm_metadata (-me) file can not be found. Please remove or fix pathing.") + file_ext = os.path.splitext(nwm_metafile) + if file_ext.count == 0: + raise Exception("The nwm_metadata (-me) file appears to be invalid. It is missing an extension.") + if file_ext[1].lower() != ".pkl": + raise Exception("The nwm_metadata (-me) file appears to be invalid. The extention is not pkl.") + + # Define default arguments. Modify these if necessary + fim_version = os.path.split(fim_run_dir)[1] + + # TODO: Aug 2024: Job values are not well used. There are some times where not + # all three job values are not being used. This needs to be cleaned up. # Check job numbers and raise error if necessary - total_cpus_requested = job_number_huc * job_number_inundate * job_number_intervals - total_cpus_available = os.cpu_count() - 1 - if total_cpus_requested > total_cpus_available: + # Considering how we are using each CPU very well at all, we could experiment + # with either overclocking or chagnign to threading. Of course, if we change + # to threading we ahve to be super careful about file and thread collisions (locking) + + # commented out for now for some small overclocking tests (carefully of course) + # total_cpus_requested = job_number_huc * job_number_inundate * job_number_intervals + # total_cpus_available = os.cpu_count() - 2 + # if total_cpus_requested > total_cpus_available: + # raise ValueError( + # f"The HUC job number (jh) [{job_number_huc}]" + # f" multiplied by the inundate job number (jn) [{job_number_inundate}]" + # f" multiplied by the job number intervals (ji) [{job_number_intervals}]" + # " exceeds your machine\'s available CPU count minus one." + # " Please lower one or more of those values accordingly." + # ) + + # we are getting too many folders and files. We want just huc folders. + # output_flow_dir_list = os.listdir(fim_run_dir) + # looking for folders only starting with 0, 1, or 2 + # Code variation for dropping all Alaska HUCS: + valid_ahps_hucs = [ + x + for x in os.listdir(fim_run_dir) + if os.path.isdir(os.path.join(fim_run_dir, x)) and x[0] in ['0', '1', '2'] and x[:2] != "19" + ] + + # # Code variation for KEEPING Alaska HUCS: + # valid_ahps_hucs = [ + # x + # for x in os.listdir(fim_run_dir) + # if os.path.isdir(os.path.join(fim_run_dir, x)) and x[0] in ['0', '1', '2'] + # ] + + valid_ahps_hucs.sort() + + num_hucs = len(valid_ahps_hucs) + if num_hucs == 0: raise ValueError( - 'The HUC job number, {}, multiplied by the inundate job number, {}, ' - 'exceeds your machine\'s available CPU count minus one. ' - 'Please lower the job_number_huc or job_number_inundate ' - 'values accordingly.'.format(job_number_huc, job_number_inundate) + f'Output directory {fim_run_dir} is empty. Verify that you have the correct input folder.' ) + # End of Validation and setup + # ================================ - # Define default arguments. Modify these if necessary - fim_version = os.path.split(fim_run_dir)[1] + log_dir = os.path.join(output_catfim_dir, "logs") + log_output_file = FLOG.calc_log_name_and_path(log_dir, "catfim") + FLOG.setup(log_output_file) - # Append option configuration (flow_based or stage_based) to output folder name. - if stage_based: - file_handle_appendage, catfim_method = "_stage_based", "STAGE-BASED" - else: - file_handle_appendage, catfim_method = "_flow_based", "FLOW-BASED" + overall_start_time = datetime.now(timezone.utc) + dt_string = overall_start_time.strftime("%m/%d/%Y %H:%M:%S") - # Define output directories - output_catfim_dir_parent = output_folder + file_handle_appendage - output_flows_dir = os.path.join(output_catfim_dir_parent, 'flows') - output_mapping_dir = os.path.join(output_catfim_dir_parent, 'mapping') - attributes_dir = os.path.join(output_catfim_dir_parent, 'attributes') - - # Create output directories - if not os.path.exists(output_catfim_dir_parent): - os.mkdir(output_catfim_dir_parent) - if not os.path.exists(output_flows_dir): - os.mkdir(output_flows_dir) - if not os.path.exists(output_mapping_dir): - os.mkdir(output_mapping_dir) - if not os.path.exists(attributes_dir): - os.mkdir(attributes_dir) + # os.makedirs(output_flows_dir, exist_ok=True) # Stage doesn't use it + os.makedirs(output_mapping_dir, exist_ok=True) + os.makedirs(attributes_dir, exist_ok=True) + + FLOG.lprint("================================") + FLOG.lprint(f"Start generate categorical fim for {catfim_method} - (UTC): {dt_string}") + FLOG.lprint("") + + FLOG.lprint( + f"Processing {num_hucs} huc(s) with Alaska temporarily removed" + ) # Code variation for DROPPING Alaska HUCs + # FLOG.lprint(f"Processing {num_hucs} huc(s)") # Code variation for KEEPING Alaska HUCs + + load_dotenv(env_file) + API_BASE_URL = os.getenv('API_BASE_URL') + if API_BASE_URL is None: + raise ValueError( + 'API base url not found. ' + 'Ensure inundation_mapping/tools/ has an .env file with the following info: ' + 'API_BASE_URL, EVALUATED_SITES_CSV, WBD_LAYER, NWM_FLOWS_MS, ' + 'USGS_METADATA_URL, USGS_DOWNLOAD_URL' + ) + + # TODO: Add check for if lid_to_run and lst_hucs parameters conflict + + # Check that fim_inputs.csv exists and raise error if necessary + fim_inputs_csv_path = os.path.join(fim_run_dir, 'fim_inputs.csv') + if not os.path.exists(fim_inputs_csv_path): + raise ValueError(f'{fim_inputs_csv_path} not found. Verify that you have the correct input files.') + + # print() + + # FLOG.lprint("Filtering out HUCs that do not have related ahps site in them.") + # valid_ahps_hucs = __filter_hucs_to_ahps(lst_hucs) + + # num_valid_hucs = len(valid_ahps_hucs) + # if num_valid_hucs == 0: + # raise Exception("None of the HUCs supplied have ahps sites in them. Check your fim output folder") + # else: + # FLOG.lprint(f"Processing {num_valid_hucs} huc(s) with AHPS sites") # Define upstream and downstream search in miles nwm_us_search, nwm_ds_search = search, search - fim_dir = fim_version - - # Set up logging - log_dir = os.path.join(output_catfim_dir_parent, 'logs') - log_file = os.path.join(log_dir, 'errors.log') + nws_lid_gpkg_file_path = "" # STAGE-BASED - if stage_based: + if is_stage_based: # Generate Stage-Based CatFIM mapping - nws_sites_layer = generate_stage_based_categorical_fim( - output_mapping_dir, - fim_version, + # does flows and inundation (mapping) + nws_lid_gpkg_file_path = generate_stage_based_categorical_fim( + output_catfim_dir, fim_run_dir, nwm_us_search, nwm_ds_search, - job_number_inundate, lid_to_run, - attributes_dir, + env_file, + job_number_inundate, + job_number_huc, + valid_ahps_hucs, job_number_intervals, past_major_interval_cap, - job_number_huc, + nwm_metafile, ) - job_number_tif = job_number_inundate * job_number_intervals - print("Post-processing TIFs...") + # creates the gkpgs (tif's created above) + # TODO: Aug 2024, so we need to clean it up + # This step does not need a job_number_inundate as it can't really use use it. + # It processes primarily hucs and ahps in multiproc + # for now, we will manuall multiple the huc * 5 (max number of ahps types) + ahps_jobs = job_number_huc * 5 post_process_cat_fim_for_viz( - job_number_huc, - job_number_tif, - output_mapping_dir, - attributes_dir, - log_file=log_file, - fim_version=fim_version, + catfim_method, output_catfim_dir, ahps_jobs, fim_version, log_output_file ) - # Updating mapping status - print('Updating mapping status...') - update_mapping_status(str(output_mapping_dir), str(output_flows_dir), nws_sites_layer, stage_based) - # FLOW-BASED else: - fim_dir = "" - print('Creating flow files using the ' + catfim_method + ' technique...') + FLOG.lprint('Creating flow files using the ' + catfim_method + ' technique...') start = time.time() - nws_sites_layer = generate_catfim_flows( - output_flows_dir, + + # generate flows is only using one of the incoming job number params + # so let's multiply -jh (huc) and -jn (inundate) + job_flows = job_number_huc * job_number_inundate + nws_lid_gpkg_file_path = generate_flows( + output_catfim_dir, nwm_us_search, nwm_ds_search, - stage_based, - fim_dir, lid_to_run, - attributes_dir, - job_number_huc, + env_file, + job_flows, + is_stage_based, + valid_ahps_hucs, + nwm_metafile, + log_output_file, ) end = time.time() elapsed_time = (end - start) / 60 + FLOG.lprint(f"Finished creating flow files in {str(elapsed_time).split('.')[0]} minutes") - print(f'Finished creating flow files in {elapsed_time} minutes') - # Generate CatFIM mapping - print('Begin mapping') - start = time.time() + # Generate CatFIM mapping (both flow and stage based need it, but if different orders manage_catfim_mapping( fim_run_dir, output_flows_dir, - output_mapping_dir, - attributes_dir, + output_catfim_dir, + catfim_method, job_number_huc, job_number_inundate, - overwrite, - depthtif=False, + False, + log_output_file, ) - end = time.time() - elapsed_time = (end - start) / 60 - print(f'Finished mapping in {elapsed_time} minutes') - # Updating mapping status - print('Updating mapping status') - update_mapping_status(str(output_mapping_dir), str(output_flows_dir), nws_sites_layer, stage_based) + # end if else + + # Updating mapping status + FLOG.lprint('Updating mapping status...') + update_flow_mapping_status(output_mapping_dir, nws_lid_gpkg_file_path) # Create CSV versions of the final geopackages. - print('Creating CSVs. This may take several minutes.') - reformatted_catfim_method = catfim_method.lower().replace('-', '_') - create_csvs(output_mapping_dir, reformatted_catfim_method) + # FLOG.lprint('Creating CSVs. This may take several minutes.') + # create_csvs(output_mapping_dir, is_stage_based) - print("================================") - print("End generate categorical fim") + FLOG.lprint("================================") + FLOG.lprint("End generate categorical fim") - end_time = datetime.now() - dt_string = datetime.now().strftime("%m/%d/%Y %H:%M:%S") - print(f"ended: {dt_string}") + overall_end_time = datetime.now(timezone.utc) + dt_string = overall_end_time.strftime("%m/%d/%Y %H:%M:%S") + FLOG.lprint(f"Ended (UTC): {dt_string}") # calculate duration - time_duration = end_time - overall_start_time - print(f"Duration: {str(time_duration).split('.')[0]}") - print() - - -def create_csvs(output_mapping_dir, reformatted_catfim_method): - ''' - Produces CSV versions of desired geopackage in the output_mapping_dir. - - Parameters - ---------- - output_mapping_dir : STR - Path to the output directory of all inundation maps. - reformatted_catfim_method : STR - Text to append to CSV to communicate the type of CatFIM. - - Returns - ------- - None. - - ''' + time_duration = overall_end_time - overall_start_time + FLOG.lprint(f"Duration: {str(time_duration).split('.')[0]}") - # Convert any geopackage in the root level of output_mapping_dir to CSV and rename. - gpkg_list = glob.glob(os.path.join(output_mapping_dir, '*.gpkg')) - for gpkg in gpkg_list: - print(f"Creating CSV for {gpkg}") - gdf = gpd.read_file(gpkg) - parent_directory = os.path.split(gpkg)[0] - if 'catfim_library' in gpkg: - file_name = reformatted_catfim_method + '_catfim.csv' - if 'nws_lid_sites' in gpkg: - file_name = reformatted_catfim_method + '_catfim_sites.csv' - - csv_output_path = os.path.join(parent_directory, file_name) - gdf.to_csv(csv_output_path) + return -def update_mapping_status(output_mapping_dir, output_flows_dir, nws_sites_layer, stage_based): +def update_flow_mapping_status(output_mapping_dir, nws_lid_gpkg_file_path): ''' Updates the status for nws_lids from the flows subdirectory. Status is updated for sites where the inundation.py routine was not able to @@ -232,12 +335,14 @@ def update_mapping_status(output_mapping_dir, output_flows_dir, nws_sites_layer, an error occured in inundation.py that all flow files for a given site experienced the error as they all would have the same nwm segments. + If there is no valid mapping files, update the nws_lids record + Parameters ---------- output_mapping_dir : STR Path to the output directory of all inundation maps. - output_flows_dir : STR - Path to the directory containing all flows. + nws_lid_gpkg_file_path : STR + Returns ------- @@ -246,422 +351,291 @@ def update_mapping_status(output_mapping_dir, output_flows_dir, nws_sites_layer, ''' # Find all LIDs with empty mapping output folders subdirs = [str(i) for i in Path(output_mapping_dir).rglob('**/*') if i.is_dir()] + + print("") + empty_nws_lids = [Path(directory).name for directory in subdirs if not list(Path(directory).iterdir())] + if len(empty_nws_lids) > 0: + FLOG.warning(f"Empty_nws_lids are.. {empty_nws_lids}") # Write list of empty nws_lids to DataFrame, these are sites that failed in inundation.py mapping_df = pd.DataFrame({'nws_lid': empty_nws_lids}) + mapping_df['did_it_map'] = 'no' mapping_df['map_status'] = ' and all categories failed to map' # Import geopackage output from flows creation - flows_df = gpd.read_file(nws_sites_layer) + flows_gdf = gpd.read_file(nws_lid_gpkg_file_path, engine='fiona') + + if len(flows_gdf) == 0: + FLOG.critical(f"flows_gdf is empty. Path is {nws_lid_gpkg_file_path}. Program aborted.") + sys.exit(1) try: # Join failed sites to flows df - flows_df = flows_df.merge(mapping_df, how='left', on='nws_lid') + flows_gdf = flows_gdf.merge(mapping_df, how='left', on='nws_lid') # Switch mapped column to no for failed sites and update status - flows_df.loc[flows_df['did_it_map'] == 'no', 'mapped'] = 'no' - flows_df.loc[flows_df['did_it_map'] == 'no', 'status'] = flows_df['status'] + flows_df['map_status'] - - # # Perform pass for HUCs where mapping was skipped due to missing data #TODO check with Brian - # if stage_based: - # missing_mapping_hucs = - # else: - # flows_hucs = [i.stem for i in Path(output_flows_dir).iterdir() if i.is_dir()] - # mapping_hucs = [i.stem for i in Path(output_mapping_dir).iterdir() if i.is_dir()] - # missing_mapping_hucs = list(set(flows_hucs) - set(mapping_hucs)) - # - # # Update status for nws_lid in missing hucs and change mapped attribute to 'no' - # flows_df.loc[flows_df.eval('HUC8 in @missing_mapping_hucs & mapped == "yes"'), 'status'] = - # flows_df['status'] + ' and all categories failed to map because missing HUC information' - # flows_df.loc[flows_df.eval('HUC8 in @missing_mapping_hucs & mapped == "yes"'), 'mapped'] = 'no' + flows_gdf.loc[flows_gdf['did_it_map'] == 'no', 'mapped'] = 'no' + flows_gdf.loc[flows_gdf['did_it_map'] == 'no', 'status'] = ( + flows_gdf['status'] + flows_gdf['map_status'] + ) # Clean up GeoDataFrame and rename columns for consistency - flows_df = flows_df.drop(columns=['did_it_map', 'map_status']) - flows_df = flows_df.rename(columns={'nws_lid': 'ahps_lid'}) + flows_gdf = flows_gdf.drop(columns=['did_it_map', 'map_status']) + flows_gdf = flows_gdf.rename(columns={'nws_lid': 'ahps_lid'}) # Write out to file - flows_df.to_file(nws_sites_layer) - except Exception as e: - print(f"No LIDs, \n Exception: \n {repr(e)} \n") - - -def produce_inundation_map_with_stage_and_feature_ids( - rem_path, catchments_path, hydroid_list, hand_stage, lid_directory, category, huc, lid, branch -): - # Open rem_path and catchment_path using rasterio. - rem_src = rasterio.open(rem_path) - catchments_src = rasterio.open(catchments_path) - rem_array = rem_src.read(1) - catchments_array = catchments_src.read(1) - - # Use numpy.where operation to reclassify rem_path on the condition that the pixel values - # are <= to hand_stage and the catchments value is in the hydroid_list. - reclass_rem_array = np.where((rem_array <= hand_stage) & (rem_array != rem_src.nodata), 1, 0).astype( - 'uint8' - ) - hydroid_mask = np.isin(catchments_array, hydroid_list) - target_catchments_array = np.where( - (hydroid_mask is True) & (catchments_array != catchments_src.nodata), 1, 0 - ).astype('uint8') - masked_reclass_rem_array = np.where( - (reclass_rem_array == 1) & (target_catchments_array == 1), 1, 0 - ).astype('uint8') - - # Save resulting array to new tif with appropriate name. brdc1_record_extent_18060005.tif - is_all_zero = np.all((masked_reclass_rem_array == 0)) - - if not is_all_zero: - output_tif = os.path.join( - lid_directory, lid + '_' + category + '_extent_' + huc + '_' + branch + '.tif' - ) - with rasterio.Env(): - profile = rem_src.profile - profile.update(dtype=rasterio.uint8) - profile.update(nodata=10) + # TODO: Aug 29, 204: Not 100% sure why, but the gpkg errors out... likely missing a projection + flows_gdf.to_file(nws_lid_gpkg_file_path, index=False, driver='GPKG', engine="fiona") - with rasterio.open(output_tif, 'w', **profile) as dst: - dst.write(masked_reclass_rem_array, 1) + # csv flow file name + nws_lid_csv_file_path = nws_lid_gpkg_file_path.replace(".gkpg", ".csv") + # and we write a csv version at this time as well. + # and this csv is good + flows_gdf.to_csv(nws_lid_csv_file_path) -def mark_complete(site_directory): - marker_file = Path(site_directory) / 'complete.txt' - marker_file.touch() + except Exception as e: + FLOG.critical(f"{output_mapping_dir} : No LIDs, \n Exception: \n {repr(e)} \n") + FLOG.critical(traceback.format_exc()) return +# This is always called as part of Multi-processing so uses MP_LOG variable and +# creates it's own logging object. +# This does flow files and mapping in the same function by HUC def iterate_through_huc_stage_based( - workspace, + output_catfim_dir, huc, fim_dir, huc_dictionary, threshold_url, - flood_categories, + magnitudes, all_lists, past_major_interval_cap, - number_of_jobs, - number_of_interval_jobs, - attributes_dir, - nwm_flows_df, + job_number_inundate, + job_number_intervals, + nwm_flows_region_df, + parent_log_output_file, + child_log_file_prefix, + progress_stmt, ): - missing_huc_files = [] - all_messages = [] - stage_based_att_dict = {} - - print(f'Iterating through {huc}...') - # Make output directory for huc. - huc_directory = os.path.join(workspace, huc) - if not os.path.exists(huc_directory): - os.mkdir(huc_directory) - - # Define paths to necessary HAND and HAND-related files. - usgs_elev_table = os.path.join(fim_dir, huc, 'usgs_elev_table.csv') - branch_dir = os.path.join(fim_dir, huc, 'branches') - - # Loop through each lid in nws_lids list - nws_lids = huc_dictionary[huc] - for lid in nws_lids: - lid = lid.lower() # Convert lid to lower case - # -- If necessary files exist, continue -- # - if not os.path.exists(usgs_elev_table): - all_messages.append( - [ - f'{lid}:usgs_elev_table missing, likely unacceptable gage datum error--' - 'more details to come in future release' - ] + """_summary_ + This and its children will create stage based tifs and catfim data based on a huc + """ + + try: + # This is setting up logging for this function to go up to the parent + # child_log_file_prefix is likely MP_iter_hucs + MP_LOG.MP_Log_setup(parent_log_output_file, child_log_file_prefix) + MP_LOG.lprint("\n**********************") + MP_LOG.lprint(f'Processing {huc} ...') + MP_LOG.lprint(f'... {progress_stmt} ...') + MP_LOG.lprint("") + + missing_huc_files = [] + all_messages = [] + stage_based_att_dict = {} + + mapping_dir = os.path.join(output_catfim_dir, "mapping") + attributes_dir = os.path.join(output_catfim_dir, 'attributes') + # output_flows_dir = os.path.join(output_catfim_dir, "flows") + huc_messages_dir = os.path.join(mapping_dir, 'huc_messages') + + # Make output directory for the particular huc in the mapping folder + mapping_huc_directory = os.path.join(mapping_dir, huc) + if not os.path.exists(mapping_huc_directory): + os.mkdir(mapping_huc_directory) + + # Define paths to necessary HAND and HAND-related files. + usgs_elev_table = os.path.join(fim_dir, huc, 'usgs_elev_table.csv') + branch_dir = os.path.join(fim_dir, huc, 'branches') + + # Loop through each lid in nws_lids list + nws_lids = huc_dictionary[huc] + + MP_LOG.lprint(f"Lids to process for {huc} are {nws_lids}") + + for lid in nws_lids: + MP_LOG.lprint("-----------------------------------") + huc_lid_id = f"{huc} : {lid}" + MP_LOG.lprint(huc_lid_id) + + lid = lid.lower() # Convert lid to lower case + # -- If necessary files exist, continue -- # + # Yes, each lid gets a record no matter what, so we need some of these messages duplicated + # per lid record + if not os.path.exists(usgs_elev_table): + msg = ":usgs_elev_table missing, likely unacceptable gage datum error -- more details to come in future release" + all_messages.append(lid + msg) + MP_LOG.warning(huc_lid_id + msg) + continue + if not os.path.exists(branch_dir): + msg = ":branch directory missing" + all_messages.append(lid + msg) + MP_LOG.warning(huc_lid_id + msg) + continue + usgs_elev_df = pd.read_csv(usgs_elev_table) + + # Make mapping lid_directory. + mapping_lid_directory = os.path.join(mapping_huc_directory, lid) + if not os.path.exists(mapping_lid_directory): + os.mkdir(mapping_lid_directory) + + # Get stages and flows for each threshold from the WRDS API. Priority given to USGS calculated flows. + stages, flows = get_thresholds( + threshold_url=threshold_url, select_by='nws_lid', selector=lid, threshold='all' ) - continue - if not os.path.exists(branch_dir): - all_messages.append([f'{lid}:branch directory missing']) - continue - usgs_elev_df = pd.read_csv(usgs_elev_table) - # Make lid_directory. - - lid_directory = os.path.join(huc_directory, lid) - if not os.path.exists(lid_directory): - os.mkdir(lid_directory) - else: - complete_marker = os.path.join(lid_directory, 'complete.txt') - if os.path.exists(complete_marker): - all_messages.append([f"{lid}: already completed in previous run."]) + + if stages is None: + msg = ':error getting thresholds from WRDS API' + all_messages.append(lid + msg) + MP_LOG.warning(huc_lid_id + msg) continue - # Get stages and flows for each threshold from the WRDS API. Priority given to USGS calculated flows. - stages, flows = get_thresholds( - threshold_url=threshold_url, select_by='nws_lid', selector=lid, threshold='all' - ) - if stages is None: - all_messages.append([f'{lid}:error getting thresholds from WRDS API']) - continue - # Check if stages are supplied, if not write message and exit. - if all(stages.get(category, None) is None for category in flood_categories): - all_messages.append([f'{lid}:missing threshold stages']) - continue + # Check if stages are supplied, if not write message and exit. + if all(stages.get(category, None) is None for category in magnitudes): + msg = ':missing threshold stages' + all_messages.append(lid + msg) + MP_LOG.warning(huc_lid_id + msg) + continue - try: - # Drop columns that offend acceptance criteria - usgs_elev_df['acceptable_codes'] = ( - usgs_elev_df['usgs_data_coord_accuracy_code'].isin(acceptable_coord_acc_code_list) - & usgs_elev_df['usgs_data_coord_method_code'].isin(acceptable_coord_method_code_list) - & usgs_elev_df['usgs_data_alt_method_code'].isin(acceptable_alt_meth_code_list) - & usgs_elev_df['usgs_data_site_type'].isin(acceptable_site_type_list) + acceptable_usgs_elev_df = __create_acceptable_usgs_elev_df(usgs_elev_df, huc_lid_id) + if acceptable_usgs_elev_df is None: + # This should only happen in a catastrophic code error. + # Exceptions inside the function, normally return usgs_elev_df or a variant of it + raise Exception("acceptable_usgs_elev_df failed to be created") + + # Get the dem_adj_elevation value from usgs_elev_table.csv. + # Prioritize the value that is not from branch 0. + lid_usgs_elev, dem_eval_messages = __adj_dem_evalation_val( + acceptable_usgs_elev_df, lid, huc_lid_id ) + all_messages += dem_eval_messages + if lid_usgs_elev is None: + continue - usgs_elev_df = usgs_elev_df.astype({'usgs_data_alt_accuracy_code': float}) - usgs_elev_df['acceptable_alt_error'] = np.where( - usgs_elev_df['usgs_data_alt_accuracy_code'] <= acceptable_alt_acc_thresh, True, False + # Initialize nested dict for lid attributes + stage_based_att_dict.update({lid: {}}) + + # Find lid metadata from master list of metadata dictionaries. + metadata = next( + (item for item in all_lists if item['identifiers']['nws_lid'] == lid.upper()), False ) + lid_altitude = metadata['usgs_data']['altitude'] - acceptable_usgs_elev_df = usgs_elev_df[ - (usgs_elev_df['acceptable_codes'] is True) & (usgs_elev_df['acceptable_alt_error'] is True) - ] - except Exception as e: - # Not sure any of the sites actually have those USGS-related - # columns in this particular file, so just assume it's fine to use + # Filter out sites that don't have "good" data + try: + ## Removed this part to relax coordinate accuracy requirements + # if not metadata['usgs_data']['coord_accuracy_code'] in acceptable_coord_acc_code_list: + # MP_LOG.warning( + # f"\t{huc_lid_id}: {metadata['usgs_data']['coord_accuracy_code']} " + # "Not in acceptable coord acc codes" + # ) + # continue + # if not metadata['usgs_data']['coord_method_code'] in acceptable_coord_method_code_list: + # MP_LOG.warning(f"\t{huc_lid_id}: Not in acceptable coord method codes") + # continue + if not metadata['usgs_data']['alt_method_code'] in acceptable_alt_meth_code_list: + MP_LOG.warning(f"{huc_lid_id}: Not in acceptable alt method codes") + continue + if not metadata['usgs_data']['site_type'] in acceptable_site_type_list: + MP_LOG.warning(f"{huc_lid_id}: Not in acceptable site type codes") + continue + if not float(metadata['usgs_data']['alt_accuracy_code']) <= acceptable_alt_acc_thresh: + MP_LOG.warning(f"{huc_lid_id}: Not in acceptable threshold range") + continue + except Exception: + MP_LOG.error(f"{huc_lid_id}: filtering out 'bad' data in the usgs_data") + MP_LOG.error(traceback.format_exc()) + continue - # print("(Various columns related to USGS probably not in this csv)") - print(f"Exception: \n {repr(e)} \n") - acceptable_usgs_elev_df = usgs_elev_df + datum_adj_ft, datum_messages = __adjust_datum_ft(flows, metadata, lid, huc_lid_id) - # Get the dem_adj_elevation value from usgs_elev_table.csv. - # Prioritize the value that is not from branch 0. - try: - matching_rows = acceptable_usgs_elev_df.loc[ - acceptable_usgs_elev_df['nws_lid'] == lid.upper(), 'dem_adj_elevation' - ] + all_messages = all_messages + datum_messages + if datum_adj_ft is None: + continue - if len(matching_rows) == 2: # It means there are two level paths, use the one that is not 0 - lid_usgs_elev = acceptable_usgs_elev_df.loc[ - (acceptable_usgs_elev_df['nws_lid'] == lid.upper()) - & (acceptable_usgs_elev_df['levpa_id'] != 0), - 'dem_adj_elevation', - ].values[0] - else: - lid_usgs_elev = acceptable_usgs_elev_df.loc[ - acceptable_usgs_elev_df['nws_lid'] == lid.upper(), 'dem_adj_elevation' - ].values[0] - except IndexError: # Occurs when LID is missing from table - all_messages.append( - [ - f'{lid}:likely unacceptable gage datum error or accuracy code(s); ' - 'please see acceptance criteria' - ] + ### -- Concluded Datum Offset --- ### + # Get mainstem segments of LID by intersecting LID segments with known mainstem segments. + unfiltered_segments = list(set(get_nwm_segs(metadata))) + + # Filter segments to be of like stream order. + desired_order = metadata['nwm_feature_data']['stream_order'] + segments = filter_nwm_segments_by_stream_order( + unfiltered_segments, desired_order, nwm_flows_region_df ) - continue - # Initialize nested dict for lid attributes - stage_based_att_dict.update({lid: {}}) + action_stage = stages['action'] + minor_stage = stages['minor'] + moderate_stage = stages['moderate'] + major_stage = stages['major'] - # Find lid metadata from master list of metadata dictionaries. - metadata = next((item for item in all_lists if item['identifiers']['nws_lid'] == lid.upper()), False) - lid_altitude = metadata['usgs_data']['altitude'] + stage_list = [ + i for i in [action_stage, minor_stage, moderate_stage, major_stage] if i is not None + ] - # Filter out sites that don't have "good" data - try: - if not metadata['usgs_data']['coord_accuracy_code'] in acceptable_coord_acc_code_list: - print( - f"\t{lid}: {metadata['usgs_data']['coord_accuracy_code']} " - "Not in acceptable coord acc codes" - ) - continue - if not metadata['usgs_data']['coord_method_code'] in acceptable_coord_method_code_list: - print(f"\t{lid}: Not in acceptable coord method codes") - continue - if not metadata['usgs_data']['alt_method_code'] in acceptable_alt_meth_code_list: - print(f"\t{lid}: Not in acceptable alt method codes") + if stage_list == []: + msg = ':no stage values available' + all_messages.append(lid + msg) + MP_LOG.warning(huc_lid_id + msg) continue - if not metadata['usgs_data']['site_type'] in acceptable_site_type_list: - print(f"\t{lid}: Not in acceptable site type codes") - continue - if not float(metadata['usgs_data']['alt_accuracy_code']) <= acceptable_alt_acc_thresh: - print(f"\t{lid}: Not in acceptable threshold range") + + # TODO: Aug 2024, Is it really ok that record is missing? hummm + # Earlier code lower was doing comparisons to see if the interval + # value was been each of these 4 but sometimes one or more was None + missing_stages = "" + for stage_type in ["action", "minor", "moderate", "major"]: + stage_val = stages[stage_type] + if stage_val is None: + if missing_stages != "": + missing_stages += ", " + missing_stages += stage_type + + if missing_stages != "": + msg = f':Missing Stages of {missing_stages}' + all_messages.append(lid + msg) + MP_LOG.warning(huc_lid_id + msg) continue - except Exception as e: - print(e) - continue - - ### --- Do Datum Offset --- ### - # determine source of interpolated threshold flows, this will be the rating curve that will be used. - rating_curve_source = flows.get('source') - if rating_curve_source is None: - all_messages.append([f'{lid}:No source for rating curve']) - continue - # Get the datum and adjust to NAVD if necessary. - nws_datum_info, usgs_datum_info = get_datum(metadata) - if rating_curve_source == 'USGS Rating Depot': - datum_data = usgs_datum_info - elif rating_curve_source == 'NRLDB': - datum_data = nws_datum_info - - # If datum not supplied, skip to new site - datum = datum_data.get('datum', None) - if datum is None: - all_messages.append([f'{lid}:datum info unavailable']) - continue - # ___________________________________________________________________________________________________# - # SPECIAL CASE: Workaround for "bmbp1" where the only valid datum is from NRLDB (USGS datum is null). - # Modifying rating curve source will influence the rating curve and - # datum retrieved for benchmark determinations. - if lid == 'bmbp1': - rating_curve_source = 'NRLDB' - # ___________________________________________________________________________________________________# - - # SPECIAL CASE: Custom workaround these sites have faulty crs from WRDS. CRS needed for NGVD29 - # conversion to NAVD88 - # USGS info indicates NAD83 for site: bgwn7, fatw3, mnvn4, nhpp1, pinn4, rgln4, rssk1, sign4, smfn7, - # stkn4, wlln7 - # Assumed to be NAD83 (no info from USGS or NWS data): dlrt2, eagi1, eppt2, jffw3, ldot2, rgdt2 - if lid in [ - 'bgwn7', - 'dlrt2', - 'eagi1', - 'eppt2', - 'fatw3', - 'jffw3', - 'ldot2', - 'mnvn4', - 'nhpp1', - 'pinn4', - 'rgdt2', - 'rgln4', - 'rssk1', - 'sign4', - 'smfn7', - 'stkn4', - 'wlln7', - ]: - datum_data.update(crs='NAD83') - # ___________________________________________________________________________________________________# - - # SPECIAL CASE: Workaround for bmbp1; CRS supplied by NRLDB is mis-assigned (NAD29) and - # is actually NAD27. - # This was verified by converting USGS coordinates (in NAD83) for bmbp1 to NAD27 and - # it matches NRLDB coordinates. - if lid == 'bmbp1': - datum_data.update(crs='NAD27') - # ___________________________________________________________________________________________________# - - # SPECIAL CASE: Custom workaround these sites have poorly defined vcs from WRDS. VCS needed to ensure - # datum reported in NAVD88. - # If NGVD29 it is converted to NAVD88. - # bgwn7, eagi1 vertical datum unknown, assume navd88 - # fatw3 USGS data indicates vcs is NAVD88 (USGS and NWS info agree on datum value). - # wlln7 USGS data indicates vcs is NGVD29 (USGS and NWS info agree on datum value). - if lid in ['bgwn7', 'eagi1', 'fatw3']: - datum_data.update(vcs='NAVD88') - elif lid == 'wlln7': - datum_data.update(vcs='NGVD29') - # ___________________________________________________________________________________________________# - - # Adjust datum to NAVD88 if needed - # Default datum_adj_ft to 0.0 - datum_adj_ft = 0.0 - crs = datum_data.get('crs') - if datum_data.get('vcs') in ['NGVD29', 'NGVD 1929', 'NGVD,1929', 'NGVD OF 1929', 'NGVD']: - # Get the datum adjustment to convert NGVD to NAVD. Sites not in contiguous US are previously - # removed otherwise the region needs changed. - try: - datum_adj_ft = ngvd_to_navd_ft(datum_info=datum_data, region='contiguous') - except Exception as e: - e = str(e) - if crs is None: - all_messages.append([f'{lid}:NOAA VDatum adjustment error, CRS is missing']) - if 'HTTPSConnectionPool' in e: - time.sleep(10) # Maybe the API needs a break, so wait 10 seconds - try: - datum_adj_ft = ngvd_to_navd_ft(datum_info=datum_data, region='contiguous') - except Exception: - all_messages.append([f'{lid}:NOAA VDatum adjustment error, possible API issue']) - if 'Invalid projection' in e: - all_messages.append( - [f'{lid}:NOAA VDatum adjustment error, invalid projection: crs={crs}'] - ) + + interval_list = np.arange( + min(stage_list), max(stage_list) + past_major_interval_cap, 1.0 + ) # Go an extra 10 ft beyond the max stage, arbitrary + + # Check for large discrepancies between the elevation values from WRDS and HAND. + # Otherwise this causes bad mapping. + elevation_diff = lid_usgs_elev - (lid_altitude * 0.3048) + if abs(elevation_diff) > 10: + msg = ':large discrepancy in elevation estimates from gage and HAND' + all_messages.append(lid + msg) + MP_LOG.warning(huc_lid_id + msg) continue - ### -- Concluded Datum Offset --- ### - # Get mainstem segments of LID by intersecting LID segments with known mainstem segments. - unfiltered_segments = list(set(get_nwm_segs(metadata))) - - # Filter segments to be of like stream order. - desired_order = metadata['nwm_feature_data']['stream_order'] - segments = filter_nwm_segments_by_stream_order(unfiltered_segments, desired_order, nwm_flows_df) - action_stage = stages['action'] - minor_stage = stages['minor'] - moderate_stage = stages['moderate'] - major_stage = stages['major'] - stage_list = [i for i in [action_stage, minor_stage, moderate_stage, major_stage] if i is not None] - # Create a list of stages, incrementing by 1 ft. - if stage_list == []: - all_messages.append([f'{lid}:no stage values available']) - continue - interval_list = np.arange( - min(stage_list), max(stage_list) + past_major_interval_cap, 1.0 - ) # Go an extra 10 ft beyond the max stage, arbitrary - - # Check for large discrepancies between the elevation values from WRDS and HAND. - # Otherwise this causes bad mapping. - elevation_diff = lid_usgs_elev - (lid_altitude * 0.3048) - if abs(elevation_diff) > 10: - all_messages.append([f'{lid}:large discrepancy in elevation estimates from gage and HAND']) - continue - - # For each flood category - for category in flood_categories: - # Pull stage value and confirm it's valid, then process - stage = stages[category] - - if stage is not None and datum_adj_ft is not None and lid_altitude is not None: - # Call function to execute mapping of the TIFs. - (messages, hand_stage, datum_adj_wse, datum_adj_wse_m) = produce_stage_based_catfim_tifs( - stage, - datum_adj_ft, - branch_dir, - lid_usgs_elev, - lid_altitude, - fim_dir, - segments, - lid, - huc, - lid_directory, - category, - number_of_jobs, - ) - all_messages += messages - - # Extra metadata for alternative CatFIM technique. - # TODO Revisit because branches complicate things - stage_based_att_dict[lid].update( - { - category: { - 'datum_adj_wse_ft': datum_adj_wse, - 'datum_adj_wse_m': datum_adj_wse_m, - 'hand_stage': hand_stage, - 'datum_adj_ft': datum_adj_ft, - 'lid_alt_ft': lid_altitude, - 'lid_alt_m': lid_altitude * 0.3048, - } - } - ) - # If missing HUC file data, write message - if huc in missing_huc_files: - all_messages.append([f'{lid}:missing some HUC data']) + # This function sometimes is called within a MP but sometimes not. + # So, we might have an MP inside an MP + # and we will need a new prefix for it. - # Now that the "official" category maps are made, produce the incremental maps. - with ProcessPoolExecutor(max_workers=number_of_interval_jobs) as executor: - try: - for interval_stage in interval_list: - # Determine category the stage value belongs with. - if action_stage <= interval_stage < minor_stage: - category = 'action_' + str(interval_stage).replace('.', 'p') + 'ft' - if minor_stage <= interval_stage < moderate_stage: - category = 'minor_' + str(interval_stage).replace('.', 'p') + 'ft' - if moderate_stage <= interval_stage < major_stage: - category = 'moderate_' + str(interval_stage).replace('.', 'p') + 'ft' - if interval_stage >= major_stage: - category = 'major_' + str(interval_stage).replace('.', 'p') + 'ft' - executor.submit( - produce_stage_based_catfim_tifs, - interval_stage, + # Becuase we already are in an MP, lets merge up what we have at this point + # Before creating child MP files + MP_LOG.merge_log_files(parent_log_output_file, child_log_file_prefix) + + # For each flood category / magnitude + MP_LOG.lprint(f"{huc_lid_id}: About to process flood categories") + child_log_file_prefix = MP_LOG.MP_calc_prefix_name( + parent_log_output_file, "MP_produce_catfim_tifs" + ) + # print(f"child log name is {child_log_file_prefix}") + for category in magnitudes: + # MP_LOG.lprint(f"{huc_lid_id}: Magnitude is {category}") + # Pull stage value and confirm it's valid, then process + stage = stages[category] + + if stage is not None and datum_adj_ft is not None and lid_altitude is not None: + # Call function to execute mapping of the TIFs. + + # These are the 5 magnitudes being inundated at their stage value + (messages, hand_stage, datum_adj_wse, datum_adj_wse_m) = produce_stage_based_catfim_tifs( + stage, datum_adj_ft, branch_dir, lid_usgs_elev, @@ -670,116 +644,466 @@ def iterate_through_huc_stage_based( segments, lid, huc, - lid_directory, + mapping_lid_directory, category, - number_of_jobs, + job_number_inundate, + parent_log_output_file, + child_log_file_prefix, + ) + all_messages += messages + + # Extra metadata for alternative CatFIM technique. + # TODO Revisit because branches complicate things + stage_based_att_dict[lid].update( + { + category: { + 'datum_adj_wse_ft': datum_adj_wse, + 'datum_adj_wse_m': datum_adj_wse_m, + 'hand_stage': hand_stage, + 'datum_adj_ft': datum_adj_ft, + 'lid_alt_ft': lid_altitude, + 'lid_alt_m': lid_altitude * 0.3048, + } + } ) - except TypeError: # sometimes the thresholds are Nonetypes - pass - # Create a csv with same information as geopackage but with each threshold as new record. - # Probably a less verbose way. - csv_df = pd.DataFrame() - for threshold in flood_categories: - try: - line_df = pd.DataFrame( - { - 'nws_lid': [lid], - 'name': metadata['nws_data']['name'], - 'WFO': metadata['nws_data']['wfo'], - 'rfc': metadata['nws_data']['rfc'], - 'huc': [huc], - 'state': metadata['nws_data']['state'], - 'county': metadata['nws_data']['county'], - 'magnitude': threshold, - 'q': flows[threshold], - 'q_uni': flows['units'], - 'q_src': flows['source'], - 'stage': stages[threshold], - 'stage_uni': stages['units'], - 's_src': stages['source'], - 'wrds_time': stages['wrds_timestamp'], - 'nrldb_time': metadata['nrldb_timestamp'], - 'nwis_time': metadata['nwis_timestamp'], - 'lat': [float(metadata['nws_preferred']['latitude'])], - 'lon': [float(metadata['nws_preferred']['longitude'])], - 'dtm_adj_ft': stage_based_att_dict[lid][threshold]['datum_adj_ft'], - 'dadj_w_ft': stage_based_att_dict[lid][threshold]['datum_adj_wse_ft'], - 'dadj_w_m': stage_based_att_dict[lid][threshold]['datum_adj_wse_m'], - 'lid_alt_ft': stage_based_att_dict[lid][threshold]['lid_alt_ft'], - 'lid_alt_m': stage_based_att_dict[lid][threshold]['lid_alt_m'], - } - ) - csv_df = pd.concat([csv_df, line_df]) - - except Exception as e: - print(e) - - # Round flow and stage columns to 2 decimal places. - csv_df = csv_df.round({'q': 2, 'stage': 2}) - # If a site folder exists (ie a flow file was written) save files containing site attributes. - output_dir = os.path.join(workspace, huc, lid) - if os.path.exists(output_dir): - # Export DataFrame to csv containing attributes - csv_df.to_csv(os.path.join(attributes_dir, f'{lid}_attributes.csv'), index=False) + # If missing HUC file data, write message + if huc in missing_huc_files: + msg = ':missing some HUC data' + all_messages.append(lid + msg) + MP_LOG.error(huc_lid_id + msg) + + # So, we might have an MP inside an MP + # let's merge what we have at this point, before we go into another MP + MP_LOG.merge_log_files(parent_log_output_file, child_log_file_prefix, True) + + # Now we will do another set of inundations, but this one is based on + # not the stage flow but flow based on each interval + tif_child_log_file_prefix = MP_LOG.MP_calc_prefix_name(parent_log_output_file, "MP_prod_sb_tifs") + with ProcessPoolExecutor(max_workers=job_number_intervals) as executor: + try: + # MP_LOG.lprint(f"{huc_lid_id} : action_stage is {action_stage}") + # MP_LOG.lprint(f"{huc_lid_id} : minor_stage is {minor_stage}") + # MP_LOG.lprint(f"{huc_lid_id} : moderate_stage is {moderate_stage}") + # MP_LOG.lprint(f"{huc_lid_id} : major_stage is {major_stage}") + + for interval_stage in interval_list: + # MP_LOG.lprint(f"{huc_lid_id} : interval_stage is {interval_stage}") + + # Determine category the stage value belongs with. + if action_stage <= interval_stage < minor_stage: + category = 'action_' + str(interval_stage).replace('.', 'p') + 'ft' + if minor_stage <= interval_stage < moderate_stage: + category = 'minor_' + str(interval_stage).replace('.', 'p') + 'ft' + if moderate_stage <= interval_stage < major_stage: + category = 'moderate_' + str(interval_stage).replace('.', 'p') + 'ft' + if interval_stage >= major_stage: + category = 'major_' + str(interval_stage).replace('.', 'p') + 'ft' + executor.submit( + produce_stage_based_catfim_tifs, + interval_stage, + datum_adj_ft, + branch_dir, + lid_usgs_elev, + lid_altitude, + fim_dir, + segments, + lid, + huc, + mapping_lid_directory, + category, + job_number_inundate, + parent_log_output_file, + tif_child_log_file_prefix, + ) + except TypeError: # sometimes the thresholds are Nonetypes + MP_LOG.error("ERROR: type error in ProcessPool, likely in the interval tests") + MP_LOG.error(traceback.format_exc()) + continue + + except Exception: + MP_LOG.critical("ERROR: ProcessPool has an error") + MP_LOG.critical(traceback.format_exc()) + # merge MP Logs (Yes) + MP_LOG.merge_log_files(parent_log_output_file, tif_child_log_file_prefix, True) + sys.exit(1) + + # merge MP Logs (merging MP into an MP (proc_pool in a proc_pool)) + MP_LOG.merge_log_files(parent_log_output_file, tif_child_log_file_prefix, True) + + # Create a csv with same information as geopackage but with each threshold as new record. + # Probably a less verbose way. + csv_df = pd.DataFrame() + for threshold in magnitudes: + try: + line_df = pd.DataFrame( + { + 'nws_lid': [lid], + 'name': metadata['nws_data']['name'], + 'WFO': metadata['nws_data']['wfo'], + 'rfc': metadata['nws_data']['rfc'], + 'huc': [huc], + 'state': metadata['nws_data']['state'], + 'county': metadata['nws_data']['county'], + 'magnitude': threshold, + 'q': flows[threshold], + 'q_uni': flows['units'], + 'q_src': flows['source'], + 'stage': stages[threshold], + 'stage_uni': stages['units'], + 's_src': stages['source'], + 'wrds_time': stages['wrds_timestamp'], + 'nrldb_time': metadata['nrldb_timestamp'], + 'nwis_time': metadata['nwis_timestamp'], + 'lat': [float(metadata['nws_preferred']['latitude'])], + 'lon': [float(metadata['nws_preferred']['longitude'])], + 'dtm_adj_ft': stage_based_att_dict[lid][threshold]['datum_adj_ft'], + 'dadj_w_ft': stage_based_att_dict[lid][threshold]['datum_adj_wse_ft'], + 'dadj_w_m': stage_based_att_dict[lid][threshold]['datum_adj_wse_m'], + 'lid_alt_ft': stage_based_att_dict[lid][threshold]['lid_alt_ft'], + 'lid_alt_m': stage_based_att_dict[lid][threshold]['lid_alt_m'], + } + ) + csv_df = pd.concat([csv_df, line_df]) + + except Exception: + MP_LOG.error("ERROR: threshold has an error") + MP_LOG.error(traceback.format_exc()) + return + # sys.exit(1) + + # Round flow and stage columns to 2 decimal places. + csv_df = csv_df.round({'q': 2, 'stage': 2}) + + # If a site folder exists (ie a flow file was written) save files containing site attributes. + if os.path.exists(mapping_lid_directory): + # Export DataFrame to csv containing attributes + attributes_filepath = os.path.join(attributes_dir, f'{lid}_attributes.csv') + + csv_df.to_csv(attributes_filepath, index=False) + else: + msg = ':missing all calculated flows' + all_messages.append(lid + msg) + MP_LOG.error(huc_lid_id + msg) + + # If it made it to this point (i.e. no continues), there were no major preventers of mapping + all_messages.append(lid + ':OK') + MP_LOG.success(f'{huc_lid_id}: Complete') + # mark_complete(mapping_lid_directory) + + # Write all_messages by HUC to be scraped later. + if len(all_messages) > 0: + + # TODO: Aug 2024: This is now identical to the way flow handles messages + # but the system should probably be changed to somethign more elegant but good enough + # for now. At least is is MP safe. + huc_messages_txt_file = os.path.join(huc_messages_dir, str(huc) + '_messages.txt') + with open(huc_messages_txt_file, 'w') as f: + for item in all_messages: + item = item.strip() + # f.write("%s\n" % item) + f.write(f"{item}\n") + else: + # something has likely gone very poorly. + MP_LOG.error(f"No messages found for {huc}") + sys.exit(1) + + except Exception: + MP_LOG.error(f"{huc} : {lid} Error iterating through huc stage based") + MP_LOG.error(traceback.format_exc()) + + return + + +def __adjust_datum_ft(flows, metadata, lid, huc_lid_id): + + # TODO: Aug 2024: This whole parts needs revisiting. Lots of lid data has changed and this + # is all likely very old. + + # Jul 2024: For now, we will duplicate messages via all_messsages and via the logging system. + all_messages = [] + + datum_adj_ft = None + ### --- Do Datum Offset --- ### + # determine source of interpolated threshold flows, this will be the rating curve that will be used. + rating_curve_source = flows.get('source') + if rating_curve_source is None: + msg = f'{huc_lid_id}:No source for rating curve' + all_messages.append(msg) + MP_LOG.warning(msg) + return None, all_messages + + # Get the datum and adjust to NAVD if necessary. + nws_datum_info, usgs_datum_info = get_datum(metadata) + if rating_curve_source == 'USGS Rating Depot': + datum_data = usgs_datum_info + elif rating_curve_source == 'NRLDB': + datum_data = nws_datum_info + + # If datum not supplied, skip to new site + datum = datum_data.get('datum', None) + if datum is None: + msg = f'{huc_lid_id}:datum info unavailable' + all_messages.append(msg) + MP_LOG.warning(msg) + return None, all_messages + + # ___________________________________________________________________________________________________# + # SPECIAL CASE: Workaround for "bmbp1" where the only valid datum is from NRLDB (USGS datum is null). + # Modifying rating curve source will influence the rating curve and + # datum retrieved for benchmark determinations. + if lid == 'bmbp1': + rating_curve_source = 'NRLDB' + # ___________________________________________________________________________________________________# + + # SPECIAL CASE: Custom workaround these sites have faulty crs from WRDS. CRS needed for NGVD29 + # conversion to NAVD88 + # USGS info indicates NAD83 for site: bgwn7, fatw3, mnvn4, nhpp1, pinn4, rgln4, rssk1, sign4, smfn7, + # stkn4, wlln7 + # Assumed to be NAD83 (no info from USGS or NWS data): dlrt2, eagi1, eppt2, jffw3, ldot2, rgdt2 + if lid in [ + 'bgwn7', + 'dlrt2', + 'eagi1', + 'eppt2', + 'fatw3', + 'jffw3', + 'ldot2', + 'mnvn4', + 'nhpp1', + 'pinn4', + 'rgdt2', + 'rgln4', + 'rssk1', + 'sign4', + 'smfn7', + 'stkn4', + 'wlln7', + ]: + datum_data.update(crs='NAD83') + # ___________________________________________________________________________________________________# + + # SPECIAL CASE: Workaround for bmbp1; CRS supplied by NRLDB is mis-assigned (NAD29) and + # is actually NAD27. + # This was verified by converting USGS coordinates (in NAD83) for bmbp1 to NAD27 and + # it matches NRLDB coordinates. + if lid == 'bmbp1': + datum_data.update(crs='NAD27') + # ___________________________________________________________________________________________________# + + # SPECIAL CASE: Custom workaround these sites have poorly defined vcs from WRDS. VCS needed to ensure + # datum reported in NAVD88. + # If NGVD29 it is converted to NAVD88. + # bgwn7, eagi1 vertical datum unknown, assume navd88 + # fatw3 USGS data indicates vcs is NAVD88 (USGS and NWS info agree on datum value). + # wlln7 USGS data indicates vcs is NGVD29 (USGS and NWS info agree on datum value). + if lid in ['bgwn7', 'eagi1', 'fatw3']: + datum_data.update(vcs='NAVD88') + elif lid == 'wlln7': + datum_data.update(vcs='NGVD29') + # ___________________________________________________________________________________________________# + + # Adjust datum to NAVD88 if needed + # Default datum_adj_ft to 0.0 + datum_adj_ft = 0.0 + crs = datum_data.get('crs') + if datum_data.get('vcs') in ['NGVD29', 'NGVD 1929', 'NGVD,1929', 'NGVD OF 1929', 'NGVD']: + # Get the datum adjustment to convert NGVD to NAVD. Sites not in contiguous US are previously + # removed otherwise the region needs changed. + try: + datum_adj_ft = ngvd_to_navd_ft(datum_info=datum_data, region='contiguous') + except Exception as ex: + MP_LOG.error(f"ERROR: {huc_lid_id}: ngvd_to_navd_ft") + MP_LOG.error(traceback.format_exc()) + ex = str(ex) + if crs is None: + msg = f'{huc_lid_id}:NOAA VDatum adjustment error, CRS is missing' + all_messages.append(msg) + MP_LOG.error(msg) + if 'HTTPSConnectionPool' in ex: + time.sleep(10) # Maybe the API needs a break, so wait 10 seconds + try: + datum_adj_ft = ngvd_to_navd_ft(datum_info=datum_data, region='contiguous') + except Exception: + msg = f'{huc_lid_id}:NOAA VDatum adjustment error, possible API issue' + all_messages.append(msg) + MP_LOG.error(msg) + if 'Invalid projection' in ex: + msg = f'{huc_lid_id}:NOAA VDatum adjustment error, invalid projection: crs={crs}' + all_messages.append(msg) + MP_LOG.error(msg) + return None, all_messages + + return datum_adj_ft, all_messages + + +def __create_acceptable_usgs_elev_df(usgs_elev_df, huc_lid_id): + acceptable_usgs_elev_df = None + try: + # Drop columns that offend acceptance criteria + usgs_elev_df['acceptable_codes'] = ( + # usgs_elev_df['usgs_data_coord_accuracy_code'].isin(acceptable_coord_acc_code_list) + # & usgs_elev_df['usgs_data_coord_method_code'].isin(acceptable_coord_method_code_list) + usgs_elev_df['usgs_data_alt_method_code'].isin(acceptable_alt_meth_code_list) + & usgs_elev_df['usgs_data_site_type'].isin(acceptable_site_type_list) + ) + + usgs_elev_df = usgs_elev_df.astype({'usgs_data_alt_accuracy_code': float}) + usgs_elev_df['acceptable_alt_error'] = np.where( + usgs_elev_df['usgs_data_alt_accuracy_code'] <= acceptable_alt_acc_thresh, True, False + ) + + acceptable_usgs_elev_df = usgs_elev_df[ + (usgs_elev_df['acceptable_codes'] == True) & (usgs_elev_df['acceptable_alt_error'] == True) + ] + + # # TEMP DEBUG Record row difference and write it to a CSV or something + # label = 'Old code' ## TEMP DEBUG + # num_potential_rows = usgs_elev_df.shape[0] + # num_acceptable_rows = acceptable_usgs_elev_df.shape[0] + # out_message = f'{label}: kept {num_acceptable_rows} rows out of {num_potential_rows} available rows.' + + except Exception: + # Not sure any of the sites actually have those USGS-related + # columns in this particular file, so just assume it's fine to use + + # print("(Various columns related to USGS probably not in this csv)") + # print(f"Exception: \n {repr(e)} \n") + MP_LOG.error(f"{huc_lid_id}: An error has occurred while working with the usgs_elev table") + MP_LOG.error(traceback.format_exc()) + acceptable_usgs_elev_df = usgs_elev_df + + return acceptable_usgs_elev_df + + +def __adj_dem_evalation_val(acceptable_usgs_elev_df, lid, huc_lid_id): + + lid_usgs_elev = None + all_messages = [] + try: + matching_rows = acceptable_usgs_elev_df.loc[ + acceptable_usgs_elev_df['nws_lid'] == lid.upper(), 'dem_adj_elevation' + ] + + if len(matching_rows) == 2: # It means there are two level paths, use the one that is not 0 + lid_usgs_elev = acceptable_usgs_elev_df.loc[ + (acceptable_usgs_elev_df['nws_lid'] == lid.upper()) + & (acceptable_usgs_elev_df['levpa_id'] != 0), + 'dem_adj_elevation', + ].values[0] else: - all_messages.append([f'{lid}:missing all calculated flows']) + lid_usgs_elev = acceptable_usgs_elev_df.loc[ + acceptable_usgs_elev_df['nws_lid'] == lid.upper(), 'dem_adj_elevation' + ].values[0] - # If it made it to this point (i.e. no continues), there were no major preventers of mapping - all_messages.append([f'{lid}:OK']) - mark_complete(output_dir) - # Write all_messages by HUC to be scraped later. - messages_dir = os.path.join(workspace, 'messages') - if not os.path.exists(messages_dir): - os.mkdir(messages_dir) - huc_messages_csv = os.path.join(messages_dir, huc + '_messages.csv') - with open(huc_messages_csv, 'w') as output_csv: - writer = csv.writer(output_csv) - writer.writerows(all_messages) + except IndexError: # Occurs when LID is missing from table (yes. warning) + MP_LOG.warning(f"{huc_lid_id}: adjusting dem_adj_elevation") + MP_LOG.warning(traceback.format_exc()) + msg = ':likely unacceptable gage datum error or accuracy code(s); please see acceptance criteria' + all_messages.append(lid + msg) + MP_LOG.warning(huc_lid_id + msg) + return lid_usgs_elev, all_messages + +# This creates a HUC iterator with each HUC creating its flow files and tifs def generate_stage_based_categorical_fim( - workspace, - fim_version, - fim_dir, + output_catfim_dir, + fim_run_dir, nwm_us_search, nwm_ds_search, - number_of_jobs, lid_to_run, - attributes_dir, - number_of_interval_jobs, - past_major_interval_cap, + env_file, + job_number_inundate, job_number_huc, + lst_hucs, + job_number_intervals, + past_major_interval_cap, + nwm_metafile, ): - flood_categories = ['action', 'minor', 'moderate', 'major', 'record'] - - (huc_dictionary, out_gdf, metadata_url, threshold_url, all_lists, nwm_flows_df) = generate_catfim_flows( - workspace, nwm_us_search, nwm_ds_search, stage_based=True, fim_dir=fim_dir, lid_to_run=lid_to_run + magnitudes = ['action', 'minor', 'moderate', 'major', 'record'] + + output_mapping_dir = os.path.join(output_catfim_dir, 'mapping') + attributes_dir = os.path.join(output_catfim_dir, 'attributes') + + # Create HUC message directory to store messages that will be read and joined after multiprocessing + huc_messages_dir = os.path.join(output_mapping_dir, 'huc_messages') + os.makedirs(huc_messages_dir, exist_ok=True) + + FLOG.lprint("Starting generate_flows (Stage Based)") + + # If it is stage based, generate flows returns all of these objects. + # If flow based, generate flows returns only + # (huc_dictionary, out_gdf, ___, threshold_url, all_lists, nwm_flows_df, nwm_flows_alaska_df) = generate_flows( # With Alaska + + # Generate flows is only using one of the incoming job number params + # so let's multiply -jh (huc) and -jn (inundate) + job_flows = job_number_huc * job_number_inundate + (huc_dictionary, out_gdf, ___, threshold_url, all_lists, all_nwm_flows_df) = generate_flows( # No Alaska + output_catfim_dir, + nwm_us_search, + nwm_ds_search, + lid_to_run, + env_file, + job_flows, + True, + lst_hucs, + nwm_metafile, + str(FLOG.LOG_FILE_PATH), ) + FLOG.lprint("End generate_flows (Stage Based)") + + child_log_file_prefix = FLOG.MP_calc_prefix_name(FLOG.LOG_FILE_PATH, "MP_iter_hucs") + + FLOG.lprint(">>>>>>>>>>>>>>>>>>>>>>>>>>>>") + FLOG.lprint("Start processing HUCs for Stage-Based CatFIM") + num_hucs = len(lst_hucs) + huc_index = 0 + FLOG.lprint(f"Number of hucs to process is {num_hucs}") with ProcessPoolExecutor(max_workers=job_number_huc) as executor: for huc in huc_dictionary: - executor.submit( - iterate_through_huc_stage_based, - workspace, - huc, - fim_dir, - huc_dictionary, - threshold_url, - flood_categories, - all_lists, - past_major_interval_cap, - number_of_jobs, - number_of_interval_jobs, - attributes_dir, - nwm_flows_df, - ) + if huc in lst_hucs: + # FLOG.lprint(f'Generating stage based catfim for : {huc}') + + # Code variation for DROPPING Alaska HUCs + nwm_flows_region_df = all_nwm_flows_df + + # # Code variation for keeping alaska HUCs + # nwm_flows_region_df = nwm_flows_alaska_df if str(huc[:2]) == '19' else nwm_flows_df + + progress_stmt = f"index {huc_index + 1} of {num_hucs}" + executor.submit( + iterate_through_huc_stage_based, + output_catfim_dir, + huc, + fim_run_dir, + huc_dictionary, + threshold_url, + magnitudes, + all_lists, + past_major_interval_cap, + job_number_inundate, + job_number_intervals, + nwm_flows_region_df, + str(FLOG.LOG_FILE_PATH), + child_log_file_prefix, + progress_stmt, + ) + huc_index += 1 + # Need to merge MP logs here, merged into the "master log file" + + FLOG.merge_log_files(FLOG.LOG_FILE_PATH, child_log_file_prefix, True) + + FLOG.lprint('Wrapping up processing HUCs for Stage-Based CatFIM...') + FLOG.lprint(">>>>>>>>>>>>>>>>>>>>>>>>>>>>") + + csv_files = [x for x in os.listdir(attributes_dir) if x.endswith('.csv')] - print('Wrapping up Stage-Based CatFIM...') - csv_files = os.listdir(attributes_dir) all_csv_df = pd.DataFrame() refined_csv_files_list = [] for csv_file in csv_files: + full_csv_path = os.path.join(attributes_dir, csv_file) # HUC has to be read in as string to preserve leading zeros. try: @@ -787,15 +1111,21 @@ def generate_stage_based_categorical_fim( all_csv_df = pd.concat([all_csv_df, temp_df], ignore_index=True) refined_csv_files_list.append(csv_file) except Exception: # Happens if a file is empty (i.e. no mapping) + FLOG.error(f"ERROR: loading csv {full_csv_path}") + FLOG.error(traceback.format_exc()) pass + # Write to file - all_csv_df.to_csv(os.path.join(workspace, 'nws_lid_attributes.csv'), index=False) + all_csv_df.to_csv(os.path.join(attributes_dir, 'nws_lid_attributes.csv'), index=False) # This section populates a geopackage of all potential sites and details # whether it was mapped or not (mapped field) and if not, why (status field). # Preprocess the out_gdf GeoDataFrame. Reproject and reformat fields. - viz_out_gdf = out_gdf.to_crs(VIZ_PROJECTION) + + # TODO: Accomodate AK projection? Yes.. and Alaska and CONUS should all end up as the same projection output + # epsg:5070, we really want 3857 out for all outputs + viz_out_gdf = out_gdf.to_crs(VIZ_PROJECTION) # TODO: Accomodate AK projection? viz_out_gdf.rename( columns={ 'identifiers_nwm_feature_id': 'nwm_seg', @@ -819,39 +1149,48 @@ def generate_stage_based_categorical_fim( viz_out_gdf = viz_out_gdf.merge(lids_df, how='left', on='nws_lid') viz_out_gdf['mapped'] = viz_out_gdf['mapped'].fillna('no') - # Create list from all messages in messages dir. - messages_dir = os.path.join(workspace, 'messages') - all_messages = [] - all_message_csvs = os.listdir(messages_dir) - for message_csv in all_message_csvs: - full_message_csv_path = os.path.join(messages_dir, message_csv) - with open(full_message_csv_path, newline='') as message_file: - reader = csv.reader(message_file) - for row in reader: - all_messages.append(row) + # Read all messages for all HUCs + # This is basically identical to a chunk in flow based. At a min, consolidate + # or better yet, find a more elegant, yet still MP safe, system than .txt files + # but it works.. so maybe someday. + huc_message_list = [] + huc_messages_dir_list = os.listdir(huc_messages_dir) + for huc_message_file in huc_messages_dir_list: + full_path_file = os.path.join(huc_messages_dir, huc_message_file) + with open(full_path_file, 'r') as f: + if full_path_file.endswith('.txt'): + lines = f.readlines() + for line in lines: + line = line.strip() + huc_message_list.append(line) # Filter out columns and write out to file - nws_sites_layer = os.path.join(workspace, 'nws_lid_sites.gpkg') + # flow based doesn't make it here only stage + nws_lid_gpkg_file_path = os.path.join(output_mapping_dir, 'stage_based_catfim_sites.gpkg') - # Only write to sites geopackage if it didn't exist yet - # (and this line shouldn't have been reached if we had an interrupted - # run previously and are picking back up with a restart) - if not os.path.exists(nws_sites_layer): - # Write messages to DataFrame, split into columns, aggregate messages. - messages_df = pd.DataFrame(all_messages, columns=['message']) + # Write messages to DataFrame, split into columns, aggregate messages. + if len(huc_message_list) > 0: + + FLOG.lprint(f"nws_sites_layer ({nws_lid_gpkg_file_path}) : adding messages") + messages_df = pd.DataFrame(huc_message_list, columns=['message']) messages_df = ( messages_df['message'] .str.split(':', n=1, expand=True) .rename(columns={0: 'nws_lid', 1: 'status'}) ) - status_df = messages_df.groupby(['nws_lid'])['status'].apply(', '.join).reset_index() + + # We want one viz_out_gdf record per ahps and if there are more than one, contact the messages + # status_df = messages_df.groupby(['nws_lid'])['status'].apply(', '.join).reset_index() + status_df = messages_df.groupby(['nws_lid'])['status'].agg(lambda x: ',\n'.join(x)).reset_index() # Join messages to populate status field to candidate sites. Assign # status for null fields. viz_out_gdf = viz_out_gdf.merge(status_df, how='left', on='nws_lid') - # viz_out_gdf['status'] = viz_out_gdf['status'].fillna('OK') + # viz_out_gdf.reset_index(inplace=True) + + viz_out_gdf['status'] = viz_out_gdf['status'].fillna('undetermined') # Add acceptance criteria to viz_out_gdf before writing viz_out_gdf['acceptable_coord_acc_code_list'] = str(acceptable_coord_acc_code_list) @@ -860,148 +1199,14 @@ def generate_stage_based_categorical_fim( viz_out_gdf['acceptable_alt_meth_code_list'] = str(acceptable_alt_meth_code_list) viz_out_gdf['acceptable_site_type_list'] = str(acceptable_site_type_list) - viz_out_gdf.to_file(nws_sites_layer, driver='GPKG') - - return nws_sites_layer - - -def produce_stage_based_catfim_tifs( - stage, - datum_adj_ft, - branch_dir, - lid_usgs_elev, - lid_altitude, - fim_dir, - segments, - lid, - huc, - lid_directory, - category, - number_of_jobs, -): - messages = [] - - # Determine datum-offset water surface elevation (from above). - datum_adj_wse = stage + datum_adj_ft + lid_altitude - datum_adj_wse_m = datum_adj_wse * 0.3048 # Convert ft to m - - # Subtract HAND gage elevation from HAND WSE to get HAND stage. - hand_stage = datum_adj_wse_m - lid_usgs_elev - - # Produce extent tif hand_stage. Multiprocess across branches. - branches = os.listdir(branch_dir) - with ProcessPoolExecutor(max_workers=number_of_jobs) as executor: - for branch in branches: - # Define paths to necessary files to produce inundation grids. - full_branch_path = os.path.join(branch_dir, branch) - rem_path = os.path.join(fim_dir, huc, full_branch_path, 'rem_zeroed_masked_' + branch + '.tif') - catchments_path = os.path.join( - fim_dir, - huc, - full_branch_path, - 'gw_catchments_reaches_filtered_addedAttributes_' + branch + '.tif', - ) - hydrotable_path = os.path.join(fim_dir, huc, full_branch_path, 'hydroTable_' + branch + '.csv') - - if not os.path.exists(rem_path): - messages.append([f"{lid}:rem doesn't exist"]) - continue - if not os.path.exists(catchments_path): - messages.append([f"{lid}:catchments files don't exist"]) - continue - if not os.path.exists(hydrotable_path): - messages.append([f"{lid}:hydrotable doesn't exist"]) - continue - - # Use hydroTable to determine hydroid_list from site_ms_segments. - hydrotable_df = pd.read_csv(hydrotable_path) - hydroid_list = [] - - # Determine hydroids at which to perform inundation - for feature_id in segments: - try: - subset_hydrotable_df = hydrotable_df[hydrotable_df['feature_id'] == int(feature_id)] - hydroid_list += list(subset_hydrotable_df.HydroID.unique()) - except IndexError: - pass - - # Some branches don't have matching hydroids - # if len(hydroid_list) == 0: - # messages.append(f"{lid}:no matching hydroids") - # continue - - # If no segments, write message and exit out - if not segments: - messages.append([f'{lid}:missing nwm segments']) - continue - - # Create inundation maps with branch and stage data - try: - print("Generating stage-based FIM for " + huc + " and branch " + branch) - executor.submit( - produce_inundation_map_with_stage_and_feature_ids, - rem_path, - catchments_path, - hydroid_list, - hand_stage, - lid_directory, - category, - huc, - lid, - branch, - ) - except Exception: - messages.append([f'{lid}:inundation failed at {category}']) - - # -- MOSAIC -- # - # Merge all rasters in lid_directory that have the same magnitude/category. - path_list = [] - lid_dir_list = os.listdir(lid_directory) - print("Merging " + category) - for f in lid_dir_list: - if category in f: - path_list.append(os.path.join(lid_directory, f)) - path_list.sort() # To force branch 0 first in list, sort - - if len(path_list) > 0: - zero_branch_grid = path_list[0] - zero_branch_src = rasterio.open(zero_branch_grid) - zero_branch_array = zero_branch_src.read(1) - summed_array = zero_branch_array # Initialize it as the branch zero array - - # Loop through remaining items in list and sum them with summed_array - for remaining_raster in path_list[1:]: - remaining_raster_src = rasterio.open(remaining_raster) - remaining_raster_array_original = remaining_raster_src.read(1) - - # Reproject non-branch-zero grids so I can sum them with the branch zero grid - remaining_raster_array = np.empty(zero_branch_array.shape, dtype=np.int8) - reproject( - remaining_raster_array_original, - destination=remaining_raster_array, - src_transform=remaining_raster_src.transform, - src_crs=remaining_raster_src.crs, - src_nodata=remaining_raster_src.nodata, - dst_transform=zero_branch_src.transform, - dst_crs=zero_branch_src.crs, - dst_nodata=-1, - dst_resolution=zero_branch_src.res, - resampling=Resampling.nearest, - ) - # Sum rasters - summed_array = summed_array + remaining_raster_array - - del zero_branch_array # Clean up + viz_out_gdf.to_file(nws_lid_gpkg_file_path, driver='GPKG', index=True, engine='fiona') - # Define path to merged file, in same format as expected by post_process_cat_fim_for_viz function - output_tif = os.path.join(lid_directory, lid + '_' + category + '_extent.tif') - profile = zero_branch_src.profile - summed_array = summed_array.astype('uint8') - with rasterio.open(output_tif, 'w', **profile) as dst: - dst.write(summed_array, 1) - del summed_array + csv_file_path = nws_lid_gpkg_file_path.replace(".gpkg", ".csv") + viz_out_gdf.to_csv(csv_file_path) + else: + FLOG.lprint(f"nws_sites_layer ({nws_lid_gpkg_file_path}) : has no messages") - return messages, hand_stage, datum_adj_wse, datum_adj_wse_m + return nws_lid_gpkg_file_path if __name__ == '__main__': @@ -1013,13 +1218,19 @@ def produce_stage_based_catfim_tifs( help='Path to directory containing HAND outputs, e.g. /data/previous_fim/fim_4_0_9_2', required=True, ) + parser.add_argument( + '-e', + '--env_file', + help='Docker mount path to the catfim environment file. ie) data/config/catfim.env', + required=True, + ) parser.add_argument( '-jh', '--job_number_huc', - help='Number of processes to use for HUC scale operations.' + help='OPTIONAL: Number of processes to use for HUC scale operations.' ' HUC and inundation job numbers should multiply to no more than one less than the CPU count of the' ' machine. CatFIM sites generally only have 2-3 branches overlapping a site, so this number can be ' - 'kept low (2-4)', + 'kept low (2-4). Defaults to 1.', required=False, default=1, type=int, @@ -1027,17 +1238,30 @@ def produce_stage_based_catfim_tifs( parser.add_argument( '-jn', '--job_number_inundate', - help='Number of processes to use for inundating' + help='OPTIONAL: Number of processes to use for inundating' ' HUC and inundation job numbers should multiply to no more than one less than the CPU count' - ' of the machine.', + ' of the machine. Defaults to 1.', required=False, default=1, type=int, ) + + parser.add_argument( + '-ji', + '--job_number_intervals', + help='OPTIONAL: Number of processes to use for inundating multiple intervals in stage-based' + ' inundation and interval job numbers should multiply to no more than one less than the CPU count ' + 'of the machine. Defaults to 1.', + required=False, + default=1, + type=int, + ) + parser.add_argument( '-a', - '--stage_based', - help='Run stage-based CatFIM instead of flow-based?' ' NOTE: flow-based CatFIM is the default.', + '--is_stage_based', + help='Run stage-based CatFIM instead of flow-based? Add this -a param to make it stage based,' + ' leave it off for flow based', required=False, default=False, action='store_true', @@ -1045,44 +1269,64 @@ def produce_stage_based_catfim_tifs( parser.add_argument( '-t', '--output_folder', - help='Target: Where the output folder will be', + help='OPTIONAL: Target location, Where the output folder will be. Defaults to /data/catfim/', required=False, default='/data/catfim/', ) - parser.add_argument('-o', '--overwrite', help='Overwrite files', required=False, action="store_true") + parser.add_argument( + '-o', '--overwrite', help='OPTIONAL: Overwrite files', required=False, action="store_true" + ) parser.add_argument( '-s', '--search', - help='Upstream and downstream search in miles. How far up and downstream do you want to go?', + help='OPTIONAL: Upstream and downstream search in miles. How far up and downstream do you want to go? Defaults to 5.', required=False, default='5', ) + parser.add_argument( '-l', '--lid_to_run', - help='NWS LID, lowercase, to produce CatFIM for. Currently only accepts one. Default is all sites', + help='OPTIONAL: NWS LID, lowercase, to produce CatFIM for. Currently only accepts one. Defaults to all sites', required=False, default='all', ) - parser.add_argument( - '-ji', - '--job_number_intervals', - help='Number of processes to use for inundating multiple intervals in stage-based' - ' inundation and interval job numbers should multiply to no more than one less than the CPU count ' - 'of the machine.', - required=False, - default=1, - type=int, - ) + + # lst_hucs temp disabled. All hucs in fim outputs in a directory will used + # parser.add_argument( + # '-lh', + # '--lst_hucs', + # help='OPTIONAL: Space-delimited list of HUCs to produce CatFIM for. Defaults to all HUCs', + # required=False, + # default='all', + # ) + parser.add_argument( '-mc', '--past_major_interval_cap', - help='Stage-Based Only. How many feet past major do you want to go for the interval FIMs?' - ' of the machine.', + help='OPTIONAL: Stage-Based Only. How many feet past major do you want to go for the interval FIMs?' + ' of the machine. Defaults to 5.', required=False, default=5.0, type=float, ) + # NOTE: This params is for quick debugging only and should not be used in a production mode + parser.add_argument( + '-me', + '--nwm_metafile', + help='OPTIONAL: If you have a pre-existing nwm metadata pickle file, you can path to it here.' + ' e.g.: /data/catfim/nwm_metafile.pkl', + required=False, + default="", + ) + args = vars(parser.parse_args()) - process_generate_categorical_fim(**args) + + try: + + # call main program + process_generate_categorical_fim(**args) + + except Exception: + FLOG.critical(traceback.format_exc()) diff --git a/tools/generate_categorical_fim_flows.py b/tools/generate_categorical_fim_flows.py index a89459e0..7acfb463 100755 --- a/tools/generate_categorical_fim_flows.py +++ b/tools/generate_categorical_fim_flows.py @@ -1,11 +1,17 @@ #!/usr/bin/env python3 +# import csv + import argparse +import copy import os +import pickle +import random import sys import time +import traceback from concurrent.futures import ProcessPoolExecutor, as_completed, wait -from datetime import datetime +from datetime import datetime, timezone from pathlib import Path import geopandas as gpd @@ -21,164 +27,257 @@ get_thresholds, ) +import utils.fim_logger as fl from utils.shared_variables import VIZ_PROJECTION +# TODO: Aug 2024: This script was upgraded significantly with lots of misc TODO's embedded. +# Lots of inline documenation needs updating as well + + +# will become global once initiallized +FLOG = fl.FIM_logger() +MP_LOG = fl.FIM_logger() + gpd.options.io_engine = "pyogrio" -def get_env_paths(): - load_dotenv() +def get_env_paths(env_file): + + if os.path.exists(env_file) is False: + raise Exception(f"The environment file of {env_file} does not seem to exist") + + load_dotenv(env_file) # import variables from .env file API_BASE_URL = os.getenv("API_BASE_URL") WBD_LAYER = os.getenv("WBD_LAYER") return API_BASE_URL, WBD_LAYER -def process_generate_flows( - huc, huc_dictionary, threshold_url, all_lists, workspace, attributes_dir, huc_messages_dir, nwm_flows_df +# This one is for lid magnitudes only and is part of an MP pool +def generate_flows_for_huc( + huc, + huc_dictionary, + threshold_url, + all_meta_lists, + output_flows_dir, + attributes_dir, + huc_messages_dir, + nwm_flows_df, + parent_log_output_file, + child_log_file_prefix, ): - # Process each huc unit, first define message variable and flood categories. - all_messages = [] - flood_categories = ['action', 'minor', 'moderate', 'major', 'record'] - - print(f'Iterating through {huc}') - # Get list of nws_lids - nws_lids = huc_dictionary[huc] - # Loop through each lid in list to create flow file - for lid in nws_lids: - # Convert lid to lower case - lid = lid.lower() - # Get stages and flows for each threshold from the WRDS API. Priority given to USGS calculated flows. - print("getting thresholds") - stages, flows = get_thresholds( - threshold_url=threshold_url, select_by='nws_lid', selector=lid, threshold='all' - ) - if stages is None or flows is None: - print("Likely WRDS error") - continue - # Check if stages are supplied, if not write message and exit. - if all(stages.get(category, None) is None for category in flood_categories): - message = f'{lid}:missing threshold stages' - all_messages.append(message) - continue - # Check if calculated flows are supplied, if not write message and exit. - if all(flows.get(category, None) is None for category in flood_categories): - message = f'{lid}:missing calculated flows' - all_messages.append(message) - continue - # find lid metadata from master list of metadata dictionaries (line 66). - metadata = next((item for item in all_lists if item['identifiers']['nws_lid'] == lid.upper()), False) - - # Get mainstem segments of LID by intersecting LID segments with known mainstem segments. - unfiltered_segments = list(set(get_nwm_segs(metadata))) - - desired_order = metadata['nwm_feature_data']['stream_order'] - # Filter segments to be of like stream order. - print("filtering segments") - start = time.time() - segments = filter_nwm_segments_by_stream_order(unfiltered_segments, desired_order, nwm_flows_df) - end = time.time() - elapsed_time = round(((end - start) / 60), 6) - print(f'Finished filtering segments in {elapsed_time} minutes') - # if no segments, write message and exit out - if not segments: - message = f'{lid}:missing nwm segments' - all_messages.append(message) - continue - # For each flood category - for category in flood_categories: - # G et the flow - flow = flows[category] - # If there is a valid flow value, write a flow file. - if flow: - # round flow to nearest hundredth - flow = round(flow, 2) - # Create the guts of the flow file. - flow_info = flow_data(segments, flow) - # Define destination path and create folders - output_file = ( - workspace / huc / lid / category / (f'ahps_{lid}_huc_{huc}_flows_{category}.csv') - ) - output_file.parent.mkdir(parents=True, exist_ok=True) - # Write flow file to file - flow_info.to_csv(output_file, index=False) - else: - message = f'{lid}:{category} is missing calculated flow' + + try: + # Note: child_log_file_prefix is "MP_process_gen_flows", meaning all logs created by this function start + # with the phrase "MP_process_gen_flows". This will roll up to the master catfim log. + # This is setting up logging for this function to go up to the parent + MP_LOG.MP_Log_setup(parent_log_output_file, child_log_file_prefix) + + start_time = datetime.now(timezone.utc) + dt_string = start_time.strftime("%m/%d/%Y %H:%M:%S") + + # A bit of start staggering to help not overload the MP (20 sec) + time_delay = random.randrange(0, 20) + # MP_LOG.lprint(f" ... {huc} start time is {dt_string} and delay is {time_delay}") + MP_LOG.lprint(f" ... {huc} flow generation start time is {dt_string}") + + time.sleep(time_delay) + + # Process each huc unit, first define message variable and flood categories. + all_messages = [] + flood_categories = ['action', 'minor', 'moderate', 'major', 'record'] + + nws_lids = huc_dictionary[huc] + + if len(nws_lids) == 0: + MP_LOG.lprint(f"huc {huc} has no applicable nws_lids") + return + + # Loop through each lid in list to create flow file + for lid in nws_lids: + # Convert lid to lower case + lid = lid.lower() + + # TODO: Jun 17, 2024 - This gets recalled for every huc but only uses the nws_list. + # Move this somewhere outside the huc list so it doesn't need to be called over and over again + + # Careful, for "all_message.append" the syntax into it must be f'{lid}: (whever messages) + # this is gets parsed and logic used against it. + + MP_LOG.trace(f'Getting thresholds for {lid}') + stages, flows = get_thresholds( + threshold_url=threshold_url, select_by='nws_lid', selector=lid, threshold='all' + ) + + if len(stages) == 0 or len(flows) == 0: + message = f'{lid}:no stages or flows exist, likely WRDS error' all_messages.append(message) - # Get various attributes of the site. - lat = float(metadata['nws_preferred']['latitude']) - lon = float(metadata['nws_preferred']['longitude']) - wfo = metadata['nws_data']['wfo'] - rfc = metadata['nws_data']['rfc'] - state = metadata['nws_data']['state'] - county = metadata['nws_data']['county'] - name = metadata['nws_data']['name'] - flow_source = flows['source'] - stage_source = stages['source'] - wrds_timestamp = stages['wrds_timestamp'] - nrldb_timestamp = metadata['nrldb_timestamp'] - nwis_timestamp = metadata['nwis_timestamp'] - - # Create a csv with same information as shapefile but with each threshold as new record. - csv_df = pd.DataFrame() - for threshold in flood_categories: - line_df = pd.DataFrame( - { - 'nws_lid': [lid], - 'name': name, - 'WFO': wfo, - 'rfc': rfc, - 'huc': [huc], - 'state': state, - 'county': county, - 'magnitude': threshold, - 'q': flows[threshold], - 'q_uni': flows['units'], - 'q_src': flow_source, - 'stage': stages[threshold], - 'stage_uni': stages['units'], - 's_src': stage_source, - 'wrds_time': wrds_timestamp, - 'nrldb_time': nrldb_timestamp, - 'nwis_time': nwis_timestamp, - 'lat': [lat], - 'lon': [lon], - } + MP_LOG.warning(f"{huc} - {message}") + continue + + # Check if stages are supplied, if not write message and exit. + if all(stages.get(category, None) is None for category in flood_categories): + message = f'{lid}:missing threshold stages' + all_messages.append(message) + MP_LOG.warning(f"{huc} - {message}") + continue + + # Check if calculated flows are supplied, if not write message and exit. + if all(flows.get(category, None) is None for category in flood_categories): + message = f'{lid}:missing calculated flows' + all_messages.append(message) + MP_LOG.warning(f"{huc} - {message}") + continue + + # Find lid metadata from master list of metadata dictionaries (line 66). + metadata = next( + (item for item in all_meta_lists if item['identifiers']['nws_lid'] == lid.upper()), False ) - csv_df = pd.concat([csv_df, line_df]) - # Round flow and stage columns to 2 decimal places. - csv_df = csv_df.round({'q': 2, 'stage': 2}) - - # If a site folder exists (ie a flow file was written) save files containing site attributes. - output_dir = workspace / huc / lid - if output_dir.exists(): - # Export DataFrame to csv containing attributes - csv_df.to_csv(os.path.join(attributes_dir, f'{lid}_attributes.csv'), index=False) - message = f'{lid}:flows available' - all_messages.append(message) - else: - message = f'{lid}:missing all calculated flows' - all_messages.append(message) - # Write all_messages to huc-specific file. - print("Writing message file for huc") - huc_messages_txt_file = os.path.join(huc_messages_dir, str(huc) + '_messages.txt') - with open(huc_messages_txt_file, 'w') as f: - for item in all_messages: - f.write("%s\n" % item) + # Get mainstem segments of LID by intersecting LID segments with known mainstem segments. + unfiltered_segments = list(set(get_nwm_segs(metadata))) + desired_order = metadata['nwm_feature_data']['stream_order'] + # Filter segments to be of like stream order. + segments = filter_nwm_segments_by_stream_order(unfiltered_segments, desired_order, nwm_flows_df) -def generate_catfim_flows( - workspace, + # If there are no segments, write message and exit out + if not segments or len(segments) == 0: + message = f'{lid}:missing nwm segments' + all_messages.append(message) + MP_LOG.warning(f"{huc} - {message}") + continue + + # For each flood category + for category in flood_categories: + # Get the flow + flow = flows[category] + + if flow is not None and flow != 0: + + # If there is a valid flow value, write a flow file. + # if flow: + # round flow to nearest hundredth + flow = round(flow, 2) + + # Create the guts of the flow file. + flow_info = flow_data(segments, flow) + + # Define destination path and create folders + csv_output_folder = os.path.join(output_flows_dir, huc, lid, category) + os.makedirs(csv_output_folder, exist_ok=True) + output_file = os.path.join( + csv_output_folder, f'ahps_{lid}_huc_{huc}_flows_{category}.csv' + ) + + # Write flow file to file + flow_info.to_csv(output_file, index=False) + + else: + message = f'{lid}:{category} is missing calculated flow' + all_messages.append(message) + MP_LOG.warning(f"{huc} - {message}") + + # Get various attributes of the site. + lat = float(metadata['nws_preferred']['latitude']) + lon = float(metadata['nws_preferred']['longitude']) + wfo = metadata['nws_data']['wfo'] + rfc = metadata['nws_data']['rfc'] + state = metadata['nws_data']['state'] + county = metadata['nws_data']['county'] + name = metadata['nws_data']['name'] + flow_source = flows['source'] + stage_source = stages['source'] + wrds_timestamp = stages['wrds_timestamp'] + nrldb_timestamp = metadata['nrldb_timestamp'] + nwis_timestamp = metadata['nwis_timestamp'] + + # Create a csv with same information as shapefile but with each threshold as new record. + csv_df = pd.DataFrame() + for threshold in flood_categories: + line_df = pd.DataFrame( + { + 'nws_lid': [lid], + 'name': name, + 'WFO': wfo, + 'rfc': rfc, + 'huc': [huc], + 'state': state, + 'county': county, + 'magnitude': threshold, + 'q': flows[threshold], + 'q_uni': flows['units'], + 'q_src': flow_source, + 'stage': stages[threshold], + 'stage_uni': stages['units'], + 's_src': stage_source, + 'wrds_time': wrds_timestamp, + 'nrldb_time': nrldb_timestamp, + 'nwis_time': nwis_timestamp, + 'lat': [lat], + 'lon': [lon], + } + ) + csv_df = pd.concat([csv_df, line_df]) + + # Round flow and stage columns to 2 decimal places. + csv_df = csv_df.round({'q': 2, 'stage': 2}) + + # If a site folder exists (ie a flow file was written) save files containing site attributes. + huc_lid_flow_dir = os.path.join(output_flows_dir, huc, lid) + + if os.path.exists(huc_lid_flow_dir): + # Export DataFrame to csv containing attributes + csv_df.to_csv(os.path.join(attributes_dir, f'{lid}_attributes.csv'), index=False) + message = f'{lid}:flows available' + all_messages.append(message) + else: + message = f'{lid}:missing all calculated flows' + all_messages.append(message) + MP_LOG.warning(f"Missing all calculated flows for {huc} - {lid}") + + # Write all_messages to huc-specific file. + # MP_LOG.lprint(f'Writing message file for {huc}') + huc_messages_txt_file = os.path.join(huc_messages_dir, str(huc) + '_messages.txt') + with open(huc_messages_txt_file, 'w') as f: + for item in all_messages: + item = item.strip() + # f.write("%s\n" % item) + f.write(f"{item}\n") + # MP_LOG.lprint(f'--- generate_flow_for_huc done for {huc}') + + end_time = datetime.now(timezone.utc) + dt_string = end_time.strftime("%m/%d/%Y %H:%M:%S") + time_duration = end_time - start_time + MP_LOG.lprint(f" ... {huc} end time is {dt_string} : Duration: {str(time_duration).split('.')[0]}") + print("") + + except Exception as ex: + MP_LOG.error(f"An error occured while generating flows for huc {huc}") + MP_LOG.error(f"Details: {ex}") + MP_LOG.error(traceback.format_exc()) + + print("") + return + + +# This is called from within this script and is not MP, so it can use FLOG directly +# lid_to_run is temp disabled +def generate_flows( + output_catfim_dir, nwm_us_search, nwm_ds_search, - stage_based, - fim_dir, lid_to_run, - attributes_dir="", - job_number_huc=1, + env_file, + job_number_huc, + is_stage_based, + lst_hucs, + nwm_metafile, + log_output_file, ): + + # TODO; Most docstrings like this are now very outdated and need updating ''' This will create static flow files for all nws_lids and save to the workspace directory with the following format: @@ -192,8 +291,8 @@ def generate_catfim_flows( flow file to use for inundation mapping. Parameters ---------- - workspace : STR - Location where output flow files will exist. + output_catfim_dir : STR + root catfim dir for the particular run. ie) fim_4_3_3_4_stage_based nwm_us_search : STR Upstream distance (in miles) for walking up NWM network. nwm_ds_search : STR @@ -203,133 +302,152 @@ def generate_catfim_flows( Returns ------- - None. + nws_lid_gpkg_file_path. - Name and path of the nws_lid file ''' - all_start = datetime.now() - API_BASE_URL, WBD_LAYER = get_env_paths() - # Define workspace and wbd_path as a pathlib Path. Convert search distances to integer. - workspace = Path(workspace) + FLOG.setup(log_output_file) # reusing the parent logs + + # FLOG.trace("args coming into generate flows") + # FLOG.trace(locals()) # see all args coming in to the function + + attributes_dir = os.path.join(output_catfim_dir, 'attributes') + mapping_dir = os.path.join(output_catfim_dir, "mapping") # create var but don't make folder yet + + all_start = datetime.now(timezone.utc) + API_BASE_URL, WBD_LAYER = get_env_paths(env_file) nwm_us_search = int(nwm_us_search) nwm_ds_search = int(nwm_ds_search) metadata_url = f'{API_BASE_URL}/metadata' threshold_url = f'{API_BASE_URL}/nws_threshold' ################################################################### - # Create workspace - workspace.mkdir(parents=True, exist_ok=True) - # Create HUC message directory to store messages that will be read and joined after multiprocessing - huc_messages_dir = os.path.join(workspace, 'huc_messages') - if not os.path.exists(huc_messages_dir): - os.mkdir(huc_messages_dir) + huc_messages_dir = os.path.join(mapping_dir, 'huc_messages') + os.makedirs(huc_messages_dir, exist_ok=True) + + FLOG.lprint("Loading nwm flow metadata") + start_dt = datetime.now(timezone.utc) - # Open NWM flows geopackage + # Open NWM flows geopackages nwm_flows_gpkg = r'/data/inputs/nwm_hydrofabric/nwm_flows.gpkg' nwm_flows_df = gpd.read_file(nwm_flows_gpkg) - print(f'Retrieving metadata for site(s): {lid_to_run}...') - start_dt = datetime.now() + # nwm_flows_alaska_gpkg = r'/data/inputs/nwm_hydrofabric/nwm_flows_alaska_nwmV3_ID.gpkg' # Uncomment to include Alaska + # nwm_flows_alaska_df = gpd.read_file(nwm_flows_alaska_gpkg) # Uncomment to include Alaska - # Get metadata for 'CONUS' - print(metadata_url) - if lid_to_run != 'all': - all_lists, conus_dataframe = get_metadata( - metadata_url, - select_by='nws_lid', - selector=[lid_to_run], - must_include='nws_data.rfc_forecast_point', - upstream_trace_distance=nwm_us_search, - downstream_trace_distance=nwm_ds_search, - ) - else: - # Get CONUS metadata - conus_list, conus_dataframe = get_metadata( - metadata_url, - select_by='nws_lid', - selector=['all'], - must_include='nws_data.rfc_forecast_point', - upstream_trace_distance=nwm_us_search, - downstream_trace_distance=nwm_ds_search, - ) - # Get metadata for Islands - islands_list, islands_dataframe = get_metadata( - metadata_url, - select_by='state', - selector=['HI', 'PR'], - must_include=None, - upstream_trace_distance=nwm_us_search, - downstream_trace_distance=nwm_ds_search, - ) - # Append the dataframes and lists - all_lists = conus_list + islands_list - print(len(all_lists)) + # nwm_metafile might be an empty string + # maybe ensure all projections are changed to one standard output of 3857 (see shared_variables) as the come out - end_dt = datetime.now() + # TODO: Aug 2024: + # Filter the meta list to just HUCs in the fim run output or huc if sent in as a param + all_meta_lists = __load_nwm_metadata( + output_catfim_dir, metadata_url, nwm_us_search, nwm_ds_search, lid_to_run, nwm_metafile + ) + + end_dt = datetime.now(timezone.utc) time_duration = end_dt - start_dt - print(f"Retrieving metadata Duration: {str(time_duration).split('.')[0]}") - print() + FLOG.lprint(f"Retrieving metadata - Duration: {str(time_duration).split('.')[0]}") - print('Determining HUC using WBD layer...') - start_dt = datetime.now() + print("") - # Assign HUCs to all sites using a spatial join of the FIM 3 HUC layer. + # Assign HUCs to all sites using a spatial join of the FIM 4 HUC layer. # Get a dictionary of hucs (key) and sites (values) as well as a GeoDataFrame # of all sites used later in script. - huc_dictionary, out_gdf = aggregate_wbd_hucs( - metadata_list=all_lists, wbd_huc8_path=WBD_LAYER, retain_attributes=True - ) + + FLOG.lprint("Start aggregate_wbd_hucs") + start_dt = datetime.now(timezone.utc) + + huc_dictionary, out_gdf = aggregate_wbd_hucs(all_meta_lists, WBD_LAYER, True, lst_hucs) + # Drop list fields if invalid out_gdf = out_gdf.drop(['downstream_nwm_features'], axis=1, errors='ignore') out_gdf = out_gdf.drop(['upstream_nwm_features'], axis=1, errors='ignore') out_gdf = out_gdf.astype({'metadata_sources': str}) - end_dt = datetime.now() + end_dt = datetime.now(timezone.utc) time_duration = end_dt - start_dt - print(f"Determining HUC using WBD layer Duration: {str(time_duration).split('.')[0]}") - print() + FLOG.lprint(f"End aggregate_wbd_hucs - Duration: {str(time_duration).split('.')[0]}") + + FLOG.lprint("Start Flow Generation") - if stage_based: - return huc_dictionary, out_gdf, metadata_url, threshold_url, all_lists, nwm_flows_df + # It this is stage-based, it returns all of these objects here, but if it continues + # (aka. Flow based), then it returns only nws_lid_layer (created later in this function) + if is_stage_based: # If it's stage-based, the function stops running here + return ( + huc_dictionary, + out_gdf, + metadata_url, + threshold_url, + all_meta_lists, + nwm_flows_df, + ) # No Alaska + # return (huc_dictionary, out_gdf, metadata_url, threshold_url, all_meta_lists, nwm_flows_df, nwm_flows_alaska_df) # Alaska - print("Generating flows for hucs using " + str(job_number_huc) + " jobs...") - start_dt = datetime.now() + # only flow based needs the "flow" dir + output_flows_dir = os.path.join(output_catfim_dir, "flows") + if not os.path.exists(output_flows_dir): + os.mkdir(output_flows_dir) + start_dt = datetime.now(timezone.utc) + + # pulls out the parent log file and replaces it with the child prefix + # catfim if coming from generate_categorical_fim.py + + child_log_file_prefix = FLOG.MP_calc_prefix_name(log_output_file, "MP_process_gen_flows") with ProcessPoolExecutor(max_workers=job_number_huc) as executor: for huc in huc_dictionary: + + nwm_flows_region_df = nwm_flows_df # To exclude Alaska + # nwm_flows_region_df = nwm_flows_alaska_df if huc[:2] == '19' else nwm_flows_df # To include Alaska + + # Deep copy that speed up Multi-Proc a little as all_meta_lists + # is a huge object. Need to figure out how to filter that down somehow + # later. Can not just filter by huc per loop, tried it and there are other factors + copy_all_meta_lists = copy.copy(all_meta_lists) executor.submit( - process_generate_flows, + generate_flows_for_huc, huc, huc_dictionary, threshold_url, - all_lists, - workspace, + copy_all_meta_lists, + output_flows_dir, attributes_dir, huc_messages_dir, - nwm_flows_df, + nwm_flows_region_df, + log_output_file, + child_log_file_prefix, ) + # end ProcessPoolExecutor - end_dt = datetime.now() + # rolls up logs from child MP processes into this parent_log_output_file + FLOG.merge_log_files(log_output_file, child_log_file_prefix, True) + + end_dt = datetime.now(timezone.utc) time_duration = end_dt - start_dt - print(f"Generating flows for hucs Duration: {str(time_duration).split('.')[0]}") + FLOG.lprint(f"End flow generation - Duration: {str(time_duration).split('.')[0]}") print() - print('Wrapping up flows generation...') + FLOG.lprint('Start merging and finalizing flows generation data') # Recursively find all *_attributes csv files and append - csv_files = os.listdir(attributes_dir) + csv_files = [x for x in os.listdir(attributes_dir) if x.endswith('_attributes.csv')] + + if len(csv_files) == 0: + MP_LOG.critical(f"No new flow files exist in the {attributes_dir} folder (errors in creating them?)") + sys.exit(1) + all_csv_df = pd.DataFrame() - for csv in csv_files: - full_csv_path = os.path.join(attributes_dir, csv) + for csv_file in csv_files: + full_csv_path = os.path.join(attributes_dir, csv_file) # Huc has to be read in as string to preserve leading zeros. temp_df = pd.read_csv(full_csv_path, dtype={'huc': str}) all_csv_df = pd.concat([all_csv_df, temp_df], ignore_index=True) # Write to file - all_csv_df.to_csv(os.path.join(workspace, 'nws_lid_attributes.csv'), index=False) + all_csv_df.to_csv(os.path.join(attributes_dir, 'nws_lid_attributes.csv'), index=False) # This section populates a shapefile of all potential sites and details # whether it was mapped or not (mapped field) and if not, why (status field). - # Preprocess the out_gdf GeoDataFrame. Reproject and reformat fields. + viz_out_gdf = out_gdf.to_crs(VIZ_PROJECTION) viz_out_gdf.rename( columns={ @@ -354,7 +472,8 @@ def generate_catfim_flows( viz_out_gdf = viz_out_gdf.merge(lids_df, how='left', on='nws_lid') viz_out_gdf['mapped'] = viz_out_gdf['mapped'].fillna('no') - # Read all messages for all HUCs TODO + # Read all messages for all HUCs + # this is basically identical to a stage based set. Seach for huc_message_list and see my notes huc_message_list = [] huc_messages_dir_list = os.listdir(huc_messages_dir) for huc_message_file in huc_messages_dir_list: @@ -363,63 +482,198 @@ def generate_catfim_flows( if full_path_file.endswith('.txt'): lines = f.readlines() for line in lines: + line = line.strip() huc_message_list.append(line) # Write messages to DataFrame, split into columns, aggregate messages. - messages_df = pd.DataFrame(huc_message_list, columns=['message']) - messages_df = ( - messages_df['message'].str.split(':', n=1, expand=True).rename(columns={0: 'nws_lid', 1: 'status'}) - ) - status_df = messages_df.groupby(['nws_lid'])['status'].apply(', '.join).reset_index() + if len(huc_message_list) > 0: + + messages_df = pd.DataFrame(huc_message_list, columns=['message']) + messages_df = ( + messages_df['message'] + .str.split(':', n=1, expand=True) + .rename(columns={0: 'nws_lid', 1: 'status'}) + ) + + # There could be duplicate message for one ahps (ie. missing nwm segments), so drop dups + messages_df.drop_duplicates(subset=["nws_lid", "status"], keep="first", inplace=True) + + # We want one viz_out_gdf record per ahps and if there are more than one, contact the messages - # Join messages to populate status field to candidate sites. Assign - # status for null fields. - viz_out_gdf = viz_out_gdf.merge(status_df, how='left', on='nws_lid') - viz_out_gdf['status'] = viz_out_gdf['status'].fillna('all calculated flows available') + # status_df = messages_df.groupby(['nws_lid'])['status'].apply(', '.join).reset_index() + # df1 = df.groupby(['ID1','ID2'])['Status'].agg(lambda x: ','.join(x.dropna())).reset_index() + status_df = messages_df.groupby(['nws_lid'])['status'].agg(lambda x: ',\n'.join(x)).reset_index() + + # some messages status values start with a space as the first character. Remove it + # status_df["status"] = status_df["status"].apply(lambda x: x.strip()) + + # Join messages to populate status field to candidate sites. Assign + # status for null fields. + viz_out_gdf = viz_out_gdf.merge(status_df, how='left', on='nws_lid') + + viz_out_gdf['status'] = viz_out_gdf['status'].fillna('all calculated flows available') # Filter out columns and write out to file # viz_out_gdf = viz_out_gdf.filter( # ['nws_lid', 'usgs_gage', 'nwm_seg', 'HUC8', 'mapped', 'status', 'geometry'] # ) - nws_lid_layer = os.path.join(workspace, 'nws_lid_sites.gpkg').replace('flows', 'mapping') - viz_out_gdf.to_file(nws_lid_layer, driver='GPKG') + # stage based doesn't get here + # crs is 3857 - web mercator at this point + nws_lid_csv_file_path = os.path.join(mapping_dir, 'flow_based_catfim_sites.csv') + viz_out_gdf.to_csv(nws_lid_csv_file_path) + + nws_lid_gpkg_file_path = os.path.join(mapping_dir, 'flow_based_catfim_sites.gpkg') + viz_out_gdf.to_file(nws_lid_gpkg_file_path, driver='GPKG', index=False, engine='fiona') # time operation - all_end = datetime.now() + all_end = datetime.now(timezone.utc) all_time_duration = all_end - all_start - print(f"Duration: {str(all_time_duration).split('.')[0]}") + FLOG.lprint(f"End Wrapping up flows generation Duration: {str(all_time_duration).split('.')[0]}") print() - return nws_lid_layer + return nws_lid_gpkg_file_path + + +# local script calls __load_nwm_metadata so FLOG is already setup +def __load_nwm_metadata( + output_catfim_dir, metadata_url, nwm_us_search, nwm_ds_search, lid_to_run, nwm_metafile +): + FLOG.trace(metadata_url) + + all_meta_lists = [] + # Check to see if meta file already exists + # This feature means we can copy the pickle file to another enviro (AWS?) as it won't need to call + # WRDS unless we need a smaller or modified version. This one likely has all nws_lid data. + + if os.path.isfile(nwm_metafile) is True: + FLOG.lprint(f"Meta file already downloaded and exists at {nwm_metafile}") + + with open(nwm_metafile, "rb") as p_handle: + all_meta_lists = pickle.load(p_handle) + + else: + meta_file = os.path.join(output_catfim_dir, "nwm_metafile.pkl") + + FLOG.lprint(f"Meta file will be downloaded and saved at {meta_file}") + + # lid_to_run coudl be a single lid or the word "all" + + if lid_to_run != "all": + all_meta_lists, ___ = get_metadata( + metadata_url, + select_by='nws_lid', + selector=[lid_to_run], + must_include='nws_data.rfc_forecast_point', + upstream_trace_distance=nwm_us_search, + downstream_trace_distance=nwm_ds_search, + ) + else: + conus_list, ___ = get_metadata( + metadata_url, + select_by='nws_lid', + selector=['all'], + must_include='nws_data.rfc_forecast_point', + upstream_trace_distance=nwm_us_search, + downstream_trace_distance=nwm_ds_search, + ) + # Get metadata for Islands and Alaska + islands_list, ___ = get_metadata( + metadata_url, + select_by='state', + selector=['HI', 'PR', 'AK'], + must_include=None, + upstream_trace_distance=nwm_us_search, + downstream_trace_distance=nwm_ds_search, + ) + # Append the lists + all_meta_lists = conus_list + islands_list + + with open(meta_file, "wb") as p_handle: + pickle.dump(all_meta_lists, p_handle, protocol=pickle.HIGHEST_PROTOCOL) + + return all_meta_lists if __name__ == '__main__': # Parse arguments parser = argparse.ArgumentParser(description='Create forecast files for all nws_lid sites') - parser.add_argument('-w', '--workspace', help='Workspace where all data will be stored.', required=True) parser.add_argument( - '-u', '--nwm_us_search', help='Walk upstream on NWM network this many miles', required=True + '-w', '--output_catfim_dir', help='Workspace where all data will be stored.', required=True ) + parser.add_argument( - '-d', '--nwm_ds_search', help='Walk downstream on NWM network this many miles', required=True + '-log', + '--log_output_file', + help='REQUIRED: Path to where the output log file will be.' + r'ie) /data/catfim/rob_test/logs/catfim_2024_07_07-22_26_18.log', + required=True, + type=str, ) + + parser.add_argument( + '-e', + '--env_file', + help='Docker mount path to the catfim environment file. ie) data/config/catfim.env', + required=True, + ) + + parser.add_argument( + '-hucs', + '--lst_hucs', + help='list of hucs that you want to process. ie) -hucs 12090301 01100006 12040101', + required=True, + type=str, + nargs='+', + ) + + parser.add_argument( + '-u', + '--nwm_us_search', + help='Walk upstream on NWM network this many miles', + required=False, + default=5, + ) + + parser.add_argument( + '-d', + '--nwm_ds_search', + help='Walk downstream on NWM network this many miles', + required=False, + default=5, + ) + + parser.add_argument( + '-jh', + '--job_number_huc', + help='OPTIONAL: Number of processes to use for HUC scale operations.' + ' HUC and inundation job numbers should multiply to no more than one less than the CPU count of the' + ' machine. CatFIM sites generally only have 2-3 branches overlapping a site, so this number can be ' + 'kept low (2-4). Defaults to 1.', + required=False, + default=1, + type=int, + ) + parser.add_argument( '-a', - '--stage_based', - help='Run stage-based CatFIM instead of flow-based? NOTE: flow-based CatFIM is the default.', + '--is_stage_based', + help='Is this a stage based or flow based run? Add the -a to mean is_stage_based is True ', required=False, default=False, action='store_true', ) + parser.add_argument( - '-f', - '--fim-dir', - help='Path to FIM outputs directory. Only use this option if you are running in alt-catfim mode.', + '-n', + '--nwm_metafile', + help='OPTIONAL: Path to the pre-made pickle file that already holds the nwm metadata', required=False, + type=str, default="", ) + args = vars(parser.parse_args()) # Run get_env_paths and static_flow_lids - generate_catfim_flows(**args) + generate_flows(**args) diff --git a/tools/generate_categorical_fim_mapping.py b/tools/generate_categorical_fim_mapping.py index 9ff6cab1..a09a5c65 100755 --- a/tools/generate_categorical_fim_mapping.py +++ b/tools/generate_categorical_fim_mapping.py @@ -1,68 +1,385 @@ #!/usr/bin/env python3 import argparse + +# import glob import os + +# import shutil import sys +import time import traceback -from concurrent.futures import ProcessPoolExecutor, as_completed, wait +from concurrent.futures import ProcessPoolExecutor import geopandas as gpd +import numpy as np import pandas as pd import rasterio from inundate_gms import Inundate_gms from mosaic_inundation import Mosaic_inundation from rasterio.features import shapes +from rasterio.warp import Resampling, calculate_default_transform, reproject from shapely.geometry.multipolygon import MultiPolygon from shapely.geometry.polygon import Polygon +from tqdm import tqdm +import utils.fim_logger as fl from utils.shared_functions import getDriver -from utils.shared_variables import PREP_PROJECTION, VIZ_PROJECTION +from utils.shared_variables import ALASKA_CRS, PREP_PROJECTION, VIZ_PROJECTION + +# TODO: Aug 2024: This script was upgraded significantly with lots of misc TODO's embedded. +# Lots of inline documenation needs updating as well + + +# will become global once initiallized +FLOG = fl.FIM_logger() +MP_LOG = fl.FIM_logger() gpd.options.io_engine = "pyogrio" -def generate_categorical_fim( - fim_run_dir, source_flow_dir, output_catfim_dir, job_number_huc, job_number_inundate, depthtif, log_file +# Technically, this is once called as a non MP, but also called in an MP pool +# we will use an MP object either way +def produce_stage_based_catfim_tifs( + stage, + datum_adj_ft, + branch_dir, + lid_usgs_elev, + lid_altitude, + fim_dir, + segments, + lid, + huc, + lid_directory, + category, + number_of_jobs, + parent_log_output_file, + child_log_file_prefix, +): + + MP_LOG.MP_Log_setup(parent_log_output_file, child_log_file_prefix) + + messages = [] + + MP_LOG.lprint("-----------------") + huc_lid_cat_id = f"{huc} : {lid} : {category}" + MP_LOG.lprint(f"{huc_lid_cat_id}: Starting to create tifs") + + # Determine datum-offset water surface elevation (from above). + datum_adj_wse = stage + datum_adj_ft + lid_altitude + datum_adj_wse_m = datum_adj_wse * 0.3048 # Convert ft to m + + # Subtract HAND gage elevation from HAND WSE to get HAND stage. + hand_stage = datum_adj_wse_m - lid_usgs_elev + + # TODO: see what happens if this is returned with tj + + # If no segments, write message and exit out + if not segments or len(segments) == 0: + msg = ': missing nwm segments' + messages.append(lid + msg) + MP_LOG.warning(huc_lid_cat_id + msg) + return messages, hand_stage, datum_adj_wse, datum_adj_wse_m + + # Produce extent tif hand_stage. Multiprocess across branches. + # branches = os.listdir(branch_dir) + MP_LOG.lprint(f"{huc_lid_cat_id} branch_dir is {branch_dir}") + + branches = [x for x in os.listdir(branch_dir) if os.path.isdir(os.path.join(branch_dir, x))] + branches.sort() + + # We need to merge what we have up to this point. + # MP_LOG.merge_log_files(parent_log_output_file, child_log_file_prefix) + # child_log_file_prefix = MP_LOG.MP_calc_prefix_name(parent_log_output_file, "MP_prod_huc_mag_stage", huc) + child_log_file_prefix = MP_LOG.MP_calc_prefix_name(parent_log_output_file, "MP_prod_huc_mag_stage") + with ProcessPoolExecutor(max_workers=number_of_jobs) as executor: + for branch in branches: + msg_id_w_branch = f"{huc} - {branch} - {lid} - {category}" + MP_LOG.trace(f"{msg_id_w_branch} : inundating branches") + # Define paths to necessary files to produce inundation grids. + full_branch_path = os.path.join(branch_dir, branch) + rem_path = os.path.join(fim_dir, huc, full_branch_path, 'rem_zeroed_masked_' + branch + '.tif') + catchments_path = os.path.join( + fim_dir, + huc, + full_branch_path, + 'gw_catchments_reaches_filtered_addedAttributes_' + branch + '.tif', + ) + hydrotable_path = os.path.join(fim_dir, huc, full_branch_path, 'hydroTable_' + branch + '.csv') + + if not os.path.exists(rem_path): + msg = ": rem doesn't exist" + messages.append(lid + msg) + MP_LOG.warning(msg_id_w_branch + msg) + continue + if not os.path.exists(catchments_path): + msg = ": catchments files don't exist" + messages.append(lid + msg) + MP_LOG.warning(msg_id_w_branch + msg) + continue + if not os.path.exists(hydrotable_path): + msg = ": hydrotable doesn't exist" + messages.append(lid + msg) + MP_LOG.warning(msg_id_w_branch + msg) + continue + + # Use hydroTable to determine hydroid_list from site_ms_segments. + hydrotable_df = pd.read_csv( + hydrotable_path, low_memory=False, dtype={'HUC': str, 'LakeID': float, 'subdiv_applied': int} + ) + hydroid_list = [] + + # Determine hydroids at which to perform inundation + for feature_id in segments: + # print(f"... feature id is {feature_id}") + try: + subset_hydrotable_df = hydrotable_df[hydrotable_df['feature_id'] == int(feature_id)] + hydroid_list += list(subset_hydrotable_df.HydroID.unique()) + except IndexError: + MP_LOG.trace( + f"Index Error for {huc} -- {branch} -- {category}. FeatureId is {feature_id} : Continuing on." + ) + pass + + # Create inundation maps with branch and stage data + try: + # print("Generating stage-based FIM for " + huc + " and branch " + branch) + # + # MP_LOG.lprint(f"{huc_lid_cat_id} : Generating stage-based FIM") + + executor.submit( + produce_tif_per_huc_per_mag_for_stage, + rem_path, + catchments_path, + hydroid_list, + hand_stage, + lid_directory, + category, + huc, + lid, + branch, + parent_log_output_file, + child_log_file_prefix, + ) + + except Exception: + msg = f': inundation failed at {category}' + messages.append(lid + msg) + MP_LOG.warning(msg_id_w_branch + msg) + MP_LOG.error(traceback.format_exc()) + + MP_LOG.merge_log_files(parent_log_output_file, child_log_file_prefix, True) + + # -- MOSAIC -- # + # Merge all rasters in lid_directory that have the same magnitude/category. + path_list = [] + + MP_LOG.trace(f"Merging files from {lid_directory}") + lid_dir_list = os.listdir(lid_directory) + + MP_LOG.lprint(f"{huc}: Merging {category}") + # MP_LOG.trace("lid_dir_list is ") + # MP_LOG.trace(lid_dir_list) + # MP_LOG.lprint("") + + for f in lid_dir_list: + if category in f: + path_list.append(os.path.join(lid_directory, f)) + + # MP_LOG.error("???") + # MP_LOG.trace(f"path_list is (pre sort) is {path_list}") + # path_list.sort() # To force branch 0 first in list, sort it isn't branchs and we don't care the order for mosaiking + # MP_LOG.trace(f"path_list is (post sort) is {path_list}") + + path_list.sort() # To force branch 0 first in list, sort + + # MP_LOG.trace(f"len of path_list is {len(path_list)}") + + if len(path_list) > 0: + zero_branch_grid = path_list[0] + zero_branch_src = rasterio.open(zero_branch_grid) + zero_branch_array = zero_branch_src.read(1) + summed_array = zero_branch_array # Initialize it as the branch zero array + + # Loop through remaining items in list and sum them with summed_array + for remaining_raster in path_list[1:]: + remaining_raster_src = rasterio.open(remaining_raster) + remaining_raster_array_original = remaining_raster_src.read(1) + + # Reproject non-branch-zero grids so I can sum them with the branch zero grid + remaining_raster_array = np.empty(zero_branch_array.shape, dtype=np.int8) + reproject( + remaining_raster_array_original, + destination=remaining_raster_array, + src_transform=remaining_raster_src.transform, + src_crs=remaining_raster_src.crs, # TODO: Accomodate AK projection? + src_nodata=remaining_raster_src.nodata, + dst_transform=zero_branch_src.transform, + dst_crs=zero_branch_src.crs, # TODO: Accomodate AK projection? + dst_nodata=-1, + dst_resolution=zero_branch_src.res, + resampling=Resampling.nearest, + ) + # Sum rasters + summed_array = summed_array + remaining_raster_array + + del zero_branch_array # Clean up + + # Define path to merged file, in same format as expected by post_process_cat_fim_for_viz function + output_tif = os.path.join(lid_directory, lid + '_' + category + '_extent.tif') + profile = zero_branch_src.profile + summed_array = summed_array.astype('uint8') + with rasterio.open(output_tif, 'w', **profile) as dst: + dst.write(summed_array, 1) + MP_LOG.lprint(f"output_tif is {output_tif}") + del summed_array + + return messages, hand_stage, datum_adj_wse, datum_adj_wse_m + + +# This is part of an MP call and needs MP_LOG +# This does not actually inundate, it just uses the stage and the catchment to create a tif +def produce_tif_per_huc_per_mag_for_stage( + rem_path, + catchments_path, + hydroid_list, + hand_stage, + lid_directory, + category, + huc, + lid, + branch, + parent_log_output_file, + child_log_file_prefix, +): + """ + # Open rem_path and catchment_path using rasterio. + """ + + try: + # This is setting up logging for this function to go up to the parent + MP_LOG.MP_Log_setup(parent_log_output_file, child_log_file_prefix) + + # MP_LOG.lprint("+++++++++++++++++++++++") + # MP_LOG.lprint(f"At the start of producing a tif for {huc}") + # MP_LOG.trace(locals()) + # MP_LOG.trace("+++++++++++++++++++++++") + + rem_src = rasterio.open(rem_path) + catchments_src = rasterio.open(catchments_path) + rem_array = rem_src.read(1) + catchments_array = catchments_src.read(1) + + # TEMP: look at a catchment and rem from the same branch. + # Then look at a stage based 4.4.0.0 for this huc and see if we can figure out the + # intended results. Are we trying for a image that makes all values below the hand stage + # value to be a value (kinda like a 1 and 0 ?) + + # Use numpy.where operation to reclassify rem_path on the condition that the pixel values + # are <= to hand_stage and the catchments value is in the hydroid_list. + reclass_rem_array = np.where((rem_array <= hand_stage) & (rem_array != rem_src.nodata), 1, 0).astype( + 'uint8' + ) + hydroid_mask = np.isin(catchments_array, hydroid_list) + target_catchments_array = np.where( + ((hydroid_mask == True) & (catchments_array != catchments_src.nodata)), 1, 0 + ).astype('uint8') + masked_reclass_rem_array = np.where( + ((reclass_rem_array == 1) & (target_catchments_array == 1)), 1, 0 + ).astype('uint8') + + # Save resulting array to new tif with appropriate name. ie) brdc1_record_extent_18060005.tif + # to our mapping/huc/lid site + is_all_zero = np.all((masked_reclass_rem_array == 0)) + + # MP_LOG.lprint(f"{huc}: masked_reclass_rem_array, is_all_zero is {is_all_zero} for {rem_path}") + + # if not is_all_zero: + # if is_all_zero is False: # this logic didn't let ANY files get saved + # 'is False' means that the object does not exist and not that it really equals the value of False + if is_all_zero == False: # corrected logic + output_tif = os.path.join( + lid_directory, lid + '_' + category + '_extent_' + huc + '_' + branch + '.tif' + ) + # MP_LOG.lprint(f" +++ Output_Tif is {output_tif}") + with rasterio.Env(): + profile = rem_src.profile + profile.update(dtype=rasterio.uint8) + profile.update(nodata=10) + + with rasterio.open(output_tif, 'w', **profile) as dst: + dst.write(masked_reclass_rem_array, 1) + + except Exception: + MP_LOG.error(f"{huc} : {lid} Error producing inundation maps with stage") + MP_LOG.error(traceback.format_exc()) + + return + + +# This is not part of an MP process, but needs to have FLOG carried over so this file can see it +def run_catfim_inundation( + fim_run_dir, output_flows_dir, output_mapping_dir, job_number_huc, job_number_inundate, log_output_file ): - source_flow_dir_list = os.listdir(source_flow_dir) - output_flow_dir_list = os.listdir(fim_run_dir) + # Adding a pointer in this file coming from generate_categorial_fim so they can share the same log file + FLOG.setup(log_output_file) + print() + FLOG.lprint(">>> Start Inundating and Mosaicking") + + source_flow_huc_dir_list = [ + x + for x in os.listdir(output_flows_dir) + if os.path.isdir(os.path.join(output_flows_dir, x)) and x[0] in ['0', '1', '2'] + ] + fim_source_huc_dir_list = [ + x + for x in os.listdir(fim_run_dir) + if os.path.isdir(os.path.join(fim_run_dir, x)) and x[0] in ['0', '1', '2'] + ] # Log missing hucs - missing_hucs = list(set(source_flow_dir_list) - set(output_flow_dir_list)) + missing_hucs = list(set(source_flow_huc_dir_list) - set(fim_source_huc_dir_list)) missing_hucs = [huc for huc in missing_hucs if "." not in huc] - if len(missing_hucs) > 0: - f = open(log_file, 'a+') - f.write(f"Missing hucs from output directory: {', '.join(missing_hucs)}\n") - f.close() - # Loop through matching huc directories in the source_flow directory - matching_hucs = list(set(output_flow_dir_list) & set(source_flow_dir_list)) + matching_hucs = list(set(fim_source_huc_dir_list) & set(source_flow_huc_dir_list)) + matching_hucs.sort() + child_log_file_prefix = FLOG.MP_calc_prefix_name(log_output_file, "MP_run_ind") with ProcessPoolExecutor(max_workers=job_number_huc) as executor: for huc in matching_hucs: if "." in huc: continue # Get list of AHPS site directories - ahps_site_dir = os.path.join(source_flow_dir, huc) - ahps_site_dir_list = os.listdir(ahps_site_dir) + huc_flows_dir = os.path.join(output_flows_dir, huc) + + # ahps_site_dir_list = os.listdir(ahps_site_dir) + ahps_site_dir_list = [ + x for x in os.listdir(huc_flows_dir) if os.path.isdir(os.path.join(huc_flows_dir, x)) + ] - # Map path to huc directory inside out output_catfim_dir - cat_fim_huc_dir = os.path.join(output_catfim_dir, huc) - if not os.path.exists(cat_fim_huc_dir): - os.mkdir(cat_fim_huc_dir) + # Map path to huc directory inside the mapping directory + huc_mapping_dir = os.path.join(output_mapping_dir, huc) + if not os.path.exists(huc_mapping_dir): + os.mkdir(huc_mapping_dir) # Loop through AHPS sites for ahps_site in ahps_site_dir_list: # map parent directory for AHPS source data dir and list AHPS thresholds (act, min, mod, maj) - ahps_site_parent = os.path.join(ahps_site_dir, ahps_site) - thresholds_dir_list = os.listdir(ahps_site_parent) + ahps_site_parent = os.path.join(huc_flows_dir, ahps_site) - # Map parent directory for all inundation output filesoutput files. - cat_fim_huc_ahps_dir = os.path.join(cat_fim_huc_dir, ahps_site) - if not os.path.exists(cat_fim_huc_ahps_dir): - os.mkdir(cat_fim_huc_ahps_dir) + # thresholds_dir_list = os.listdir(ahps_site_parent) + thresholds_dir_list = [ + x + for x in os.listdir(ahps_site_parent) + if os.path.isdir(os.path.join(ahps_site_parent, x)) + ] + + # Map parent directory for all inundation output files output files. + huc_site_mapping_dir = os.path.join(huc_mapping_dir, ahps_site) + if not os.path.exists(huc_site_mapping_dir): + os.mkdir(huc_site_mapping_dir) # Loop through thresholds/magnitudes and define inundation output files paths for magnitude in thresholds_dir_list: @@ -73,57 +390,91 @@ def generate_categorical_fim( magnitude, 'ahps_' + ahps_site + '_huc_' + huc + '_flows_' + magnitude + '.csv', ) + # print(f"magnitude_flows_csv is {magnitude_flows_csv}") if os.path.exists(magnitude_flows_csv): - output_extent_grid = os.path.join( - cat_fim_huc_ahps_dir, ahps_site + '_' + magnitude + '_extent.tif' - ) + tif_name = ahps_site + '_' + magnitude + '_extent.tif' + output_extent_tif = os.path.join(huc_site_mapping_dir, tif_name) + FLOG.trace(f"Begin inundation for {tif_name}") try: executor.submit( run_inundation, magnitude_flows_csv, huc, - output_extent_grid, + huc_site_mapping_dir, + output_extent_tif, ahps_site, magnitude, - log_file, fim_run_dir, job_number_inundate, + log_output_file, + child_log_file_prefix, ) + except Exception: - traceback.print_exc() + FLOG.critical( + "An critical error occured while attempting inundation" + f" for {huc} -- {ahps_site} -- {magnitude}" + ) + FLOG.critical(traceback.format_exc()) + FLOG.merge_log_files(log_output_file, child_log_file_prefix) sys.exit(1) + # end of ProcessPoolExecutor + + # rolls up logs from child MP processes into this parent_log_output_file + + # hold on merging it up for now, to keep the overall log size down a little + FLOG.merge_log_files(log_output_file, child_log_file_prefix, True) + + print() + FLOG.lprint(">>> End Inundating and Mosaicking") + return + + +# This is part of an MP Pool def run_inundation( magnitude_flows_csv, huc, - output_extent_grid, + output_huc_site_mapping_dir, + output_extent_tif, ahps_site, magnitude, - log_file, fim_run_dir, job_number_inundate, + parent_log_output_file, + child_log_file_prefix, ): + # Note: child_log_file_prefix is "MP_run_ind", meaning all logs created by this function start + # with the phrase "MP_run_ind" + # They will be rolled up into the parent_log_output_file + # This is setting up logging for this function to go up to the parent\ + MP_LOG.MP_Log_setup(parent_log_output_file, child_log_file_prefix) + # MP_LOG.trace(locals()) + huc_dir = os.path.join(fim_run_dir, huc) + # Why all high number for job_number_inundate? Inundate_gms has to create inundation for each + # branch and merge them. try: - print("Running Inundate_gms for " + huc) + MP_LOG.lprint(f"... Running Inundate_gms and mosiacking for {huc} : {ahps_site} : {magnitude}") map_file = Inundate_gms( hydrofabric_dir=fim_run_dir, forecast=magnitude_flows_csv, num_workers=job_number_inundate, hucs=huc, - inundation_raster=output_extent_grid, + inundation_raster=output_extent_tif, inundation_polygon=None, depths_raster=None, verbose=False, log_file=None, output_fileNames=None, ) - print("Mosaicking for " + huc) + + MP_LOG.trace(f"Mosaicking for {huc} : {ahps_site} : {magnitude}") Mosaic_inundation( map_file, mosaic_attribute='inundation_rasters', - mosaic_output=output_extent_grid, + mosaic_output=output_extent_tif, mask=os.path.join(huc_dir, 'wbd.gpkg'), unit_attribute_name='huc8', nodata=-9999, @@ -132,59 +483,152 @@ def run_inundation( subset=None, verbose=False, ) - + MP_LOG.trace(f"Mosaicking complete for {huc} : {ahps_site} : {magnitude}") except Exception: # Log errors and their tracebacks - f = open(log_file, 'a+') - f.write(f"{output_extent_grid} - inundation error: {traceback.format_exc()}\n") - f.close() + MP_LOG.error(f"Exception: running inundation for {huc}") + MP_LOG.error(traceback.format_exc()) + return - # Inundation.py appends the huc code to the supplied output_extent_grid. + # Inundation.py appends the huc code to the supplied output_extent_grid for stage-based. # Modify output_extent_grid to match inundation.py saved filename. # Search for this file, if it didn't create, send message to log file. - base_file_path, extension = os.path.splitext(output_extent_grid) - saved_extent_grid_filename = "{}_{}{}".format(base_file_path, huc, extension) - if not os.path.exists(saved_extent_grid_filename): - with open(log_file, 'a+') as f: - f.write('FAILURE_huc_{}:{}:{} map failed to create\n'.format(huc, ahps_site, magnitude)) + # base_file_path, extension = os.path.splitext(output_extent_tif) + # saved_extent_grid_filename = "{}_{}{}".format(base_file_path, huc, extension) + + # MP_LOG.trace(f"saved_extent_grid_filename is {saved_extent_grid_filename}") -def post_process_huc_level( - job_number_tif, ahps_dir_list, huc_dir, attributes_dir, gpkg_dir, fim_version, huc + if not os.path.exists(output_extent_tif): + MP_LOG.error(f"FAILURE_huc_{huc} - {ahps_site} - {magnitude} map failed to create") + return + + # For space reasons, we need to delete all of the intermediary files such as: + # Stage: grmn3_action_extent_0.tif, grmn3_action_extent_1933000003.tif. The give aways are a number before + # the .tif + # Flows: allm1_action_12p0ft_extent_01010002_0.tif, allm1_action_12p0ft_extent_01010002_7170000001.tif + # your give away is to just delete any file that has the HUC number in teh file name + # The intermediatary are all inundated branch tifs. + + # The ones we want to keep stop at _extent.tif + # branch_tifs = glob.glob(os.path.join(output_huc_site_mapping_dir, '*_extent_*.tif')) + # for tif_file in branch_tifs: + # os.remove(tif_file) + + return + + +# This is part of an MP Pool +# TODO: Aug 2024: job_number_inundate is not used well at all and is partially +# with more cleanup to do later. Partially removed now. +def post_process_huc( + output_catfim_dir, + ahps_dir_list, + huc_dir, + gpkg_dir, + fim_version, + huc, + parent_log_output_file, + child_log_file_prefix, + progress_stmt, ): - # Loop through ahps sites - for ahps_lid in ahps_dir_list: - tifs_to_reformat_list = [] - ahps_lid_dir = os.path.join(huc_dir, ahps_lid) - - # Append desired filenames to list. - tif_list = os.listdir(ahps_lid_dir) - for tif in tif_list: - if 'extent.tif' in tif: - tifs_to_reformat_list.append(os.path.join(ahps_lid_dir, tif)) - - # Stage-Based CatFIM uses attributes from individual CSVs instead of the master CSV. - nws_lid_attributes_filename = os.path.join(attributes_dir, ahps_lid + '_attributes.csv') - - print(f"Reformatting TIFs {ahps_lid} for {huc_dir}") - with ProcessPoolExecutor(max_workers=job_number_tif) as executor: + + # Note: child_log_file_prefix is "MP_post_process_{huc}", meaning all logs created by this function start + # with the phrase "MP_post_process_{huc}". This one rollups up to the master catfim log + # This is setting up logging for this function to go up to the parent + try: + MP_LOG.MP_Log_setup(parent_log_output_file, child_log_file_prefix) + MP_LOG.lprint(f'Post Processing {huc} ...') + MP_LOG.lprint(f'... {progress_stmt} ...') + + # Loop through ahps sites + attributes_dir = os.path.join(output_catfim_dir, 'attributes') + + for ahps_lid in ahps_dir_list: + tifs_to_reformat_list = [] + mapping_huc_lid_dir = os.path.join(huc_dir, ahps_lid) + MP_LOG.trace(f"mapping_huc_lid_dir is {mapping_huc_lid_dir}") + + # Append desired filenames to list. (notice.. no value after the word extent) + # tif_list = [x for x in os.listdir(mapping_huc_lid_dir) if x.endswith("extent.tif")] # doesn't match the old filenames + + # new logic actually finds the extent tifs + # It will find only the mag rollups and not its branches + + # if stage based, the file names looks like this: masm1_major_20p0ft_extent.tif + # but there also is masm1_major_extent.tif, so we want both + # if flow based, the file name looks like this: masm1_action_extent.tif + + tif_list = [x for x in os.listdir(mapping_huc_lid_dir) if ('extent.tif') in x] + + if len(tif_list) == 0: + MP_LOG.warning(f">> no tifs found for {huc} {ahps_lid} at {mapping_huc_lid_dir}") + continue + + for tif in tif_list: + if 'extent.tif' in tif: + # as we processing tifs for just one ahps at a time, we can further files that have that + # ahps_lid in it. + if ahps_lid in tif: + tifs_to_reformat_list.append(os.path.join(mapping_huc_lid_dir, tif)) + + if len(tifs_to_reformat_list) == 0: + MP_LOG.warning(f">> no tifs found for {huc} {ahps_lid} at {mapping_huc_lid_dir}") + continue + + # Stage-Based CatFIM uses attributes from individual CSVs instead of the master CSV. + nws_lid_attributes_filename = os.path.join(attributes_dir, ahps_lid + '_attributes.csv') + + # There may not necessarily be an attributes.csv for this lid, depending on how flow processing went + # lots of lids fall out in the attributes or flow steps. + if os.path.exists(nws_lid_attributes_filename) is False: + MP_LOG.warning(f"{ahps_lid} has no attributes file which may perfectly fine.") + continue + + # We are going to do an MP in MP. + # child_log_file_prefix = MP_LOG.MP_calc_prefix_name( + # parent_log_output_file, "MP_reformat_tifs", huc + # ) + # Weird case, we ahve to delete any of these files that might already exist (MP in MP) + # Get parent log dir + # log_dir = os.path.dirname(parent_log_output_file) + # old_refomat_log_files = glob.glob(os.path.join(log_dir, 'MP_reformat_tifs_*')) + # for log_file in old_refomat_log_files: + # os.remove(log_file) + + # with ProcessPoolExecutor(max_workers=job_number_inundate) as executor: + # TODO: + # Aug 2024, Using MP (job number inundate has very little value, drop it) + # Clean up that ji MP + # with ProcessPoolExecutor(max_workers=job_number_inundate) as executor: for tif_to_process in tifs_to_reformat_list: - if not os.path.exists(tif_to_process): - continue + # If not os.path.exists(tif_to_process): + # continue + + # If stage based, the file names looks like this: masm1_major_20p0ft_extent.tif + # but there also is masm1_major_extent.tif, so we want both + # If flow based, the file name looks like this: masm1_action_extent.tif + MP_LOG.trace(f".. Tif to Process = {tif_to_process}") try: - magnitude = os.path.split(tif_to_process)[1].split('_')[1] - try: - interval_stage = float( - (os.path.split(tif_to_process)[1].split('_')[2]) - .replace('p', '.') - .replace("ft", "") - ) - if interval_stage == 'extent': + + tif_file_name = os.path.basename(tif_to_process) + file_name_parts = tif_file_name.split("_") + magnitude = file_name_parts[1] + + if "ft" in tif_file_name: # stage based, ie grnm1_action_11p0ft_extent.tif + try: + interval_stage = float(file_name_parts[2].replace('p', '.').replace("ft", "")) + except ValueError: interval_stage = None - except ValueError: + MP_LOG.error( + f"Value Error for {huc} - {ahps_lid} - magnitude {magnitude}" + " at {mapping_huc_lid_dir}" + ) + MP_LOG.error(traceback.format_exc()) + else: # flow based. ie) cfkt2_action_extent.tif interval_stage = None - executor.submit( - reformat_inundation_maps, + + reformat_inundation_maps( ahps_lid, tif_to_process, gpkg_dir, @@ -193,89 +637,195 @@ def post_process_huc_level( magnitude, nws_lid_attributes_filename, interval_stage, + parent_log_output_file, + child_log_file_prefix, + ) + except Exception: + MP_LOG.error( + f"An ind reformat map error occured for {huc} - {ahps_lid} - magnitude {magnitude}" ) - except Exception as ex: - print(f"*** {ex}") - traceback.print_exc() + MP_LOG.error(traceback.format_exc()) + # rolls up logs from child MP processes into this parent_log_output_file + # MP_LOG.merge_log_files(parent_log_output_file, child_log_file_prefix, True) + except Exception: + MP_LOG.error(f"An error has occurred in post processing for {huc}") + MP_LOG.error(traceback.format_exc()) + + return + + +# This is not part of an MP process, but does need FLOG carried into it so it can use FLOG directly def post_process_cat_fim_for_viz( - job_number_huc, job_number_tif, output_catfim_dir, attributes_dir, log_file="", fim_version="" + catfim_method, output_catfim_dir, job_huc_ahps, fim_version, log_output_file ): - print("In post processing...") - # Create workspace - gpkg_dir = os.path.join(output_catfim_dir, 'gpkg') - if not os.path.exists(gpkg_dir): - os.mkdir(gpkg_dir) - - # Find the FIM version - merged_layer = os.path.join(output_catfim_dir, 'catfim_library.gpkg') - if not os.path.exists(merged_layer): # prevents appending to existing output - huc_ahps_dir_list = os.listdir(output_catfim_dir) - skip_list = ['errors', 'logs', 'gpkg', 'missing_files.txt', 'messages', merged_layer] - - # Loop through all categories - print("Building list of TIFs to reformat...") - with ProcessPoolExecutor(max_workers=job_number_huc) as huc_exector: - for huc in huc_ahps_dir_list: - if huc in skip_list: - continue - huc_dir = os.path.join(output_catfim_dir, huc) - try: - ahps_dir_list = os.listdir(huc_dir) - except NotADirectoryError: - continue - # If there's no mapping for a HUC, delete the HUC directory. - if ahps_dir_list == []: - os.rmdir(huc_dir) - continue - - huc_exector.submit( - post_process_huc_level, - job_number_tif, - ahps_dir_list, - huc_dir, - attributes_dir, - gpkg_dir, - fim_version, - huc, - ) - # Merge all layers - print(f"Merging {len(os.listdir(gpkg_dir))} layers...") - for layer in os.listdir(gpkg_dir): - # Open dissolved extent layers - diss_extent_filename = os.path.join(gpkg_dir, layer) - diss_extent = gpd.read_file(diss_extent_filename) - diss_extent['viz'] = 'yes' + # Adding a pointer in this file coming from generate_categorial_fim so they can share the same log file + FLOG.setup(log_output_file) - # Write/append aggregate diss_extent - print(f"Merging layer: {layer}") - if os.path.isfile(merged_layer): - diss_extent.to_file(merged_layer, driver=getDriver(merged_layer), index=False, mode='a') - else: - diss_extent.to_file(merged_layer, driver=getDriver(merged_layer), index=False) - del diss_extent + FLOG.lprint("\n>>>>>>>>>>>>>>>>>>>>>>>>>>>>") + FLOG.lprint("Start post processing TIFs (TIF extents into poly into gpkg)...") + output_mapping_dir = os.path.join(output_catfim_dir, 'mapping') + gpkg_dir = os.path.join(output_mapping_dir, 'gpkg') + os.mkdir(gpkg_dir) - # shutil.rmtree(gpkg_dir) # TODO + huc_ahps_dir_list = [ + x + for x in os.listdir(output_mapping_dir) + if os.path.isdir(os.path.join(output_mapping_dir, x)) and x[0] in ['0', '1', '2'] + ] - else: - print(f"{merged_layer} already exists.") + skip_list = ['errors', 'logs', 'gpkg', 'missing_files.txt', 'messages'] + + num_hucs = len(huc_ahps_dir_list) + huc_index = 0 + FLOG.lprint(f"Number of hucs to post process is {num_hucs}") + + child_log_file_prefix = MP_LOG.MP_calc_prefix_name(log_output_file, "MP_post_process") + with ProcessPoolExecutor(max_workers=job_huc_ahps) as huc_exector: + for huc in huc_ahps_dir_list: + FLOG.lprint(f"TIF post processing for {huc}") + if huc in skip_list: + continue + + huc_dir = os.path.join(output_mapping_dir, huc) + progress_stmt = f"index {huc_index + 1} of {num_hucs}" + huc_index += 1 + + try: + ahps_dir_list = [x for x in os.listdir(huc_dir) if os.path.isdir(os.path.join(huc_dir, x))] + # ahps_dir_list = os.listdir(huc_dir) + except NotADirectoryError: + FLOG.warning(f"{huc_dir} directory missing. Continuing on") + continue + + # If there's no mapping for a HUC, delete the HUC directory. + if len(ahps_dir_list) == 0: + os.rmdir(huc_dir) + FLOG.warning(f"no mapping for {huc}") + continue + + huc_exector.submit( + post_process_huc, + output_catfim_dir, + ahps_dir_list, + huc_dir, + gpkg_dir, + fim_version, + huc, + log_output_file, + child_log_file_prefix, + progress_stmt, + ) + + # end of ProcessPoolExecutor + + # rolls up logs from child MP processes into this parent_log_output_file + FLOG.merge_log_files(FLOG.LOG_FILE_PATH, child_log_file_prefix, True) + + # Merge all layers + gpkg_files = [x for x in os.listdir(gpkg_dir) if x.endswith('.gpkg')] + FLOG.lprint(f"Merging {len(gpkg_files)} from layers in {gpkg_dir}") + + # TODO: put a tqdm in here for visual only. + + gpkg_files.sort() + + merged_layers_gdf = None + ctr = 0 + for layer in tqdm( + gpkg_files, + total=len(gpkg_files), + desc="Merging gpkg layers", + bar_format="{desc}:({n_fmt}/{total_fmt})|{bar}| {percentage:.1f}% ", + ncols=80, + ): + # for ctr, layer in enumerate(gpkg_files): + # FLOG.lprint(f"Merging gpkg ({ctr+1} of {len(gpkg_files)} - {}") + FLOG.trace(f"Merging gpkg ({ctr+1} of {len(gpkg_files)} : {layer}") + # Concatenate each /gpkg/{aphs}_{magnitude}_extent_{huc}_dissolved.gkpg + diss_extent_filename = os.path.join(gpkg_dir, layer) + diss_extent_gdf = gpd.read_file(diss_extent_filename, engine='fiona') + diss_extent_gdf['viz'] = 'yes' + + if ctr == 0: + merged_layers_gdf = diss_extent_gdf + else: + merged_layers_gdf = pd.concat([merged_layers_gdf, diss_extent_gdf]) + + del diss_extent_gdf + ctr += 1 + + if merged_layers_gdf is None or len(merged_layers_gdf) == 0: + raise Exception(f"No gpkgs found in {gpkg_dir}") + + # TODO: July 9, 2024: Consider deleting all of the interium .gkpg files in the gkpg folder. + # It will get very big quick. But not yet. + # shutil.rmtree(gpkg_dir) + + # Now dissolve based on ahps and magnitude (we no longer saved non dissolved versrons) + # Aug 2024: We guessed on what might need to be dissolved from 4.4.0.0. In 4.4.0.0 there + # are "_dissolved" versions of catfim files but no notes on why or how, but this script + # did not do it. We are going to guess on what the dissolving rules are. + if catfim_method == "flow_based": + FLOG.lprint("Dissolving flow based catfim_libary by ahps and magnitudes") + merged_layers_gdf = merged_layers_gdf.dissolve(by=['ahps_lid', 'magnitude'], as_index=False) + + merged_layers_gdf.reset_index(inplace=True) + + output_file_name = f"{catfim_method}_catfim_library" + + # TODO: Aug 2024: gpkg are not opening in qgis now? project, wkt, non defined geometry columns? + # gkpg_file_path = os.path.join(output_mapping_dir, f'{output_file_name}.gpkg') + # FLOG.lprint(f"Saving catfim library gpkg version to {gkpg_file_path}") + # merged_layers_gdf.to_file(gkpg_file_path, driver='GPKG', index=True, engine="fiona", crs=PREP_PROJECTION) + + csv_file_path = os.path.join(output_mapping_dir, f'{output_file_name}.csv') + FLOG.lprint(f"Saving catfim library csv version to {csv_file_path}") + merged_layers_gdf.to_csv(csv_file_path) + + FLOG.lprint("End post processing TIFs...") + + return + + +# This is part of an MP pool def reformat_inundation_maps( ahps_lid, - extent_grid, + tif_to_process, gpkg_dir, fim_version, huc, magnitude, nws_lid_attributes_filename, - interval_stage=None, + interval_stage, + parent_log_output_file, + child_log_file_prefix, ): + """_summary_ + Turns inundated tifs into dissolved polys gpkg with more attributes + + """ + # interval stage might come in as null and that is ok + + # Note: child_log_file_prefix is "MP_reformat_tifs_{huc}", meaning all logs created by this + # function start with the phrase "MP_reformat_tifs_{huc}". This will rollup to the master + # catfim logs + + # This is setting up logging for this function to go up to the parent + MP_LOG.MP_Log_setup(parent_log_output_file, child_log_file_prefix) + try: - # Convert raster to to shapes - with rasterio.open(extent_grid) as src: + MP_LOG.trace( + f"{huc} : {ahps_lid} : {magnitude} -- Start reformat_inundation_maps" " (tif extent to gpkg poly)" + ) + MP_LOG.trace(F"Tif to process is {tif_to_process}") + + # Convert raster to shapes + with rasterio.open(tif_to_process) as src: image = src.read(1) mask = image > 0 @@ -286,7 +836,11 @@ def reformat_inundation_maps( ) # Convert list of shapes to polygon - extent_poly = gpd.GeoDataFrame.from_features(list(results), crs=PREP_PROJECTION) + # lots of polys + extent_poly = gpd.GeoDataFrame.from_features(list(results), crs=PREP_PROJECTION) # Previous code + # extent_poly = gpd.GeoDataFrame.from_features(list(results)) # Updated to accomodate AK projection + # extent_poly = extent_poly.set_crs(src.crs) # Update to accomodate AK projection + # Dissolve polygons extent_poly_diss = extent_poly.dissolve(by='extent') @@ -297,6 +851,7 @@ def reformat_inundation_maps( extent_poly_diss['version'] = fim_version extent_poly_diss['huc'] = huc extent_poly_diss['interval_stage'] = interval_stage + # Project to Web Mercator extent_poly_diss = extent_poly_diss.to_crs(VIZ_PROJECTION) @@ -311,9 +866,10 @@ def reformat_inundation_maps( right_on=['nws_lid', 'magnitude', 'huc'], ) extent_poly_diss = extent_poly_diss.drop(columns='nws_lid') + # Save dissolved multipolygon - handle = os.path.split(extent_grid)[1].replace('.tif', '') - diss_extent_filename = os.path.join(gpkg_dir, f"{handle}_{huc}_dissolved.gpkg") + handle = os.path.split(tif_to_process)[1].replace('.tif', '') + diss_extent_filename = os.path.join(gpkg_dir, f"{huc}_{handle}_dissolved.gpkg") extent_poly_diss["geometry"] = [ MultiPolygon([feature]) if type(feature) is Polygon else feature for feature in extent_poly_diss["geometry"] @@ -323,70 +879,99 @@ def reformat_inundation_maps( extent_poly_diss.to_file( diss_extent_filename, driver=getDriver(diss_extent_filename), index=False ) + # MP_LOG.trace( + # f"{huc} : {ahps_lid} : {magnitude} - Reformatted inundation map saved" + # f" as {diss_extent_filename}" + # ) + else: + MP_LOG.error(f"{huc} : {ahps_lid} : {magnitude} tif to gpkg, geodataframe is empty") + + except ValueError as ve: + msg = f"{huc} : {ahps_lid} : {magnitude} - Reformatted inundation map" + if "Assigning CRS to a GeoDataFrame without a geometry column is not supported" in ve: + MP_LOG.warning(f"{msg} - Warning: details: {ve}") + else: + MP_LOG.error(f"{msg} - Exception") + MP_LOG.error(traceback.format_exc()) except Exception: - pass - # Log and clean out the gdb so it's not merged in later + MP_LOG.error(f"{huc} : {ahps_lid} : {magnitude} - Reformatted inundation map - Exception") + MP_LOG.error(traceback.format_exc()) + return -# try: -# print(e) -## f = open(log_file, 'a+') -## f.write(str(diss_extent_filename) + " - dissolve error: " + str(e)) -## f.close() -# except: -# pass +# This is not part of an MP progress and simply needs the +# pointer of FLOG carried over here so it can use it directly. + +# TODO: Aug, 2024. We need re-evaluate job numbers, see usage of job numbers below +# Used for Flow only def manage_catfim_mapping( fim_run_dir, - source_flow_dir, + output_flows_dir, output_catfim_dir, - attributes_dir, + catfim_method, job_number_huc, job_number_inundate, - overwrite, depthtif, + log_output_file, + step_number=1, ): - # Create output directory - if not os.path.exists(output_catfim_dir): - os.mkdir(output_catfim_dir) - - # Create log directory - log_dir = os.path.join(output_catfim_dir, 'logs') - if not os.path.exists(log_dir): - os.mkdir(log_dir) - - # Create error log path - log_file = os.path.join(log_dir, 'errors.log') - - job_number_tif = job_number_inundate - - print("Generating Categorical FIM") - generate_categorical_fim( - fim_run_dir, - source_flow_dir, - output_catfim_dir, - job_number_huc, - job_number_inundate, - depthtif, - log_file, - ) - print("Aggregating Categorical FIM") + # Adding a pointer in this file coming from generate_categorial_fim so they can share the same log file + FLOG.setup(log_output_file) + + FLOG.lprint('Begin mapping') + start = time.time() + + output_mapping_dir = os.path.join(output_catfim_dir, 'mapping') + if not os.path.exists(output_mapping_dir): + os.mkdir(output_mapping_dir) + + if step_number <= 1: + run_catfim_inundation( + fim_run_dir, + output_flows_dir, + output_mapping_dir, + job_number_huc, + job_number_inundate, + FLOG.LOG_FILE_PATH, + ) + else: + FLOG.lprint("Skip running Inundation as Step > 1") + + # FLOG.lprint("Aggregating Categorical FIM") # Get fim_version. - fim_version = ( - os.path.basename(os.path.normpath(fim_run_dir)) - .replace('fim_', '') - .replace('_ms_c', '') - .replace('_', '.') - ) + fim_version = os.path.basename(os.path.normpath(fim_run_dir)).replace('fim_', '').replace('_', '.') + + # Step 2 + # TODO: Aug 2024, so we need to clean it up + # This step does not need a job_number_inundate as it can't really use it. + # It processes primarily hucs and ahps in multiproc + # for now, we will manually multiple the huc * 5 (max number of ahps types) + ahps_jobs = job_number_huc * 5 post_process_cat_fim_for_viz( - job_number_huc, job_number_tif, output_catfim_dir, attributes_dir, log_file, fim_version + catfim_method, output_catfim_dir, ahps_jobs, fim_version, str(FLOG.LOG_FILE_PATH) ) + end = time.time() + elapsed_time = (end - start) / 60 + FLOG.lprint(f"Finished mapping in {str(elapsed_time).split('.')[0]} minutes") + + return + if __name__ == '__main__': + + """ + Sample Usage: + python3 /foss_fim/tools/generate_categorical_fim_mapping.py -r "/outputs/rob_test_catfim_huc" + -s "/data/catfim/rob_test/test_5_flow_based/flows" -o "/data/catfim/rob_test/test_5_flow_based" + -jh 1 -jn 40 + + """ + # Parse arguments parser = argparse.ArgumentParser(description='Categorical inundation mapping for FOSS FIM.') parser.add_argument( @@ -407,19 +992,38 @@ def manage_catfim_mapping( default="", ) parser.add_argument( - '-j', - '--number-of-jobs', - help='Number of processes to use. Default is 1.', + '-jh', + '--job-number-huc', + help='Number of processes to use for huc processing. Default is 1.', required=False, default="1", type=int, ) parser.add_argument( - '-depthtif', - '--write-depth-tiff', + '-jn', + '--job-number-inundate', + help='OPTIONAL: Number of processes to use for inundating' + ' HUC and inundation job numbers should multiply to no more than one less than the CPU count' + ' of the machine. Defaults to 1.', + required=False, + default=1, + type=int, + ) + # parser.add_argument( + # '-depthtif', + # '--write-depth-tiff', + # help='Using this option will write depth TIFFs.', + # required=False, + # action='store_true', + # ) + + parser.add_argument( + '-step', + '--step_number', help='Using this option will write depth TIFFs.', required=False, - action='store_true', + default=1, + type=int, ) args = vars(parser.parse_args()) @@ -427,7 +1031,14 @@ def manage_catfim_mapping( fim_run_dir = args['fim_run_dir'] source_flow_dir = args['source_flow_dir'] output_catfim_dir = args['output_catfim_dir'] - number_of_jobs = int(args['number_of_jobs']) - depthtif = args['write_depth_tiff'] + job_number_huc = int(args['job_number_huc']) + job_number_inundate = int(args['job_number_inundate']) + # depthtif = args['write_depth_tiff'] + step_num = args['step_number'] - manage_catfim_mapping(fim_run_dir, source_flow_dir, output_catfim_dir, number_of_jobs, depthtif) + log_dir = os.path.join(output_catfim_dir, "logs") + log_output_file = FLOG.calc_log_name_and_path(log_dir, "gen_cat_mapping") + + manage_catfim_mapping( + source_flow_dir, output_catfim_dir, job_number_huc, job_number_inundate, log_output_file, step_num + ) diff --git a/tools/generate_nws_lid.py b/tools/generate_nws_lid.py old mode 100644 new mode 100755 diff --git a/tools/mosaic_inundation.py b/tools/mosaic_inundation.py index 5bded653..bc20265c 100755 --- a/tools/mosaic_inundation.py +++ b/tools/mosaic_inundation.py @@ -55,7 +55,7 @@ def Mosaic_inundation( inundation_maps_df = inundation_maps_df.set_index(unit_attribute_name, drop=True) - # decide upon whether to display + # decide upon whether to display the progress bar if verbose & len(aggregation_units) == 1: tqdm_disable = False elif verbose: diff --git a/tools/tools_shared_functions.py b/tools/tools_shared_functions.py index 66c0a969..2c9da736 100755 --- a/tools/tools_shared_functions.py +++ b/tools/tools_shared_functions.py @@ -1,8 +1,11 @@ #!/usr/bin/env python3 +import datetime as dt import json +import logging import os import pathlib +import traceback from pathlib import Path import geopandas as gpd @@ -13,6 +16,7 @@ import rasterio.shutil import requests import rioxarray as rxr +import urllib3 import xarray as xr from dotenv import load_dotenv from geocube.api.core import make_geocube @@ -63,9 +67,16 @@ def filter_nwm_segments_by_stream_order(unfiltered_segments, desired_order, nwm_ filtered_segments = [] for feature_id in unfiltered_segments: - stream_order = nwm_flows_df.loc[nwm_flows_df['ID'] == int(feature_id), 'order_'].values[0] + + try: + stream_order = nwm_flows_df.loc[nwm_flows_df['ID'] == int(feature_id), 'order_'].values[0] + except Exception as e: + print(f'WARNING: Exception occurred during filter_nwm_segments_by_stream_order():{e}') + if stream_order == desired_order: filtered_segments.append(feature_id) + # else: + # print(f'Stream order for {feature_id} did not match desired stream order...') return filtered_segments @@ -477,9 +488,7 @@ def get_stats_table_from_binary_rasters( # Write legend text file legend_txt = os.path.join(os.path.split(agreement_raster)[0], 'read_me.txt') - from datetime import datetime - - now = datetime.now() + now = dt.datetime.now() current_time = now.strftime("%m/%d/%Y %H:%M:%S") with open(legend_txt, 'w') as f: @@ -594,7 +603,7 @@ def get_metadata( metadata_url : STR metadata base URL. select_by : STR - Location search option. + Location search option. Options include: 'state', TODO: add options selector : LIST Value to match location data against. Supplied as a LIST. must_include : STR, optional @@ -660,7 +669,7 @@ def get_metadata( metadata.update(crosswalk_info) metadata_dataframe = pd.json_normalize(metadata_list) # Replace all periods with underscores in column names - metadata_dataframe.columns = metadata_dataframe.columns.str.replace('.', '_') + metadata_dataframe.columns = metadata_dataframe.columns.astype(str).str.replace('.', '_') else: # if request was not succesful, print error message. print(f'Code: {response.status_code}\nMessage: {response.reason}\nURL: {response.url}') @@ -673,7 +682,7 @@ def get_metadata( ######################################################################## # Function to assign HUC code using the WBD spatial layer using a spatial join ######################################################################## -def aggregate_wbd_hucs(metadata_list, wbd_huc8_path, retain_attributes=False): +def aggregate_wbd_hucs(metadata_list, wbd_huc8_path, retain_attributes=False, huc_list=list()): ''' Assigns the proper FIM HUC 08 code to each site in the input DataFrame. Converts input DataFrame to a GeoDataFrame using lat/lon attributes @@ -703,9 +712,16 @@ def aggregate_wbd_hucs(metadata_list, wbd_huc8_path, retain_attributes=False): ''' # Import huc8 layer as geodataframe and retain necessary columns print("Reading WBD...") - huc8 = gpd.read_file(wbd_huc8_path, layer='WBDHU8') + huc8_all = gpd.read_file(wbd_huc8_path, layer='WBDHU8') print("WBD read.") - huc8 = huc8[['HUC8', 'name', 'states', 'geometry']] + huc8 = huc8_all[['HUC8', 'name', 'states', 'geometry']] + + if len(huc_list) > 0: + # filter by hucs we are using + huc8 = huc8[huc8['HUC8'].isin(huc_list)] + + huc8.sort_values(by='HUC8', ascending=True, inplace=True) + # Define EPSG codes for possible latlon datum names (default of NAD83 if unassigned) crs_lookup = {'NAD27': 'EPSG:4267', 'NAD83': 'EPSG:4269', 'WGS84': 'EPSG:4326'} # Create empty geodataframe and define CRS for potential horizontal datums @@ -940,6 +956,9 @@ def get_thresholds(threshold_url, select_by, selector, threshold='all'): # Call the API session = requests.Session() + + urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) + retry = Retry(connect=3, backoff_factor=0.5) adapter = HTTPAdapter(max_retries=retry) session.mount('http://', adapter)