From 9663969f46f08dbd311e50e421f28b50f57bfbb5 Mon Sep 17 00:00:00 2001
From: Rishi Verma <riverma@apache.org>
Date: Tue, 16 Jul 2024 14:52:34 -0700
Subject: [PATCH 1/4] Supporting UAT and OPS endpoints for validation

---
 report/dswx-s1-validator/README.md            | 4 ++--
 report/dswx-s1-validator/dswx_s1_validator.py | 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/report/dswx-s1-validator/README.md b/report/dswx-s1-validator/README.md
index 7b1af143..2876dc6f 100644
--- a/report/dswx-s1-validator/README.md
+++ b/report/dswx-s1-validator/README.md
@@ -183,7 +183,7 @@ This guide provides a quick way to get started with the script.
 * Validate whether DSWx-S1 processing has kept up with input RTC processing (success condition)
 
   ```
-  $ python dswx_s1_validator.py --start "2024-05-12T08:00:00" --end "2024-05-12T08:59:00" --db MGRS_tile_collection_v0.3.sqlite --threshold 99 --validate
+  $ python dswx_s1_validator.py --endpoint UAT --start "2024-05-12T08:00:00" --end "2024-05-12T08:59:00" --db MGRS_tile_collection_v0.3.sqlite --threshold 99 --validate
   Total granules: 114
   Querying CMR for time range 2024-05-12T08:00:00 to 2024-05-12T08:59:00.
 
@@ -206,7 +206,7 @@ This guide provides a quick way to get started with the script.
 * Validate whether DSWx-S1 processing has kept up with input RTC processing (failure condition)
 
   ```
-  $ python dswx_s1_validator.py --start "2024-05-12T04:10:00" --end "2024-05-12T05:10:00" --db MGRS_tile_collection_v0.3.sqlite --threshold 99 --validate
+  $ python dswx_s1_validator.py --endpoint UAT --start "2024-05-12T04:10:00" --end "2024-05-12T05:10:00" --db MGRS_tile_collection_v0.3.sqlite --threshold 99 --validate
 
   Total granules: 894
   Querying CMR for time range 2024-05-12T04:10:00 to 2024-05-12T05:10:00.
diff --git a/report/dswx-s1-validator/dswx_s1_validator.py b/report/dswx-s1-validator/dswx_s1_validator.py
index f81a9304..23007de2 100644
--- a/report/dswx-s1-validator/dswx_s1_validator.py
+++ b/report/dswx-s1-validator/dswx_s1_validator.py
@@ -318,7 +318,7 @@ def get_burst_ids_from_query(start, end, timestamp, endpoint):
     
     return burst_ids, burst_dates
 
-def validate_mgrs_tiles(smallest_date, greatest_date, unique_mgrs_tiles, endpoint='UAT'):
+def validate_mgrs_tiles(smallest_date, greatest_date, unique_mgrs_tiles, endpoint):
     """
     Validates that the MGRS tiles from the CMR query match the provided unique MGRS tiles list.
 
@@ -492,7 +492,7 @@ def validate_mgrs_tiles(smallest_date, greatest_date, unique_mgrs_tiles, endpoin
     else:
         print(tabulate(df[['MGRS Set ID','Coverage Percentage', 'Matching Burst Count', 'Total Burst Count', 'MGRS Tiles']], headers='keys', tablefmt='plain', showindex=False))
 
-    if args.validate:
+    if args.validate and len(df) > 0:
         burst_dates_series = df['Burst Dates'].explode()
         smallest_date = burst_dates_series.min()
         greatest_date = burst_dates_series.max()
@@ -502,4 +502,4 @@ def validate_mgrs_tiles(smallest_date, greatest_date, unique_mgrs_tiles, endpoin
         mgrs_tiles_series = df['MGRS Tiles'].str.split(', ').explode()
         unique_mgrs_tiles = mgrs_tiles_series.unique()
 
-        validate_mgrs_tiles(smallest_date, greatest_date, unique_mgrs_tiles)
\ No newline at end of file
+        validate_mgrs_tiles(smallest_date, greatest_date, unique_mgrs_tiles, args.endpoint)
\ No newline at end of file

From 8683cae9fbdb34e68e4551b4255ca315fd367f63 Mon Sep 17 00:00:00 2001
From: Rishi Verma <riverma@apache.org>
Date: Wed, 17 Jul 2024 09:33:27 -0700
Subject: [PATCH 2/4] Refactored code to validate exp RTC bursts against
 DSWx-S1 CMR metadata RTC burst list

---
 report/dswx-s1-validator/dswx_s1_validator.py | 142 ++++++++++++------
 1 file changed, 94 insertions(+), 48 deletions(-)

diff --git a/report/dswx-s1-validator/dswx_s1_validator.py b/report/dswx-s1-validator/dswx_s1_validator.py
index 23007de2..d1feadc0 100644
--- a/report/dswx-s1-validator/dswx_s1_validator.py
+++ b/report/dswx-s1-validator/dswx_s1_validator.py
@@ -318,24 +318,47 @@ def get_burst_ids_from_query(start, end, timestamp, endpoint):
     
     return burst_ids, burst_dates
 
-def validate_mgrs_tiles(smallest_date, greatest_date, unique_mgrs_tiles, endpoint):
-    """
-    Validates that the MGRS tiles from the CMR query match the provided unique MGRS tiles list.
+def extract_rtc_granule_from_file_path(path):
+    # Define a regular expression pattern to extract the desired substring
+    # This pattern assumes the substring starts with 'OPERA_' and ends before the last underscore followed by a suffix that includes the file extension
+    pattern = r"(OPERA_L2_RTC-S1_[\w-]+_\d+T\d+Z_\d+T\d+Z_S1A_30_v\d+\.\d+)"
+    
+    # Search for the pattern
+    match = re.search(pattern, full_string)
+    
+    # Return the matched substring or None if no match is found
+    if match:
+        return match.group(1)
+    else:
+        return None
 
-    :param smallest_date: The earliest date in the range (ISO 8601 format).
-    :param greatest_date: The latest date in the range (ISO 8601 format).
-    :param unique_mgrs_tiles: List of unique MGRS tiles to validate against.
-    :param endpoint: CMR environment ('UAT' or 'OPS').
-    :return: Boolean indicating if the validation was successful.
+def validate_mgrs_tiles(smallest_date, greatest_date, endpoint, df):
+    """
+    Validates that the granules from the CMR query are accurately reflected in the DataFrame provided.
+    It extracts granule information based on the input dates and checks which granules are missing from the DataFrame.
+    The function then updates the DataFrame to include a count of unprocessed bursts based on the missing granules.
+
+    :param smallest_date: datetime.datetime
+        The earliest date in the range (ISO 8601 format).
+    :param greatest_date: datetime.datetime
+        The latest date in the range (ISO 8601 format).
+    :param endpoint: str
+        CMR environment ('UAT' or 'OPS') to specify the operational setting for the data query.
+    :param df: pandas.DataFrame
+        A DataFrame containing columns with granule identifiers which will be checked against the CMR query results.
+        
+    :return: pandas.DataFrame or bool
+        A modified DataFrame with additional columns 'Unprocessed Bursts' and 'Unprocessed Bursts Count' showing
+        granules not found in the CMR results and their count respectively. Returns False if the CMR query fails.
+    
+    Raises:
+        requests.exceptions.RequestException if the CMR query fails, which is logged as an error.
     """
 
     # Convert Timestamps to strings in ISO 8601 format
     smallest_date_iso = smallest_date.strftime('%Y-%m-%dT%H:%M:%S.%f')[:-3]
     greatest_date_iso = greatest_date.strftime('%Y-%m-%dT%H:%M:%S.%f')[:-3]
 
-    # Add 'T' prefix to MGRS tiles if not already present
-    formatted_mgrs_tiles = [f"T{tile}" if not tile.startswith('T') else tile for tile in unique_mgrs_tiles]
-
     # Generate the base URL and parameters for the CMR query
     base_url, params = generate_url_params(
         start=smallest_date_iso,
@@ -359,38 +382,43 @@ def validate_mgrs_tiles(smallest_date, greatest_date, unique_mgrs_tiles, endpoin
         granules = response.json()
 
         # Extract MGRS tiles from the response
-        retrieved_tiles = []
+        available_rtc_bursts = []
+        pattern = r"(OPERA_L2_RTC-S1_[\w-]+_\d+T\d+Z_\d+T\d+Z_S1A_30_v\d+\.\d+)"
         for item in granules['items']:
-            for attribute in item['umm']['AdditionalAttributes']:
-                if 'AdditionalAttributes' in item['umm'] and attribute['Name'] == 'MGRS_TILE_ID':
-                    retrieved_tiles.append(attribute['Values'][0])
+            for path in item['umm']['InputGranules']:
+                # Extract the granule burst ID from the full path
+                match = re.search(pattern, path)
+                if match:
+                    available_rtc_bursts.append(match.group(1))
 
-        # Validate against provided unique MGRS tiles
-        retrieved_tiles_set = set(retrieved_tiles)
-        unique_mgrs_tiles_set = set(formatted_mgrs_tiles)
+        unique_available_rtc_bursts = set(available_rtc_bursts)
 
-        print()
-        if retrieved_tiles_set == unique_mgrs_tiles_set:
-            print(f"✅ Validation successful: All DSWx-S1 tiles available at CMR for corresponding matched input RTC bursts within sensing time range.")
-            return True
-        else:
-            missing_tiles = unique_mgrs_tiles_set - retrieved_tiles_set
-            extra_tiles = retrieved_tiles_set - unique_mgrs_tiles_set
-            print(f"❌ Validation failed: Mismatch in DSWx-S1 tiles available at CMR for corresponding matched input RTC bursts within sensing time range.")
-            print()
-            print(f"Expected({len(unique_mgrs_tiles_set)}): {unique_mgrs_tiles_set}")
-            print()
-            print(f"Received({len(retrieved_tiles_set)}): {retrieved_tiles_set}")
-            print()
-            if missing_tiles:
-                print(f"Missing tiles({len(missing_tiles)}): {missing_tiles}")
-            if extra_tiles:
-                print(f"Extra tiles({len(extra_tiles)}): {extra_tiles}")
-            return False
+        # Function to identify missing bursts
+        def filter_and_find_missing(row):
+            rtc_bursts_in_df_row = set(row['Matching Granules'].split(', '))
+            unprocessed_rtc_bursts = rtc_bursts_in_df_row - unique_available_rtc_bursts
+            if unprocessed_rtc_bursts:
+                return ', '.join(unprocessed_rtc_bursts)
+            return None  # or pd.NA 
+
+        # Function to count missing bursts
+        def count_missing(row):
+            count = len(row['Unprocessed Bursts'].split(', '))
+            return count
+
+        # Apply the function and create a new column 'Unprocessed Bursts'
+        df['Unprocessed Bursts'] = df.apply(filter_and_find_missing, axis=1)
+        df = df.dropna(subset=['Unprocessed Bursts'])
+
+        # Using loc to safely modify the DataFrame without triggering SettingWithCopyWarning
+        df.loc[:, 'Unprocessed Bursts Count'] = df.apply(count_missing, axis=1)
+
+        return df
 
     except requests.exceptions.RequestException as e:
         logging.error(f"Failed to fetch data from CMR: {e}")
-        return False
+        
+    return False
 
 if __name__ == '__main__':
     # Create an argument parser
@@ -403,7 +431,8 @@ def validate_mgrs_tiles(smallest_date, greatest_date, unique_mgrs_tiles, endpoin
     parser.add_argument("--threshold", required=False, help="Completion threshold minimum to filter results by (percentage format - leave out the % sign)")
     parser.add_argument("--matching_burst_count", required=False, help="Matching burst count to filter results by. Typically four or more is advised. Using this with the --threshold flag makes this flag inactive (only one of '--threshold' or '--matching_burst_count' may be used)")
     parser.add_argument("--verbose", action='store_true', help="Verbose and detailed output")
-    parser.add_argument("--endpoint", required=False, choices=['UAT', 'OPS'], default='OPS', help='CMR endpoint venue')
+    parser.add_argument("--endpoint_rtc", required=False, choices=['UAT', 'OPS'], default='OPS', help='CMR endpoint venue for RTC granules')
+    parser.add_argument("--endpoint_dswx_s1", required=False, choices=['UAT', 'OPS'], default='OPS', help='CMR endpoint venue for DSWx-S1 granules')
     parser.add_argument("--validate", action='store_true', help="Validate if DSWx-S1 products have been delivered for given time range (use --timestamp TEMPORAL mode only)")
 
     # Parse the command-line arguments
@@ -416,7 +445,7 @@ def validate_mgrs_tiles(smallest_date, greatest_date, unique_mgrs_tiles, endpoin
     if args.file:
         burst_ids, burst_dates = get_burst_ids_from_file(filename=args.file)
     else:
-        burst_ids, burst_dates = get_burst_ids_from_query(args.start, args.end, args.timestamp, args.endpoint)
+        burst_ids, burst_dates = get_burst_ids_from_query(args.start, args.end, args.timestamp, args.endpoint_rtc)
 
     # Connect to the MGRS Tile Set SQLITE database
     conn = sqlite3.connect(args.db)
@@ -465,7 +494,9 @@ def validate_mgrs_tiles(smallest_date, greatest_date, unique_mgrs_tiles, endpoin
             'Total Burst Count': len(bursts_list),
             'MGRS Tiles': ', '.join(mgrs_tiles_list),
             'MGRS Tiles Count': len(mgrs_tiles_list),
-            'Burst Dates': [pd.to_datetime(date, format='%Y%m%dT%H%M%SZ') for date in matching_burst_dates.values()]
+            'Burst Dates': [pd.to_datetime(date, format='%Y%m%dT%H%M%SZ') for date in matching_burst_dates.values()],
+            'Unprocessed Bursts': '',
+            'Unprocessed Bursts Count': 0
         })
 
         logging.debug(f"len(matching_burst_dates) = {matching_burst_dates}")
@@ -486,11 +517,6 @@ def validate_mgrs_tiles(smallest_date, greatest_date, unique_mgrs_tiles, endpoin
 
     # Pretty print results - adjust tablefmt accordingly (https://github.com/astanin/python-tabulate#table-format)
     print()
-    print('MGRS Set IDs covered:', len(df))
-    if (args.verbose):
-        print(tabulate(df[['MGRS Set ID','Coverage Percentage', 'Matching Granules', 'Matching Bursts', 'Matching Burst Count', 'Total Burst Count', 'MGRS Tiles', 'MGRS Tiles Count', 'Burst Dates']], headers='keys', tablefmt='plain', showindex=False))
-    else:
-        print(tabulate(df[['MGRS Set ID','Coverage Percentage', 'Matching Burst Count', 'Total Burst Count', 'MGRS Tiles']], headers='keys', tablefmt='plain', showindex=False))
 
     if args.validate and len(df) > 0:
         burst_dates_series = df['Burst Dates'].explode()
@@ -499,7 +525,27 @@ def validate_mgrs_tiles(smallest_date, greatest_date, unique_mgrs_tiles, endpoin
 
         print()
         print(f"Expected DSWx-S1 product sensing time range: {smallest_date} to {greatest_date}")
-        mgrs_tiles_series = df['MGRS Tiles'].str.split(', ').explode()
-        unique_mgrs_tiles = mgrs_tiles_series.unique()
 
-        validate_mgrs_tiles(smallest_date, greatest_date, unique_mgrs_tiles, args.endpoint)
\ No newline at end of file
+        validated_df = validate_mgrs_tiles(smallest_date, greatest_date, args.endpoint_dswx_s1, df)
+
+        print()
+        if len(validated_df) == 0:
+            print(f"✅ Validation successful: All DSWx-S1 products available at CMR for corresponding matched input RTC bursts within sensing time range.")
+            if (args.verbose):
+                print(tabulate(df[['MGRS Set ID','Coverage Percentage', 'Matching Granules', 'Matching Bursts', 'Matching Burst Count', 'Total Burst Count', 'MGRS Tiles', 'MGRS Tiles Count', 'Unprocessed Bursts', 'Unprocessed Bursts Count']], headers='keys', tablefmt='plain', showindex=False))
+            else:
+                print(tabulate(df[['MGRS Set ID','Coverage Percentage', 'Total Burst Count', 'Matching Burst Count', 'Unprocessed Bursts Count', 'MGRS Tiles']], headers='keys', tablefmt='plain', showindex=False))
+        else:
+            print(f"❌ Validation failed: Mismatch in DSWx-S1 products available at CMR for corresponding matched input RTC bursts within sensing time range.")
+            print()
+            print('Incomplete MGRS Set IDs:', len(validated_df))
+            if (args.verbose):
+                print(tabulate(validated_df[['MGRS Set ID','Coverage Percentage', 'Matching Granules', 'Matching Bursts', 'Matching Burst Count', 'Total Burst Count', 'MGRS Tiles', 'MGRS Tiles Count', 'Unprocessed Bursts', 'Unprocessed Bursts Count']], headers='keys', tablefmt='plain', showindex=False))
+            else:
+                print(tabulate(validated_df[['MGRS Set ID','Coverage Percentage', 'Total Burst Count', 'Matching Burst Count', 'Unprocessed Bursts Count', 'MGRS Tiles']], headers='keys', tablefmt='plain', showindex=False))
+    else:
+        print('MGRS Set IDs covered:', len(df))
+        if (args.verbose):
+            print(tabulate(df[['MGRS Set ID','Coverage Percentage', 'Matching Granules', 'Matching Bursts', 'Matching Burst Count', 'Total Burst Count', 'MGRS Tiles', 'MGRS Tiles Count']], headers='keys', tablefmt='plain', showindex=False))
+        else:
+            print(tabulate(df[['MGRS Set ID', 'Coverage Percentage', 'Total Burst Count', 'Matching Burst Count', 'MGRS Tiles']], headers='keys', tablefmt='plain', showindex=False))

From 9a9b16fda673a125a3012c40a9482423c28672bc Mon Sep 17 00:00:00 2001
From: Rishi Verma <riverma@apache.org>
Date: Wed, 17 Jul 2024 09:55:37 -0700
Subject: [PATCH 3/4] Renamed columns for clarity, optimized dataframe copying

---
 report/dswx-s1-validator/README.md            | 11 +++--
 report/dswx-s1-validator/dswx_s1_validator.py | 48 +++++++++----------
 2 files changed, 30 insertions(+), 29 deletions(-)

diff --git a/report/dswx-s1-validator/README.md b/report/dswx-s1-validator/README.md
index 2876dc6f..85dd45aa 100644
--- a/report/dswx-s1-validator/README.md
+++ b/report/dswx-s1-validator/README.md
@@ -58,9 +58,10 @@ This guide provides a quick way to get started with the script.
 2. Optionally, use the `--file` argument to specify a file with granule IDs.
 3. Optionally, use the `--threshold` argument to a threshold percentage to filter MGRS Tile Set coverages by or use the `--matching_burst_count` to specify the minimum number of bursts to expect a match for for filtering. If both are provided, `--threshold` is used and `--matching_burst_count` is ignored. 
 4. Optionally, use the `--timestamp` argument to specify the type of timestamp to query CMR with. Example values: `TEMPORAL|PRODUCTION|REVISION|CREATED`. Default value is `TEMPORAL`. See [CMR documentation](https://cmr.earthdata.nasa.gov/search/site/docs/search/api.html) for details. 
-5. Optionally, use the `--endpoint` argument to specify the CMR endpoint venue. Accepted values are `OPS|UAT`, with `OPS` set as the default value.
-6. Optionally, use the `--verbose` argument to get detailed information like a list of matching bursts and granule IDs
-7. Optionally, use the `--validate` argument to check if expected DSWx-S1 products (tiles) exist for relevant RTC input bursts 
+5. Optionally, use the `--endpoint_rtc` argument to specify the CMR endpoint venue for RTC granules. Accepted values are `OPS|UAT`, with `OPS` set as the default value.
+6. Optionally, use the `--endpoint_dswx_s1` argument to specify the CMR endpoint venue for DSWx-S1 granules. Accepted values are `OPS|UAT`, with `OPS` set as the default value.
+7. Optionally, use the `--verbose` argument to get detailed information like a list of matching bursts and granule IDs
+8. Optionally, use the `--validate` argument to check if expected DSWx-S1 products (tiles) exist for relevant RTC input bursts 
  
 ### Usage Examples
 
@@ -183,7 +184,7 @@ This guide provides a quick way to get started with the script.
 * Validate whether DSWx-S1 processing has kept up with input RTC processing (success condition)
 
   ```
-  $ python dswx_s1_validator.py --endpoint UAT --start "2024-05-12T08:00:00" --end "2024-05-12T08:59:00" --db MGRS_tile_collection_v0.3.sqlite --threshold 99 --validate
+  $ python dswx_s1_validator.py --endpoint_dswx_s1 UAT --start "2024-05-12T08:00:00" --end "2024-05-12T08:59:00" --db MGRS_tile_collection_v0.3.sqlite --threshold 99 --validate
   Total granules: 114
   Querying CMR for time range 2024-05-12T08:00:00 to 2024-05-12T08:59:00.
 
@@ -206,7 +207,7 @@ This guide provides a quick way to get started with the script.
 * Validate whether DSWx-S1 processing has kept up with input RTC processing (failure condition)
 
   ```
-  $ python dswx_s1_validator.py --endpoint UAT --start "2024-05-12T04:10:00" --end "2024-05-12T05:10:00" --db MGRS_tile_collection_v0.3.sqlite --threshold 99 --validate
+  $ python dswx_s1_validator.py --endpoint_dswx_s1 UAT --start "2024-05-12T04:10:00" --end "2024-05-12T05:10:00" --db MGRS_tile_collection_v0.3.sqlite --threshold 99 --validate
 
   Total granules: 894
   Querying CMR for time range 2024-05-12T04:10:00 to 2024-05-12T05:10:00.
diff --git a/report/dswx-s1-validator/dswx_s1_validator.py b/report/dswx-s1-validator/dswx_s1_validator.py
index d1feadc0..3ddd4fb4 100644
--- a/report/dswx-s1-validator/dswx_s1_validator.py
+++ b/report/dswx-s1-validator/dswx_s1_validator.py
@@ -348,7 +348,7 @@ def validate_mgrs_tiles(smallest_date, greatest_date, endpoint, df):
         A DataFrame containing columns with granule identifiers which will be checked against the CMR query results.
         
     :return: pandas.DataFrame or bool
-        A modified DataFrame with additional columns 'Unprocessed Bursts' and 'Unprocessed Bursts Count' showing
+        A modified DataFrame with additional columns 'Unprocessed RTC Native IDs' and 'Unprocessed RTC Native IDs Count' showing
         granules not found in the CMR results and their count respectively. Returns False if the CMR query fails.
     
     Raises:
@@ -395,7 +395,7 @@ def validate_mgrs_tiles(smallest_date, greatest_date, endpoint, df):
 
         # Function to identify missing bursts
         def filter_and_find_missing(row):
-            rtc_bursts_in_df_row = set(row['Matching Granules'].split(', '))
+            rtc_bursts_in_df_row = set(row['Covered RTC Native IDs'].split(', '))
             unprocessed_rtc_bursts = rtc_bursts_in_df_row - unique_available_rtc_bursts
             if unprocessed_rtc_bursts:
                 return ', '.join(unprocessed_rtc_bursts)
@@ -403,15 +403,15 @@ def filter_and_find_missing(row):
 
         # Function to count missing bursts
         def count_missing(row):
-            count = len(row['Unprocessed Bursts'].split(', '))
+            count = len(row['Unprocessed RTC Native IDs'].split(', '))
             return count
 
-        # Apply the function and create a new column 'Unprocessed Bursts'
-        df['Unprocessed Bursts'] = df.apply(filter_and_find_missing, axis=1)
-        df = df.dropna(subset=['Unprocessed Bursts'])
+        # Apply the function and create a new column 'Unprocessed RTC Native IDs'
+        df['Unprocessed RTC Native IDs'] = df.apply(filter_and_find_missing, axis=1)
+        df = df.dropna(subset=['Unprocessed RTC Native IDs'])
 
         # Using loc to safely modify the DataFrame without triggering SettingWithCopyWarning
-        df.loc[:, 'Unprocessed Bursts Count'] = df.apply(count_missing, axis=1)
+        df.loc[:, 'Unprocessed RTC Native IDs Count'] = df.apply(count_missing, axis=1)
 
         return df
 
@@ -457,7 +457,7 @@ def count_missing(row):
     mgrs_data = cursor.fetchall()
 
     # Initialize DataFrame to store results
-    df = pd.DataFrame(columns=['MGRS Set ID', 'Coverage Percentage', 'Matching Granules', 'Matching Bursts', 'Total Bursts', 'Matching Burst Count', 'Total Burst Count', 'MGRS Tiles', 'MGRS Tiles Count', 'Burst Dates'])
+    df = pd.DataFrame(columns=['MGRS Set ID', 'Coverage Percentage', 'Covered RTC Native IDs', 'Covered RTC Burst IDs', 'Total RTC Burst IDs', 'Covered RTC Burst ID Count', 'Total RTC Burst IDs Count', 'MGRS Tiles', 'MGRS Tiles Count', 'RTC Burst ID Dates'])
 
     # Initialize a list to store data for DataFrame
     data_for_df = []
@@ -487,16 +487,16 @@ def count_missing(row):
         data_for_df.append({
             'MGRS Set ID': mgrs_set_id,
             'Coverage Percentage': coverage_percentage,
-            'Matching Granules': ', '.join(list(matching_burst_ids.values())),
-            'Matching Bursts': ', '.join(list(matching_burst_ids.keys())),
-            'Total Bursts': ', '.join(bursts_list),
-            'Matching Burst Count': len(matching_burst_ids),
-            'Total Burst Count': len(bursts_list),
+            'Covered RTC Native IDs': ', '.join(list(matching_burst_ids.values())),
+            'Covered RTC Burst IDs': ', '.join(list(matching_burst_ids.keys())),
+            'Total RTC Burst IDs': ', '.join(bursts_list),
+            'Covered RTC Burst ID Count': len(matching_burst_ids),
+            'Total RTC Burst IDs Count': len(bursts_list),
             'MGRS Tiles': ', '.join(mgrs_tiles_list),
             'MGRS Tiles Count': len(mgrs_tiles_list),
-            'Burst Dates': [pd.to_datetime(date, format='%Y%m%dT%H%M%SZ') for date in matching_burst_dates.values()],
-            'Unprocessed Bursts': '',
-            'Unprocessed Bursts Count': 0
+            'RTC Burst ID Dates': [pd.to_datetime(date, format='%Y%m%dT%H%M%SZ') for date in matching_burst_dates.values()],
+            'Unprocessed RTC Native IDs': '',
+            'Unprocessed RTC Native IDs Count': 0
         })
 
         logging.debug(f"len(matching_burst_dates) = {matching_burst_dates}")
@@ -513,13 +513,13 @@ def count_missing(row):
         df = df[df['Coverage Percentage'] >= threshold]
     elif args.matching_burst_count:
         matching_burst_count = int(args.matching_burst_count)
-        df = df[df['Matching Burst Count'] >= matching_burst_count]
+        df = df[df['Covered RTC Burst ID Count'] >= matching_burst_count]
 
     # Pretty print results - adjust tablefmt accordingly (https://github.com/astanin/python-tabulate#table-format)
     print()
 
     if args.validate and len(df) > 0:
-        burst_dates_series = df['Burst Dates'].explode()
+        burst_dates_series = df['RTC Burst ID Dates'].explode()
         smallest_date = burst_dates_series.min()
         greatest_date = burst_dates_series.max()
 
@@ -532,20 +532,20 @@ def count_missing(row):
         if len(validated_df) == 0:
             print(f"✅ Validation successful: All DSWx-S1 products available at CMR for corresponding matched input RTC bursts within sensing time range.")
             if (args.verbose):
-                print(tabulate(df[['MGRS Set ID','Coverage Percentage', 'Matching Granules', 'Matching Bursts', 'Matching Burst Count', 'Total Burst Count', 'MGRS Tiles', 'MGRS Tiles Count', 'Unprocessed Bursts', 'Unprocessed Bursts Count']], headers='keys', tablefmt='plain', showindex=False))
+                print(tabulate(df[['MGRS Set ID','Coverage Percentage', 'Covered RTC Native IDs', 'Covered RTC Burst IDs', 'Covered RTC Burst ID Count', 'Total RTC Burst IDs Count', 'MGRS Tiles', 'MGRS Tiles Count', 'Unprocessed RTC Native IDs', 'Unprocessed RTC Native IDs Count']], headers='keys', tablefmt='plain', showindex=False))
             else:
-                print(tabulate(df[['MGRS Set ID','Coverage Percentage', 'Total Burst Count', 'Matching Burst Count', 'Unprocessed Bursts Count', 'MGRS Tiles']], headers='keys', tablefmt='plain', showindex=False))
+                print(tabulate(df[['MGRS Set ID','Coverage Percentage', 'Total RTC Burst IDs Count', 'Covered RTC Burst ID Count', 'Unprocessed RTC Native IDs Count']], headers='keys', tablefmt='plain', showindex=False))
         else:
             print(f"❌ Validation failed: Mismatch in DSWx-S1 products available at CMR for corresponding matched input RTC bursts within sensing time range.")
             print()
             print('Incomplete MGRS Set IDs:', len(validated_df))
             if (args.verbose):
-                print(tabulate(validated_df[['MGRS Set ID','Coverage Percentage', 'Matching Granules', 'Matching Bursts', 'Matching Burst Count', 'Total Burst Count', 'MGRS Tiles', 'MGRS Tiles Count', 'Unprocessed Bursts', 'Unprocessed Bursts Count']], headers='keys', tablefmt='plain', showindex=False))
+                print(tabulate(validated_df[['MGRS Set ID','Coverage Percentage', 'Covered RTC Native IDs', 'Covered RTC Burst IDs', 'Covered RTC Burst ID Count', 'Total RTC Burst IDs Count', 'MGRS Tiles', 'MGRS Tiles Count', 'Unprocessed RTC Native IDs', 'Unprocessed RTC Native IDs Count']], headers='keys', tablefmt='plain', showindex=False))
             else:
-                print(tabulate(validated_df[['MGRS Set ID','Coverage Percentage', 'Total Burst Count', 'Matching Burst Count', 'Unprocessed Bursts Count', 'MGRS Tiles']], headers='keys', tablefmt='plain', showindex=False))
+                print(tabulate(validated_df[['MGRS Set ID','Coverage Percentage', 'Total RTC Burst IDs Count', 'Covered RTC Burst ID Count', 'Unprocessed RTC Native IDs Count']], headers='keys', tablefmt='plain', showindex=False))
     else:
         print('MGRS Set IDs covered:', len(df))
         if (args.verbose):
-            print(tabulate(df[['MGRS Set ID','Coverage Percentage', 'Matching Granules', 'Matching Bursts', 'Matching Burst Count', 'Total Burst Count', 'MGRS Tiles', 'MGRS Tiles Count']], headers='keys', tablefmt='plain', showindex=False))
+            print(tabulate(df[['MGRS Set ID','Coverage Percentage', 'Covered RTC Native IDs', 'Covered RTC Burst IDs', 'Covered RTC Burst ID Count', 'Total RTC Burst IDs Count', 'MGRS Tiles', 'MGRS Tiles Count']], headers='keys', tablefmt='plain', showindex=False))
         else:
-            print(tabulate(df[['MGRS Set ID', 'Coverage Percentage', 'Total Burst Count', 'Matching Burst Count', 'MGRS Tiles']], headers='keys', tablefmt='plain', showindex=False))
+            print(tabulate(df[['MGRS Set ID', 'Coverage Percentage', 'Total RTC Burst IDs Count', 'Covered RTC Burst ID Count', 'MGRS Tiles']], headers='keys', tablefmt='plain', showindex=False))

From fe3f14a519e2a91e518b24f340b64cc2cb9f216b Mon Sep 17 00:00:00 2001
From: Rishi Verma <riverma@apache.org>
Date: Wed, 17 Jul 2024 11:13:11 -0700
Subject: [PATCH 4/4] Reorganized output table to be more clear

---
 report/dswx-s1-validator/dswx_s1_validator.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/report/dswx-s1-validator/dswx_s1_validator.py b/report/dswx-s1-validator/dswx_s1_validator.py
index 3ddd4fb4..92faca9a 100644
--- a/report/dswx-s1-validator/dswx_s1_validator.py
+++ b/report/dswx-s1-validator/dswx_s1_validator.py
@@ -531,8 +531,9 @@ def count_missing(row):
         print()
         if len(validated_df) == 0:
             print(f"✅ Validation successful: All DSWx-S1 products available at CMR for corresponding matched input RTC bursts within sensing time range.")
+            print()
             if (args.verbose):
-                print(tabulate(df[['MGRS Set ID','Coverage Percentage', 'Covered RTC Native IDs', 'Covered RTC Burst IDs', 'Covered RTC Burst ID Count', 'Total RTC Burst IDs Count', 'MGRS Tiles', 'MGRS Tiles Count', 'Unprocessed RTC Native IDs', 'Unprocessed RTC Native IDs Count']], headers='keys', tablefmt='plain', showindex=False))
+                print(tabulate(df[['MGRS Set ID','Coverage Percentage', 'Total RTC Burst IDs Count', 'Covered RTC Burst ID Count', 'Unprocessed RTC Native IDs Count', 'Covered RTC Native IDs', 'Unprocessed RTC Native IDs', 'MGRS Tiles']], headers='keys', tablefmt='plain', showindex=False))
             else:
                 print(tabulate(df[['MGRS Set ID','Coverage Percentage', 'Total RTC Burst IDs Count', 'Covered RTC Burst ID Count', 'Unprocessed RTC Native IDs Count']], headers='keys', tablefmt='plain', showindex=False))
         else:
@@ -540,12 +541,12 @@ def count_missing(row):
             print()
             print('Incomplete MGRS Set IDs:', len(validated_df))
             if (args.verbose):
-                print(tabulate(validated_df[['MGRS Set ID','Coverage Percentage', 'Covered RTC Native IDs', 'Covered RTC Burst IDs', 'Covered RTC Burst ID Count', 'Total RTC Burst IDs Count', 'MGRS Tiles', 'MGRS Tiles Count', 'Unprocessed RTC Native IDs', 'Unprocessed RTC Native IDs Count']], headers='keys', tablefmt='plain', showindex=False))
+                print(tabulate(validated_df[['MGRS Set ID','Coverage Percentage', 'Total RTC Burst IDs Count', 'Covered RTC Burst ID Count', 'Unprocessed RTC Native IDs Count', 'Covered RTC Native IDs', 'Unprocessed RTC Native IDs', 'MGRS Tiles']], headers='keys', tablefmt='plain', showindex=False))
             else:
                 print(tabulate(validated_df[['MGRS Set ID','Coverage Percentage', 'Total RTC Burst IDs Count', 'Covered RTC Burst ID Count', 'Unprocessed RTC Native IDs Count']], headers='keys', tablefmt='plain', showindex=False))
     else:
         print('MGRS Set IDs covered:', len(df))
         if (args.verbose):
-            print(tabulate(df[['MGRS Set ID','Coverage Percentage', 'Covered RTC Native IDs', 'Covered RTC Burst IDs', 'Covered RTC Burst ID Count', 'Total RTC Burst IDs Count', 'MGRS Tiles', 'MGRS Tiles Count']], headers='keys', tablefmt='plain', showindex=False))
+            print(tabulate(df[['MGRS Set ID','Coverage Percentage', 'Total RTC Burst IDs Count', 'Covered RTC Burst ID Count', 'Covered RTC Native IDs', 'MGRS Tiles']], headers='keys', tablefmt='plain', showindex=False))
         else:
-            print(tabulate(df[['MGRS Set ID', 'Coverage Percentage', 'Total RTC Burst IDs Count', 'Covered RTC Burst ID Count', 'MGRS Tiles']], headers='keys', tablefmt='plain', showindex=False))
+            print(tabulate(df[['MGRS Set ID', 'Coverage Percentage', 'Total RTC Burst IDs Count', 'Covered RTC Burst ID Count']], headers='keys', tablefmt='plain', showindex=False))