From b59fc2fe90a9783fb310ac78cb0213383330232e Mon Sep 17 00:00:00 2001
From: Jeremy Kubica <104161096+jeremykubica@users.noreply.github.com>
Date: Fri, 22 Nov 2024 12:15:31 -0500
Subject: [PATCH 1/2] Add a notebook for joining results and known objects

---
 notebooks/join_known_objects_example.ipynb | 166 +++++++++++++++++++++
 src/kbmod/filters/known_object_filters.py  |  30 ++--
 src/kbmod/results.py                       |  32 ++++
 tests/test_known_object_filters.py         |  32 +++-
 tests/test_results.py                      |  27 ++++
 5 files changed, 272 insertions(+), 15 deletions(-)
 create mode 100644 notebooks/join_known_objects_example.ipynb

diff --git a/notebooks/join_known_objects_example.ipynb b/notebooks/join_known_objects_example.ipynb
new file mode 100644
index 000000000..bd1170beb
--- /dev/null
+++ b/notebooks/join_known_objects_example.ipynb
@@ -0,0 +1,166 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "a89ad362-6ed6-489f-806c-2fd94fc9356d",
+   "metadata": {},
+   "source": [
+    "# Example Known Object Labeling\n",
+    "\n",
+    "This notebook serves as an example (and usable tool) for labeling objects in the results file as corresponding to a known object. It assumes the user has run KBMOD to produce a results .ecsv and has access to a table of known results.\n",
+    "\n",
+    "This notebook uses specific files and parameters from the DEEP reprocessing."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0b1d47da-db6f-4530-93a2-9cb83a6af145",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from astropy.table import Table\n",
+    "import numpy as np\n",
+    "\n",
+    "from kbmod.filters.known_object_filters import KnownObjsMatcher\n",
+    "from kbmod.results import Results"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "43b6aaa7-9bf3-4ed7-8659-8daea4ed4765",
+   "metadata": {},
+   "source": [
+    "We start by loading the results data and known object data. The results data is the ecsv file produced by a KBMOD run and contains information on each trajectory found. The known object table is a given file with information on the location (RA, dec) of each observation at different time steps.\n",
+    "\n",
+    "We also extract the required metadata (global WCS and a list of all observation times) from the results."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6ce8dcad-fa91-4267-b2cf-7c787ac0d665",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# These two files are specific to the UW DEEP reprocessing runs and should be replaced\n",
+    "# by the user's files of interest.\n",
+    "res_file = \"/epyc/projects/kbmod/runs/DEEP/results/20190402_A0b_001.results.ecsv\"\n",
+    "known_file = \"/epyc/projects/kbmod/data/fakes_detections_joined.fits\"\n",
+    "\n",
+    "in_results = Results.read_table(res_file)\n",
+    "print(f\"Loaded a results table with {len(res_file)} entries and columns:\\n{in_results.colnames}\")\n",
+    "\n",
+    "wcs = in_results.wcs\n",
+    "if wcs is None:\n",
+    "    raise ValueError(\"WCS missing from results file.\")\n",
+    "\n",
+    "if \"mjd_mid\" in in_results.table.meta:\n",
+    "    obstimes = np.array(in_results.table.meta[\"mjd_mid\"])\n",
+    "else:\n",
+    "    raise ValueError(\"Metadata 'mjd_mid' missing from results file.\")\n",
+    "print(f\"Loaded {len(obstimes)} timestamps.\")\n",
+    "\n",
+    "known_table = Table.read(known_file)\n",
+    "print(f\"\\n\\nLoaded a known objects table with {len(known_table)} entries and columns:\\n{known_table.colnames}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c06cc1e2-2186-4418-a746-1bbe5e7bc971",
+   "metadata": {},
+   "source": [
+    "We use the `KnownObjsMatcher` to determine which of the found results correspond to previously known objects. `KnownObjsMatcher` provides the ability to match by either the number or ratio of observations that are in close proximity to the known object. Here we use a minimum number with reasonable proximity thresholds in space and time."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "19eeeb00-b7f2-4969-8737-185f9161b34a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "min_obs = 10\n",
+    "fake_matcher = KnownObjsMatcher(\n",
+    "    known_table,\n",
+    "    obstimes,\n",
+    "    matcher_name=\"known_matcher\",\n",
+    "    sep_thresh=2.0,  # Obs must be within 2 arcsecs.\n",
+    "    time_thresh_s=600.0,  # Obs must be within 10 minutes.\n",
+    "    name_col=\"ORBITID\",  # For the DEEP-data known objects only.\n",
+    ")\n",
+    "\n",
+    "# First create the matches column.\n",
+    "fake_matcher.match(in_results, wcs)\n",
+    "\n",
+    "# Second filter the matches.\n",
+    "fake_matcher.match_on_min_obs(in_results, min_obs)\n",
+    "\n",
+    "matched_col_name = fake_matcher.match_min_obs_col(min_obs)\n",
+    "print(f\"Matches stored in column '{matched_col_name}'\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2d8faadb-a496-4798-a0df-27712e4db8cd",
+   "metadata": {},
+   "source": [
+    "Iterate over the matched column computing a Boolean of whether there was any match (True if the match list is not empty). Add the resulting list as a new \"is_known\" column."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b6490988-242e-4a0f-b355-fcb1804d118d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "is_known = ~in_results.is_empty_value(matched_col_name)\n",
+    "in_results.table[\"is_known\"] = is_known\n",
+    "matched_count = np.count_nonzero(is_known)\n",
+    "\n",
+    "print(f\"Found {matched_count} of the {len(in_results)} results matched known objects.\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c4c79978-ed82-4261-abd4-14e745efb09a",
+   "metadata": {},
+   "source": [
+    "We could save the resulting joined table using:\n",
+    "```\n",
+    "in_results.write_table(output_filename)\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "56aadd08-aaed-4de5-a9eb-77ecafef1e55",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Jeremy's KBMOD",
+   "language": "python",
+   "name": "kbmod_jk"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/src/kbmod/filters/known_object_filters.py b/src/kbmod/filters/known_object_filters.py
index 6c6cdc6ec..a3fa20f8a 100644
--- a/src/kbmod/filters/known_object_filters.py
+++ b/src/kbmod/filters/known_object_filters.py
@@ -22,9 +22,9 @@ class KnownObjsMatcher:
 
     In addition to modifying a KBMOD `Results` table to include columns for matched known objects,
     it also provides methods for filtering the results based on the matches. This includes
-    marking observations that matched to known objects as invalid, and filtering out results that matched to known objects by
-    either the minimum number of observations that matched to that known object or the proportion
-    of observations from the catalog for that known object that were matched to a given result.
+    marking observations that matched to known objects as invalid, and filtering out results that matched to
+    known objects by either the minimum number of observations that matched to that known object or the
+    proportion of observations from the catalog for that known object that were matched to a given result.
     """
 
     def __init__(
@@ -152,12 +152,13 @@ def to_skycoords(self):
     def match(self, result_data, wcs):
         """This function takes a list of results and matches them to known objects.
 
-        This modifies the `Results` table by adding a column with name `self.matcher_name` that provides for each result a dictionary mapping the names of known
-        objects (as defined by the catalog's `name_col`) to a boolean array indicating which observations
-        in the result matched to that known object. Note that depending on the matching parameters, a result
-        can match to multiple known objects from the catalog even at the same observation time.
+        This modifies the `Results` table by adding a column with name `self.matcher_name` that provides for each
+        result a dictionary mapping the names of known objects (as defined by the catalog's `name_col`) to a boolean
+        array indicating which observations in the result matched to that known object. Note that depending on the
+        matching parameters, a result can match to multiple known objects from the catalog even at the same observation time.
 
-        So for a dataset with 5 observations a result matching to 2 known objects, A and B, might have an entry in the column `self.matcher_name` like:
+        So for a dataset with 5 observations a result matching to 2 known objects, A and B, might have an entry in
+        the column `self.matcher_name` like:
         ```{
             "A": [True, True, False, False, False],
             "B": [False, False, False, True, True],
@@ -192,7 +193,8 @@ def match(self, result_data, wcs):
             # can use this to later map back to the original index of all observations in the stack.
             trj_idx_to_obs_idx = np.where(result_data[result_idx]["obs_valid"])[0]
 
-            # Now we can compare the SkyCoords of the known objects to the SkyCoords of the result trajectories using search_around_sky
+            # Now we can compare the SkyCoords of the known objects to the SkyCoords of the result trajectories
+            # using search_around_sky.
             # This will return a list of indices of known objects that are within sep_thresh of a trajectory
             # Note that subsequent calls by default will use the same underlying KD-Tree iin coords2.cache.
             trjs_idx, known_objs_idx, _, _ = search_around_sky(
@@ -297,6 +299,11 @@ def match_on_min_obs(
         `Results`
             The modified `Results` object returned for chaining.
         """
+        if self.matcher_name not in result_data.table.colnames:
+            raise ValueError(
+                f"Column {self.matcher_name} not found in results table. Please run match() first."
+            )
+
         matched_objs = []
         for idx in range(len(result_data)):
             matched_objs.append(set([]))
@@ -341,6 +348,11 @@ def match_on_obs_ratio(
         if obs_ratio < 0 or obs_ratio > 1:
             raise ValueError("obs_ratio must be within the range [0, 1].")
 
+        if self.matcher_name not in result_data.table.colnames:
+            raise ValueError(
+                f"Column {self.matcher_name} not found in results table. Please run match() first."
+            )
+
         # Create a dictionary of how many observations we have for each known object
         # in our catalog
         known_obj_cnts = dict(Counter(self.data[self.name_col]))
diff --git a/src/kbmod/results.py b/src/kbmod/results.py
index 9cd79c57f..15b6ebac6 100644
--- a/src/kbmod/results.py
+++ b/src/kbmod/results.py
@@ -480,6 +480,38 @@ def mask_based_on_invalid_obs(self, input_mat, mask_value):
             masked_mat[~self.table["obs_valid"]] = mask_value
         return masked_mat
 
+    def is_empty_value(self, colname):
+        """Create a Boolean vector indicating whether the entry in each row
+        is an 'empty' value (None or anything of length 0). Used to mark or
+        filter missing values.
+
+        Parameter
+        ---------
+        colname : str
+            The name of the column to check.
+
+        Returns
+        -------
+        result : `numpy.ndarray`
+            An array of Boolean values indicating whether the result is
+            one of the empty values.
+        """
+        if colname not in self.table.colnames:
+            raise KeyError(f"Querying unknown column {colname}")
+
+        # Skip numeric types (integers, floats, etc.)
+        result = np.full(len(self.table), False)
+        if np.issubdtype(self.table[colname].dtype, np.number):
+            return result
+
+        # Go through each entry and check whether it is None or something of length=0.
+        for idx, val in enumerate(self.table[colname]):
+            if val is None:
+                result[idx] = True
+            elif hasattr(val, "__len__") and len(val) == 0:
+                result[idx] = True
+        return result
+
     def filter_rows(self, rows, label=""):
         """Filter the rows in the `Results` to only include those indices
         that are provided in a list of row indices (integers) or marked
diff --git a/tests/test_known_object_filters.py b/tests/test_known_object_filters.py
index a6d2e7b11..c19ad8029 100644
--- a/tests/test_known_object_filters.py
+++ b/tests/test_known_object_filters.py
@@ -595,15 +595,23 @@ def test_match_min_obs(self):
 
     def test_empty_filter_matches(self):
         # Test that we can filter matches with an empty Results table
+        empty_res = Results()
         matcher = KnownObjsMatcher(
             self.known_objs,
             self.obstimes,
             self.matcher_name,
         )
-        # Adds a matching column to our empty table.
-        empty_res = matcher.match_on_obs_ratio(Results(), 0.5)
+
+        # No matcher_name column in the data.
+        with self.assertRaises(ValueError):
+            _ = matcher.match_on_obs_ratio(empty_res, 0.5)
+
+        # Do the match to add the columns.
+        matcher.match(empty_res, self.wcs)
+        empty_res = matcher.match_on_obs_ratio(empty_res, 0.5)
+
+        # Test an invalid matching column
         with self.assertRaises(ValueError):
-            # Test an inavlid matching column
             matcher.filter_matches(empty_res, "empty")
 
         empty_res = matcher.filter_matches(empty_res, matcher.match_obs_ratio_col(0.5))
@@ -611,17 +619,29 @@ def test_empty_filter_matches(self):
 
     def test_empty_get_recovered_objects(self):
         # Test that we can get recovered objects with an empty Results table
+        empty_res = Results()
         matcher = KnownObjsMatcher(
             self.known_objs,
             self.obstimes,
             self.matcher_name,
         )
-        # Adds a matching column to our empty table.
-        empty_res = matcher.match_on_min_obs(Results(), 5)
+
+        # No matcher_name column in the data.
+        with self.assertRaises(ValueError):
+            _ = matcher.match_on_min_obs(empty_res, 5)
+
+        # Do the match to add the columns.
+        matcher.match(empty_res, self.wcs)
+        empty_res = matcher.match_on_min_obs(empty_res, 5)
+
+        # Test an invalid matching column
         with self.assertRaises(ValueError):
-            # Test an inavlid matching column
             matcher.get_recovered_objects(empty_res, "empty")
 
         recovered, missed = matcher.get_recovered_objects(empty_res, matcher.match_min_obs_col(5))
         self.assertEqual(0, len(recovered))
         self.assertEqual(0, len(missed))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/test_results.py b/tests/test_results.py
index 84664e277..56bc73d73 100644
--- a/tests/test_results.py
+++ b/tests/test_results.py
@@ -251,6 +251,33 @@ def test_compute_likelihood_curves(self):
         )
         self.assertTrue(np.array_equal(np.isfinite(lh_mat3), expected))
 
+    def test_is_empty_value(self):
+        table = Results.from_trajectories(self.trj_list)
+
+        # Create a two new columns: one with integers and the other with meaningless
+        # index pairs (three of which are empty)
+        nums_col = [i for i in range(len(table))]
+        table.table["nums"] = nums_col
+
+        pairs_col = [(i, i + 1) for i in range(len(table))]
+        pairs_col[1] = None
+        pairs_col[3] = ()
+        pairs_col[7] = ()
+        table.table["pairs"] = pairs_col
+
+        expected = [False] * len(table)
+        expected[1] = True
+        expected[3] = True
+        expected[7] = True
+
+        # Check that we can tell which entries are empty.
+        nums_is_empty = table.is_empty_value("nums")
+        self.assertFalse(np.any(nums_is_empty))
+
+        pairs_is_empty = table.is_empty_value("pairs")
+        print(pairs_is_empty)
+        self.assertTrue(np.array_equal(pairs_is_empty, expected))
+
     def test_filter_by_index(self):
         table = Results.from_trajectories(self.trj_list)
         self.assertEqual(len(table), self.num_entries)

From dca67575a0bf38053b614403ce48dbb016998de7 Mon Sep 17 00:00:00 2001
From: Jeremy Kubica <104161096+jeremykubica@users.noreply.github.com>
Date: Fri, 22 Nov 2024 12:19:26 -0500
Subject: [PATCH 2/2] Fix linting error

---
 notebooks/join_known_objects_example.ipynb | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/notebooks/join_known_objects_example.ipynb b/notebooks/join_known_objects_example.ipynb
index bd1170beb..f73f639fb 100644
--- a/notebooks/join_known_objects_example.ipynb
+++ b/notebooks/join_known_objects_example.ipynb
@@ -62,7 +62,10 @@
     "print(f\"Loaded {len(obstimes)} timestamps.\")\n",
     "\n",
     "known_table = Table.read(known_file)\n",
-    "print(f\"\\n\\nLoaded a known objects table with {len(known_table)} entries and columns:\\n{known_table.colnames}\")"
+    "print(\n",
+    "    f\"\\n\\nLoaded a known objects table with {len(known_table)} entries \"\n",
+    "    f\"and columns:\\n{known_table.colnames}\"\n",
+    ")"
    ]
   },
   {