update functions for filtering

ONSdigital · Feb 3, 2025 · 67a33aa · 67a33aa
1 parent c486c10
commit 67a33aa
Show file tree

Hide file tree

Showing 4 changed files with 218 additions and 46 deletions.
diff --git a/src/imputation/MoR.py b/src/imputation/MoR.py
@@ -176,11 +176,15 @@ def filter_for_links(df: pd.DataFrame, is_current: bool) -> pd.DataFrame:
     # Filter out imputation classes that are missing either "200" or "201"
     nan_mask = df["imp_class"].str.contains("nan").apply(lambda x: not x)
     # Select only clear, or equivalently, imp_marker R.
-    # Exclude PRN cells in the current period.
+    clear_mask = df["imp_marker"] == "R"
+    # Exclude instance 0
+    ins_mask = df["instance"] > 0
     if is_current:
-        mask = (df["imp_marker"] == "R") & (df["selectiontype"] != "P") & nan_mask
+        # Exclude PRN cells in the current period.
+        prn_mask = df["selectiontype"] != "P"
+        mask = clear_mask & nan_mask & prn_mask & ins_mask
     else:
-        mask = (df["imp_marker"] == "R") & nan_mask
+        mask = clear_mask & nan_mask & ins_mask
 
     return df.loc[mask, :]
 

diff --git a/src/imputation/imputation_helpers.py b/src/imputation/imputation_helpers.py
@@ -82,44 +82,72 @@ def create_notnull_mask(df: pd.DataFrame, col: str) -> pd.Series:
     return df[col].str.len() > 0
 
 
-def create_mask(df: pd.DataFrame, options: List) -> pd.Series:
-    """Create a dataframe mask based on listed options - retrun Bool column.
+def create_mask(df: pd.DataFrame, options: List[str]) -> pd.Series:
+    """Create a dataframe mask based on listed options - return Bool column.
 
     Options include:
         - 'clear_status': rows with one of the clear statuses
         - 'instance_zero': rows with instance = 0
         - 'instance_nonzero': rows with instance != 0
-        - 'no_r_and_d' : rows where q604 = 'No'
+        - 'no_r_and_d': rows where q604 = 'No'
         - 'postcode_only': rows in which there are no numeric values, only postcodes.
-    """
-    clear_mask = df["status"].isin(["Clear", "Clear - overridden"])
-    instance_mask = df.instance == 0
-    no_r_and_d_mask = df["604"] == "No"
-    postcode_only_mask = df["211"].isnull() & ~df["601"].isnull()
-
-    # Set initial values for the mask series as a column in the dataframe
-    df = df.copy()
-    df["mask_col"] = False
-
-    if "clear_status" in options:
-        df["mask_col"] = df["mask_col"] & clear_mask
+        - 'excl_postcode_only': rows excluding those with only postcodes.
+        - 'exclude_nan_classes': rows excluding those with "nan" in the imp_class col.
+        - 'prn_only': PRN rows, ie, rows with selectiontype = 'P'
+        - 'census_only': Census rows, ie, rows with selectiontype 'C'
+        - 'longform_only': Longform rows, ie, rows with formtype = '0001'
+        - 'shortform_only': Shortform rows, ie, rows with formtype = '0006'
+        - 'bad_status': rows with a status that is not in the clear statuses
 
-    if "instance_zero" in options:
-        df["mask_col"] = df["mask_col"] & instance_mask
-
-    elif "instance_nonzero" in options:
-        df["mask_col"] = df["mask_col"] & ~instance_mask
-
-    if "no_r_and_d" in options:
-        df["mask_col"] = df["mask_col"] & no_r_and_d_mask
+    Args:
+        df (pd.DataFrame): The input dataframe.
+        options (List[str]): List of options to create the mask.
 
-    if "postcode_only" in options:
-        df["mask_col"] = df["mask_col"] & postcode_only_mask
+    Returns:
+        pd.Series: Boolean mask based on the options.
+    """
+    df = df.copy()  # Ensure the original DataFrame is not modified
+
+    # Define masks for each option
+    masks = {
+        "clear_status": df["status"].isin(["Clear", "Clear - overridden"]),
+        "instance_zero": df.instance == 0,
+        "instance_nonzero": df.instance > 0,
+        "no_r_and_d": df["604"] == "No",
+        "postcode_only": df["211"].isnull() & df["601"].notnull(),
+        "excl_postcode_only": ~(df["211"].isnull() & df["601"].notnull()),
+        "exclude_nan_classes": ~df["imp_class"].str.contains("nan", na=False),
+        "prn_only": df["selectiontype"] == "P",
+        "census_only": df["selectiontype"] == "C",
+        "longform_only": df["formtype"] == "0001",
+        "shortform_only": df["formtype"] == "0006",
+        "bad_status": df["status"].isin(["Check needed", "Form sent out"]),
+    }
+
+    # Initialize the mask to True
+    mask = pd.Series(True, index=df.index)
+
+    # Apply the masks based on the options
+    for option in options:
+        if option in masks:
+            mask &= masks[option]
+
+    return mask
+
+
+def special_filter(df: pd.DataFrame, options: List[str]) -> pd.DataFrame:
+    """Filter the dataframe based on a list of options commonly used in the pipeline.
 
-    if "excl_postcode_only" in options:
-        df["mask_col"] = df["mask_col"] & ~postcode_only_mask
+    Args:
+        df (pd.DataFrame): The input dataframe.
+        options (List[str]): List of options to filter the dataframe.
 
-    return df["mask_col"]
+    Returns:
+        pd.DataFrame: The filtered dataframe.
+    """
+    mask = create_mask(df, options)
+    df = df.copy().loc[mask]
+    return df
 
 
 def instance_fix(df: pd.DataFrame):
@@ -281,6 +309,9 @@ def create_r_and_d_instance(
     # Ensure that in the case longforms with "no R&D" we only have one row
     df, mult_604_qa_df = fix_604_error(df)
 
+    # In the case where there is "no R&D", we create a copy of instance 0
+    # and update to instance = 1. In this way we create an "instance 1" which we can
+    # popultae with zeros for imputation purposes (see docstring above).
     no_rd_mask = (df.formtype == "0001") & (df["604"] == "No")
     filtered_df = df.copy().loc[no_rd_mask]
     filtered_df["instance"] = 1

diff --git a/src/imputation/tmi_imputation.py b/src/imputation/tmi_imputation.py
@@ -199,13 +199,8 @@ def create_mean_dict(
     # Create an empty dict to store means
     mean_dict = dict.fromkeys(target_variable_list)
 
-    # Filter for clear statuses
-    clear_statuses = ["Clear", "Clear - overridden"]
-
-    filtered_df = df.loc[df["status"].isin(clear_statuses)]
-
-    # Filter out imputation classes that are missing either "200" or "201"
-    filtered_df = filtered_df[~(filtered_df["imp_class"].str.contains("nan"))]
+    filter_conditions_list = ["clear_status", "instance_nonzero", "exclude_nan_classes"]
+    filtered_df = hlp.special_filter(df, filter_conditions_list)
 
     # Group by imp_class
     grp = filtered_df.groupby("imp_class")
@@ -264,14 +259,8 @@ def apply_tmi(
     Returns:
         pd.DataFrame: The passed dataframe with TMI imputation applied.
     """
-    df = df.copy()
-
-    filtered_df = df.loc[df["status"].isin(["Form sent out", "Check needed"])]
-
-    # Filter out any cases where 200 or 201 are missing from the imputation class
-    # This ensures that means are calculated using only valid imputation classes
-    # Since imp_class is string type, any entry containing "nan" is excluded.
-    filtered_df = filtered_df[~(filtered_df["imp_class"].str.contains("nan"))]
+    conditions_mask_list = ["bad_status", "instance_nonzero", "exclude_nan_classes"]
+    filtered_df = hlp.special_filter(df, conditions_mask_list)
 
     grp = filtered_df.groupby("imp_class")
     class_keys = list(grp.groups.keys())

diff --git a/tests/test_imputation/test_imputation_helpers.py b/tests/test_imputation/test_imputation_helpers.py
@@ -12,6 +12,8 @@
     create_imp_class_col,
     imputation_marker,
     concat_with_bool,
+    create_mask,
+    special_filter,
 )
 
 
@@ -508,3 +510,149 @@ def test_concat_with_bool(self):
         result_df = concat_with_bool([df1, df2, df3])
         # ignore the order of the columns
         assert_frame_equal(result_df.reset_index(drop=True), expected_df, check_like=True)
+
+
+class TestCreateMask:
+    """Unit tests for create_mask function."""
+
+    def create_input_df(self):
+        """Create an input dataframe for the test."""
+        input_cols = [
+            "reference",
+            "instance",
+            "imp_class",
+            "211",
+            "601",
+            "604",
+            "status",
+            "formtype",
+            "selectiontype",
+        ]
+
+        data = [
+            [111, 0, "nan_A", np.nan, None, "Yes", "Clear", "0001", "C"],
+            [111, 1, "C_A", 1, None, None, "Clear - overridden", "0001", "C"],
+            [222, 0, "nan_A", np.nan, None, None, "Clear", "0001", "C"],
+            [222, 1, "C_A", 1, "CB1 2NF", "No", "Clear", "0001", "C"],
+            [222, 2, "C_A", np.nan, "BA1 5DA", "No", "Clear", "0001", "C"],
+            [333, np.nan, "nan_A", np.nan, None, "No", "Form sent out", "0006", "P"],
+        ]
+
+        input_df = pd.DataFrame(data=data, columns=input_cols)
+        return input_df
+
+    def test_clear_status(self):
+        df = self.create_input_df()
+        options = ["clear_status"]
+        expected_mask = pd.Series([True, True, True, True, True, False])
+        result_mask = create_mask(df, options)
+        assert_series_equal(result_mask, expected_mask)
+
+    def test_bad_status(self):
+        df = self.create_input_df()
+        options = ["bad_status"]
+        expected_mask = pd.Series([False, False, False, False, False, True])
+        result_mask = create_mask(df, options)
+        assert_series_equal(result_mask, expected_mask)
+
+    def test_instance_zero(self):
+        df = self.create_input_df()
+        options = ["instance_zero"]
+        expected_mask = pd.Series([True, False, True, False, False, False])
+        result_mask = create_mask(df, options)
+        assert_series_equal(result_mask, expected_mask)
+
+    def test_instance_nonzero(self):
+        df = self.create_input_df()
+        options = ["instance_nonzero"]
+        expected_mask = pd.Series([False, True, False, True, True, False])
+        result_mask = create_mask(df, options)
+        assert_series_equal(result_mask, expected_mask)
+
+    def test_no_r_and_d(self):
+        df = self.create_input_df()
+        options = ["no_r_and_d"]
+        expected_mask = pd.Series([False, False, False, True, True, True])
+        result_mask = create_mask(df, options)
+        assert_series_equal(result_mask, expected_mask)
+
+    def test_postcode_only(self):
+        df = self.create_input_df()
+        options = ["postcode_only"]
+        expected_mask = pd.Series([False, False, False, False, True, False])
+        result_mask = create_mask(df, options)
+        assert_series_equal(result_mask, expected_mask)
+
+    def test_excl_postcode_only(self):
+        df = self.create_input_df()
+        options = ["excl_postcode_only"]
+        expected_mask = pd.Series([True, True, True, True, False, True])
+        result_mask = create_mask(df, options)
+        assert_series_equal(result_mask, expected_mask)
+
+    def test_clear_instance_zero(self):
+        df = self.create_input_df()
+        options = ["clear_status", "instance_zero"]
+        expected_mask = pd.Series([True, False, True, False, False, False])
+        result_mask = create_mask(df, options)
+        assert_series_equal(result_mask, expected_mask)
+
+    def test_exclude_nan_classes(self):
+        df = self.create_input_df()
+        options = ["exclude_nan_classes"]
+        expected_mask = pd.Series([False, True, False, True, True, False])
+        result_mask = create_mask(df, options)
+        assert_series_equal(result_mask, expected_mask)
+
+    def test_clear_instance_nonzero(self):
+        df = self.create_input_df()
+        options = ["clear_status", "instance_nonzero"]
+        expected_mask = pd.Series([False, True, False, True, True, False])
+        result_mask = create_mask(df, options)
+        assert_series_equal(result_mask, expected_mask)
+
+    def test_clear_instance_nonzero_exclude_nan_classes(self):
+        df = self.create_input_df()
+        options = ["clear_status", "instance_nonzero", "exclude_nan_classes"]
+        expected_mask = pd.Series([False, True, False, True, True, False])
+        result_mask = create_mask(df, options)
+        assert_series_equal(result_mask, expected_mask)
+
+    def test_clear_longfom_instance_nonzero(self):
+        df = self.create_input_df()
+        options = ["clear_status", "instance_nonzero", "longform"]
+        expected_mask = pd.Series([False, True, False, True, True, False])
+        result_mask = create_mask(df, options)
+        assert_series_equal(result_mask, expected_mask)
+
+
+class TestSpecialFilter:
+    """Tests for the SpecialFilter function."""
+    def create_input_df(self):
+        """Create an input dataframe for the test."""
+        input_cols = [
+            "reference",
+            "instance",
+            "imp_class",
+            "211",
+            "601",
+            "604",
+            "status",
+            "formtype",
+            "selectiontype",
+        ]
+
+        data = [
+            [111, 0, "nan_A", np.nan, None, "Yes", "Clear", "0001", "C"],
+            [111, 1, "C_A", 1, None, None, "Clear - overridden", "0001", "C"],
+            [222, 0, "nan_A", np.nan, None, None, "Clear", "0001", "C"],
+            [222, 1, "C_A", 1, "CB1 2NF", "No", "Clear", "0001", "C"],
+            [222, 2, "C_A", np.nan, "BA1 5DA", "No", "Clear", "0001", "C"],
+            [333, np.nan, "nan_A", np.nan, None, "No", "Form sent out", "0006", "P"],
+        ]
+
+        input_df = pd.DataFrame(data=data, columns=input_cols)
+        return input_df
+
+    def test_special_filter_create_mean_case(self):
+        filter_conditions_list = ["clear_status", "instance_nonzero", "exclude_nan_classes"]