Skip to content

Commit

Permalink
update functions for filtering
Browse files Browse the repository at this point in the history
  • Loading branch information
AnneONS committed Feb 3, 2025
1 parent c486c10 commit 67a33aa
Show file tree
Hide file tree
Showing 4 changed files with 218 additions and 46 deletions.
10 changes: 7 additions & 3 deletions src/imputation/MoR.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,11 +176,15 @@ def filter_for_links(df: pd.DataFrame, is_current: bool) -> pd.DataFrame:
# Filter out imputation classes that are missing either "200" or "201"
nan_mask = df["imp_class"].str.contains("nan").apply(lambda x: not x)
# Select only clear, or equivalently, imp_marker R.
# Exclude PRN cells in the current period.
clear_mask = df["imp_marker"] == "R"
# Exclude instance 0
ins_mask = df["instance"] > 0
if is_current:
mask = (df["imp_marker"] == "R") & (df["selectiontype"] != "P") & nan_mask
# Exclude PRN cells in the current period.
prn_mask = df["selectiontype"] != "P"
mask = clear_mask & nan_mask & prn_mask & ins_mask
else:
mask = (df["imp_marker"] == "R") & nan_mask
mask = clear_mask & nan_mask & ins_mask

return df.loc[mask, :]

Expand Down
87 changes: 59 additions & 28 deletions src/imputation/imputation_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,44 +82,72 @@ def create_notnull_mask(df: pd.DataFrame, col: str) -> pd.Series:
return df[col].str.len() > 0


def create_mask(df: pd.DataFrame, options: List) -> pd.Series:
"""Create a dataframe mask based on listed options - retrun Bool column.
def create_mask(df: pd.DataFrame, options: List[str]) -> pd.Series:
"""Create a dataframe mask based on listed options - return Bool column.
Options include:
- 'clear_status': rows with one of the clear statuses
- 'instance_zero': rows with instance = 0
- 'instance_nonzero': rows with instance != 0
- 'no_r_and_d' : rows where q604 = 'No'
- 'no_r_and_d': rows where q604 = 'No'
- 'postcode_only': rows in which there are no numeric values, only postcodes.
"""
clear_mask = df["status"].isin(["Clear", "Clear - overridden"])
instance_mask = df.instance == 0
no_r_and_d_mask = df["604"] == "No"
postcode_only_mask = df["211"].isnull() & ~df["601"].isnull()

# Set initial values for the mask series as a column in the dataframe
df = df.copy()
df["mask_col"] = False

if "clear_status" in options:
df["mask_col"] = df["mask_col"] & clear_mask
- 'excl_postcode_only': rows excluding those with only postcodes.
- 'exclude_nan_classes': rows excluding those with "nan" in the imp_class col.
- 'prn_only': PRN rows, ie, rows with selectiontype = 'P'
- 'census_only': Census rows, ie, rows with selectiontype 'C'
- 'longform_only': Longform rows, ie, rows with formtype = '0001'
- 'shortform_only': Shortform rows, ie, rows with formtype = '0006'
- 'bad_status': rows with a status that is not in the clear statuses
if "instance_zero" in options:
df["mask_col"] = df["mask_col"] & instance_mask

elif "instance_nonzero" in options:
df["mask_col"] = df["mask_col"] & ~instance_mask

if "no_r_and_d" in options:
df["mask_col"] = df["mask_col"] & no_r_and_d_mask
Args:
df (pd.DataFrame): The input dataframe.
options (List[str]): List of options to create the mask.
if "postcode_only" in options:
df["mask_col"] = df["mask_col"] & postcode_only_mask
Returns:
pd.Series: Boolean mask based on the options.
"""
df = df.copy() # Ensure the original DataFrame is not modified

# Define masks for each option
masks = {
"clear_status": df["status"].isin(["Clear", "Clear - overridden"]),
"instance_zero": df.instance == 0,
"instance_nonzero": df.instance > 0,
"no_r_and_d": df["604"] == "No",
"postcode_only": df["211"].isnull() & df["601"].notnull(),
"excl_postcode_only": ~(df["211"].isnull() & df["601"].notnull()),
"exclude_nan_classes": ~df["imp_class"].str.contains("nan", na=False),
"prn_only": df["selectiontype"] == "P",
"census_only": df["selectiontype"] == "C",
"longform_only": df["formtype"] == "0001",
"shortform_only": df["formtype"] == "0006",
"bad_status": df["status"].isin(["Check needed", "Form sent out"]),
}

# Initialize the mask to True
mask = pd.Series(True, index=df.index)

# Apply the masks based on the options
for option in options:
if option in masks:
mask &= masks[option]

return mask


def special_filter(df: pd.DataFrame, options: List[str]) -> pd.DataFrame:
"""Filter the dataframe based on a list of options commonly used in the pipeline.
if "excl_postcode_only" in options:
df["mask_col"] = df["mask_col"] & ~postcode_only_mask
Args:
df (pd.DataFrame): The input dataframe.
options (List[str]): List of options to filter the dataframe.
return df["mask_col"]
Returns:
pd.DataFrame: The filtered dataframe.
"""
mask = create_mask(df, options)
df = df.copy().loc[mask]
return df


def instance_fix(df: pd.DataFrame):
Expand Down Expand Up @@ -281,6 +309,9 @@ def create_r_and_d_instance(
# Ensure that in the case longforms with "no R&D" we only have one row
df, mult_604_qa_df = fix_604_error(df)

# In the case where there is "no R&D", we create a copy of instance 0
# and update to instance = 1. In this way we create an "instance 1" which we can
# popultae with zeros for imputation purposes (see docstring above).
no_rd_mask = (df.formtype == "0001") & (df["604"] == "No")
filtered_df = df.copy().loc[no_rd_mask]
filtered_df["instance"] = 1
Expand Down
19 changes: 4 additions & 15 deletions src/imputation/tmi_imputation.py
Original file line number Diff line number Diff line change
Expand Up @@ -199,13 +199,8 @@ def create_mean_dict(
# Create an empty dict to store means
mean_dict = dict.fromkeys(target_variable_list)

# Filter for clear statuses
clear_statuses = ["Clear", "Clear - overridden"]

filtered_df = df.loc[df["status"].isin(clear_statuses)]

# Filter out imputation classes that are missing either "200" or "201"
filtered_df = filtered_df[~(filtered_df["imp_class"].str.contains("nan"))]
filter_conditions_list = ["clear_status", "instance_nonzero", "exclude_nan_classes"]
filtered_df = hlp.special_filter(df, filter_conditions_list)

# Group by imp_class
grp = filtered_df.groupby("imp_class")
Expand Down Expand Up @@ -264,14 +259,8 @@ def apply_tmi(
Returns:
pd.DataFrame: The passed dataframe with TMI imputation applied.
"""
df = df.copy()

filtered_df = df.loc[df["status"].isin(["Form sent out", "Check needed"])]

# Filter out any cases where 200 or 201 are missing from the imputation class
# This ensures that means are calculated using only valid imputation classes
# Since imp_class is string type, any entry containing "nan" is excluded.
filtered_df = filtered_df[~(filtered_df["imp_class"].str.contains("nan"))]
conditions_mask_list = ["bad_status", "instance_nonzero", "exclude_nan_classes"]
filtered_df = hlp.special_filter(df, conditions_mask_list)

grp = filtered_df.groupby("imp_class")
class_keys = list(grp.groups.keys())
Expand Down
148 changes: 148 additions & 0 deletions tests/test_imputation/test_imputation_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
create_imp_class_col,
imputation_marker,
concat_with_bool,
create_mask,
special_filter,
)


Expand Down Expand Up @@ -508,3 +510,149 @@ def test_concat_with_bool(self):
result_df = concat_with_bool([df1, df2, df3])
# ignore the order of the columns
assert_frame_equal(result_df.reset_index(drop=True), expected_df, check_like=True)


class TestCreateMask:
"""Unit tests for create_mask function."""

def create_input_df(self):
"""Create an input dataframe for the test."""
input_cols = [
"reference",
"instance",
"imp_class",
"211",
"601",
"604",
"status",
"formtype",
"selectiontype",
]

data = [
[111, 0, "nan_A", np.nan, None, "Yes", "Clear", "0001", "C"],
[111, 1, "C_A", 1, None, None, "Clear - overridden", "0001", "C"],
[222, 0, "nan_A", np.nan, None, None, "Clear", "0001", "C"],
[222, 1, "C_A", 1, "CB1 2NF", "No", "Clear", "0001", "C"],
[222, 2, "C_A", np.nan, "BA1 5DA", "No", "Clear", "0001", "C"],
[333, np.nan, "nan_A", np.nan, None, "No", "Form sent out", "0006", "P"],
]

input_df = pd.DataFrame(data=data, columns=input_cols)
return input_df

def test_clear_status(self):
df = self.create_input_df()
options = ["clear_status"]
expected_mask = pd.Series([True, True, True, True, True, False])
result_mask = create_mask(df, options)
assert_series_equal(result_mask, expected_mask)

def test_bad_status(self):
df = self.create_input_df()
options = ["bad_status"]
expected_mask = pd.Series([False, False, False, False, False, True])
result_mask = create_mask(df, options)
assert_series_equal(result_mask, expected_mask)

def test_instance_zero(self):
df = self.create_input_df()
options = ["instance_zero"]
expected_mask = pd.Series([True, False, True, False, False, False])
result_mask = create_mask(df, options)
assert_series_equal(result_mask, expected_mask)

def test_instance_nonzero(self):
df = self.create_input_df()
options = ["instance_nonzero"]
expected_mask = pd.Series([False, True, False, True, True, False])
result_mask = create_mask(df, options)
assert_series_equal(result_mask, expected_mask)

def test_no_r_and_d(self):
df = self.create_input_df()
options = ["no_r_and_d"]
expected_mask = pd.Series([False, False, False, True, True, True])
result_mask = create_mask(df, options)
assert_series_equal(result_mask, expected_mask)

def test_postcode_only(self):
df = self.create_input_df()
options = ["postcode_only"]
expected_mask = pd.Series([False, False, False, False, True, False])
result_mask = create_mask(df, options)
assert_series_equal(result_mask, expected_mask)

def test_excl_postcode_only(self):
df = self.create_input_df()
options = ["excl_postcode_only"]
expected_mask = pd.Series([True, True, True, True, False, True])
result_mask = create_mask(df, options)
assert_series_equal(result_mask, expected_mask)

def test_clear_instance_zero(self):
df = self.create_input_df()
options = ["clear_status", "instance_zero"]
expected_mask = pd.Series([True, False, True, False, False, False])
result_mask = create_mask(df, options)
assert_series_equal(result_mask, expected_mask)

def test_exclude_nan_classes(self):
df = self.create_input_df()
options = ["exclude_nan_classes"]
expected_mask = pd.Series([False, True, False, True, True, False])
result_mask = create_mask(df, options)
assert_series_equal(result_mask, expected_mask)

def test_clear_instance_nonzero(self):
df = self.create_input_df()
options = ["clear_status", "instance_nonzero"]
expected_mask = pd.Series([False, True, False, True, True, False])
result_mask = create_mask(df, options)
assert_series_equal(result_mask, expected_mask)

def test_clear_instance_nonzero_exclude_nan_classes(self):
df = self.create_input_df()
options = ["clear_status", "instance_nonzero", "exclude_nan_classes"]
expected_mask = pd.Series([False, True, False, True, True, False])
result_mask = create_mask(df, options)
assert_series_equal(result_mask, expected_mask)

def test_clear_longfom_instance_nonzero(self):
df = self.create_input_df()
options = ["clear_status", "instance_nonzero", "longform"]
expected_mask = pd.Series([False, True, False, True, True, False])
result_mask = create_mask(df, options)
assert_series_equal(result_mask, expected_mask)


class TestSpecialFilter:
"""Tests for the SpecialFilter function."""
def create_input_df(self):
"""Create an input dataframe for the test."""
input_cols = [
"reference",
"instance",
"imp_class",
"211",
"601",
"604",
"status",
"formtype",
"selectiontype",
]

data = [
[111, 0, "nan_A", np.nan, None, "Yes", "Clear", "0001", "C"],
[111, 1, "C_A", 1, None, None, "Clear - overridden", "0001", "C"],
[222, 0, "nan_A", np.nan, None, None, "Clear", "0001", "C"],
[222, 1, "C_A", 1, "CB1 2NF", "No", "Clear", "0001", "C"],
[222, 2, "C_A", np.nan, "BA1 5DA", "No", "Clear", "0001", "C"],
[333, np.nan, "nan_A", np.nan, None, "No", "Form sent out", "0006", "P"],
]

input_df = pd.DataFrame(data=data, columns=input_cols)
return input_df

def test_special_filter_create_mean_case(self):
filter_conditions_list = ["clear_status", "instance_nonzero", "exclude_nan_classes"]

0 comments on commit 67a33aa

Please sign in to comment.