Merge branch 'empty_selector'.

nprouvost · Dec 13, 2024 · 1d00bac · 1d00bac
2 parents c74a9b2 + c7156d0
commit 1d00bac
Show file tree

Hide file tree

Showing 5 changed files with 201 additions and 55 deletions.
diff --git a/hbt/config/configs_hbt.py b/hbt/config/configs_hbt.py
@@ -836,7 +836,7 @@ def if_era(
     # TODO: what? what about run 2? really pnet??
     from columnflow.production.cms.btag import BTagSFConfig
     cfg.x.btag_sf = BTagSFConfig(
-        correction_set="particleNet_shape",
+        correction_set="particleNet_shape",  # TODO:
         jec_sources=cfg.x.btag_sf_jec_sources,
         discriminator="btagPNetB",
     )

diff --git a/hbt/production/default.py b/hbt/production/default.py
@@ -11,7 +11,6 @@
 from columnflow.production.cms.muon import muon_weights
 from columnflow.util import maybe_import
 
-from hbt.production.features import features
 from hbt.production.weights import (
     normalized_pu_weight, normalized_pdf_weight, normalized_murmuf_weight,
 )
@@ -24,12 +23,12 @@
 
 @producer(
     uses={
-        category_ids, features, stitched_normalization_weights, normalized_pu_weight,
+        category_ids, stitched_normalization_weights, normalized_pu_weight,
         normalized_btag_weights, tau_weights, electron_weights, muon_weights, trigger_weights,
         IF_DATASET_HAS_LHE_WEIGHTS(normalized_pdf_weight, normalized_murmuf_weight),
     },
     produces={
-        category_ids, features, stitched_normalization_weights, normalized_pu_weight,
+        category_ids, stitched_normalization_weights, normalized_pu_weight,
         normalized_btag_weights, tau_weights, electron_weights, muon_weights, trigger_weights,
         IF_DATASET_HAS_LHE_WEIGHTS(normalized_pdf_weight, normalized_murmuf_weight),
     },
@@ -38,9 +37,6 @@ def default(self: Producer, events: ak.Array, **kwargs) -> ak.Array:
     # category ids
     events = self[category_ids](events, **kwargs)
 
-    # features
-    events = self[features](events, **kwargs)
-
     # mc-only weights
     if self.dataset_inst.is_mc:
         # normalization weights

diff --git a/hbt/production/weights.py b/hbt/production/weights.py
@@ -24,7 +24,8 @@
     mc_only=True,
 )
 def normalized_pu_weight(self: Producer, events: ak.Array, **kwargs) -> ak.Array:
-    for weight_name in self[pu_weight].produces:
+    for route in self[pu_weight].produces:
+        weight_name = str(route)
         if not weight_name.startswith("pu_weight"):
             continue
 
@@ -50,7 +51,7 @@ def normalized_pu_weight(self: Producer, events: ak.Array, **kwargs) -> ak.Array
 def normalized_pu_weight_init(self: Producer) -> None:
     self.produces |= {
         f"normalized_{weight_name}"
-        for weight_name in self[pu_weight].produces
+        for weight_name in (str(route) for route in self[pu_weight].produced_columns)
         if weight_name.startswith("pu_weight")
     }
 
@@ -98,7 +99,7 @@ def denominator_per_pid(weight_name, pid):
             pid: safe_div(numerator_per_pid(pid), denominator_per_pid(weight_name, pid))
             for pid in self.unique_process_ids
         }
-        for weight_name in self[pu_weight].produces
+        for weight_name in (str(route) for route in self[pu_weight].produced_columns)
         if weight_name.startswith("pu_weight")
     }
 

diff --git a/hbt/selection/default.py b/hbt/selection/default.py
@@ -110,11 +110,20 @@ def default(
         # btag weights
         events = self[btag_weights](
             events,
-            ak.fill_none(results.x.jet_mask, False, axis=-1),
+            jet_mask=ak.fill_none(results.x.jet_mask, False, axis=-1),
             negative_b_score_log_mode="none",
             **kwargs,
         )
 
+    # create process ids
+    if self.process_ids_dy is not None:
+        events = self[self.process_ids_dy](events, **kwargs)
+    else:
+        events = self[process_ids](events, **kwargs)
+
+    # some cutflow features
+    events = self[cutflow_features](events, results.objects, **kwargs)
+
     # combined event selection after all steps
     event_sel = reduce(and_, results.steps.values())
     results.event = event_sel
@@ -125,45 +134,210 @@ def default(
         [mask for step_name, mask in results.steps.items() if step_name != "bjet"],
     )
 
+    # increment stats
+    events, results = setup_and_increment_stats(
+        self,
+        events=events,
+        results=results,
+        stats=stats,
+        event_sel=event_sel,
+        event_sel_nob=event_sel_nob,
+        njets=results.x.n_central_jets,
+    )
+
+    return events, results
+
+
+@default.init
+def default_init(self: Selector) -> None:
+    if getattr(self, "dataset_inst", None) is None:
+        return
+
+    self.process_ids_dy: process_ids_dy | None = None
+    if self.dataset_inst.has_tag("is_dy"):
+        # check if this dataset is covered by any dy id producer
+        for name, dy_cfg in self.config_inst.x.dy_stitching.items():
+            dataset_inst = dy_cfg["inclusive_dataset"]
+            # the dataset is "covered" if its process is a subprocess of that of the dy dataset
+            if dataset_inst.has_process(self.dataset_inst.processes.get_first()):
+                self.process_ids_dy = process_ids_dy.derive(f"process_ids_dy_{name}", cls_dict={
+                    "dy_inclusive_dataset": dataset_inst,
+                    "dy_leaf_processes": dy_cfg["leaf_processes"],
+                })
+
+                # add it as a dependency
+                self.uses.add(self.process_ids_dy)
+                self.produces.add(self.process_ids_dy)
+
+                # stop after the first match
+                break
+
+
+empty = default.derive("empty", cls_dict={})
+
+
+@empty.init
+def empty_init(self: Selector) -> None:
+    super(empty, self).init_func()
+
+    # remove unused dependencies
+    unused = {
+        json_filter,
+        met_filters,
+        cutflow_features,
+        patch_ecalBadCalibFilter,
+        jet_selection,
+        lepton_selection,
+        trigger_selection,
+    }
+    self.uses -= unused
+    self.produces -= unused
+
+    # add custom columns
+    self.uses.add("Jet.phi")  # needed by vector behavior for accessing pt in btag_weights
+    self.produces |= {"channel_id", "leptons_os", "tau2_isolated"}
+
+
+@empty.call
+def empty_call(
+    self: Selector,
+    events: ak.Array,
+    stats: defaultdict,
+    **kwargs,
+) -> tuple[ak.Array, SelectionResult]:
+    """
+    An empty selection that does not perform selection steps but only invokes producers that are
+    necessary to create columns that are required downstream, e.g. for ProduceColumns with our
+    "default" producer.
+    """
+    from columnflow.columnar_util import set_ak_column
+
+    # ensure coffea behavior
+    events = self[attach_coffea_behavior](events, **kwargs)
+
+    # prepare the selection results that are updated at every step
+    results = SelectionResult()
+
+    # mc-only functions
+    if self.dataset_inst.is_mc:
+        events = self[mc_weight](events, **kwargs)
+
+        # pdf weights
+        if self.has_dep(pdf_weights):
+            events = self[pdf_weights](events, **kwargs)
+
+        # renormalization/factorization scale weights
+        if self.has_dep(murmuf_weights):
+            events = self[murmuf_weights](events, **kwargs)
+
+        # pileup weights
+        events = self[pu_weight](events, **kwargs)
+
+        # btag weights
+        events = self[btag_weights](
+            events,
+            jet_mask=abs(events.Jet["eta"]) < 2.5,
+            negative_b_score_log_mode="none",
+            **kwargs,
+        )
+
     # create process ids
     if self.process_ids_dy is not None:
         events = self[self.process_ids_dy](events, **kwargs)
     else:
         events = self[process_ids](events, **kwargs)
 
-    # some cutflow features
-    events = self[cutflow_features](events, results.objects, **kwargs)
+    # fake lepton selection results
+    events = set_ak_column(events, "channel_id", np.zeros(len(events), dtype=np.uint8))
+    events = set_ak_column(events, "leptons_os", np.zeros(len(events), dtype=bool))
+    events = set_ak_column(events, "tau2_isolated", np.zeros(len(events), dtype=bool))
+
+    # trivial selection mask capturing all events
+    results.event = np.ones(len(events), dtype=bool)
 
     # increment stats
+    events, results = setup_and_increment_stats(
+        self,
+        events=events,
+        results=results,
+        stats=stats,
+        event_sel=results.event,
+        event_sel_nob=results.event,
+        njets=ak.num(events.Jet, axis=1),
+    )
+
+    return events, results
+
+
+def setup_and_increment_stats(
+    self: Selector,
+    *,
+    events: ak.Array,
+    results: SelectionResult,
+    stats: defaultdict,
+    event_sel: np.ndarray | ak.Array,
+    event_sel_nob: np.ndarray | ak.Array | None = None,
+    njets: np.ndarray | ak.Array | None = None,
+    **kwargs,
+) -> tuple[ak.Array, SelectionResult]:
+    """
+    Helper function that sets up the weight and group maps for the increment_stats task, invokes it
+    and returns the updated events and results objects.
+
+    :param self: The selector instance.
+    :param events: The events array.
+    :param results: The current selection results.
+    :param stats: The stats dictionary.
+    :param event_sel: The general event selection mask.
+    :param event_sel_nob: The event selection mask without the bjet step.
+    :param njets: The number of central jets.
+    :return: The updated events and results objects in a tuple.
+    """
+    # start creating a weight, group and group combination map
     weight_map = {
         "num_events": Ellipsis,
         "num_events_selected": event_sel,
-        "num_events_selected_nobjet": event_sel_nob,
     }
+    if event_sel_nob is not None:
+        weight_map["num_events_selected_nobjet"] = event_sel_nob
     group_map = {}
     group_combinations = []
+
+    # add mc info
     if self.dataset_inst.is_mc:
         weight_map["sum_mc_weight"] = events.mc_weight
         weight_map["sum_mc_weight_selected"] = (events.mc_weight, event_sel)
-        weight_map["sum_mc_weight_selected_nobjet"] = (events.mc_weight, event_sel_nob)
+        if event_sel_nob is not None:
+            weight_map["sum_mc_weight_selected_nobjet"] = (events.mc_weight, event_sel_nob)
+
         # pu weights with variations
-        for name in sorted(self[pu_weight].produces):
+        for route in sorted(self[pu_weight].produced_columns):
+            name = str(route)
             weight_map[f"sum_mc_weight_{name}"] = (events.mc_weight * events[name], Ellipsis)
-        # pdf and murmuf weights with variations
-        if not self.dataset_inst.has_tag("no_lhe_weights"):
+
+        # pdf weights with variations
+        if self.has_dep(pdf_weights):
             for v in ["", "_up", "_down"]:
                 weight_map[f"sum_pdf_weight{v}"] = events[f"pdf_weight{v}"]
                 weight_map[f"sum_pdf_weight{v}_selected"] = (events[f"pdf_weight{v}"], event_sel)
+
+        # mur/muf weights with variations
+        if self.has_dep(murmuf_weights):
+            for v in ["", "_up", "_down"]:
                 weight_map[f"sum_murmuf_weight{v}"] = events[f"murmuf_weight{v}"]
                 weight_map[f"sum_murmuf_weight{v}_selected"] = (events[f"murmuf_weight{v}"], event_sel)
+
         # btag weights
-        for name in sorted(self[btag_weights].produces):
+        for route in sorted(self[btag_weights].produced_columns):
+            name = str(route)
             if not name.startswith("btag_weight"):
                 continue
             weight_map[f"sum_{name}"] = events[name]
             weight_map[f"sum_{name}_selected"] = (events[name], event_sel)
-            weight_map[f"sum_{name}_selected_nobjet"] = (events[name], event_sel_nob)
-            weight_map[f"sum_mc_weight_{name}_selected_nobjet"] = (events.mc_weight * events[name], event_sel_nob)
+            if event_sel_nob is not None:
+                weight_map[f"sum_{name}_selected_nobjet"] = (events[name], event_sel_nob)
+                weight_map[f"sum_mc_weight_{name}_selected_nobjet"] = (events.mc_weight * events[name], event_sel_nob)
+
         # groups
         group_map = {
             **group_map,
@@ -172,16 +346,18 @@ def default(
                 "values": events.process_id,
                 "mask_fn": (lambda v: events.process_id == v),
             },
-            # per jet multiplicity
-            "njet": {
-                "values": results.x.n_central_jets,
-                "mask_fn": (lambda v: results.x.n_central_jets == v),
-            },
         }
+        # per jet multiplicity
+        if njets is not None:
+            group_map["njet"] = {
+                "values": njets,
+                "mask_fn": (lambda v: njets == v),
+            }
+
         # combinations
         group_combinations.append(("process", "njet"))
 
-    events, results = self[increment_stats](
+    return self[increment_stats](
         events,
         results,
         stats,
@@ -190,30 +366,3 @@ def default(
         group_combinations=group_combinations,
         **kwargs,
     )
-
-    return events, results
-
-
-@default.init
-def default_init(self: Selector) -> None:
-    if getattr(self, "dataset_inst", None) is None:
-        return
-
-    self.process_ids_dy: process_ids_dy | None = None
-    if self.dataset_inst.has_tag("is_dy"):
-        # check if this dataset is covered by any dy id producer
-        for name, dy_cfg in self.config_inst.x.dy_stitching.items():
-            dataset_inst = dy_cfg["inclusive_dataset"]
-            # the dataset is "covered" if its process is a subprocess of that of the dy dataset
-            if dataset_inst.has_process(self.dataset_inst.processes.get_first()):
-                self.process_ids_dy = process_ids_dy.derive(f"process_ids_dy_{name}", cls_dict={
-                    "dy_inclusive_dataset": dataset_inst,
-                    "dy_leaf_processes": dy_cfg["leaf_processes"],
-                })
-
-                # add it as a dependency
-                self.uses.add(self.process_ids_dy)
-                self.produces.add(self.process_ids_dy)
-
-                # stop after the first match
-                break
diff --git a/law.cfg b/law.cfg
@@ -42,7 +42,7 @@ default_config: run3_2022_preEE
 default_dataset: hh_ggf_hbb_htt_kl1_kt1_powheg
 
 calibration_modules: columnflow.calibration.cms.{jets,met,tau}, hbt.calibration.{default,fake_triggers}
-selection_modules: columnflow.selection.empty, columnflow.selection.cms.{json_filter,met_filters}, hbt.selection.{default,lepton,trigger}
+selection_modules: columnflow.selection.cms.{json_filter,met_filters}, hbt.selection.{default,lepton,trigger}
 production_modules: columnflow.production.{categories,normalization,processes}, columnflow.production.cms.{btag,electron,mc_weight,muon,pdf,pileup,scale,seeds,gen_top_decay}, hbt.production.{default,weights,features,btag,tau,minimal,hh_mass,res_networks,patches}
 categorization_modules: hbt.categorization.default
 weight_production_modules: columnflow.weight.{empty,all_weights}, hbt.weight.default