Adapt to cf.

nprouvost · Feb 7, 2023 · 06ffb22 · 06ffb22
1 parent c56bd40
commit 06ffb22
Show file tree

Hide file tree

Showing 12 changed files with 39 additions and 67 deletions.
diff --git a/hbt/calibration/default.py b/hbt/calibration/default.py
@@ -6,12 +6,11 @@
 
 from columnflow.calibration import Calibrator, calibrator
 from columnflow.calibration.cms.met import met_phi
-from columnflow.calibration.cms.jets import jec, jer
+from columnflow.calibration.cms.jets import jec, jec_nominal, jer
 from columnflow.production.cms.mc_weight import mc_weight
 from columnflow.production.cms.seeds import deterministic_seeds
 from columnflow.util import maybe_import
 
-from hbt.calibration.jet import jec_nominal
 from hbt.calibration.tau import tec
 
 

diff --git a/hbt/calibration/jet.py b/hbt/calibration/jet.py
diff --git a/hbt/config/configs_run2ul.py b/hbt/config/configs_run2ul.py
@@ -672,7 +672,7 @@ def add_aliases(
         "cf.MergeSelectionMasks": {
             "mc_weight", "normalization_weight", "process_id", "category_ids", "cutflow.*",
         },
-        "cf.CoalesceColumns": {
+        "cf.UniteColumns": {
             "*",
         },
     })
@@ -712,7 +712,7 @@ def add_aliases(
     # else:
     #     raise NotImplementedError(f"config versions not implemented for {cfg.name}")
 
-    # cannels
+    # channels
     cfg.add_channel(name="mutau", id=1)
     cfg.add_channel(name="etau", id=2)
     cfg.add_channel(name="tautau", id=3)

diff --git a/hbt/production/btag.py b/hbt/production/btag.py
@@ -29,13 +29,10 @@
         # nano columns
         "Jet.pt",
     },
-    # produced columns are defined in the init function below
+    # only run on mc
+    mc_only=True,
 )
 def normalized_btag_weights(self: Producer, events: ak.Array, **kwargs) -> ak.Array:
-    # fail when running on data
-    if self.dataset_inst.is_data:
-        raise ValueError("attempt to compute normalized btag weights in data")
-
     for weight_name in self[btag_weights].produces:
         if not weight_name.startswith("btag_weight"):
             continue
@@ -74,8 +71,7 @@ def normalized_btag_weights_init(self: Producer) -> None:
         if not weight_name.startswith("btag_weight"):
             continue
 
-        self.produces.add(f"normalized_{weight_name}")
-        self.produces.add(f"normalized_njet_{weight_name}")
+        self.produces |= {f"normalized_{weight_name}", f"normalized_njet_{weight_name}"}
 
 
 @normalized_btag_weights.requires

diff --git a/hbt/production/default.py b/hbt/production/default.py
@@ -21,16 +21,24 @@
 
 
 @producer(
-    uses={features, category_ids},
-    produces={features, category_ids},
+    uses={
+        category_ids, features, normalization_weights, normalized_pdf_weight,
+        normalized_murmuf_weight, normalized_pu_weight, normalized_btag_weights,
+        tau_weights, electron_weights, muon_weights, trigger_weights,
+    },
+    produces={
+        category_ids, features, normalization_weights, normalized_pdf_weight,
+        normalized_murmuf_weight, normalized_pu_weight, normalized_btag_weights,
+        tau_weights, electron_weights, muon_weights, trigger_weights,
+    },
 )
 def default(self: Producer, events: ak.Array, **kwargs) -> ak.Array:
-    # features
-    events = self[features](events, **kwargs)
-
     # category ids
     events = self[category_ids](events, **kwargs)
 
+    # features
+    events = self[features](events, **kwargs)
+
     # mc-only weights
     if self.dataset_inst.is_mc:
         # normalization weights
@@ -61,17 +69,3 @@ def default(self: Producer, events: ak.Array, **kwargs) -> ak.Array:
         events = self[trigger_weights](events, **kwargs)
 
     return events
-
-
-@default.init
-def default_init(self: Producer) -> None:
-    if not getattr(self, "dataset_inst", None) or self.dataset_inst.is_data:
-        return
-
-    # my only producers
-    producers = {
-        normalization_weights, normalized_pdf_weight, normalized_murmuf_weight, normalized_pu_weight,
-        normalized_btag_weights, tau_weights, electron_weights, muon_weights, trigger_weights,
-    }
-    self.uses |= producers
-    self.produces |= producers
diff --git a/hbt/production/tau.py b/hbt/production/tau.py
@@ -35,6 +35,8 @@
             "mu_0p4", "mu_0p4To0p8", "mu_0p8To1p2", "mu_1p2To1p7", "mu_1p7ToInf",
         ]
     },
+    # only run on mc
+    mc_only=True,
 )
 def tau_weights(self: Producer, events: ak.Array, **kwargs) -> ak.Array:
     """
@@ -56,10 +58,6 @@ def tau_weights(self: Producer, events: ak.Array, **kwargs) -> ak.Array:
     https://twiki.cern.ch/twiki/bin/view/CMS/TauIDRecommendationForRun2?rev=113
     https://gitlab.cern.ch/cms-nanoAOD/jsonpog-integration/-/blob/849c6a6efef907f4033715d52290d1a661b7e8f9/POG/TAU
     """
-    # fail when running on data
-    if self.dataset_inst.is_data:
-        raise ValueError("attempt to compute tau weights in data")
-
     # helper to bring a flat sf array into the shape of taus, and multiply across the tau axis
     reduce_mul = lambda sf: ak.prod(layout_ak_array(sf, events.Tau.pt), axis=1, mask_identity=False)
 
@@ -196,6 +194,8 @@ def tau_weights_setup(self: Producer, reqs: dict, inputs: dict) -> None:
         for direction in ["up", "down"]
         for ch in ["etau", "mutau", "tautau"]  # TODO: add tautauvbf when existing
     },
+    # only run on mc
+    mc_only=True,
 )
 def trigger_weights(self: Producer, events: ak.Array, **kwargs) -> ak.Array:
     """
@@ -213,10 +213,6 @@ def trigger_weights(self: Producer, events: ak.Array, **kwargs) -> ak.Array:
     https://twiki.cern.ch/twiki/bin/view/CMS/TauIDRecommendationForRun2?rev=113
     https://gitlab.cern.ch/cms-nanoAOD/jsonpog-integration/-/blob/849c6a6efef907f4033715d52290d1a661b7e8f9/POG/TAU
     """
-    # fail when running on data
-    if self.dataset_inst.is_data:
-        raise ValueError("attempt to compute trigger weights in data")
-
     # get channels from the config
     ch_etau = self.config_inst.get_channel("etau")
     ch_mutau = self.config_inst.get_channel("mutau")

diff --git a/hbt/production/weights.py b/hbt/production/weights.py
@@ -20,13 +20,10 @@
         # custom columns created upstream, probably by a producer
         "process_id",
     },
-    # produced columns are defined in the init function below
+    # only run on mc
+    mc_only=True,
 )
 def normalized_pu_weight(self: Producer, events: ak.Array, **kwargs) -> ak.Array:
-    # fail when running on data
-    if self.dataset_inst.is_data:
-        raise ValueError("attempt to compute normalized pileup weights in data")
-
     for weight_name in self[pu_weight].produces:
         if not weight_name.startswith("pu_weight"):
             continue
@@ -105,12 +102,10 @@ def denominator_per_pid(weight_name, pid):
     produces={
         "normalized_pdf_weight", "normalized_pdf_weight_up", "normalized_pdf_weight_down",
     },
+    # only run on mc
+    mc_only=True,
 )
 def normalized_pdf_weight(self: Producer, events: ak.Array, **kwargs) -> ak.Array:
-    # fail when running on data
-    if self.dataset_inst.is_data:
-        raise ValueError("attempt to compute normalized pdf weights in data")
-
     for postfix in ["", "_up", "_down"]:
         # create the normalized weight
         avg = self.average_pdf_weights[postfix]
@@ -152,12 +147,10 @@ def normalized_pdf_weight_setup(self: Producer, reqs: dict, inputs: dict) -> Non
     produces={
         "normalized_murmuf_weight", "normalized_murmuf_weight_up", "normalized_murmuf_weight_down",
     },
+    # only run on mc
+    mc_only=True,
 )
 def normalized_murmuf_weight(self: Producer, events: ak.Array, **kwargs) -> ak.Array:
-    # fail when running on data
-    if self.dataset_inst.is_data:
-        raise ValueError("attempt to compute normalized mur/muf weights in data")
-
     for postfix in ["", "_up", "_down"]:
         # create the normalized weight
         avg = self.average_murmuf_weights[postfix]

diff --git a/hbt/selection/default.py b/hbt/selection/default.py
@@ -9,9 +9,9 @@
 from collections import defaultdict, OrderedDict
 
 from columnflow.selection import Selector, SelectionResult, selector
+from columnflow.production.processes import process_ids
 from columnflow.production.cms.mc_weight import mc_weight
 from columnflow.production.cms.pileup import pu_weight
-from columnflow.production.processes import process_ids
 from columnflow.production.cms.pdf import pdf_weights
 from columnflow.production.cms.scale import murmuf_weights
 from columnflow.production.cms.btag import btag_weights

diff --git a/law.cfg b/law.cfg
@@ -6,6 +6,8 @@ inherit: $CF_BASE/law.cfg
 
 [modules]
 
+columnflow.tasks.cms.inference
+columnflow.tasks.cms.external
 hbt.tasks
 
 
@@ -48,7 +50,7 @@ chunked_io_debug: False
 # csv list of task families that inherit from ChunkedReaderMixin and whose output arrays should be
 # checked for non-finite values before saving them to disk (right now, supported tasks are
 # cf.CalibrateEvents, cf.SelectEvents, cf.ProduceColumns, cf.PrepareMLEvents, cf.MLEvaluation,
-# cf.CoalesceColumns)
+# cf.UniteColumns)
 check_finite_output: cf.CalibrateEvents, cf.SelectEvents, cf.ProduceColumns
 
 

diff --git a/modules/columnflow b/modules/columnflow
diff --git a/sandboxes/columnar_tf.txt b/sandboxes/columnar_tf.txt
@@ -6,4 +6,7 @@ dask-awkward~=2023.1
 uproot~=5.0
 correctionlib~=2.2
 tabulate~=0.9
+zstandard~=0.19
+lz4~=4.3
+xxhash~=3.2
 tensorflow~=2.11
diff --git a/tests/run_linting b/tests/run_linting
@@ -6,10 +6,10 @@ action() {
     local shell_is_zsh="$( [ -z "${ZSH_VERSION}" ] && echo "false" || echo "true" )"
     local this_file="$( ${shell_is_zsh} && echo "${(%):-%x}" || echo "${BASH_SOURCE[0]}" )"
     local this_dir="$( cd "$( dirname "${this_file}" )" && pwd )"
-    local hbt_dir="$( dirname "$this_dir" )"
+    local hbt_dir="$( dirname "${this_dir}" )"
 
     (
-        cd "$hbt_dir" && \
+        cd "${hbt_dir}" && \
         flake8 hbt tests
     )
 }