From bcd2f741bda1b014f0b5b252d8d97ddf0c12136a Mon Sep 17 00:00:00 2001 From: Lindsey Gray Date: Mon, 13 May 2024 16:07:11 -0500 Subject: [PATCH 01/11] port in hist from old coffea to jitters area --- src/coffea/jitters/__init__.py | 0 src/coffea/jitters/hist/__init__.py | 38 + src/coffea/jitters/hist/hist_tools.py | 1622 +++++++++++++++++++++++++ src/coffea/jitters/hist/plot.py | 1039 ++++++++++++++++ tests/test_hist_plot.py | 352 ++++++ tests/test_hist_tools.py | 460 +++++++ 6 files changed, 3511 insertions(+) create mode 100644 src/coffea/jitters/__init__.py create mode 100644 src/coffea/jitters/hist/__init__.py create mode 100644 src/coffea/jitters/hist/hist_tools.py create mode 100644 src/coffea/jitters/hist/plot.py create mode 100644 tests/test_hist_plot.py create mode 100644 tests/test_hist_tools.py diff --git a/src/coffea/jitters/__init__.py b/src/coffea/jitters/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/src/coffea/jitters/hist/__init__.py b/src/coffea/jitters/hist/__init__.py new file mode 100644 index 000000000..453410865 --- /dev/null +++ b/src/coffea/jitters/hist/__init__.py @@ -0,0 +1,38 @@ +"""Histogramming tools + +`coffea.hist` is a histogram filling, transformation, and plotting sub-package, utilizing +numpy arrays for storage and matplotlib plotting routines for visualization. + +Features found in this package are similar to those found in +packages such as `histbook `__ (deprecated), +`boost-histogram `__ (in development), +`physt `__, and built-in numpy +`histogram `__ utilities. + +""" + +from coffea.jitters.hist.hist_tools import Bin, Cat, Hist, Interval, StringBin +from coffea.jitters.hist.plot import ( + clopper_pearson_interval, + normal_interval, + plot1d, + plot2d, + plotgrid, + plotratio, + poisson_interval, +) + +__all__ = [ + "Hist", + "Bin", + "Interval", + "Cat", + "StringBin", + "poisson_interval", + "clopper_pearson_interval", + "normal_interval", + "plot1d", + "plotratio", + "plot2d", + "plotgrid", +] diff --git a/src/coffea/jitters/hist/hist_tools.py b/src/coffea/jitters/hist/hist_tools.py new file mode 100644 index 000000000..848ccfed8 --- /dev/null +++ b/src/coffea/jitters/hist/hist_tools.py @@ -0,0 +1,1622 @@ +import copy +import functools +import numbers +import re +import warnings +from abc import ABCMeta, abstractmethod +from collections import namedtuple + +import awkward +import numpy + +# Python 2 and 3 compatibility +_regex_pattern = re.compile("dummy").__class__ +try: + basestring +except NameError: + basestring = str + +try: + from collections.abc import Sequence +except ImportError: + from collections.abc import Sequence + +MaybeSumSlice = namedtuple("MaybeSumSlice", ["start", "stop", "sum"]) + + +def assemble_blocks(array, ndslice, depth=0): + """ + Turns an n-dimensional slice of array (tuple of slices) + into a nested list of numpy arrays that can be passed to numpy.block() + + Under the assumption that index 0 of any dimension is underflow, -2 overflow, -1 nanflow, + this function will add the range not in the slice to the appropriate (over/under)flow bins + """ + if depth == 0: + ndslice = [MaybeSumSlice(s.start, s.stop, False) for s in ndslice] + if depth == len(ndslice): + slice_op = tuple(slice(s.start, s.stop) for s in ndslice) + sum_op = tuple(i for i, s in enumerate(ndslice) if s.sum) + return array[slice_op].sum(axis=sum_op, keepdims=True) + slist = [] + newslice = ndslice[:] + if ndslice[depth].start is not None: + newslice[depth] = MaybeSumSlice(None, ndslice[depth].start, True) + slist.append(assemble_blocks(array, newslice, depth + 1)) + newslice[depth] = MaybeSumSlice(ndslice[depth].start, ndslice[depth].stop, False) + slist.append(assemble_blocks(array, newslice, depth + 1)) + if ndslice[depth].stop is not None: + newslice[depth] = MaybeSumSlice(ndslice[depth].stop, -1, True) + slist.append(assemble_blocks(array, newslice, depth + 1)) + newslice[depth] = MaybeSumSlice(-1, None, False) + slist.append(assemble_blocks(array, newslice, depth + 1)) + return slist + + +def overflow_behavior(overflow): + if overflow == "none": + return slice(1, -2) + elif overflow == "under": + return slice(None, -2) + elif overflow == "over": + return slice(1, -1) + elif overflow == "all": + return slice(None, -1) + elif overflow == "allnan": + return slice(None) + elif overflow == "justnan": + return slice(-1, None) + else: + raise ValueError("Unrecognized overflow behavior: %s" % overflow) + + +@functools.total_ordering +class Interval: + """Real number interval + + Totally ordered, assuming no overlap in intervals. + A special nan interval can be constructed, which is defined + as greater than ``[*, inf)`` + + Parameters + ---------- + lo : float + Bin lower bound, inclusive + hi : float + Bin upper bound, exclusive + """ + + def __init__(self, lo, hi, label=None): + self._lo = float(lo) + self._hi = float(hi) + self._label = label + + def __repr__(self): + return "<{} ({}) instance at 0x{:0x}>".format( + self.__class__.__name__, + str(self), + id(self), + ) + + def __str__(self): + if self._label is not None: + return self._label + if self.nan(): + return "(nanflow)" + # string representation of floats is apparently a touchy subject.. further reading: + # https://stackoverflow.com/questions/25898733/why-does-strfloat-return-more-digits-in-python-3-than-python-2 + return "{}{:.12g}, {:.12g})".format( + "(" if self._lo == -numpy.inf else "[", + self._lo, + self._hi, + ) + + def __hash__(self): + return hash((self._lo, self._hi)) + + def __lt__(self, other): + if other.nan() and not self.nan(): + return True + elif self.nan(): + return False + elif self._lo < other._lo: + if self._hi > other._lo: + raise ValueError( + "Intervals %r and %r intersect! What are you doing?!" + % (self, other) + ) + return True + return False + + def __eq__(self, other): + if not isinstance(other, Interval): + return False + if other.nan() and self.nan(): + return True + if self._lo == other._lo and self._hi == other._hi: + return True + return False + + def nan(self): + return numpy.isnan(self._hi) + + @property + def lo(self): + """Lower boundary of this bin, inclusive""" + return self._lo + + @property + def hi(self): + """Upper boundary of this bin, exclusive""" + return self._hi + + @property + def mid(self): + """Midpoint of this bin""" + return (self._hi + self._lo) / 2 + + @property + def label(self): + """Label of this bin, mutable""" + return self._label + + @label.setter + def label(self, lbl): + self._label = lbl + + +@functools.total_ordering +class StringBin: + """A string used to fill a sparse axis + + Totally ordered, lexicographically by name. + + Parameters + ---------- + name : str + Name of the bin, as used in `Hist.fill` calls + label : str + The `str` representation of this bin can be overridden by + a custom label, which will be used preferentially in legends + produced by `hist.plot1d`, etc. + """ + + def __init__(self, name, label=None): + if not isinstance(name, basestring): + raise TypeError( + "StringBin only supports string categories, received a %r" % name + ) + elif "*" in name: + raise ValueError( + "StringBin does not support character '*' as it conflicts with wildcard mapping." + ) + self._name = name + self._label = label + + def __repr__(self): + return "<{} ({}) instance at 0x{:0x}>".format( + self.__class__.__name__, + self.name, + id(self), + ) + + def __str__(self): + if self._label is not None: + return self._label + return self._name + + def __hash__(self): + return hash(self._name) + + def __lt__(self, other): + return self._name < other._name + + def __eq__(self, other): + if isinstance(other, StringBin): + return self._name == other._name + return False + + @property + def name(self): + """Name of this bin, *Immutable*""" + return self._name + + @property + def label(self): + """Label of this bin, mutable""" + return self._label + + @label.setter + def label(self, lbl): + self._label = lbl + + +class Axis: + """ + Axis: Base class for any type of axis + Derived classes should implement, at least, an equality override + """ + + def __init__(self, name, label): + if name == "weight": + raise ValueError( + "Cannot create axis: 'weight' is a reserved keyword for Hist.fill()" + ) + self._name = name + self._label = label + + def __repr__(self): + return "<{} (name={}) instance at 0x{:0x}>".format( + self.__class__.__name__, + self._name, + id(self), + ) + + @property + def name(self): + return self._name + + @property + def label(self): + return self._label + + @label.setter + def label(self, label): + self._label = label + + def __eq__(self, other): + if isinstance(other, Axis): + if self._name != other._name: + return False + # label doesn't matter + return True + elif isinstance(other, basestring): + # Convenient for testing axis in list by name + if self._name != other: + return False + return True + raise TypeError("Cannot compare an Axis with a %r" % other) + + +class SparseAxis(Axis): + """ + SparseAxis: ABC for a sparse axis + + Derived should implement: + **index(identifier)** - return a hashable object for indexing + + **__eq__(axis)** - axis has same definition (not necessarily same bins) + + **__getitem__(index)** - return an identifier + + **_ireduce(slice)** - return a list of hashes, slice is arbitrary + + What we really want here is a hashlist with some slice sugar on top + It is usually the case that the identifier is already hashable, + in which case index and __getitem__ are trivial, but this mechanism + may be useful if the size of the tuple of identifiers in a + sparse-binned histogram becomes too large + """ + + pass + + +class Cat(SparseAxis): + """A category axis with name and label + + Parameters + ---------- + name : str + is used as a keyword in histogram filling, immutable + label : str + describes the meaning of the axis, can be changed + sorting : {'identifier', 'placement', 'integral'}, optional + Axis sorting when listing identifiers. Default 'placement' + Changing this setting can effect the order of stack plotting + in `hist.plot1d`. + + The number of categories is arbitrary, and can be filled sparsely + Identifiers are strings + """ + + def __init__(self, name, label, sorting="identifier"): + super().__init__(name, label) + # In all cases key == value.name + self._bins = {} + self._sorting = sorting + self._sorted = [] + + def index(self, identifier): + """Index of a identifier or label + + Parameters + ---------- + identifier : str or StringBin + The identifier to lookup + + Returns a `StringBin` corresponding to the given argument (trivial in the case + where a `StringBin` was passed) and saves a reference internally in the case where + the identifier was not seen before by this axis. + """ + if isinstance(identifier, StringBin): + index = identifier + else: + index = StringBin(identifier) + if index.name not in self._bins: + self._bins[index.name] = index + self._sorted.append(index.name) + if self._sorting == "identifier": + self._sorted.sort() + return self._bins[index.name] + + def __eq__(self, other): + # Sparse, so as long as name is the same + return super().__eq__(other) + + def __getitem__(self, index): + if not isinstance(index, StringBin): + raise TypeError("Expected a StringBin object, got: %r" % index) + identifier = index.name + if identifier not in self._bins: + raise KeyError("No identifier %r in this Category axis") + return identifier + + def _ireduce(self, the_slice): + out = None + if isinstance(the_slice, StringBin): + out = [the_slice.name] + elif isinstance(the_slice, _regex_pattern): + out = [k for k in self._sorted if the_slice.match(k)] + elif isinstance(the_slice, basestring): + pattern = "^" + re.escape(the_slice).replace(r"\*", ".*") + "$" + m = re.compile(pattern) + out = [k for k in self._sorted if m.match(k)] + elif isinstance(the_slice, list): + if not all(k in self._sorted for k in the_slice): + warnings.warn( + "Not all requested indices present in %r" % self, RuntimeWarning + ) + out = [k for k in self._sorted if k in the_slice] + elif isinstance(the_slice, slice): + if the_slice.step is not None: + raise IndexError("Not sure how to use slice step for categories...") + start, stop = 0, len(self._sorted) + if isinstance(the_slice.start, basestring): + start = self._sorted.index(the_slice.start) + else: + start = the_slice.start + if isinstance(the_slice.stop, basestring): + stop = self._sorted.index(the_slice.stop) + else: + stop = the_slice.stop + out = self._sorted[start:stop] + else: + raise IndexError(f"Cannot understand slice {the_slice!r} on axis {self!r}") + return [self._bins[k] for k in out] + + @property + def size(self): + """Number of bins""" + return len(self._bins) + + @property + def sorting(self): + """Sorting definition to adhere to + + See `Cat` constructor for possible values + """ + return self._sorting + + @sorting.setter + def sorting(self, newsorting): + if newsorting == "placement": + # not much we can do about already inserted values + pass + elif newsorting == "identifier": + self._sorted.sort() + elif newsorting == "integral": + # this will be checked in any Hist.identifiers() call accessing this axis + pass + else: + raise AttributeError("Invalid axis sorting type: %s" % newsorting) + self._sorting = newsorting + + def identifiers(self): + """List of `StringBin` identifiers""" + return [self._bins[k] for k in self._sorted] + + +class DenseAxis(Axis): + """ + DenseAxis: ABC for a fixed-size densely-indexed axis + + Derived should implement: + **index(identifier)** - return an index + + **__eq__(axis)** - axis has same definition and binning + + **__getitem__(index)** - return an identifier + + **_ireduce(slice)** - return a slice or list of indices, input slice to be interpred as values + + **reduced(islice)** - return a new axis with binning corresponding to the index slice (from _ireduce) + + TODO: hasoverflow(), not all dense axes might have an overflow concept, + currently it is implicitly assumed they do (as the only dense type is a numeric axis) + """ + + pass + + +class Bin(DenseAxis): + """A binned axis with name, label, and binning. + + Parameters + ---------- + name : str + is used as a keyword in histogram filling, immutable + label : str + describes the meaning of the axis, can be changed + n_or_arr : int or list or numpy.ndarray + Integer number of bins, if uniform binning. Otherwise, a list or + numpy 1D array of bin boundaries. + lo : float, optional + lower boundary of bin range, if uniform binning + hi : float, optional + upper boundary of bin range, if uniform binning + + This axis will generate frequencies for n+3 bins, special bin indices: + ``0 = underflow, n+1 = overflow, n+2 = nanflow`` + Bin boundaries are [lo, hi) + """ + + def __init__(self, name, label, n_or_arr, lo=None, hi=None): + super().__init__(name, label) + self._lazy_intervals = None + if isinstance(n_or_arr, (list, numpy.ndarray)): + self._uniform = False + self._bins = numpy.array(n_or_arr, dtype="d") + if not all(numpy.sort(self._bins) == self._bins): + raise ValueError("Binning not sorted!") + self._lo = self._bins[0] + self._hi = self._bins[-1] + # to make searchsorted differentiate inf from nan + self._bins = numpy.append(self._bins, numpy.inf) + self._interval_bins = numpy.r_[-numpy.inf, self._bins, numpy.nan] + self._bin_names = numpy.full(self._interval_bins[:-1].size, None) + elif isinstance(n_or_arr, numbers.Integral): + if lo is None or hi is None: + raise TypeError( + "Interpreting n_or_arr as uniform binning, please specify lo and hi values" + ) + self._uniform = True + self._lo = lo + self._hi = hi + self._bins = n_or_arr + self._interval_bins = numpy.r_[ + -numpy.inf, + numpy.linspace(self._lo, self._hi, self._bins + 1), + numpy.inf, + numpy.nan, + ] + self._bin_names = numpy.full(self._interval_bins[:-1].size, None) + else: + raise TypeError( + "Cannot understand n_or_arr (nbins or binning array) type %r" % n_or_arr + ) + + @property + def _intervals(self): + if not hasattr(self, "_lazy_intervals") or self._lazy_intervals is None: + self._lazy_intervals = [ + Interval(low, high, bin) + for low, high, bin in zip( + self._interval_bins[:-1], self._interval_bins[1:], self._bin_names + ) + ] + return self._lazy_intervals + + def __getstate__(self): + if hasattr(self, "_lazy_intervals") and self._lazy_intervals is not None: + self._bin_names = numpy.array( + [interval.label for interval in self._lazy_intervals] + ) + self.__dict__.pop("_lazy_intervals", None) + return self.__dict__ + + def __setstate__(self, d): + if "_intervals" in d: # convert old hists to new serialization format + _old_intervals = d.pop("_intervals") + interval_bins = [i._lo for i in _old_intervals] + [_old_intervals[-1]._hi] + d["_interval_bins"] = numpy.array(interval_bins) + d["_bin_names"] = numpy.array( + [interval._label for interval in _old_intervals] + ) + if "_interval_bins" in d and "_bin_names" not in d: + d["_bin_names"] = numpy.full(d["_interval_bins"][:-1].size, None) + self.__dict__ = d + + def index(self, identifier): + """Index of a identifier or label + + Parameters + ---------- + identifier : float or Interval or numpy.ndarray + The identifier(s) to lookup. Supports vectorized + calls when a numpy 1D array of numbers is passed. + + Returns an integer corresponding to the index in the axis where the histogram would be filled. + The integer range includes flow bins: ``0 = underflow, n+1 = overflow, n+2 = nanflow`` + """ + isarray = isinstance(identifier, (awkward.Array, numpy.ndarray)) + if isarray or isinstance(identifier, numbers.Number): + if isarray: + identifier = numpy.asarray(identifier) + if self._uniform: + idx = numpy.clip( + numpy.floor( + (identifier - self._lo) + * float(self._bins) + / (self._hi - self._lo) + ) + + 1, + 0, + self._bins + 1, + ) + if isinstance(idx, numpy.ndarray): + idx[numpy.isnan(idx)] = self.size - 1 + idx = idx.astype(int) + elif numpy.isnan(idx): + idx = self.size - 1 + else: + idx = int(idx) + return idx + else: + return numpy.searchsorted(self._bins, identifier, side="right") + elif isinstance(identifier, Interval): + if identifier.nan(): + return self.size - 1 + for idx, interval in enumerate(self._intervals): + if interval._lo <= identifier._lo and interval._hi >= identifier._hi: + return idx + raise ValueError( + "Axis %r has no interval that fully contains identifier %r" + % (self, identifier) + ) + raise TypeError("Request bin indices with a identifier or 1-D array only") + + def __eq__(self, other): + if isinstance(other, DenseAxis): + if not super().__eq__(other): + return False + if self._uniform != other._uniform: + return False + if self._uniform and self._bins != other._bins: + return False + if not self._uniform and not all(self._bins == other._bins): + return False + return True + return super().__eq__(other) + + def __getitem__(self, index): + return self._intervals[index] + + def _ireduce(self, the_slice): + if isinstance(the_slice, numbers.Number): + the_slice = slice(the_slice, the_slice) + elif isinstance(the_slice, Interval): + if the_slice.nan(): + return slice(-1, None) + lo = the_slice._lo if the_slice._lo > -numpy.inf else None + hi = the_slice._hi if the_slice._hi < numpy.inf else None + the_slice = slice(lo, hi) + if isinstance(the_slice, slice): + blo, bhi = None, None + if the_slice.start is not None: + if the_slice.start < self._lo: + raise ValueError( + "Reducing along axis %r: requested start %r exceeds bin boundaries (use open slicing, e.g. x[:stop])" + % (self, the_slice.start) + ) + if self._uniform: + blo_real = (the_slice.start - self._lo) * self._bins / ( + self._hi - self._lo + ) + 1 + blo = numpy.clip( + numpy.round(blo_real).astype(int), 0, self._bins + 1 + ) + if abs(blo - blo_real) > 1.0e-14: + warnings.warn( + "Reducing along axis %r: requested start %r between bin boundaries, no interpolation is performed" + % (self, the_slice.start), + RuntimeWarning, + ) + else: + if the_slice.start not in self._bins: + warnings.warn( + "Reducing along axis %r: requested start %r between bin boundaries, no interpolation is performed" + % (self, the_slice.start), + RuntimeWarning, + ) + blo = self.index(the_slice.start) + if the_slice.stop is not None: + if the_slice.stop > self._hi: + raise ValueError( + "Reducing along axis %r: requested stop %r exceeds bin boundaries (use open slicing, e.g. x[start:])" + % (self, the_slice.stop) + ) + if self._uniform: + bhi_real = (the_slice.stop - self._lo) * self._bins / ( + self._hi - self._lo + ) + 1 + bhi = numpy.clip( + numpy.round(bhi_real).astype(int), 0, self._bins + 1 + ) + if abs(bhi - bhi_real) > 1.0e-14: + warnings.warn( + "Reducing along axis %r: requested stop %r between bin boundaries, no interpolation is performed" + % (self, the_slice.stop), + RuntimeWarning, + ) + else: + if the_slice.stop not in self._bins: + warnings.warn( + "Reducing along axis %r: requested stop %r between bin boundaries, no interpolation is performed" + % (self, the_slice.stop), + RuntimeWarning, + ) + bhi = self.index(the_slice.stop) + # Assume null ranges (start==stop) mean we want the bin containing the value + if blo is not None and blo == bhi: + bhi += 1 + if the_slice.step is not None: + raise NotImplementedError( + "Step slicing can be interpreted as a rebin factor" + ) + return slice(blo, bhi, the_slice.step) + elif isinstance(the_slice, list) and all( + isinstance(v, Interval) for v in the_slice + ): + raise NotImplementedError("Slice histogram from list of intervals") + raise IndexError(f"Cannot understand slice {the_slice!r} on axis {self!r}") + + def reduced(self, islice): + """Return a new axis with reduced binning + + The new binning corresponds to the slice made on this axis. + Overflow will be taken care of by ``Hist.__getitem__`` + + Parameters + ---------- + islice : slice + ``islice.start`` and ``islice.stop`` should be None or within ``[1, ax.size() - 1]`` + This slice is usually as returned from ``Bin._ireduce`` + """ + if islice.step is not None: + raise NotImplementedError( + "Step slicing can be interpreted as a rebin factor" + ) + if islice.start is None and islice.stop is None: + return self + if self._uniform: + lo = self._lo + ilo = 0 + if islice.start is not None: + lo += (islice.start - 1) * (self._hi - self._lo) / self._bins + ilo = islice.start - 1 + hi = self._hi + ihi = self._bins + if islice.stop is not None: + hi = self._lo + (islice.stop - 1) * (self._hi - self._lo) / self._bins + ihi = islice.stop - 1 + bins = ihi - ilo + # TODO: remove this once satisfied it works + rbins = (hi - lo) * self._bins / (self._hi - self._lo) + assert abs(bins - rbins) < 1e-14, "%d %f %r" % (bins, rbins, self) + ax = Bin(self._name, self._label, bins, lo, hi) + return ax + else: + lo = None if islice.start is None else islice.start - 1 + hi = -1 if islice.stop is None else islice.stop + bins = self._bins[slice(lo, hi)] + ax = Bin(self._name, self._label, bins) + return ax + + @property + def size(self): + """Number of bins, including overflow (i.e. ``n + 3``)""" + if self._uniform: + return self._bins + 3 + # (inf added at constructor) + return len(self._bins) + 1 + + def edges(self, overflow="none"): + """Bin boundaries + + Parameters + ---------- + overflow : str + Create overflow and/or underflow bins by adding a bin of same width to each end. + See `Hist.sum` description for the allowed values. + """ + if self._uniform: + out = numpy.linspace(self._lo, self._hi, self._bins + 1) + else: + out = self._bins[:-1].copy() + out = numpy.r_[ + 2 * out[0] - out[1], out, 2 * out[-1] - out[-2], 3 * out[-1] - 2 * out[-2] + ] + return out[overflow_behavior(overflow)] + + def centers(self, overflow="none"): + """Bin centers + + Parameters + ---------- + overflow : str + Create overflow and/or underflow bins by adding a bin of same width to each end. + See `Hist.sum` description for the allowed values. + """ + edges = self.edges(overflow) + return (edges[:-1] + edges[1:]) / 2 + + def identifiers(self, overflow="none"): + """List of `Interval` identifiers""" + return self._intervals[overflow_behavior(overflow)] + + +class AccumulatorABC(metaclass=ABCMeta): + """Abstract base class for an accumulator + + Accumulators are abstract objects that enable the reduce stage of the typical map-reduce + scaleout that we do in Coffea. One concrete example is a histogram. The idea is that an + accumulator definition holds enough information to be able to create an empty accumulator + (the ``identity()`` method) and add two compatible accumulators together (the ``add()`` method). + The former is not strictly necessary, but helps with book-keeping. Here we show an example usage + of a few accumulator types. An arbitrary-depth nesting of dictionary accumulators is supported, much + like the behavior of directories in ROOT hadd. + + After defining an accumulator:: + + from coffea.processor import dict_accumulator, column_accumulator, defaultdict_accumulator + from coffea.hist import Hist, Bin + import numpy as np + + adef = dict_accumulator({ + 'cutflow': defaultdict_accumulator(int), + 'pt': Hist("counts", Bin("pt", "$p_T$", 100, 0, 100)), + 'final_pt': column_accumulator(np.zeros(shape=(0,))), + }) + + Notice that this function does not mutate ``adef``:: + + def fill(n): + ptvals = np.random.exponential(scale=30, size=n) + cut = ptvals > 200. + acc = adef.identity() + acc['cutflow']['pt>200'] += cut.sum() + acc['pt'].fill(pt=ptvals) + acc['final_pt'] += column_accumulator(ptvals[cut]) + return acc + + As such, we can execute it several times in parallel and reduce the result:: + + import concurrent.futures + with concurrent.futures.ThreadPoolExecutor() as executor: + outputs = executor.map(fill, [2000, 2000]) + + combined = sum(outputs, adef.identity()) + + + Derived classes must implement + - ``identity()``: returns a new object of same type as self, + such that ``self + self.identity() == self`` + - ``add(other)``: adds an object of same type as self to self + + Concrete implementations are then provided for ``__add__``, ``__radd__``, and ``__iadd__``. + """ + + @abstractmethod + def identity(self): + """Identity of the accumulator + + A value such that any other value added to it will return + the other value + """ + pass + + @abstractmethod + def add(self, other): + """Add another accumulator to this one in-place""" + pass + + def __add__(self, other): + ret = self.identity() + ret.add(self) + ret.add(other) + return ret + + def __radd__(self, other): + ret = self.identity() + ret.add(other) + ret.add(self) + return ret + + def __iadd__(self, other): + self.add(other) + return self + + +class Hist(AccumulatorABC): + """ + Specify a multidimensional histogram. + + Parameters + ---------- + label : str + A description of the meaning of the sum of weights + ``*axes`` + positional list of `Cat` or `Bin` objects, denoting the axes of the histogram + axes : collections.abc.Sequence + list of `Cat` or `Bin` objects, denoting the axes of the histogram (overridden by ``*axes``) + dtype : str + Underlying numpy dtype to use for storing sum of weights + + Examples + -------- + + Creating a histogram with a sparse axis, and two dense axes:: + + h = coffea.hist.Hist("Observed bird count", + coffea.hist.Cat("species", "Bird species"), + coffea.hist.Bin("x", "x coordinate [m]", 20, -5, 5), + coffea.hist.Bin("y", "y coordinate [m]", 20, -5, 5), + ) + + # or + + h = coffea.hist.Hist(label="Observed bird count", + axes=(coffea.hist.Cat("species", "Bird species"), + coffea.hist.Bin("x", "x coordinate [m]", 20, -5, 5), + coffea.hist.Bin("y", "y coordinate [m]", 20, -5, 5), + ) + ) + + # or + + h = coffea.hist.Hist(axes=[coffea.hist.Cat("species", "Bird species"), + coffea.hist.Bin("x", "x coordinate [m]", 20, -5, 5), + coffea.hist.Bin("y", "y coordinate [m]", 20, -5, 5), + ], + label="Observed bird count", + ) + + which produces: + + >>> h + + + """ + + #: Default numpy dtype to store sum of weights + DEFAULT_DTYPE = "d" + + def __init__(self, label, *axes, **kwargs): + if not isinstance(label, basestring): + raise TypeError("label must be a string") + self._label = label + self._dtype = kwargs.pop( + "dtype", Hist.DEFAULT_DTYPE + ) # Much nicer in python3 :( + self._axes = axes + if len(axes) == 0 and "axes" in kwargs: + if not isinstance(kwargs["axes"], Sequence): + raise TypeError("axes must be a sequence type! (tuple, list, etc.)") + self._axes = tuple(kwargs["axes"]) + elif len(axes) != 0 and "axes" in kwargs: + warnings.warn( + "axes defined by both positional arguments and keyword argument, using positional arguments" + ) + + if not all(isinstance(ax, Axis) for ax in self._axes): + del self._axes + raise TypeError("All axes must be derived from Axis class") + # if we stably partition axes to sparse, then dense, some things simplify + # ..but then the user would then see the order change under them + self._dense_shape = tuple( + [ax.size for ax in self._axes if isinstance(ax, DenseAxis)] + ) + if numpy.prod(self._dense_shape) > 10000000: + warnings.warn("Allocating a large (>10M bin) histogram!", RuntimeWarning) + self._sumw = {} + # Storage of sumw2 starts at first use of weight keyword in fill() + self._sumw2 = None + + def __repr__(self): + return "<{} ({}) instance at 0x{:0x}>".format( + self.__class__.__name__, + ",".join(d.name for d in self.axes()), + id(self), + ) + + @property + def label(self): + """A label describing the meaning of the sum of weights""" + return self._label + + @label.setter + def label(self, label): + self._label = label + + def copy(self, content=True): + """Create a deep copy + + Parameters + ---------- + content : bool + If set false, only the histogram definition is copied, resetting + the sum of weights to zero + """ + out = Hist(self._label, *self._axes, dtype=self._dtype) + if self._sumw2 is not None: + out._sumw2 = {} + if content: + out._sumw = copy.deepcopy(self._sumw) + out._sumw2 = copy.deepcopy(self._sumw2) + return out + + def identity(self): + """The identity (zero value) of this accumulator""" + return self.copy(content=False) + + def clear(self): + """Clear all content in this histogram""" + self._sumw = {} + self._sumw2 = None + + def axis(self, axis_name): + """Get an ``Axis`` object""" + if axis_name in self._axes: + return self._axes[self._axes.index(axis_name)] + raise KeyError(f"No axis {axis_name} found in {self!r}") + + def axes(self): + """Get all axes in this histogram""" + return self._axes + + @property + def fields(self): + """This is a stub for histbook compatibility""" + return [ax.name for ax in self._axes] + + def dim(self): + """Dimension of this histogram (number of axes)""" + return len(self._axes) + + def dense_dim(self): + """Dense dimension of this histogram (number of non-sparse axes)""" + return len(self._dense_shape) + + def sparse_dim(self): + """Sparse dimension of this histogram (number of sparse axes)""" + return self.dim() - self.dense_dim() + + def dense_axes(self): + """All dense axes""" + return [ax for ax in self._axes if isinstance(ax, DenseAxis)] + + def sparse_axes(self): + """All sparse axes""" + return [ax for ax in self._axes if isinstance(ax, SparseAxis)] + + def sparse_nbins(self): + """Total number of sparse bins""" + return len(self._sumw) + + def _idense(self, axis): + return self.dense_axes().index(axis) + + def _isparse(self, axis): + return self.sparse_axes().index(axis) + + def _init_sumw2(self): + self._sumw2 = {} + for key in self._sumw.keys(): + self._sumw2[key] = self._sumw[key].copy() + + def compatible(self, other): + """Checks if this histogram is compatible with another, i.e. they have identical binning""" + if self.dim() != other.dim(): + return False + if {d.name for d in self.sparse_axes()} != { + d.name for d in other.sparse_axes() + }: + return False + if not all(d1 == d2 for d1, d2 in zip(self.dense_axes(), other.dense_axes())): + return False + return True + + def add(self, other): + """Add another histogram into this one, in-place""" + if not self.compatible(other): + raise ValueError( + "Cannot add this histogram with histogram %r of dissimilar dimensions" + % other + ) + + raxes = other.sparse_axes() + + def add_dict(left, right): + for rkey in right.keys(): + lkey = tuple( + self.axis(rax).index(rax[ridx]) for rax, ridx in zip(raxes, rkey) + ) + if lkey in left: + left[lkey] += right[rkey] + else: + left[lkey] = copy.deepcopy(right[rkey]) + + if self._sumw2 is None and other._sumw2 is None: + pass + elif self._sumw2 is None: + self._init_sumw2() + add_dict(self._sumw2, other._sumw2) + elif other._sumw2 is None: + add_dict(self._sumw2, other._sumw) + else: + add_dict(self._sumw2, other._sumw2) + add_dict(self._sumw, other._sumw) + return self + + def __getitem__(self, keys): + if not isinstance(keys, tuple): + keys = (keys,) + if len(keys) > self.dim(): + raise IndexError("Too many indices for this histogram") + elif len(keys) < self.dim(): + if Ellipsis in keys: + idx = keys.index(Ellipsis) + slices = (slice(None),) * (self.dim() - len(keys) + 1) + keys = keys[:idx] + slices + keys[idx + 1 :] + else: + slices = (slice(None),) * (self.dim() - len(keys)) + keys += slices + sparse_idx = [] + dense_idx = [] + new_dims = [] + for s, ax in zip(keys, self._axes): + if isinstance(ax, SparseAxis): + sparse_idx.append(ax._ireduce(s)) + new_dims.append(ax) + else: + islice = ax._ireduce(s) + dense_idx.append(islice) + new_dims.append(ax.reduced(islice)) + dense_idx = tuple(dense_idx) + + def dense_op(array): + return numpy.block(assemble_blocks(array, dense_idx)) + + out = Hist(self._label, *new_dims, dtype=self._dtype) + if self._sumw2 is not None: + out._init_sumw2() + for sparse_key in self._sumw: + if not all(k in idx for k, idx in zip(sparse_key, sparse_idx)): + continue + if sparse_key in out._sumw: + out._sumw[sparse_key] += dense_op(self._sumw[sparse_key]) + if self._sumw2 is not None: + out._sumw2[sparse_key] += dense_op(self._sumw2[sparse_key]) + else: + out._sumw[sparse_key] = dense_op(self._sumw[sparse_key]).copy() + if self._sumw2 is not None: + out._sumw2[sparse_key] = dense_op(self._sumw2[sparse_key]).copy() + return out + + def fill(self, **values): + """Fill sum of weights from columns + + Parameters + ---------- + ``**values`` + Keyword arguments, one for each axis name, of either flat numpy arrays + (for dense dimensions) or literals (for sparse dimensions) which will + be used to fill bins at the corresponding indices. + + Note + ---- + The reserved keyword ``weight``, if specified, will increment sum of weights + by the given column values, which must be broadcastable to the same dimension as all other + columns. Upon first use, this will trigger the storage of the sum of squared weights. + + + Examples + -------- + + Filling the histogram from the `Hist` example: + + >>> h.fill(species='ducks', x=numpy.random.normal(size=10), y=numpy.random.normal(size=10), weight=numpy.ones(size=10) * 3) + + """ + weight = values.pop("weight", None) + if isinstance(weight, (awkward.Array, numpy.ndarray)): + weight = numpy.asarray(weight) + if isinstance(weight, numbers.Number): + weight = numpy.atleast_1d(weight) + if not all(d.name in values for d in self._axes): + missing = ", ".join(d.name for d in self._axes if d.name not in values) + raise ValueError( + f"Not all axes specified for {self!r}. Missing: {missing}" + ) + if not all(name in self._axes for name in values): + extra = ", ".join(name for name in values if name not in self._axes) + raise ValueError( + "Unrecognized axes specified for {!r}. Extraneous: {}".format( + self, extra + ) + ) + + if weight is not None and self._sumw2 is None: + self._init_sumw2() + + sparse_key = tuple(d.index(values[d.name]) for d in self.sparse_axes()) + if sparse_key not in self._sumw: + self._sumw[sparse_key] = numpy.zeros( + shape=self._dense_shape, dtype=self._dtype + ) + if self._sumw2 is not None: + self._sumw2[sparse_key] = numpy.zeros( + shape=self._dense_shape, dtype=self._dtype + ) + + if self.dense_dim() > 0: + dense_indices = tuple( + d.index(values[d.name]) for d in self._axes if isinstance(d, DenseAxis) + ) + xy = numpy.atleast_1d( + numpy.ravel_multi_index(dense_indices, self._dense_shape) + ) + if weight is not None: + self._sumw[sparse_key][:] += numpy.bincount( + xy, weights=weight, minlength=numpy.array(self._dense_shape).prod() + ).reshape(self._dense_shape) + self._sumw2[sparse_key][:] += numpy.bincount( + xy, + weights=weight**2, + minlength=numpy.array(self._dense_shape).prod(), + ).reshape(self._dense_shape) + else: + self._sumw[sparse_key][:] += numpy.bincount( + xy, weights=None, minlength=numpy.array(self._dense_shape).prod() + ).reshape(self._dense_shape) + if self._sumw2 is not None: + self._sumw2[sparse_key][:] += numpy.bincount( + xy, + weights=None, + minlength=numpy.array(self._dense_shape).prod(), + ).reshape(self._dense_shape) + else: + if weight is not None: + self._sumw[sparse_key] += numpy.sum(weight) + self._sumw2[sparse_key] += numpy.sum(weight**2) + else: + self._sumw[sparse_key] += 1.0 + if self._sumw2 is not None: + self._sumw2[sparse_key] += 1.0 + + def sum(self, *axes, **kwargs): + """Integrates out a set of axes, producing a new histogram + + Parameters + ---------- + ``*axes`` + Positional list of axes to integrate out (either a string or an Axis object) + + overflow : {'none', 'under', 'over', 'all', 'allnan'}, optional + How to treat the overflow bins in the sum. Only applies to dense axes. + 'all' includes both under- and over-flow but not nan-flow bins. + Default is 'none'. + """ + overflow = kwargs.pop("overflow", "none") + axes = [self.axis(ax) for ax in axes] + reduced_dims = [ax for ax in self._axes if ax not in axes] + out = Hist(self._label, *reduced_dims, dtype=self._dtype) + if self._sumw2 is not None: + out._init_sumw2() + + sparse_drop = [] + dense_slice = [slice(None)] * self.dense_dim() + dense_sum_dim = [] + for axis in axes: + if isinstance(axis, DenseAxis): + idense = self._idense(axis) + dense_sum_dim.append(idense) + dense_slice[idense] = overflow_behavior(overflow) + elif isinstance(axis, SparseAxis): + isparse = self._isparse(axis) + sparse_drop.append(isparse) + dense_slice = tuple(dense_slice) + dense_sum_dim = tuple(dense_sum_dim) + + def dense_op(array): + if len(dense_sum_dim) > 0: + return numpy.sum(array[dense_slice], axis=dense_sum_dim) + return array + + for key in self._sumw.keys(): + new_key = tuple(k for i, k in enumerate(key) if i not in sparse_drop) + if new_key in out._sumw: + out._sumw[new_key] += dense_op(self._sumw[key]) + if self._sumw2 is not None: + out._sumw2[new_key] += dense_op(self._sumw2[key]) + else: + out._sumw[new_key] = dense_op(self._sumw[key]).copy() + if self._sumw2 is not None: + out._sumw2[new_key] = dense_op(self._sumw2[key]).copy() + return out + + def project(self, *axes, **kwargs): + """Project histogram onto a subset of its axes + + Parameters + ---------- + ``*axes`` : str or Axis + Positional list of axes to project on to + overflow : str + Controls behavior of integration over remaining axes. + See `sum` description for meaning of allowed values + Default is to *not include* overflow bins + """ + overflow = kwargs.pop("overflow", "none") + axes = [self.axis(ax) for ax in axes] + toremove = [ax for ax in self.axes() if ax not in axes] + return self.sum(*toremove, overflow=overflow) + + def integrate(self, axis_name, int_range=slice(None), overflow="none"): + """Integrates current histogram along one dimension + + Parameters + ---------- + axis_name : str or Axis + Which dimension to reduce on + int_range : slice + Any slice, list, string, or other object that the axis will understand + Default is to integrate over the whole range + overflow : str + See `sum` description for meaning of allowed values + Default is to *not include* overflow bins + + """ + axis = self.axis(axis_name) + full_slice = tuple( + slice(None) if ax != axis else int_range for ax in self._axes + ) + if isinstance(int_range, Interval): + # Handle overflow intervals nicely + if int_range.nan(): + overflow = "justnan" + elif int_range.lo == -numpy.inf: + overflow = "under" + elif int_range.hi == numpy.inf: + overflow = "over" + return self[full_slice].sum( + axis.name, overflow=overflow + ) # slice may make new axis, use name + + def remove(self, bins, axis): + """Remove bins from a sparse axis + + Parameters + ---------- + bins : iterable + A list of bin identifiers to remove + axis : str or Axis + Axis name or SparseAxis instance + + Returns a *copy* of the histogram with specified bins removed, not an in-place operation + """ + axis = self.axis(axis) + if not isinstance(axis, SparseAxis): + raise NotImplementedError( + "Hist.remove() only supports removing items from a sparse axis." + ) + bins = [axis.index(binid) for binid in bins] + keep = [binid.name for binid in self.identifiers(axis) if binid not in bins] + full_slice = tuple(slice(None) if ax != axis else keep for ax in self._axes) + return self[full_slice] + + def group(self, old_axes, new_axis, mapping, overflow="none"): + """Group a set of slices on old axes into a single new axis + + Parameters + ---------- + old_axes + Axis or tuple of axes which are being grouped + new_axis + A new sparse dimension definition, e.g. a `Cat` instance + mapping : dict + A mapping ``{'new_bin': (slice, ...), ...}`` where each + slice is on the axes being re-binned. In the case of + a single axis for ``old_axes``, ``{'new_bin': slice, ...}`` + is admissible. + overflow : str + See `sum` description for meaning of allowed values + Default is to *not include* overflow bins + + Returns a new histogram object + """ + if not isinstance(new_axis, SparseAxis): + raise TypeError( + "New axis must be a sparse axis. Note: Hist.group() signature has changed to group(old_axes, new_axis, ...)!" + ) + if new_axis in self.axes() and self.axis(new_axis) is new_axis: + raise RuntimeError( + "new_axis is already in the list of axes. Note: Hist.group() signature has changed to group(old_axes, new_axis, ...)!" + ) + if not isinstance(old_axes, tuple): + old_axes = (old_axes,) + old_axes = [self.axis(ax) for ax in old_axes] + old_indices = [i for i, ax in enumerate(self._axes) if ax in old_axes] + new_dims = [new_axis] + [ax for ax in self._axes if ax not in old_axes] + out = Hist(self._label, *new_dims, dtype=self._dtype) + if self._sumw2 is not None: + out._init_sumw2() + for new_cat in mapping.keys(): + the_slice = mapping[new_cat] + if not isinstance(the_slice, tuple): + the_slice = (the_slice,) + if len(the_slice) != len(old_axes): + raise Exception("Slicing does not match number of axes being rebinned") + full_slice = [slice(None)] * self.dim() + for idx, s in zip(old_indices, the_slice): + full_slice[idx] = s + full_slice = tuple(full_slice) + reduced_hist = self[full_slice].sum( + *tuple(ax.name for ax in old_axes), overflow=overflow + ) # slice may change old axis binning + new_idx = new_axis.index(new_cat) + for key in reduced_hist._sumw: + new_key = (new_idx,) + key + out._sumw[new_key] = reduced_hist._sumw[key] + if self._sumw2 is not None: + out._sumw2[new_key] = reduced_hist._sumw2[key] + return out + + def rebin(self, old_axis, new_axis): + """Rebin a dense axis + + This function will construct the mapping from old to new axis, and + constructs a new histogram, rebinning the sum of weights along that dimension. + + Note + ---- + No interpolation is performed, so the user must be sure the old + and new axes have compatible bin boundaries, e.g. that they evenly + divide each other. + + Parameters + ---------- + old_axis : str or Axis + Axis to rebin + new_axis : str or Axis or int + A DenseAxis object defining the new axis (e.g. a `Bin` instance). + If a number N is supplied, the old axis edges are downsampled by N, + resulting in a histogram with ``old_nbins // N`` bins. + + Returns a new `Hist` object. + """ + old_axis = self.axis(old_axis) + if isinstance(new_axis, numbers.Integral): + new_axis = Bin(old_axis.name, old_axis.label, old_axis.edges()[::new_axis]) + new_dims = [ax if ax != old_axis else new_axis for ax in self._axes] + out = Hist(self._label, *new_dims, dtype=self._dtype) + if self._sumw2 is not None: + out._init_sumw2() + + # would have been nice to use ufunc.reduceat, but we should support arbitrary reshuffling + idense = self._idense(old_axis) + + def view_ax(idx): + fullindex = [slice(None)] * self.dense_dim() + fullindex[idense] = idx + return tuple(fullindex) + + binmap = [new_axis.index(i) for i in old_axis.identifiers(overflow="allnan")] + + def dense_op(array): + anew = numpy.zeros(out._dense_shape, dtype=out._dtype) + for iold, inew in enumerate(binmap): + anew[view_ax(inew)] += array[view_ax(iold)] + return anew + + for key in self._sumw: + out._sumw[key] = dense_op(self._sumw[key]) + if self._sumw2 is not None: + out._sumw2[key] = dense_op(self._sumw2[key]) + return out + + def values(self, sumw2=False, overflow="none"): + """Extract the sum of weights arrays from this histogram + + Parameters + ---------- + sumw2 : bool + If True, frequencies is a tuple of arrays (sum weights, sum squared weights) + overflow + See `sum` description for meaning of allowed values + + Returns a mapping ``{(sparse identifier, ...): numpy.array(...), ...}`` + where each array has dimension `dense_dim` and shape matching + the number of bins per axis, plus 0-3 overflow bins depending + on the ``overflow`` argument. + """ + + def view_dim(arr): + if self.dense_dim() == 0: + return arr + else: + return arr[ + tuple(overflow_behavior(overflow) for _ in range(self.dense_dim())) + ] + + out = {} + for sparse_key in self._sumw.keys(): + id_key = tuple(ax[k] for ax, k in zip(self.sparse_axes(), sparse_key)) + if sumw2: + if self._sumw2 is not None: + w2 = view_dim(self._sumw2[sparse_key]) + else: + w2 = view_dim(self._sumw[sparse_key]) + out[id_key] = (view_dim(self._sumw[sparse_key]), w2) + else: + out[id_key] = view_dim(self._sumw[sparse_key]) + return out + + def scale(self, factor, axis=None): + """Scale histogram in-place by factor + + Parameters + ---------- + factor : float or dict + A number or mapping of identifier to number + axis : optional + Which (sparse) axis the dict applies to, may be a tuples of axes. + The dict keys must follow the same structure. + + Examples + -------- + This function is useful to quickly reweight according to some + weight mapping along a sparse axis, such as the ``species`` axis + in the `Hist` example: + + >>> h.scale({'ducks': 0.3, 'geese': 1.2}, axis='species') + >>> h.scale({('ducks',): 0.5}, axis=('species',)) + >>> h.scale({('geese', 'honk'): 5.0}, axis=('species', 'vocalization')) + """ + if self._sumw2 is None: + self._init_sumw2() + if isinstance(factor, numbers.Number) and axis is None: + for key in self._sumw.keys(): + self._sumw[key] *= factor + self._sumw2[key] *= factor**2 + elif isinstance(factor, dict): + if not isinstance(axis, tuple): + axis = (axis,) + factor = {(k,): v for k, v in factor.items()} + axis = tuple(map(self.axis, axis)) + isparse = list(map(self._isparse, axis)) + factor = { + tuple(a.index(e) for a, e in zip(axis, k)): v for k, v in factor.items() + } + for key in self._sumw.keys(): + factor_key = tuple(key[i] for i in isparse) + if factor_key in factor: + self._sumw[key] *= factor[factor_key] + self._sumw2[key] *= factor[factor_key] ** 2 + elif isinstance(factor, numpy.ndarray): + axis = self.axis(axis) + raise NotImplementedError("Scale dense dimension by a factor") + else: + raise TypeError("Could not interpret scale factor") + + def identifiers(self, axis, overflow="none"): + """Return a list of identifiers for an axis + + Parameters + ---------- + axis + Axis name or Axis object + overflow + See `sum` description for meaning of allowed values + """ + axis = self.axis(axis) + if isinstance(axis, SparseAxis): + out = [] + isparse = self._isparse(axis) + for identifier in axis.identifiers(): + if any(k[isparse] == axis.index(identifier) for k in self._sumw.keys()): + out.append(identifier) + if axis.sorting == "integral": + hproj = { + key[0]: integral + for key, integral in self.project(axis).values().items() + } + out.sort(key=lambda k: hproj[k.name]) + return out + elif isinstance(axis, DenseAxis): + return axis.identifiers(overflow=overflow) + + def to_boost(self): + """Convert this coffea Hist object to a boost_histogram obbject""" + import boost_histogram + + newaxes = [] + for axis in self.axes(): + if isinstance(axis, Bin) and axis._uniform: + newaxis = boost_histogram.axis.Regular( + axis._bins, + axis._lo, + axis._hi, + underflow=True, + overflow=True, + ) + newaxis.name = axis.name + newaxis.label = axis.label + newaxes.append(newaxis) + elif isinstance(axis, Bin) and not axis._uniform: + newaxis = boost_histogram.axis.Variable( + axis.edges(), + underflow=True, + overflow=True, + ) + newaxis.name = axis.name + newaxis.label = axis.label + newaxes.append(newaxis) + elif isinstance(axis, Cat): + identifiers = self.identifiers(axis) + newaxis = boost_histogram.axis.StrCategory( + [x.name for x in identifiers], + growth=True, + ) + newaxis.name = axis.name + newaxis.label = axis.label + newaxis.bin_labels = [x.label for x in identifiers] + newaxes.append(newaxis) + + if self._sumw2 is None: + storage = boost_histogram.storage.Double() + else: + storage = boost_histogram.storage.Weight() + + out = boost_histogram.Histogram(*newaxes, storage=storage) + out.label = self.label + + def expandkey(key): + kiter = iter(key) + for ax in newaxes: + if isinstance(ax, boost_histogram.axis.StrCategory): + yield ax.index(next(kiter)) + else: + yield slice(None) + + if self._sumw2 is None: + values = self.values(overflow="all") + for sparse_key, sumw in values.items(): + index = tuple(expandkey(sparse_key)) + view = out.view(flow=True) + view[index] = sumw + else: + values = self.values(sumw2=True, overflow="all") + for sparse_key, (sumw, sumw2) in values.items(): + index = tuple(expandkey(sparse_key)) + view = out.view(flow=True) + view[index].value = sumw + view[index].variance = sumw2 + + return out + + def to_hist(self): + """Convert this coffea.hist histogram to a hist object""" + import hist + + return hist.Hist(self.to_boost()) diff --git a/src/coffea/jitters/hist/plot.py b/src/coffea/jitters/hist/plot.py new file mode 100644 index 000000000..0d1cc10a3 --- /dev/null +++ b/src/coffea/jitters/hist/plot.py @@ -0,0 +1,1039 @@ +import numbers +import warnings + +import numpy +import scipy.stats + +from .hist_tools import DenseAxis, Interval, SparseAxis, overflow_behavior + +# Plotting is always terrible +# Let's try our best to follow matplotlib idioms +# https://matplotlib.org/tutorials/introductory/usage.html#coding-styles + +_coverage1sd = scipy.stats.norm.cdf(1) - scipy.stats.norm.cdf(-1) + + +def poisson_interval(sumw, sumw2, coverage=_coverage1sd): + """Frequentist coverage interval for Poisson-distributed observations + + Parameters + ---------- + sumw : numpy.ndarray + Sum of weights vector + sumw2 : numpy.ndarray + Sum weights squared vector + coverage : float, optional + Central coverage interval, defaults to 68% + + Calculates the so-called 'Garwood' interval, + c.f. https://www.ine.pt/revstat/pdf/rs120203.pdf or + http://ms.mcmaster.ca/peter/s743/poissonalpha.html + For weighted data, this approximates the observed count by ``sumw**2/sumw2``, which + effectively scales the unweighted poisson interval by the average weight. + This may not be the optimal solution: see https://arxiv.org/pdf/1309.1287.pdf for a proper treatment. + When a bin is zero, the scale of the nearest nonzero bin is substituted to scale the nominal upper bound. + If all bins zero, a warning is generated and interval is set to ``sumw``. + """ + scale = numpy.empty_like(sumw) + scale[sumw != 0] = sumw2[sumw != 0] / sumw[sumw != 0] + if numpy.sum(sumw == 0) > 0: + missing = numpy.where(sumw == 0) + available = numpy.nonzero(sumw) + if len(available[0]) == 0: + warnings.warn( + "All sumw are zero! Cannot compute meaningful error bars", + RuntimeWarning, + ) + return numpy.vstack([sumw, sumw]) + nearest = sum( + [numpy.subtract.outer(d, d0) ** 2 for d, d0 in zip(available, missing)] + ).argmin(axis=0) + argnearest = tuple(dim[nearest] for dim in available) + scale[missing] = scale[argnearest] + counts = sumw / scale + lo = scale * scipy.stats.chi2.ppf((1 - coverage) / 2, 2 * counts) / 2.0 + hi = scale * scipy.stats.chi2.ppf((1 + coverage) / 2, 2 * (counts + 1)) / 2.0 + interval = numpy.array([lo, hi]) + interval[interval == numpy.nan] = 0.0 # chi2.ppf produces nan for counts=0 + return interval + + +def clopper_pearson_interval(num, denom, coverage=_coverage1sd): + """Compute Clopper-Pearson coverage interval for a binomial distribution + + Parameters + ---------- + num : numpy.ndarray + Numerator, or number of successes, vectorized + denom : numpy.ndarray + Denominator or number of trials, vectorized + coverage : float, optional + Central coverage interval, defaults to 68% + + c.f. http://en.wikipedia.org/wiki/Binomial_proportion_confidence_interval + """ + if numpy.any(num > denom): + raise ValueError( + "Found numerator larger than denominator while calculating binomial uncertainty" + ) + lo = scipy.stats.beta.ppf((1 - coverage) / 2, num, denom - num + 1) + hi = scipy.stats.beta.ppf((1 + coverage) / 2, num + 1, denom - num) + interval = numpy.array([lo, hi]) + interval[:, num == 0.0] = 0.0 + interval[1, num == denom] = 1.0 + return interval + + +def normal_interval(pw, tw, pw2, tw2, coverage=_coverage1sd): + """Compute errors based on the expansion of pass/(pass + fail), possibly weighted + + Parameters + ---------- + pw : numpy.ndarray + Numerator, or number of (weighted) successes, vectorized + tw : numpy.ndarray + Denominator or number of (weighted) trials, vectorized + pw2 : numpy.ndarray + Numerator sum of weights squared, vectorized + tw2 : numpy.ndarray + Denominator sum of weights squared, vectorized + coverage : float, optional + Central coverage interval, defaults to 68% + + c.f. https://root.cern.ch/doc/master/TEfficiency_8cxx_source.html#l02515 + """ + + eff = pw / tw + + variance = (pw2 * (1 - 2 * eff) + tw2 * eff**2) / (tw**2) + sigma = numpy.sqrt(variance) + + prob = 0.5 * (1 - coverage) + delta = numpy.zeros_like(sigma) + delta[sigma != 0] = scipy.stats.norm.ppf(prob, scale=sigma[sigma != 0]) + + lo = eff - numpy.minimum(eff + delta, numpy.ones_like(eff)) + hi = numpy.maximum(eff - delta, numpy.zeros_like(eff)) - eff + + return numpy.array([lo, hi]) + + +def plot1d( + hist, + ax=None, + clear=True, + overlay=None, + stack=False, + overflow="none", + line_opts=None, + fill_opts=None, + error_opts=None, + legend_opts={}, + overlay_overflow="none", + density=False, + binwnorm=None, + order=None, +): + """Create a 1D plot from a 1D or 2D `Hist` object + + Parameters + ---------- + hist : Hist + Histogram with maximum of two dimensions + ax : matplotlib.axes.Axes, optional + Axes object (if None, one is created) + clear : bool, optional + Whether to clear Axes before drawing (if passed); if False, this function will skip drawing the legend + overlay : str, optional + In the case that ``hist`` is 2D, specify the axis of hist to overlay (remaining axis will be x axis) + stack : bool, optional + Whether to stack or overlay non-axis dimension (if it exists) + order : list, optional + How to order when stacking. Take a list of identifiers. + overflow : str, optional + If overflow behavior is not 'none', extra bins will be drawn on either end of the nominal + axis range, to represent the contents of the overflow bins. See `Hist.sum` documentation + for a description of the options. + line_opts : dict, optional + A dictionary of options to pass to the matplotlib + `ax.step `_ call + internal to this function. Leave blank for defaults. + fill_opts : dict, optional + A dictionary of options to pass to the matplotlib + `ax.fill_between `_ call + internal to this function. Leave blank for defaults. + error_opts : dict, optional + A dictionary of options to pass to the matplotlib + `ax.errorbar `_ call + internal to this function. Leave blank for defaults. Some special options are interpreted by + this function and not passed to matplotlib: 'emarker' (default: '') specifies the marker type + to place at cap of the errorbar. + legend_opts : dict, optional + A dictionary of options to pass to the matplotlib + `ax.legend `_ call + internal to this function. Leave blank for defaults. + overlay_overflow : str, optional + If overflow behavior is not 'none', extra bins in the overlay axis will be overlaid or stacked, + to represent the contents of the overflow bins. See `Hist.sum` documentation for a description of the options. + density : bool, optional + If true, convert sum weights to probability density (i.e. integrates to 1 over domain of axis) + (Note: this option conflicts with ``binwnorm``) + binwnorm : float, optional + If true, convert sum weights to bin-width-normalized, with unit equal to supplied value (usually you want to specify 1.) + + + Returns + ------- + ax : matplotlib.axes.Axes + A matplotlib `Axes `_ object + """ + import matplotlib.pyplot as plt + import mplhep as hep + + if ax is None: + ax = plt.gca() + else: + if not isinstance(ax, plt.Axes): + raise ValueError("ax must be a matplotlib Axes object") + if clear: + ax.clear() + if hist.dim() > 2: + raise ValueError( + "plot1d() can only support up to two dimensions (one for axis, one to stack or overlay)" + ) + if overlay is None and hist.sparse_dim() == 1 and hist.dense_dim() == 1: + overlay = hist.sparse_axes()[0].name + elif overlay is None and hist.dim() > 1: + raise ValueError( + "plot1d() can only support one dimension without an overlay axis chosen" + ) + if density and binwnorm is not None: + raise ValueError("Cannot use density and binwnorm at the same time!") + if binwnorm is not None: + if not isinstance(binwnorm, numbers.Number): + raise ValueError( + "Bin width normalization not a number, but a %r" % binwnorm.__class__ + ) + if line_opts is None and fill_opts is None and error_opts is None: + if stack: + fill_opts = {} + else: + line_opts = {} + error_opts = {} + + axis = hist.axes()[0] + if overlay is not None: + overlay = hist.axis(overlay) + if axis == overlay: + axis = hist.axes()[1] + if isinstance(axis, SparseAxis): + raise NotImplementedError("Plot a sparse axis (e.g. bar chart)") + elif isinstance(axis, DenseAxis): + ax.set_xlabel(axis.label) + ax.set_ylabel(hist.label) + edges = axis.edges(overflow=overflow) + if order is None: + identifiers = ( + hist.identifiers(overlay, overflow=overlay_overflow) + if overlay is not None + else [None] + ) + else: + identifiers = order + plot_info = { + "identifier": identifiers, + "label": list(map(str, identifiers)), + "sumw": [], + "sumw2": [], + } + for i, identifier in enumerate(identifiers): + if identifier is None: + sumw, sumw2 = hist.values(sumw2=True, overflow=overflow)[()] + elif isinstance(overlay, SparseAxis): + sumw, sumw2 = hist.integrate(overlay, identifier).values( + sumw2=True, overflow=overflow + )[()] + else: + sumw, sumw2 = hist.values(sumw2=True, overflow="allnan")[()] + the_slice = ( + i if overflow_behavior(overlay_overflow).start is None else i + 1, + overflow_behavior(overflow), + ) + if hist._idense(overlay) == 1: + the_slice = (the_slice[1], the_slice[0]) + sumw = sumw[the_slice] + sumw2 = sumw2[the_slice] + plot_info["sumw"].append(sumw) + plot_info["sumw2"].append(sumw2) + + def w2err(sumw, sumw2): + err = [] + for a, b in zip(sumw, sumw2): + err.append(numpy.abs(poisson_interval(a, b) - a)) + return err + + kwargs = None + if line_opts is not None and error_opts is None: + _error = None + else: + _error = w2err(plot_info["sumw"], plot_info["sumw2"]) + if fill_opts is not None: + histtype = "fill" + kwargs = fill_opts + elif error_opts is not None and line_opts is None: + histtype = "errorbar" + kwargs = error_opts + else: + histtype = "step" + kwargs = line_opts + if kwargs is None: + kwargs = {} + + hep.histplot( + plot_info["sumw"], + edges, + label=plot_info["label"], + yerr=_error, + histtype=histtype, + ax=ax, + density=density, + binwnorm=binwnorm, + stack=stack, + **kwargs, + ) + + if stack and error_opts is not None: + stack_sumw = numpy.sum(plot_info["sumw"], axis=0) + stack_sumw2 = numpy.sum(plot_info["sumw2"], axis=0) + err = poisson_interval(stack_sumw, stack_sumw2) + if binwnorm is not None: + err *= binwnorm / numpy.diff(edges)[None, :] + opts = { + "step": "post", + "label": "Sum unc.", + "hatch": "///", + "facecolor": "none", + "edgecolor": (0, 0, 0, 0.5), + "linewidth": 0, + } + opts.update(error_opts) + ax.fill_between( + x=edges, + y1=numpy.r_[err[0, :], err[0, -1]], + y2=numpy.r_[err[1, :], err[1, -1]], + **opts, + ) + + if legend_opts is not None: + _label = overlay.label if overlay is not None else "" + ax.legend(title=_label, **legend_opts) + else: + ax.legend(title=_label) + ax.autoscale(axis="x", tight=True) + ax.set_ylim(0, None) + + return ax + + +def plotratio( + num, + denom, + ax=None, + clear=True, + overflow="none", + xerr=False, + error_opts=None, + denom_fill_opts=None, + guide_opts=None, + unc="clopper-pearson", + label=None, +): + """Create a ratio plot, dividing two compatible histograms + + Parameters + ---------- + num : Hist + Numerator, a single-axis histogram + denom : Hist + Denominator, a single-axis histogram + ax : matplotlib.axes.Axes, optional + Axes object (if None, one is created) + clear : bool, optional + Whether to clear Axes before drawing (if passed); if False, this function will skip drawing the legend + overflow : str, optional + If overflow behavior is not 'none', extra bins will be drawn on either end of the nominal + axis range, to represent the contents of the overflow bins. See `Hist.sum` documentation + for a description of the options. + xerr: bool, optional + If true, then error bars are drawn for x-axis to indicate the size of the bin. + error_opts : dict, optional + A dictionary of options to pass to the matplotlib + `ax.errorbar `_ call + internal to this function. Leave blank for defaults. Some special options are interpreted by + this function and not passed to matplotlib: 'emarker' (default: '') specifies the marker type + to place at cap of the errorbar. + denom_fill_opts : dict, optional + A dictionary of options to pass to the matplotlib + `ax.fill_between `_ call + internal to this function, filling the denominator uncertainty band. Leave blank for defaults. + guide_opts : dict, optional + A dictionary of options to pass to the matplotlib + `ax.axhline `_ call + internal to this function, to plot a horizontal guide line at ratio of 1. Leave blank for defaults. + unc : str, optional + Uncertainty calculation option: 'clopper-pearson' interval for efficiencies; 'poisson-ratio' interval + for ratio of poisson distributions; 'num' poisson interval of numerator scaled by denominator value + (common for data/mc, for better or worse). + label : str, optional + Associate a label to this entry (note: y axis label set by ``num.label``) + + Returns + ------- + ax : matplotlib.axes.Axes + A matplotlib `Axes `_ object + """ + import matplotlib.pyplot as plt + + if ax is None: + fig, ax = plt.subplots(1, 1) + else: + if not isinstance(ax, plt.Axes): + raise ValueError("ax must be a matplotlib Axes object") + if clear: + ax.clear() + if not num.compatible(denom): + raise ValueError( + "numerator and denominator histograms have incompatible axis definitions" + ) + if num.dim() > 1: + raise ValueError("plotratio() can only support one-dimensional histograms") + if error_opts is None and denom_fill_opts is None and guide_opts is None: + error_opts = {} + denom_fill_opts = {} + + axis = num.axes()[0] + if isinstance(axis, SparseAxis): + raise NotImplementedError("Ratio for sparse axes (labeled axis with errorbars)") + elif isinstance(axis, DenseAxis): + ax.set_xlabel(axis.label) + ax.set_ylabel(num.label) + edges = axis.edges(overflow=overflow) + centers = axis.centers(overflow=overflow) + ranges = (edges[1:] - edges[:-1]) / 2 if xerr else None + + sumw_num, sumw2_num = num.values(sumw2=True, overflow=overflow)[()] + sumw_denom, sumw2_denom = denom.values(sumw2=True, overflow=overflow)[()] + + rsumw = sumw_num / sumw_denom + if unc == "clopper-pearson": + rsumw_err = numpy.abs( + clopper_pearson_interval(sumw_num, sumw_denom) - rsumw + ) + elif unc == "poisson-ratio": + # poisson ratio n/m is equivalent to binomial n/(n+m) + rsumw_err = numpy.abs( + clopper_pearson_interval(sumw_num, sumw_num + sumw_denom) - rsumw + ) + elif unc == "num": + rsumw_err = numpy.abs( + poisson_interval(rsumw, sumw2_num / sumw_denom**2) - rsumw + ) + elif unc == "normal": + rsumw_err = numpy.abs( + normal_interval(sumw_num, sumw_denom, sumw2_num, sumw2_denom) + ) + else: + raise ValueError("Unrecognized uncertainty option: %r" % unc) + + if error_opts is not None: + opts = {"label": label, "linestyle": "none"} + opts.update(error_opts) + emarker = opts.pop("emarker", "") + errbar = ax.errorbar( + x=centers, y=rsumw, xerr=ranges, yerr=rsumw_err, **opts + ) + plt.setp(errbar[1], "marker", emarker) + if denom_fill_opts is not None: + unity = numpy.ones_like(sumw_denom) + denom_unc = poisson_interval(unity, sumw2_denom / sumw_denom**2) + opts = {"step": "post", "facecolor": (0, 0, 0, 0.3), "linewidth": 0} + opts.update(denom_fill_opts) + ax.fill_between( + edges, + numpy.r_[denom_unc[0], denom_unc[0, -1]], + numpy.r_[denom_unc[1], denom_unc[1, -1]], + **opts, + ) + if guide_opts is not None: + opts = {"linestyle": "--", "color": (0, 0, 0, 0.5), "linewidth": 1} + opts.update(guide_opts) + ax.axhline(1.0, **opts) + + if clear: + ax.autoscale(axis="x", tight=True) + ax.set_ylim(0, None) + + return ax + + +def plot2d( + hist, + xaxis, + ax=None, + clear=True, + xoverflow="none", + yoverflow="none", + patch_opts=None, + text_opts=None, + density=False, + binwnorm=None, +): + """Create a 2D plot from a 2D `Hist` object + + Parameters + ---------- + hist : Hist + Histogram with two dimensions + xaxis : str or Axis + Which of the two dimensions to use as an x axis + ax : matplotlib.axes.Axes, optional + Axes object (if None, one is created) + clear : bool, optional + Whether to clear Axes before drawing (if passed); if False, this function will skip drawing the legend + xoverflow : str, optional + If overflow behavior is not 'none', extra bins will be drawn on either end of the nominal x + axis range, to represent the contents of the overflow bins. See `Hist.sum` documentation + for a description of the options. + yoverflow : str, optional + Similar to ``xoverflow`` + patch_opts : dict, optional + Options passed to the matplotlib `pcolormesh `_ + call internal to this function, to plot a rectangular grid of patches colored according to the bin values. + Leave empty for defaults. + text_opts : dict, optional + Options passed to the matplotlib `text `_ + call internal to this function, to place a text label at each bin center with the bin value. Special + options interpreted by this function and not passed to matplotlib: 'format': printf-style float format + , default '%.2g'. + density : bool, optional + If true, convert sum weights to probability density (i.e. integrates to 1 over domain of axis) + (Note: this option conflicts with ``binwnorm``) + binwnorm : float, optional + If true, convert sum weights to bin-width-normalized, with unit equal to supplied value (usually you want to specify 1.) + + Returns + ------- + ax : matplotlib.axes.Axes + A matplotlib `Axes `_ object + """ + import matplotlib.pyplot as plt + + if ax is None: + fig, ax = plt.subplots(1, 1) + else: + if not isinstance(ax, plt.Axes): + raise ValueError("ax must be a matplotlib Axes object") + if clear: + ax.clear() + fig = ax.figure + if hist.dim() != 2: + raise ValueError("plot2d() can only support exactly two dimensions") + if density and binwnorm is not None: + raise ValueError("Cannot use density and binwnorm at the same time!") + if binwnorm is not None: + if not isinstance(binwnorm, numbers.Number): + raise ValueError( + "Bin width normalization not a number, but a %r" % binwnorm.__class__ + ) + if patch_opts is None and text_opts is None: + patch_opts = {} + + xaxis = hist.axis(xaxis) + yaxis = hist.axes()[1] + transpose = False + if yaxis == xaxis: + yaxis = hist.axes()[0] + transpose = True + if isinstance(xaxis, SparseAxis) or isinstance(yaxis, SparseAxis): + raise NotImplementedError("Plot a sparse axis (e.g. bar chart or labeled bins)") + else: + xedges = xaxis.edges(overflow=xoverflow) + yedges = yaxis.edges(overflow=yoverflow) + sumw, sumw2 = hist.values(sumw2=True, overflow="allnan")[()] + if transpose: + sumw = sumw.T + sumw2 = sumw2.T + # no support for different overflow behavior per axis, do it ourselves + sumw = sumw[overflow_behavior(xoverflow), overflow_behavior(yoverflow)] + sumw2 = sumw2[overflow_behavior(xoverflow), overflow_behavior(yoverflow)] + if (density or binwnorm is not None) and numpy.sum(sumw) > 0: + overallnorm = numpy.sum(sumw) * binwnorm if binwnorm is not None else 1.0 + areas = numpy.multiply.outer(numpy.diff(xedges), numpy.diff(yedges)) + binnorms = overallnorm / (areas * numpy.sum(sumw)) + sumw = sumw * binnorms + sumw2 = sumw2 * binnorms**2 + + if patch_opts is not None: + opts = {"cmap": "viridis"} + opts.update(patch_opts) + pc = ax.pcolormesh(xedges, yedges, sumw.T, **opts) + ax.add_collection(pc) + if clear: + fig.colorbar(pc, ax=ax, label=hist.label) + if text_opts is not None: + for ix, xcenter in enumerate(xaxis.centers()): + for iy, ycenter in enumerate(yaxis.centers()): + opts = { + "horizontalalignment": "center", + "verticalalignment": "center", + } + if patch_opts is not None: + opts["color"] = ( + "black" if pc.norm(sumw[ix, iy]) > 0.5 else "lightgrey" + ) + opts.update(text_opts) + txtformat = opts.pop("format", r"%.2g") + ax.text(xcenter, ycenter, txtformat % sumw[ix, iy], **opts) + + if clear: + ax.set_xlabel(xaxis.label) + ax.set_ylabel(yaxis.label) + ax.set_xlim(xedges[0], xedges[-1]) + ax.set_ylim(yedges[0], yedges[-1]) + + return ax + + +def plotgrid( + h, + figure=None, + row=None, + col=None, + overlay=None, + row_overflow="none", + col_overflow="none", + **plot_opts, +): + """Create a grid of plots, enumerating identifiers on up to 3 axes + + Parameters + ---------- + h : Hist + A histogram with up to 3 axes + figure : matplotlib.figure.Figure, optional + If omitted, a new figure is created. Otherwise, the axes will be redrawn on this existing figure. + row : str + Name of row axis + col : str + Name of column axis + overlay : str + name of overlay axis + row_overflow : str, optional + If overflow behavior is not 'none', extra bins will be drawn on either end of the nominal x + axis range, to represent the contents of the overflow bins. See `Hist.sum` documentation + for a description of the options. + col_overflow : str, optional + Similar to ``row_overflow`` + ``**plot_opts`` : kwargs + The remaining axis of the histogram, after removing any of ``row,col,overlay`` specified, + will be the plot axis, with ``plot_opts`` passed to the `plot1d` call. + + Returns + ------- + axes : numpy.ndarray + An array of matplotlib `Axes `_ objects + """ + import matplotlib.pyplot as plt + + haxes = {ax.name for ax in h.axes()} + nrow, ncol = 1, 1 + if row: + row_identifiers = h.identifiers(row, overflow=row_overflow) + nrow = len(row_identifiers) + haxes.remove(row) + if col: + col_identifiers = h.identifiers(col, overflow=col_overflow) + ncol = len(col_identifiers) + haxes.remove(col) + if overlay: + haxes.remove(overlay) + if len(haxes) > 1: + raise ValueError( + "More than one dimension left: {}".format(",".join(ax for ax in haxes)) + ) + elif len(haxes) == 0: + raise ValueError("Not enough dimensions available in %r" % h) + + figsize = plt.rcParams["figure.figsize"] + figsize = figsize[0] * max(ncol, 1), figsize[1] * max(nrow, 1) + if figure is None: + fig, axes = plt.subplots( + nrow, ncol, figsize=figsize, squeeze=False, sharex=True, sharey=True + ) + else: + fig = figure + shape = (0, 0) + lastax = fig.get_children()[-1] + if isinstance(lastax, plt.Axes): + shape = lastax.rowNum + 1, lastax.colNum + 1 + if shape[0] == nrow and shape[1] == ncol: + axes = numpy.array(fig.axes).reshape(shape) + else: + fig.clear() + # fig.set_size_inches(figsize) + axes = fig.subplots(nrow, ncol, squeeze=False, sharex=True, sharey=True) + + for icol in range(ncol): + hcol = h + coltitle = None + if col: + vcol = col_identifiers[icol] + hcol = h.integrate(col, vcol) + coltitle = str(vcol) + if isinstance(vcol, Interval) and vcol.label is None: + coltitle = f"{h.axis(col).label} ∈ {coltitle}" + for irow in range(nrow): + ax = axes[irow, icol] + hplot = hcol + rowtitle = None + if row: + vrow = row_identifiers[irow] + hplot = hcol.integrate(row, vrow) + rowtitle = str(vrow) + if isinstance(vrow, Interval) and vrow.label is None: + rowtitle = f"{h.axis(row).label} ∈ {rowtitle}" + + plot1d(hplot, ax=ax, overlay=overlay, **plot_opts) + if row is not None and col is not None: + ax.set_title(f"{rowtitle}, {coltitle}") + elif row is not None: + ax.set_title(rowtitle) + elif col is not None: + ax.set_title(coltitle) + + for ax in axes.flatten(): + ax.autoscale(axis="y") + ax.set_ylim(0, None) + + return axes + + +def isnotebook(): + try: + shell = get_ipython().__class__.__name__ + if shell == "ZMQInteractiveShell": + return True # Jupyter notebook or qtconsole + elif shell == "TerminalInteractiveShell": + return False # Terminal running IPython + else: + return False # Other type (?) + except NameError: + return False + + +def bokeh_plot(histo, jup_url="http://127.0.0.1:8889"): + if not isnotebook(): + raise NotImplementedError("Only usable in jupyter notebook") + import bokeh.plotting.figure as bk_figure + from bokeh import palettes + + # init bokeh + from bokeh.application import Application + from bokeh.application.handlers import FunctionHandler + from bokeh.core.validation import silence + from bokeh.core.validation.warnings import EMPTY_LAYOUT + from bokeh.io import output_notebook # enables plot interface in J notebook + from bokeh.io import show + from bokeh.layouts import column, row + from bokeh.models import ColumnDataSource + from bokeh.models.widgets import ( + CheckboxButtonGroup, + Div, + RadioButtonGroup, + RangeSlider, + ) + + silence(EMPTY_LAYOUT, True) + + output_notebook() + + # Set up widgets + cfg_labels = ["Ghost"] + wi_config = CheckboxButtonGroup(labels=cfg_labels, active=[0]) + wi_dense_select = RadioButtonGroup( + labels=[ax.name for ax in histo.dense_axes()], active=0 + ) + wi_sparse_select = RadioButtonGroup( + labels=[ax.name for ax in histo.sparse_axes()], active=0 + ) + + # Dense widgets + sliders = {} + for ax in histo.dense_axes(): + edge_vals = (histo.axis(ax.name).edges()[0], histo.axis(ax.name).edges()[-1]) + _smallest_bin = numpy.min(numpy.diff(histo.axis(ax.name).edges())) + sliders[ax.name] = RangeSlider( + title=ax.name, + value=edge_vals, + start=edge_vals[0], + end=edge_vals[1], + step=_smallest_bin, + name=ax.name, + ) + + # Cat widgets + togglers = {} + for ax in histo.sparse_axes(): + togglers[ax.name] = CheckboxButtonGroup( + labels=[i.name for i in ax.identifiers()], active=[0], name=ax.name + ) + + # Toggles for all widgets + configers = {} + for ax in histo.sparse_axes(): + configers[ax.name] = CheckboxButtonGroup( + labels=["Display", "Ghost"], active=[0, 1], name=ax.name + ) + for ax in histo.dense_axes(): + configers[ax.name] = CheckboxButtonGroup( + labels=["Display"], active=[0], name=ax.name + ) + + # Figure + fig = bk_figure( + title="1D Projection", + plot_width=500, + plot_height=500, + min_border=20, + toolbar_location=None, + ) + fig.yaxis.axis_label = "N" + fig.xaxis.axis_label = "Quantity" + + # Iterate over possible overlays + _max_idents = 0 # Max number of simultaneou histograms + for ax in histo.sparse_axes(): + _max_idents = max(_max_idents, len([i.name for i in ax.identifiers()])) + + # Data source list + sources = [] + sources_ghost = [] + for i in range(_max_idents): + sources.append(ColumnDataSource(dict(left=[], top=[], right=[], bottom=[]))) + sources_ghost.append( + ColumnDataSource(dict(left=[], top=[], right=[], bottom=[])) + ) + + # Hist list + hists = [] + hists_ghost = [] + for i in range(_max_idents): + if _max_idents < 10: + _color = palettes.Category10[min(max(3, _max_idents), 10)][i] + else: + _color = palettes.magma(_max_idents)[i] + hists.append( + fig.quad( + left="left", + right="right", + top="top", + bottom="bottom", + source=sources[i], + alpha=0.9, + color=_color, + ) + ) + hists_ghost.append( + fig.quad( + left="left", + right="right", + top="top", + bottom="bottom", + source=sources_ghost[i], + alpha=0.05, + color=_color, + ) + ) + + def update_data(attrname, old, new): + sparse_active = wi_sparse_select.active + sparse_name = [ax.name for ax in histo.sparse_axes()][sparse_active] + sparse_other = [ax.name for ax in histo.sparse_axes() if ax.name != sparse_name] + + dense_active = wi_dense_select.active + dense_name = [ax.name for ax in histo.dense_axes()][dense_active] + dense_other = [ax.name for ax in histo.dense_axes() if ax.name != dense_name] + + # Apply cuts in projections + _h = histo.copy() + for proj_ax in sparse_other: + _idents = histo.axis(proj_ax).identifiers() + _labels = [ident.name for ident in _idents] + if 0 in configers[proj_ax].active: + _h = _h.integrate( + proj_ax, [_labels[i] for i in togglers[proj_ax].active] + ) + else: + _h = _h.integrate(proj_ax) + + for proj_ax in dense_other: + _h = _h.integrate( + proj_ax, slice(sliders[proj_ax].value[0], sliders[proj_ax].value[1]) + ) + + for cat_ix in range(_max_idents): + # Update histo for each toggled overlay + if cat_ix in togglers[sparse_name].active: + cat_value = histo.axis(sparse_name).identifiers()[cat_ix] + h1d = _h.integrate(sparse_name, cat_value) + + # Get shown histogram + values = h1d.project(dense_name).values() + if values != {}: + h = values[()] + bins = h1d.axis(dense_name).edges() + + # Apply cuts on shown axis + bin_los = bins[:-1][bins[:-1] > sliders[dense_name].value[0]] + bin_his = bins[1:][bins[1:] < sliders[dense_name].value[1]] + new_bins = numpy.intersect1d(bin_los, bin_his) + bin_ixs = numpy.searchsorted(bins, new_bins)[:-1] + h = h[bin_ixs] + + sources[cat_ix].data = dict( + left=new_bins[:-1], + right=new_bins[1:], + top=h, + bottom=numpy.zeros_like(h), + ) + else: + sources[cat_ix].data = dict(left=[], right=[], top=[], bottom=[]) + + # Add ghosts + if 0 in wi_config.active: + h1d = histo.integrate(sparse_name, cat_value) + for proj_ax in sparse_other: + _idents = histo.axis(proj_ax).identifiers() + _labels = [ident.name for ident in _idents] + if 1 not in configers[proj_ax].active: + h1d = h1d.integrate( + proj_ax, [_labels[i] for i in togglers[proj_ax].active] + ) + else: + h1d = h1d.integrate(proj_ax) + values = h1d.project(dense_name).values() + if values != {}: + h = h1d.project(dense_name).values()[()] + bins = h1d.axis(dense_name).edges() + sources_ghost[cat_ix].data = dict( + left=bins[:-1], + right=bins[1:], + top=h, + bottom=numpy.zeros_like(h), + ) + else: + sources_ghost[cat_ix].data = dict( + left=[], right=[], top=[], bottom=[] + ) + else: + sources[cat_ix].data = dict(left=[], right=[], top=[], bottom=[]) + sources_ghost[cat_ix].data = dict(left=[], right=[], top=[], bottom=[]) + + # Cosmetics + fig.xaxis.axis_label = dense_name + + for name, slider in sliders.items(): + slider.on_change("value", update_data) + for name, toggler in togglers.items(): + toggler.on_change("active", update_data) + for name, configurer in configers.items(): + configurer.on_change("active", update_data) + # Button + for w in [wi_dense_select, wi_sparse_select, wi_config]: + w.on_change("active", update_data) + + from bokeh.models.widgets import Panel, Tabs + + layout = row( + fig, + column( + Div( + text="Overlay Axis:", + style={"font-size": "100%", "color": "black"}, + ), + wi_sparse_select, + Div( + text="Plot Axis:", style={"font-size": "100%", "color": "black"} + ), + wi_dense_select, + Div( + text="Categorical Cuts:", + style={"font-size": "100%", "color": "black"}, + ), + *[toggler for name, toggler in togglers.items()], + Div( + text="Dense Cuts:", style={"font-size": "100%", "color": "black"} + ), + *[slider for name, slider in sliders.items()], + ), + ) + + # Config prep + incl_lists = [[], [], []] + for i, key in enumerate(list(configers.keys())): + incl_lists[i // max(5, len(list(configers.keys())) / 3)].append( + Div( + text=f"{key}:", + style={"font-size": "70%", "color": "black"}, + ) + ) + incl_lists[i // max(5, len(list(configers.keys())) / 3)].append(configers[key]) + + layout_cfgs = column( + row( + column( + Div( + text="Configs:", + style={"font-size": "100%", "color": "black"}, + ), + wi_config, + ) + ), + Div( + text="Axis togglers:", style={"font-size": "100%", "color": "black"} + ), + row( + column(incl_lists[0]), + column(incl_lists[1]), + column(incl_lists[2]), + ), + ) + + # Update active buttons + def update_layout(attrname, old, new): + active_axes = [None] + for name, wi in configers.items(): + if 0 in wi.active: + active_axes.append(name) + for child in layout.children[1].children: + if child.name not in active_axes: + child.visible = False + else: + child.visible = True + + for name, configurer in configers.items(): + configurer.on_change("active", update_layout) + + tab1 = Panel(child=layout, title="Projection") + tab2 = Panel(child=layout_cfgs, title="Configs") + tabs = Tabs(tabs=[tab1, tab2]) + + def modify_doc(doc): + doc.add_root(row(tabs, width=800)) + doc.title = "Sliders" + + handler = FunctionHandler(modify_doc) + app = Application(handler) + + show(app, notebook_url=jup_url) + update_data("", "", "") diff --git a/tests/test_hist_plot.py b/tests/test_hist_plot.py new file mode 100644 index 000000000..fe3ec2f8f --- /dev/null +++ b/tests/test_hist_plot.py @@ -0,0 +1,352 @@ +import os + +import pytest +import requests + +from coffea.util import numpy as np + +url = ( + "https://github.com/scikit-hep/uproot3/blob/master/tests/samples/HZZ.root?raw=true" +) +r = requests.get(url) +with open(os.path.join(os.getcwd(), "HZZ.root"), "wb") as f: + f.write(r.content) + + +def fill_lepton_kinematics(): + import awkward as ak + import uproot + + from coffea.nanoevents.methods import candidate + + ak.behavior.update(candidate.behavior) + + # histogram creation and manipulation + from coffea import hist + + fin = uproot.open("HZZ.root") + tree = fin["events"] + + arrays = { + k.replace("Electron_", "").strip("P").replace("E", "t").lower(): v + for k, v in tree.arrays(filter_name="Electron_*", how=dict).items() + } + electrons = ak.zip(arrays, with_name="Candidate") + + arrays = { + k.replace("Muon_", "").strip("P").replace("E", "t").lower(): v + for k, v in tree.arrays(filter_name="Muon_*", how=dict).items() + } + muons = ak.zip(arrays, with_name="Candidate") + + # Two types of axes exist presently: bins and categories + lepton_kinematics = hist.Hist( + "Events", + hist.Cat("flavor", "Lepton flavor"), + hist.Bin("pt", "$p_{T}$", 19, 10, 100), + hist.Bin("eta", r"$\eta$", [-2.5, -1.4, 0, 1.4, 2.5]), + ) + + # Pass keyword arguments to fill, all arrays must be flat numpy arrays + # User is responsible for ensuring all arrays have same jagged structure! + lepton_kinematics.fill( + flavor="electron", pt=ak.flatten(electrons.pt), eta=ak.flatten(electrons.eta) + ) + lepton_kinematics.fill( + flavor="muon", pt=ak.flatten(muons.pt), eta=ak.flatten(muons.eta) + ) + + return lepton_kinematics + + +@pytest.mark.mpl_image_compare(style="default", remove_text=True) +def test_plot1d(): + # histogram creation and manipulation + # matplotlib + import matplotlib.pyplot as plt + + plt.switch_backend("agg") + + from coffea import hist + + lepton_kinematics = fill_lepton_kinematics() + + # looking at lepton pt for all eta + lepton_pt = lepton_kinematics.integrate("eta", overflow="under") + + ax = hist.plot1d( + lepton_pt, + overlay="flavor", + stack=True, + fill_opts={"alpha": 0.5, "edgecolor": (0, 0, 0, 0.3)}, + ) + # all matplotlib primitives are returned, in case one wants to tweak them + # e.g. maybe you really miss '90s graphics... + + # Clearly the yields are much different, are the shapes similar? + lepton_pt.label = "Density" + hist.plot1d(lepton_pt, overlay="flavor", density=True) + + return ax.figure + + +@pytest.mark.mpl_image_compare(style="default", remove_text=True) +def test_plot2d(): + # histogram creation and manipulation + # matplotlib + import matplotlib.pyplot as plt + + from coffea import hist + + plt.switch_backend("agg") + + lepton_kinematics = fill_lepton_kinematics() + + # looking at lepton pt for all eta + muon_kinematics = lepton_kinematics.integrate("flavor", "muon") + + ax = hist.plot2d(muon_kinematics, "eta") + + return ax.figure + + +def test_plotratio(): + # histogram creation and manipulation + # matplotlib + import matplotlib.pyplot as plt + + from coffea import hist + + plt.switch_backend("agg") + + lepton_kinematics = fill_lepton_kinematics() + + # Add some pseudodata to a pt histogram so we can make a nice data/mc plot + pthist = lepton_kinematics.sum("eta") + bin_values = pthist.axis("pt").centers() + poisson_means = pthist.sum("flavor").values()[()] + values = np.repeat(bin_values, np.random.poisson(poisson_means)) + pthist.fill(flavor="pseudodata", pt=values) + + # Set nicer labels, by accessing the string bins' label property + pthist.axis("flavor").index("electron").label = "e Flavor" + pthist.axis("flavor").index("muon").label = r"$\mu$ Flavor" + pthist.axis("flavor").index("pseudodata").label = r"Pseudodata from e/$\mu$" + + # using regular expressions on flavor name to select just the data + # another method would be to fill a separate data histogram + import re + + notdata = re.compile("(?!pseudodata)") + + # make a nice ratio plot + plt.rcParams.update( + { + "font.size": 14, + "axes.titlesize": 18, + "axes.labelsize": 18, + "xtick.labelsize": 12, + "ytick.labelsize": 12, + } + ) + fig, (ax, rax) = plt.subplots( + 2, 1, figsize=(7, 7), gridspec_kw={"height_ratios": (3, 1)}, sharex=True + ) + fig.subplots_adjust(hspace=0.07) + + # Here is an example of setting up a color cycler to color the various fill patches + # http://colorbrewer2.org/#type=qualitative&scheme=Paired&n=6 + from cycler import cycler + + colors = ["#a6cee3", "#1f78b4", "#b2df8a", "#33a02c", "#fb9a99", "#e31a1c"] + ax.set_prop_cycle(cycler(color=colors)) + + fill_opts = {"edgecolor": (0, 0, 0, 0.3), "alpha": 0.8} + error_opts = { + "label": "Stat. Unc.", + "hatch": "///", + "facecolor": "none", + "edgecolor": (0, 0, 0, 0.5), + "linewidth": 0, + } + data_err_opts = { + "linestyle": "none", + "marker": ".", + "markersize": 10.0, + "color": "k", + "elinewidth": 1, + } + + hist.plot1d( + pthist[notdata], + overlay="flavor", + ax=ax, + clear=False, + stack=True, + line_opts=None, + fill_opts=fill_opts, + error_opts=error_opts, + ) + hist.plot1d( + pthist["pseudodata"], + overlay="flavor", + ax=ax, + clear=False, + error_opts=data_err_opts, + ) + + ax.autoscale(axis="x", tight=True) + ax.set_ylim(0, None) + ax.set_xlabel(None) + ax.legend() + + hist.plotratio( + pthist["pseudodata"].sum("flavor"), + pthist[notdata].sum("flavor"), + ax=rax, + error_opts=data_err_opts, + denom_fill_opts={}, + guide_opts={}, + unc="num", + ) + rax.set_ylabel("Ratio") + rax.set_ylim(0, 2) + + plt.text( + 0.0, + 1.0, + "☕", + fontsize=28, + horizontalalignment="left", + verticalalignment="bottom", + transform=ax.transAxes, + ) + plt.text( + 1.0, + 1.0, + r"1 fb$^{-1}$ (?? TeV)", + fontsize=16, + horizontalalignment="right", + verticalalignment="bottom", + transform=ax.transAxes, + ) + + +@pytest.mark.mpl_image_compare(style="default", remove_text=True) +def test_plotgrid(): + # histogram creation and manipulation + # matplotlib + import matplotlib.pyplot as plt + + from coffea import hist + + plt.switch_backend("agg") + + lepton_kinematics = fill_lepton_kinematics() + + # Let's stack them, after defining some nice styling + stack_fill_opts = {"alpha": 0.8, "edgecolor": (0, 0, 0, 0.5)} + stack_error_opts = { + "label": "Stat. Unc.", + "hatch": "///", + "facecolor": "none", + "edgecolor": (0, 0, 0, 0.5), + "linewidth": 0, + } + # maybe we want to compare different eta regions + # plotgrid accepts row and column axes, and creates a grid of 1d plots as appropriate + axs = hist.plotgrid( + lepton_kinematics, + row="eta", + overlay="flavor", + stack=True, + fill_opts=stack_fill_opts, + error_opts=stack_error_opts, + ) + + return axs.flatten()[0].figure + + +def test_clopper_pearson_interval(): + from coffea.hist.plot import clopper_pearson_interval + + # Reference values for CL=0.6800 calculated with ROOT's TEfficiency + num = np.array([1.0, 5.0, 10.0, 10.0]) + denom = np.array([10.0, 10.0, 10.0, 437.0]) + ref_hi = np.array( + [0.293313782248242, 0.6944224231766912, 1.0, 0.032438865381336446] + ) + ref_lo = np.array( + [ + 0.01728422272382846, + 0.3055775768233088, + 0.8325532074018731, + 0.015839046981153772, + ] + ) + + interval = clopper_pearson_interval(num, denom, coverage=0.68) + + threshold = 1e-6 + assert all((interval[1, :] / ref_hi) - 1 < threshold) + assert all((interval[0, :] / ref_lo) - 1 < threshold) + + +def test_normal_interval(): + from coffea.hist.plot import normal_interval + + # Reference weighted efficiency and error from ROOTs TEfficiency + + denom = np.array( + [ + 89.01457591590004, + 2177.066076428943, + 6122.5256890981855, + 0.0, + 100.27757990710668, + ] + ) + num = np.array( + [ + 75.14287743709515, + 2177.066076428943, + 5193.454723043864, + 0.0, + 84.97723540536361, + ] + ) + denom_sumw2 = np.array( + [94.37919737476827, 10000.0, 6463.46795877633, 0.0, 105.90898005417333] + ) + num_sumw2 = np.array( + [67.2202147680005, 10000.0, 4647.983931785646, 0.0, 76.01275761253757] + ) + ref_hi = np.array( + [0.0514643476600107, 0.0, 0.0061403263960343, np.nan, 0.0480731185500146] + ) + ref_lo = np.array( + [0.0514643476600107, 0.0, 0.0061403263960343, np.nan, 0.0480731185500146] + ) + + interval = normal_interval(num, denom, num_sumw2, denom_sumw2) + threshold = 1e-6 + + lo, hi = interval + + assert len(ref_hi) == len(hi) + assert len(ref_lo) == len(lo) + + for i in range(len(ref_hi)): + if np.isnan(ref_hi[i]): + assert np.isnan(ref_hi[i]) + elif ref_hi[i] == 0.0: + assert hi[i] == 0.0 + else: + assert np.abs(hi[i] / ref_hi[i] - 1) < threshold + + if np.isnan(ref_lo[i]): + assert np.isnan(ref_lo[i]) + elif ref_lo[i] == 0.0: + assert lo[i] == 0.0 + else: + assert np.abs(lo[i] / ref_lo[i] - 1) < threshold diff --git a/tests/test_hist_tools.py b/tests/test_hist_tools.py new file mode 100644 index 000000000..b3abb803c --- /dev/null +++ b/tests/test_hist_tools.py @@ -0,0 +1,460 @@ +import sys + +import awkward as ak +import numpy as np +import pytest +from dummy_distributions import dummy_jagged_eta_pt + +from coffea import hist + + +def test_hist(): + counts, test_eta, test_pt = dummy_jagged_eta_pt() + + h_nothing = hist.Hist("empty inside") + assert h_nothing.sparse_dim() == h_nothing.dense_dim() == 0 + assert h_nothing.values() == {} + + h_regular_bins = hist.Hist( + "regular joe", hist.Bin("x", "x", 20, 0, 200), hist.Bin("y", "why", 20, -3, 3) + ) + h_regular_bins.fill(x=test_pt, y=test_eta) + nentries = np.sum(counts) + assert h_regular_bins.sum("x", "y", overflow="all").values(sumw2=True)[()] == ( + nentries, + nentries, + ) + # bin x=2, y=10 (when overflow removed) + count_some_bin = np.sum( + (test_pt >= 20.0) & (test_pt < 30.0) & (test_eta >= 0.0) & (test_eta < 0.3) + ) + assert ( + h_regular_bins.integrate("x", slice(20, 30)).values()[()][10] == count_some_bin + ) + assert ( + h_regular_bins.integrate("y", slice(0, 0.3)).values()[()][2] == count_some_bin + ) + + h_reduced = h_regular_bins[10:, -0.6:] + # bin x=1, y=2 + assert h_reduced.integrate("x", slice(20, 30)).values()[()][2] == count_some_bin + assert h_reduced.integrate("y", slice(0, 0.3)).values()[()][1] == count_some_bin + h_reduced.fill(x=23, y=0.1) + assert h_reduced.integrate("x", slice(20, 30)).values()[()][2] == count_some_bin + 1 + assert h_reduced.integrate("y", slice(0, 0.3)).values()[()][1] == count_some_bin + 1 + + animal = hist.Cat("animal", "type of animal") + vocalization = hist.Cat("vocalization", "onomatopoiea is that how you spell it?") + h_cat_bins = hist.Hist("I like cats", animal, vocalization) + h_cat_bins.fill(animal="cat", vocalization="meow", weight=2.0) + h_cat_bins.fill( + animal="dog", vocalization="meow", weight=np.array([-1.0, -1.0, -5.0]) + ) + h_cat_bins.fill(animal="dog", vocalization="woof", weight=100.0) + h_cat_bins.fill(animal="dog", vocalization="ruff") + assert h_cat_bins.values()[("cat", "meow")] == 2.0 + assert h_cat_bins.values(sumw2=True)[("dog", "meow")] == (-7.0, 27.0) + assert h_cat_bins.integrate("vocalization", ["woof", "ruff"]).values(sumw2=True)[ + ("dog",) + ] == (101.0, 10001.0) + + height = hist.Bin("height", "height [m]", 10, 0, 5) + h_mascots_1 = hist.Hist( + "fermi mascot showdown", + animal, + vocalization, + height, + # weight is a reserved keyword + hist.Bin( + "mass", "weight (g=9.81m/s**2) [kg]", np.power(10.0, np.arange(5) - 1) + ), + ) + + h_mascots_2 = hist.Hist( + "fermi mascot showdown", + axes=( + animal, + vocalization, + height, + # weight is a reserved keyword + hist.Bin( + "mass", "weight (g=9.81m/s**2) [kg]", np.power(10.0, np.arange(5) - 1) + ), + ), + ) + + h_mascots_3 = hist.Hist( + axes=[ + animal, + vocalization, + height, + # weight is a reserved keyword + hist.Bin( + "mass", "weight (g=9.81m/s**2) [kg]", np.power(10.0, np.arange(5) - 1) + ), + ], + label="fermi mascot showdown", + ) + + with pytest.warns(UserWarning): + h_mascots_4 = hist.Hist( + "fermi mascot showdown", + animal, + vocalization, + height, + # weight is a reserved keyword + hist.Bin( + "mass", "weight (g=9.81m/s**2) [kg]", np.power(10.0, np.arange(5) - 1) + ), + axes=[ + animal, + vocalization, + height, + # weight is a reserved keyword + hist.Bin( + "mass", + "weight (g=9.81m/s**2) [kg]", + np.power(10.0, np.arange(5) - 1), + ), + ], + ) + + assert h_mascots_1._dense_shape == h_mascots_2._dense_shape + assert h_mascots_2._dense_shape == h_mascots_3._dense_shape + assert h_mascots_3._dense_shape == h_mascots_4._dense_shape + + assert h_mascots_1._axes == h_mascots_2._axes + assert h_mascots_2._axes == h_mascots_3._axes + assert h_mascots_3._axes == h_mascots_4._axes + + adult_bison_h = np.random.normal(loc=2.5, scale=0.2, size=40) + adult_bison_w = np.random.normal(loc=700, scale=100, size=40) + h_mascots_1.fill( + animal="bison", vocalization="huff", height=adult_bison_h, mass=adult_bison_w + ) + goose_h = np.random.normal(loc=0.4, scale=0.05, size=1000) + goose_w = np.random.normal(loc=7, scale=1, size=1000) + h_mascots_1.fill(animal="goose", vocalization="honk", height=goose_h, mass=goose_w) + crane_h = np.random.normal(loc=1, scale=0.05, size=4) + crane_w = np.random.normal(loc=10, scale=1, size=4) + h_mascots_1.fill(animal="crane", vocalization="none", height=crane_h, mass=crane_w) + + with pytest.raises(ValueError): + h_mascots_1.fill( + beast="crane", yelling="none", tallness=crane_h, heavitivity=crane_w + ) + + h_mascots_2 = h_mascots_1.copy() + h_mascots_2.clear() + baby_bison_h = np.random.normal(loc=0.5, scale=0.1, size=20) + baby_bison_w = np.random.normal(loc=200, scale=10, size=20) + baby_bison_cutefactor = 2.5 * np.ones_like(baby_bison_w) + h_mascots_2.fill( + animal="bison", + vocalization="baa", + height=baby_bison_h, + mass=baby_bison_w, + weight=baby_bison_cutefactor, + ) + h_mascots_2.fill(animal="fox", vocalization="none", height=1.0, mass=30.0) + + h_mascots = h_mascots_1 + h_mascots_2 + assert ( + h_mascots.integrate("vocalization", "h*") + .sum("height", "mass", "animal") + .values()[()] + == 1040.0 + ) + + species_class = hist.Cat("species_class", "where the subphylum is vertibrates") + classes = { + "birds": ["goose", "crane"], + "mammals": ["bison", "fox"], + } + h_mascots.scale({("goose",): 0.5}, axis=("animal",)) + h_mascots.scale({("goose", "honk"): 2.0}, axis=("animal", "vocalization")) + h_species = h_mascots.group("animal", species_class, classes) + + assert set(h_species.integrate("vocalization").values().keys()) == { + ("birds",), + ("mammals",), + } + nbirds_bin = np.sum( + (goose_h >= 0.5) & (goose_h < 1) & (goose_w > 10) & (goose_w < 100) + ) + nbirds_bin += np.sum( + (crane_h >= 0.5) & (crane_h < 1) & (crane_w > 10) & (crane_w < 100) + ) + assert h_species.integrate("vocalization").values()[("birds",)][1, 2] == nbirds_bin + tally = h_species.sum("mass", "height", "vocalization").values() + assert tally[("birds",)] == 1004.0 + assert tally[("mammals",)] == 91.0 + + h_species.scale({"honk": 0.1, "huff": 0.9}, axis="vocalization") + h_species.scale(5.0) + tally = h_species.sum("mass", height, vocalization).values(sumw2=True) + assert tally[("birds",)] == (520.0, 350.0) + assert tally[("mammals",)] == (435.0, 25 * (40 * (0.9**2) + 20 * (2.5**2) + 1)) + + assert h_species.axis("vocalization") is vocalization + assert h_species.axis("height") is height + assert h_species.integrate("vocalization", "h*").axis("height") is height + + tall_class = hist.Cat("tall_class", "species class (species above 1m)") + mapping = { + "birds": (["goose", "crane"], slice(1.0, None)), + "mammals": (["bison", "fox"], slice(1.0, None)), + } + h_tall = h_mascots.group((animal, height), tall_class, mapping) + tall_bird_count = np.sum(goose_h >= 1.0) + np.sum(crane_h >= 1) + assert h_tall.sum("mass", "vocalization").values()[("birds",)] == tall_bird_count + tall_mammal_count = np.sum(adult_bison_h >= 1.0) + np.sum(baby_bison_h >= 1) + 1 + assert ( + h_tall.sum("mass", "vocalization").values()[("mammals",)] == tall_mammal_count + ) + + h_less = h_mascots.remove(["fox", "bison"], axis="animal") + assert h_less.sum("vocalization", "height", "mass", "animal").values()[()] == 1004.0 + + +def test_export1d(): + import os + + import uproot3 + + from coffea.hist.export import export1d + + counts, test_eta, test_pt = dummy_jagged_eta_pt() + h_regular_bins = hist.Hist("regular_joe", hist.Bin("x", "x", 20, 0, 200)) + h_regular_bins.fill(x=test_pt) + + hout = export1d(h_regular_bins) + + filename = "test_export1d.root" + + with uproot3.create(filename) as fout: + fout["regular_joe"] = hout + fout.close() + + with uproot3.open(filename) as fin: + hin = fin["regular_joe"] + + assert np.all(hin.edges == hout.edges) + assert np.all(hin.values == hout.values) + + del hin + del fin + + if os.path.exists(filename): + os.remove(filename) + + +def test_hist_serdes(): + import pickle + + h_regular_bins = hist.Hist( + "regular joe", hist.Bin("x", "x", 20, 0, 200), hist.Bin("y", "why", 20, -3, 3) + ) + + h_regular_bins.fill( + x=np.array([1.0, 2.0, 3.0, 4.0, 5.0]), y=np.array([-2.0, 1.0, 0.0, 1.0, 2.0]) + ) + + h_regular_bins.sum("x").identifiers("y") + + spkl = pickle.dumps(h_regular_bins) + + hnew = pickle.loads(spkl) + + hnew.sum("x").identifiers("y") + + assert h_regular_bins._dense_shape == hnew._dense_shape + assert h_regular_bins._axes == hnew._axes + + +def test_hist_serdes_labels(): + import pickle + + ax = hist.Bin("asdf", "asdf", 3, 0, 3) + ax.identifiers()[0].label = "type 1" + h = hist.Hist("a", ax) + h.identifiers("asdf") + + spkl = pickle.dumps(h) + + hnew = pickle.loads(spkl) + + for old, new in zip(h.identifiers("asdf"), hnew.identifiers("asdf")): + assert old.label == new.label + + assert h._dense_shape == hnew._dense_shape + assert h._axes == hnew._axes + + +@pytest.mark.skipif( + sys.version_info < (3, 4), + reason="requires python3.4 or higher, test file is pickle proto 4", +) +def test_hist_compat(): + from coffea.util import load + + test = load("tests/samples/old_hist_format.coffea") + + expected_bins = np.array( + [ + -np.inf, + 0.0, + 20.0, + 40.0, + 60.0, + 80.0, + 100.0, + 120.0, + 140.0, + 160.0, + 180.0, + 200.0, + 220.0, + 240.0, + 260.0, + 280.0, + 300.0, + 320.0, + 340.0, + 360.0, + 380.0, + 400.0, + 420.0, + 440.0, + 460.0, + 480.0, + 500.0, + 520.0, + 540.0, + 560.0, + 580.0, + 600.0, + 620.0, + 640.0, + 660.0, + 680.0, + 700.0, + 720.0, + 740.0, + 760.0, + 780.0, + 800.0, + 820.0, + 840.0, + 860.0, + 880.0, + 900.0, + 920.0, + 940.0, + 960.0, + 980.0, + 1000.0, + 1020.0, + 1040.0, + 1060.0, + 1080.0, + 1100.0, + 1120.0, + 1140.0, + 1160.0, + 1180.0, + 1200.0, + np.inf, + np.nan, + ] + ) + assert np.all(test._axes[2]._interval_bins[:-1] == expected_bins[:-1]) + assert np.isnan(test._axes[2]._interval_bins[-1]) + + +def test_issue_247(): + from coffea import hist + + h = hist.Hist("stuff", hist.Bin("old", "old", 20, -1, 1)) + h.fill(old=h.axis("old").centers()) + h2 = h.rebin(h.axis("old"), hist.Bin("new", "new", 10, -1, 1)) + # check first if its even possible to have correct binning + assert np.all(h2.axis("new").edges() == h.axis("old").edges()[::2]) + # make sure the lookup works properly + assert np.all(h2.values()[()] == 2.0) + h3 = h.rebin(h.axis("old"), 2) + assert np.all(h3.values()[()] == 2.0) + + with pytest.raises(ValueError): + # invalid division + _ = h.rebin(h.axis("old"), hist.Bin("new", "new", 8, -1, 1)) + + newaxis = hist.Bin("new", "new", h.axis("old").edges()[np.cumsum([0, 2, 3, 5])]) + h.rebin("old", newaxis) + + +def test_issue_333(): + axis = hist.Bin("channel", "Channel b1", 50, 0, 2000) + temp = np.arange(0, 2000, 40, dtype=np.int16) + assert np.all(axis.index(temp) == np.arange(50) + 1) + + +def test_issue_394(): + dummy = hist.Hist( + "Dummy", + hist.Cat("sample", "sample"), + hist.Bin("dummy", "Number of events", 1, 0, 1), + ) + dummy.fill(sample="test", dummy=1, weight=0.5) + + +def test_fill_none(): + dummy = hist.Hist("Dummy", hist.Bin("x", "asdf", 1, 0, 1)) + with pytest.raises(ValueError): + # attempt to fill with none + dummy.fill(x=ak.Array([0.1, None, 0.3])) + + # allow fill when masked type but no Nones remain + dummy.fill(x=ak.Array([0.1, None, 0.3])[[True, False, True]]) + + +def test_boost_conversion(): + import boost_histogram as bh + + dummy = hist.Hist( + "Dummy", + hist.Cat("sample", "sample"), + hist.Bin("dummy", "Number of events", 1, 0, 1), + ) + dummy.fill(sample="test", dummy=1, weight=0.5) + dummy.fill(sample="test", dummy=0.1) + dummy.fill(sample="test2", dummy=-0.1) + dummy.fill(sample="test3", dummy=0.5, weight=0.1) + dummy.fill(sample="test3", dummy=0.5, weight=0.9) + + h = dummy.to_boost() + assert len(h.axes) == 2 + assert h[bh.loc("test"), bh.loc(1)].value == 0.5 + assert h[bh.loc("test"), bh.loc(100)].value == 0.5 + assert h[bh.loc("test"), bh.loc(1)].variance == 0.25 + assert h[0, 0].value == 1.0 + assert h[0, 0].variance == 1.0 + assert h[1, 0].value == 0.0 + assert h[bh.loc("test2"), 0].value == 0.0 + assert h[1, bh.underflow].value == 1.0 + assert h[bh.loc("test3"), bh.loc(0.5)].value == 1.0 + assert h[bh.loc("test3"), bh.loc(0.5)].variance == 0.1 * 0.1 + 0.9 * 0.9 + + dummy = hist.Hist( + "Dummy", + hist.Cat("sample", "sample"), + hist.Bin("dummy", "Number of events", 1, 0, 1), + ) + dummy.fill(sample="test", dummy=0.1) + dummy.fill(sample="test", dummy=0.2) + dummy.fill(sample="test2", dummy=0.2) + # No sumw2 -> simple bh storage + h = dummy.to_boost() + assert len(h.axes) == 2 + assert h[0, 0] == 2.0 + assert h[1, 0] == 1.0 From 5c5e46d7b065e9db8054da49100637895967be46 Mon Sep 17 00:00:00 2001 From: Lindsey Gray Date: Wed, 15 May 2024 09:17:46 -0500 Subject: [PATCH 02/11] ressurect tests --- tests/test_hist_plot.py | 14 ++--- tests/test_hist_tools.py | 119 +-------------------------------------- 2 files changed, 9 insertions(+), 124 deletions(-) diff --git a/tests/test_hist_plot.py b/tests/test_hist_plot.py index fe3ec2f8f..1eb7af46b 100644 --- a/tests/test_hist_plot.py +++ b/tests/test_hist_plot.py @@ -22,7 +22,7 @@ def fill_lepton_kinematics(): ak.behavior.update(candidate.behavior) # histogram creation and manipulation - from coffea import hist + from coffea.jitters import hist fin = uproot.open("HZZ.root") tree = fin["events"] @@ -67,7 +67,7 @@ def test_plot1d(): plt.switch_backend("agg") - from coffea import hist + from coffea.jitters import hist lepton_kinematics = fill_lepton_kinematics() @@ -96,7 +96,7 @@ def test_plot2d(): # matplotlib import matplotlib.pyplot as plt - from coffea import hist + from coffea.jitters import hist plt.switch_backend("agg") @@ -115,7 +115,7 @@ def test_plotratio(): # matplotlib import matplotlib.pyplot as plt - from coffea import hist + from coffea.jitters import hist plt.switch_backend("agg") @@ -238,7 +238,7 @@ def test_plotgrid(): # matplotlib import matplotlib.pyplot as plt - from coffea import hist + from coffea.jitters import hist plt.switch_backend("agg") @@ -268,7 +268,7 @@ def test_plotgrid(): def test_clopper_pearson_interval(): - from coffea.hist.plot import clopper_pearson_interval + from coffea.jitters.hist.plot import clopper_pearson_interval # Reference values for CL=0.6800 calculated with ROOT's TEfficiency num = np.array([1.0, 5.0, 10.0, 10.0]) @@ -293,7 +293,7 @@ def test_clopper_pearson_interval(): def test_normal_interval(): - from coffea.hist.plot import normal_interval + from coffea.jitters.hist.plot import normal_interval # Reference weighted efficiency and error from ROOTs TEfficiency diff --git a/tests/test_hist_tools.py b/tests/test_hist_tools.py index b3abb803c..05bd59721 100644 --- a/tests/test_hist_tools.py +++ b/tests/test_hist_tools.py @@ -1,11 +1,9 @@ -import sys - import awkward as ak import numpy as np import pytest from dummy_distributions import dummy_jagged_eta_pt -from coffea import hist +from coffea.jitters import hist def test_hist(): @@ -217,38 +215,6 @@ def test_hist(): assert h_less.sum("vocalization", "height", "mass", "animal").values()[()] == 1004.0 -def test_export1d(): - import os - - import uproot3 - - from coffea.hist.export import export1d - - counts, test_eta, test_pt = dummy_jagged_eta_pt() - h_regular_bins = hist.Hist("regular_joe", hist.Bin("x", "x", 20, 0, 200)) - h_regular_bins.fill(x=test_pt) - - hout = export1d(h_regular_bins) - - filename = "test_export1d.root" - - with uproot3.create(filename) as fout: - fout["regular_joe"] = hout - fout.close() - - with uproot3.open(filename) as fin: - hin = fin["regular_joe"] - - assert np.all(hin.edges == hout.edges) - assert np.all(hin.values == hout.values) - - del hin - del fin - - if os.path.exists(filename): - os.remove(filename) - - def test_hist_serdes(): import pickle @@ -291,89 +257,8 @@ def test_hist_serdes_labels(): assert h._axes == hnew._axes -@pytest.mark.skipif( - sys.version_info < (3, 4), - reason="requires python3.4 or higher, test file is pickle proto 4", -) -def test_hist_compat(): - from coffea.util import load - - test = load("tests/samples/old_hist_format.coffea") - - expected_bins = np.array( - [ - -np.inf, - 0.0, - 20.0, - 40.0, - 60.0, - 80.0, - 100.0, - 120.0, - 140.0, - 160.0, - 180.0, - 200.0, - 220.0, - 240.0, - 260.0, - 280.0, - 300.0, - 320.0, - 340.0, - 360.0, - 380.0, - 400.0, - 420.0, - 440.0, - 460.0, - 480.0, - 500.0, - 520.0, - 540.0, - 560.0, - 580.0, - 600.0, - 620.0, - 640.0, - 660.0, - 680.0, - 700.0, - 720.0, - 740.0, - 760.0, - 780.0, - 800.0, - 820.0, - 840.0, - 860.0, - 880.0, - 900.0, - 920.0, - 940.0, - 960.0, - 980.0, - 1000.0, - 1020.0, - 1040.0, - 1060.0, - 1080.0, - 1100.0, - 1120.0, - 1140.0, - 1160.0, - 1180.0, - 1200.0, - np.inf, - np.nan, - ] - ) - assert np.all(test._axes[2]._interval_bins[:-1] == expected_bins[:-1]) - assert np.isnan(test._axes[2]._interval_bins[-1]) - - def test_issue_247(): - from coffea import hist + from coffea.jitters import hist h = hist.Hist("stuff", hist.Bin("old", "old", 20, -1, 1)) h.fill(old=h.axis("old").centers()) From 2f567627cb4018ff1c0140aa1cea635bf70a5792 Mon Sep 17 00:00:00 2001 From: Lindsey Gray Date: Sun, 26 May 2024 13:58:24 +0000 Subject: [PATCH 03/11] ressurect main functionality tests, bits of cruft yet to fix, decent throughput on multi-dim histograms --- .github/workflows/ci.yml | 2 +- pyproject.toml | 3 + src/coffea/jitters/hist/hist_tools.py | 105 ++++++++++++++++---------- src/coffea/jitters/hist/plot.py | 4 +- tests/test_hist_tools.py | 2 + 5 files changed, 72 insertions(+), 44 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index a1b02d658..a5319d31d 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -75,7 +75,7 @@ jobs: uv pip install --system xgboost uv pip install --system 'tritonclient[grpc,http]!=2.41.0' # install checked out coffea - uv pip install --system -q '.[dev,parsl,dask,spark]' --upgrade + uv pip install --system -q '.[dev,parsl,dask,spark,gpu]' --upgrade uv pip list --system java -version - name: Install dependencies (MacOS) diff --git a/pyproject.toml b/pyproject.toml index e3a0fd14a..b4eb9b5ad 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -93,6 +93,9 @@ rucio = [ "rucio-clients>=32;python_version>'3.8'", "rucio-clients<32;python_version<'3.9'", ] +gpu = [ + "cupy>=13.1.0" +] dev = [ "pre-commit", "flake8", diff --git a/src/coffea/jitters/hist/hist_tools.py b/src/coffea/jitters/hist/hist_tools.py index 848ccfed8..1dca811ec 100644 --- a/src/coffea/jitters/hist/hist_tools.py +++ b/src/coffea/jitters/hist/hist_tools.py @@ -7,6 +7,7 @@ from collections import namedtuple import awkward +import cupy import numpy # Python 2 and 3 compatibility @@ -23,6 +24,19 @@ MaybeSumSlice = namedtuple("MaybeSumSlice", ["start", "stop", "sum"]) +_replace_nans = cupy.ElementwiseKernel("T v", "T x", "x = isnan(x)?v:x", "replace_nans") + +_clip_bins = cupy.ElementwiseKernel( + "T Nbins, T lo, T hi, T id", + "T idx", + """ + const T floored = floor((id - lo) * float(Nbins) / (hi - lo)) + 1; + idx = floored < 0 ? 0 : floored; + idx = floored > Nbins ? Nbins + 1 : floored; + """, + "clip_bins", +) + def assemble_blocks(array, ndslice, depth=0): """ @@ -481,8 +495,8 @@ def __init__(self, name, label, n_or_arr, lo=None, hi=None): self._lo = self._bins[0] self._hi = self._bins[-1] # to make searchsorted differentiate inf from nan - self._bins = numpy.append(self._bins, numpy.inf) - self._interval_bins = numpy.r_[-numpy.inf, self._bins, numpy.nan] + self._bins = cupy.append(self._bins, cupy.inf) + self._interval_bins = cupy.r_[-cupy.inf, self._bins, cupy.nan] self._bin_names = numpy.full(self._interval_bins[:-1].size, None) elif isinstance(n_or_arr, numbers.Integral): if lo is None or hi is None: @@ -493,11 +507,11 @@ def __init__(self, name, label, n_or_arr, lo=None, hi=None): self._lo = lo self._hi = hi self._bins = n_or_arr - self._interval_bins = numpy.r_[ - -numpy.inf, - numpy.linspace(self._lo, self._hi, self._bins + 1), - numpy.inf, - numpy.nan, + self._interval_bins = cupy.r_[ + -cupy.inf, + cupy.linspace(self._lo, self._hi, self._bins + 1), + cupy.inf, + cupy.nan, ] self._bin_names = numpy.full(self._interval_bins[:-1].size, None) else: @@ -528,7 +542,7 @@ def __setstate__(self, d): if "_intervals" in d: # convert old hists to new serialization format _old_intervals = d.pop("_intervals") interval_bins = [i._lo for i in _old_intervals] + [_old_intervals[-1]._hi] - d["_interval_bins"] = numpy.array(interval_bins) + d["_interval_bins"] = cupy.array(interval_bins) d["_bin_names"] = numpy.array( [interval._label for interval in _old_intervals] ) @@ -548,23 +562,28 @@ def index(self, identifier): Returns an integer corresponding to the index in the axis where the histogram would be filled. The integer range includes flow bins: ``0 = underflow, n+1 = overflow, n+2 = nanflow`` """ - isarray = isinstance(identifier, (awkward.Array, numpy.ndarray)) + isarray = isinstance(identifier, (awkward.Array, cupy.ndarray, numpy.ndarray)) if isarray or isinstance(identifier, numbers.Number): - if isarray: - identifier = numpy.asarray(identifier) + identifier = awkward.to_cupy(identifier) # cupy.asarray(identifier) if self._uniform: - idx = numpy.clip( - numpy.floor( - (identifier - self._lo) - * float(self._bins) - / (self._hi - self._lo) + idx = None + if isarray: + idx = cupy.zeros_like(identifier) + _clip_bins(float(self._bins), self._lo, self._hi, identifier, idx) + else: + idx = numpy.clip( + numpy.floor( + (identifier - self._lo) + * float(self._bins) + / (self._hi - self._lo) + ) + + 1, + 0, + self._bins + 1, ) - + 1, - 0, - self._bins + 1, - ) - if isinstance(idx, numpy.ndarray): - idx[numpy.isnan(idx)] = self.size - 1 + + if isinstance(idx, (cupy.ndarray, numpy.ndarray)): + _replace_nans(self.size - 1, idx) idx = idx.astype(int) elif numpy.isnan(idx): idx = self.size - 1 @@ -572,7 +591,7 @@ def index(self, identifier): idx = int(idx) return idx else: - return numpy.searchsorted(self._bins, identifier, side="right") + return cupy.searchsorted(self._bins, identifier, side="right") elif isinstance(identifier, Interval): if identifier.nan(): return self.size - 1 @@ -1095,7 +1114,9 @@ def __getitem__(self, keys): dense_idx = tuple(dense_idx) def dense_op(array): - return numpy.block(assemble_blocks(array, dense_idx)) + as_numpy = array.get() + blocked = numpy.block(assemble_blocks(as_numpy, dense_idx)) + return cupy.asarray(blocked) out = Hist(self._label, *new_dims, dtype=self._dtype) if self._sumw2 is not None: @@ -1139,10 +1160,10 @@ def fill(self, **values): """ weight = values.pop("weight", None) - if isinstance(weight, (awkward.Array, numpy.ndarray)): - weight = numpy.asarray(weight) + if isinstance(weight, (awkward.Array, cupy.ndarray, numpy.ndarray)): + weight = cupy.array(weight) if isinstance(weight, numbers.Number): - weight = numpy.atleast_1d(weight) + weight = cupy.atleast_1d(weight) if not all(d.name in values for d in self._axes): missing = ", ".join(d.name for d in self._axes if d.name not in values) raise ValueError( @@ -1161,44 +1182,46 @@ def fill(self, **values): sparse_key = tuple(d.index(values[d.name]) for d in self.sparse_axes()) if sparse_key not in self._sumw: - self._sumw[sparse_key] = numpy.zeros( + self._sumw[sparse_key] = cupy.zeros( shape=self._dense_shape, dtype=self._dtype ) if self._sumw2 is not None: - self._sumw2[sparse_key] = numpy.zeros( + self._sumw2[sparse_key] = cupy.zeros( shape=self._dense_shape, dtype=self._dtype ) if self.dense_dim() > 0: dense_indices = tuple( - d.index(values[d.name]) for d in self._axes if isinstance(d, DenseAxis) + cupy.asarray(d.index(values[d.name])) + for d in self._axes + if isinstance(d, DenseAxis) ) - xy = numpy.atleast_1d( - numpy.ravel_multi_index(dense_indices, self._dense_shape) + xy = cupy.atleast_1d( + cupy.ravel_multi_index(dense_indices, self._dense_shape) ) if weight is not None: - self._sumw[sparse_key][:] += numpy.bincount( + self._sumw[sparse_key][:] += cupy.bincount( xy, weights=weight, minlength=numpy.array(self._dense_shape).prod() ).reshape(self._dense_shape) - self._sumw2[sparse_key][:] += numpy.bincount( + self._sumw2[sparse_key][:] += cupy.bincount( xy, weights=weight**2, minlength=numpy.array(self._dense_shape).prod(), ).reshape(self._dense_shape) else: - self._sumw[sparse_key][:] += numpy.bincount( + self._sumw[sparse_key][:] += cupy.bincount( xy, weights=None, minlength=numpy.array(self._dense_shape).prod() ).reshape(self._dense_shape) if self._sumw2 is not None: - self._sumw2[sparse_key][:] += numpy.bincount( + self._sumw2[sparse_key][:] += cupy.bincount( xy, weights=None, minlength=numpy.array(self._dense_shape).prod(), ).reshape(self._dense_shape) else: if weight is not None: - self._sumw[sparse_key] += numpy.sum(weight) - self._sumw2[sparse_key] += numpy.sum(weight**2) + self._sumw[sparse_key] += cupy.sum(weight) + self._sumw2[sparse_key] += cupy.sum(weight**2) else: self._sumw[sparse_key] += 1.0 if self._sumw2 is not None: @@ -1604,14 +1627,14 @@ def expandkey(key): for sparse_key, sumw in values.items(): index = tuple(expandkey(sparse_key)) view = out.view(flow=True) - view[index] = sumw + view[index] = sumw.get() else: values = self.values(sumw2=True, overflow="all") for sparse_key, (sumw, sumw2) in values.items(): index = tuple(expandkey(sparse_key)) view = out.view(flow=True) - view[index].value = sumw - view[index].variance = sumw2 + view[index].value = sumw.get() + view[index].variance = sumw2.get() return out diff --git a/src/coffea/jitters/hist/plot.py b/src/coffea/jitters/hist/plot.py index 0d1cc10a3..d4d6be239 100644 --- a/src/coffea/jitters/hist/plot.py +++ b/src/coffea/jitters/hist/plot.py @@ -263,8 +263,8 @@ def plot1d( the_slice = (the_slice[1], the_slice[0]) sumw = sumw[the_slice] sumw2 = sumw2[the_slice] - plot_info["sumw"].append(sumw) - plot_info["sumw2"].append(sumw2) + plot_info["sumw"].append(sumw.get()) + plot_info["sumw2"].append(sumw2.get()) def w2err(sumw, sumw2): err = [] diff --git a/tests/test_hist_tools.py b/tests/test_hist_tools.py index 05bd59721..0b26fa974 100644 --- a/tests/test_hist_tools.py +++ b/tests/test_hist_tools.py @@ -3,6 +3,8 @@ import pytest from dummy_distributions import dummy_jagged_eta_pt +pytest.importorskip("cupy") + from coffea.jitters import hist From ee0ccc66175d8e43db53455e5cb062001bef1204 Mon Sep 17 00:00:00 2001 From: Lindsey Gray Date: Sun, 26 May 2024 09:03:13 -0500 Subject: [PATCH 04/11] use precompiled package for cupy --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index b4eb9b5ad..a0f84858c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -94,7 +94,7 @@ rucio = [ "rucio-clients<32;python_version<'3.9'", ] gpu = [ - "cupy>=13.1.0" + "cupy-cuda12x>=13.1.0" ] dev = [ "pre-commit", From ec7bdf9098d2dda8437b7da2d20c01021687b1c0 Mon Sep 17 00:00:00 2001 From: Lindsey Gray Date: Sun, 26 May 2024 12:24:05 -0500 Subject: [PATCH 05/11] install cuda-toolkit with actions --- .github/workflows/ci.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index a5319d31d..368b9be22 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -65,6 +65,10 @@ jobs: - name: Install uv run: python -m pip install --upgrade uv + - name: Install cuda-toolkit (Linux) + if: matrix.os == 'ubuntu-latest' + uses: Jimver/cuda-toolkit@v0.2.15 + - name: Install dependencies (Linux) if: matrix.os == 'ubuntu-latest' run: | From 7f351af96b07f84c3200a42e5ca1dd10eb243d25 Mon Sep 17 00:00:00 2001 From: Lindsey Gray Date: Sun, 26 May 2024 12:30:34 -0500 Subject: [PATCH 06/11] cuda install is too big --- .github/workflows/ci.yml | 4 ---- 1 file changed, 4 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 368b9be22..a5319d31d 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -65,10 +65,6 @@ jobs: - name: Install uv run: python -m pip install --upgrade uv - - name: Install cuda-toolkit (Linux) - if: matrix.os == 'ubuntu-latest' - uses: Jimver/cuda-toolkit@v0.2.15 - - name: Install dependencies (Linux) if: matrix.os == 'ubuntu-latest' run: | From a31afcc4947983337985275e4133ebc0f9f6ab70 Mon Sep 17 00:00:00 2001 From: Lindsey Gray Date: Sun, 26 May 2024 12:31:07 -0500 Subject: [PATCH 07/11] cannot test gpu code in matrix as is --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index a5319d31d..a1b02d658 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -75,7 +75,7 @@ jobs: uv pip install --system xgboost uv pip install --system 'tritonclient[grpc,http]!=2.41.0' # install checked out coffea - uv pip install --system -q '.[dev,parsl,dask,spark,gpu]' --upgrade + uv pip install --system -q '.[dev,parsl,dask,spark]' --upgrade uv pip list --system java -version - name: Install dependencies (MacOS) From 74831fcf2687b74bcf061b64d55bcf7590fccec9 Mon Sep 17 00:00:00 2001 From: Lindsey Gray Date: Sun, 26 May 2024 13:05:35 -0500 Subject: [PATCH 08/11] skip hist plotting tests if no cupy --- tests/test_hist_plot.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/test_hist_plot.py b/tests/test_hist_plot.py index 1eb7af46b..8654fb11e 100644 --- a/tests/test_hist_plot.py +++ b/tests/test_hist_plot.py @@ -5,6 +5,8 @@ from coffea.util import numpy as np +pytest.importorskip("cupy") + url = ( "https://github.com/scikit-hep/uproot3/blob/master/tests/samples/HZZ.root?raw=true" ) From a16c81a23d6e673254edf92a36653e8d3239f9e2 Mon Sep 17 00:00:00 2001 From: Lindsey Gray Date: Thu, 30 May 2024 21:26:47 +0000 Subject: [PATCH 09/11] all tests work except test_hist_plot::test_plotgrid --- src/coffea/jitters/hist/hist_tools.py | 21 +++++++++++++++------ src/coffea/jitters/hist/plot.py | 13 ++++++++----- tests/test_hist_plot.py | 4 ++-- tests/test_hist_tools.py | 2 +- 4 files changed, 26 insertions(+), 14 deletions(-) diff --git a/src/coffea/jitters/hist/hist_tools.py b/src/coffea/jitters/hist/hist_tools.py index 1dca811ec..1711b0a7a 100644 --- a/src/coffea/jitters/hist/hist_tools.py +++ b/src/coffea/jitters/hist/hist_tools.py @@ -487,9 +487,9 @@ class Bin(DenseAxis): def __init__(self, name, label, n_or_arr, lo=None, hi=None): super().__init__(name, label) self._lazy_intervals = None - if isinstance(n_or_arr, (list, numpy.ndarray)): + if isinstance(n_or_arr, (list, numpy.ndarray, cupy.ndarray)): self._uniform = False - self._bins = numpy.array(n_or_arr, dtype="d") + self._bins = cupy.array(n_or_arr, dtype="d") if not all(numpy.sort(self._bins) == self._bins): raise ValueError("Binning not sorted!") self._lo = self._bins[0] @@ -583,7 +583,10 @@ def index(self, identifier): ) if isinstance(idx, (cupy.ndarray, numpy.ndarray)): - _replace_nans(self.size - 1, idx) + _replace_nans( + self.size - 1, + idx if idx.dtype.kind == "f" else idx.astype(cupy.float32), + ) idx = idx.astype(int) elif numpy.isnan(idx): idx = self.size - 1 @@ -596,7 +599,13 @@ def index(self, identifier): if identifier.nan(): return self.size - 1 for idx, interval in enumerate(self._intervals): - if interval._lo <= identifier._lo and interval._hi >= identifier._hi: + if ( + interval._lo <= identifier._lo + or cupy.isclose(interval._lo, identifier._lo) + ) and ( + interval._hi >= identifier._hi + or cupy.isclose(interval._hi, identifier._hi) + ): return idx raise ValueError( "Axis %r has no interval that fully contains identifier %r" @@ -759,10 +768,10 @@ def edges(self, overflow="none"): See `Hist.sum` description for the allowed values. """ if self._uniform: - out = numpy.linspace(self._lo, self._hi, self._bins + 1) + out = cupy.linspace(self._lo, self._hi, self._bins + 1) else: out = self._bins[:-1].copy() - out = numpy.r_[ + out = cupy.r_[ 2 * out[0] - out[1], out, 2 * out[-1] - out[-2], 3 * out[-1] - 2 * out[-2] ] return out[overflow_behavior(overflow)] diff --git a/src/coffea/jitters/hist/plot.py b/src/coffea/jitters/hist/plot.py index d4d6be239..7f5c6ab30 100644 --- a/src/coffea/jitters/hist/plot.py +++ b/src/coffea/jitters/hist/plot.py @@ -231,7 +231,7 @@ def plot1d( elif isinstance(axis, DenseAxis): ax.set_xlabel(axis.label) ax.set_ylabel(hist.label) - edges = axis.edges(overflow=overflow) + edges = axis.edges(overflow=overflow).get() if order is None: identifiers = ( hist.identifiers(overlay, overflow=overlay_overflow) @@ -417,12 +417,14 @@ def plotratio( elif isinstance(axis, DenseAxis): ax.set_xlabel(axis.label) ax.set_ylabel(num.label) - edges = axis.edges(overflow=overflow) - centers = axis.centers(overflow=overflow) + edges = axis.edges(overflow=overflow).get() + centers = axis.centers(overflow=overflow).get() ranges = (edges[1:] - edges[:-1]) / 2 if xerr else None sumw_num, sumw2_num = num.values(sumw2=True, overflow=overflow)[()] sumw_denom, sumw2_denom = denom.values(sumw2=True, overflow=overflow)[()] + sumw_num, sumw2_num = sumw_num.get(), sumw2_num.get() + sumw_denom, sumw2_denom = sumw_denom.get(), sumw2_denom.get() rsumw = sumw_num / sumw_denom if unc == "clopper-pearson": @@ -557,9 +559,10 @@ def plot2d( if isinstance(xaxis, SparseAxis) or isinstance(yaxis, SparseAxis): raise NotImplementedError("Plot a sparse axis (e.g. bar chart or labeled bins)") else: - xedges = xaxis.edges(overflow=xoverflow) - yedges = yaxis.edges(overflow=yoverflow) + xedges = xaxis.edges(overflow=xoverflow).get() + yedges = yaxis.edges(overflow=yoverflow).get() sumw, sumw2 = hist.values(sumw2=True, overflow="allnan")[()] + sumw, sumw2 = sumw.get(), sumw2.get() if transpose: sumw = sumw.T sumw2 = sumw2.T diff --git a/tests/test_hist_plot.py b/tests/test_hist_plot.py index 8654fb11e..7ed8f19a1 100644 --- a/tests/test_hist_plot.py +++ b/tests/test_hist_plot.py @@ -125,8 +125,8 @@ def test_plotratio(): # Add some pseudodata to a pt histogram so we can make a nice data/mc plot pthist = lepton_kinematics.sum("eta") - bin_values = pthist.axis("pt").centers() - poisson_means = pthist.sum("flavor").values()[()] + bin_values = pthist.axis("pt").centers().get() + poisson_means = pthist.sum("flavor").values()[()].get() values = np.repeat(bin_values, np.random.poisson(poisson_means)) pthist.fill(flavor="pseudodata", pt=values) diff --git a/tests/test_hist_tools.py b/tests/test_hist_tools.py index 0b26fa974..460770f23 100644 --- a/tests/test_hist_tools.py +++ b/tests/test_hist_tools.py @@ -283,7 +283,7 @@ def test_issue_247(): def test_issue_333(): axis = hist.Bin("channel", "Channel b1", 50, 0, 2000) temp = np.arange(0, 2000, 40, dtype=np.int16) - assert np.all(axis.index(temp) == np.arange(50) + 1) + assert np.all(axis.index(temp).get() == np.arange(50) + 1) def test_issue_394(): From 73855e1ec39bf1bfc735e16d133b912a06ecbd70 Mon Sep 17 00:00:00 2001 From: Lindsey Gray Date: Thu, 1 Aug 2024 16:59:15 -0500 Subject: [PATCH 10/11] add numba-cuda as gpu dependency --- pyproject.toml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index a0f84858c..a0f31e77a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -94,7 +94,8 @@ rucio = [ "rucio-clients<32;python_version<'3.9'", ] gpu = [ - "cupy-cuda12x>=13.1.0" + "cupy-cuda12x>=13.1.0", + "numba-cuda>=0.0.13" ] dev = [ "pre-commit", From cb7e49458207440103d5c14a47db703611d4686a Mon Sep 17 00:00:00 2001 From: Lindsey Gray Date: Thu, 1 Aug 2024 17:00:04 -0500 Subject: [PATCH 11/11] lint --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index a0f31e77a..559447e44 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -95,7 +95,7 @@ rucio = [ ] gpu = [ "cupy-cuda12x>=13.1.0", - "numba-cuda>=0.0.13" + "numba-cuda>=0.0.13", ] dev = [ "pre-commit",