Skip to content

Commit

Permalink
Merge pull request #67 from ggmarshall/main
Browse files Browse the repository at this point in the history
Update validity management code to new format
  • Loading branch information
gipert authored Nov 26, 2024
2 parents 03da06e + b460921 commit c7cb940
Show file tree
Hide file tree
Showing 14 changed files with 160 additions and 57 deletions.
4 changes: 2 additions & 2 deletions .pre-commit-hooks.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,10 @@
name: check LEGEND channel maps format
entry: validate-legend-chmaps
language: python
types: [json]
types: [yaml]

- id: validate-legend-detdb
name: check LEGEND detector database format
entry: validate-legend-detdb
language: python
types: [json]
types: [yaml]
43 changes: 35 additions & 8 deletions docs/source/tutorial.rst
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ Let's consider the following database:
│   └── file1.json
├── file2.json
├── file3.yaml
└── validity.jsonl
└── validity.yaml
With:

Expand Down Expand Up @@ -80,20 +80,47 @@ Metadata validity
-----------------

Mappings of metadata to time periods, data taking systems etc. are specified
through JSONL files (`specification
through YAML files (`specification
<https://legend-exp.github.io/legend-data-format-specs/dev/metadata>`_).
If a ``.jsonl`` file is present in a directory, ``TextDB``
If a ``validity.yaml`` file is present in a directory, ``TextDB``
exposes the :meth:`~.textdb.textdb.on` interface to perform a query.

Let's assume the ``legend-metadata`` directory from the example above contains
the following file:

.. code-block::
.. code-block:: yaml
:linenos:
:caption: ``validity.jsonl``
{"valid_from": "20220628T000000Z", "select": "all", "apply": ["file2.json"]}
{"valid_from": "20220629T000000Z", "select": "all", "apply": ["file3.yaml"]}
:caption: ``validity.yaml``
- valid_from: 20230101T000000Z
category: all
apply:
- file3.yaml
- valid_from: 20230102T000000Z
category: all
mode: append
apply:
- file2.yaml
- valid_from: 20230103T000000Z
category: all
mode: remove
apply:
- file2.yaml
- valid_from: 20230104T000000Z
category: all
mode: reset
apply:
- file2.yaml
- valid_from: 20230105T000000Z
category: all
mode: replace
apply:
- file2.yaml
- file3.yaml
From code, it's possible to obtain the metadata valid for a certain time point:

Expand Down
48 changes: 41 additions & 7 deletions src/legendmeta/catalog.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,14 @@
import bisect
import collections
import copy
import json
import types
from collections import namedtuple
from datetime import datetime
from pathlib import Path
from string import Template

import yaml

from . import utils


Expand All @@ -33,6 +34,7 @@ def to_datetime(value):


def unix_time(value):
"""Convert a LEGEND timestamp or datetime object to Unix time value"""
if isinstance(value, str):
return datetime.timestamp(datetime.strptime(value, "%Y%m%dT%H%M%SZ"))

Expand All @@ -44,6 +46,8 @@ def unix_time(value):


class PropsStream:
"""Simple class to control loading of validity.yaml files"""

@staticmethod
def get(value):
if isinstance(value, str):
Expand All @@ -57,13 +61,14 @@ def get(value):

@staticmethod
def read_from(file_name):
with Path(file_name).open() as file:
for json_str in file:
yield json.loads(json_str)
with Path(file_name).open() as r:
file = yaml.safe_load(r)
file = sorted(file, key=lambda item: unix_time(item["valid_from"]))
yield from file


class Catalog(namedtuple("Catalog", ["entries"])):
"""Implementation of the `JSONL metadata validity specification <https://legend-exp.github.io/legend-data-format-specs/dev/metadata/#Specifying-metadata-validity-in-time-(and-system)>`_."""
"""Implementation of the `YAML metadata validity specification <https://legend-exp.github.io/legend-data-format-specs/dev/metadata/#Specifying-metadata-validity-in-time-(and-system)>`_."""

__slots__ = ()

Expand All @@ -83,15 +88,40 @@ def get(value):

@staticmethod
def read_from(file_name):
"""Read from a valdiity YAML file and build a Catalog object"""
entries = {}

for props in PropsStream.get(file_name):
timestamp = props["valid_from"]
system = "all" if props.get("category") is None else props["category"]
file_key = props["apply"]
if system not in entries:
entries[system] = []
entries[system].append(Catalog.Entry(unix_time(timestamp), file_key))
mode = "append" if props.get("mode") is None else props["mode"]
mode = "reset" if len(entries[system]) == 0 else mode
if mode == "reset":
new = file_key
elif mode == "append":
new = entries[system][-1].file.copy() + file_key
elif mode == "remove":
new = entries[system][-1].file.copy()
for file in file_key:
new.remove(file)
elif mode == "replace":
new = entries[system][-1].file.copy()
if len(file_key) != 2:
msg = f"Invalid number of elements in replace mode: {len(file_key)}"
raise ValueError(msg)
new.remove(file_key[0])
new += [file_key[1]]

else:
msg = f"Unknown mode for {timestamp}"
raise ValueError(msg)

if timestamp in [entry.valid_from for entry in entries[system]]:
msg = f"Duplicate timestamp: {timestamp}, use reset mode instead with a single entry"
raise ValueError(msg)
entries[system].append(Catalog.Entry(unix_time(timestamp), new))

for system in entries:
entries[system] = sorted(
Expand All @@ -100,6 +130,7 @@ def read_from(file_name):
return Catalog(entries)

def valid_for(self, timestamp, system="all", allow_none=False):
"""Get the valid entries for a given timestamp and system"""
if system in self.entries:
valid_from = [entry.valid_from for entry in self.entries[system]]
pos = bisect.bisect_right(valid_from, unix_time(timestamp))
Expand All @@ -126,11 +157,14 @@ def valid_for(self, timestamp, system="all", allow_none=False):

@staticmethod
def get_files(catalog_file, timestamp, category="all"):
"""Helper function to get the files for a given timestamp and category"""
catalog = Catalog.read_from(catalog_file)
return Catalog.valid_for(catalog, timestamp, category)


class Props:
"""Class to handle overwriting of dictionaries in cascade order"""

@staticmethod
def read_from(sources, subst_pathvar=False, trim_null=False):
def read_impl(sources):
Expand Down
12 changes: 7 additions & 5 deletions src/legendmeta/police.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,13 @@
from __future__ import annotations

import argparse
import json
import re
import sys
from importlib import resources
from pathlib import Path

import yaml

from . import utils
from .textdb import TextDB

Expand Down Expand Up @@ -96,10 +97,11 @@ def validate_legend_channel_map() -> bool:
db = TextDB(d)
valid = True

with Path(f"{d}/validity.jsonl").open() as f:
for line in f.readlines():
ts = json.loads(line)["valid_from"]
sy = json.loads(line)["select"]
with Path(f"{d}/validity.yaml").open() as f:
validity = yaml.safe_load(f)
for line in validity():
ts = line["valid_from"]
sy = line["apply"]
chmap = db.on(ts, system=sy)

for k, v in chmap.items():
Expand Down
15 changes: 9 additions & 6 deletions src/legendmeta/textdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -381,11 +381,11 @@ def on(
) -> AttrsDict | list:
"""Query database in `time[, file pattern, system]`.
A (only one) valid ``validity.jsonl`` file must exist in the directory
A (only one) valid ``validity.yaml`` file must exist in the directory
to specify a validity mapping. This functionality relies on the
:class:`.catalog.Catalog` class.
The JSONL specification is documented at `this link
The YAML specification is documented at `this link
<https://legend-exp.github.io/legend-data-format-specs/dev/metadata/#Specifying-metadata-validity-in-time-(and-system)>`_.
The special ``$_`` string is expanded to the directory containing the
Expand All @@ -401,12 +401,15 @@ def on(
system: 'all', 'phy', 'cal', 'lar', ...
query only a data taking "system".
"""
jsonl = self.__path__ / "validity.jsonl"
if not jsonl.is_file():
msg = f"no validity.jsonl file found in {self.__path__!s}"
for ext in utils.__file_extensions__["yaml"]:
yml = self.__path__ / f"validity{ext}"
if yml.is_file():
break
if not yml.is_file():
msg = f"no validity.yaml / validity.yml file found in {self.__path__!s}"
raise RuntimeError(msg)

file_list = Catalog.get_files(str(jsonl), timestamp, system)
file_list = Catalog.get_files(str(yml), timestamp, system)
# select only files matching pattern if specified
if pattern is not None:
c = re.compile(pattern)
Expand Down
53 changes: 33 additions & 20 deletions tests/test_jsondb.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ def test_props():
# test subst_vars
Props.subst_vars(test_dict, var_values={"_": str(Path(__file__).parent / "testdb")})
assert test_dict["filepath"] == str(
Path(__file__).parent / "testdb/dir1/file3.json"
Path(__file__).parent / "testdb/dir1/file3.yaml"
)

test_dict2 = Props.read_from(str(Path(__file__).parent / "testdb/file3.json"))
Expand All @@ -43,7 +43,7 @@ def test_props():
)
assert test_dict["data"] == 3
assert test_dict["filepath"] == str(
Path(__file__).parent / "testdb/dir1/file3.json"
Path(__file__).parent / "testdb/dir1/file3.yaml"
)
with pytest.raises(KeyError):
test_dict["null_key"]
Expand All @@ -55,12 +55,12 @@ def test_access():
assert isinstance(jdb["file2.yaml"], AttrsDict)
assert isinstance(jdb["file1"], AttrsDict)
assert isinstance(jdb["dir1"], TextDB)
assert isinstance(jdb["dir1"]["file3.json"], AttrsDict)
assert isinstance(jdb["dir1"]["file3.yaml"], AttrsDict)
assert isinstance(jdb["dir1"]["file3"], AttrsDict)
assert isinstance(jdb["dir1/file3.json"], AttrsDict)
assert isinstance(jdb["dir1/file3.yaml"], AttrsDict)
assert isinstance(jdb["dir1"]["dir2"], TextDB)
assert isinstance(jdb["dir1"]["dir2"]["file4.json"], AttrsDict)
assert isinstance(jdb["dir1/dir2/file4.json"], AttrsDict)
assert isinstance(jdb["dir1"]["dir2"]["file4.yaml"], AttrsDict)
assert isinstance(jdb["dir1/dir2/file4.yaml"], AttrsDict)
assert jdb["file1.json"]["data"] == 1
assert isinstance(jdb["file1"]["group"], AttrsDict)

Expand All @@ -82,7 +82,7 @@ def test_access():
assert jdb.arrays[1].array[0] == 1
assert jdb.arrays[1].array[1].data == 2

assert jdb.file2.filepath == str(Path(__file__).parent / "testdb/dir1/file3.json")
assert jdb.file2.filepath == str(Path(__file__).parent / "testdb/dir1/file3.yaml")

with pytest.raises(ValueError):
TextDB("non-existent-db")
Expand All @@ -98,7 +98,7 @@ def test_access():
def test_keys():
jdb = TextDB(testdb, lazy=False)
assert sorted(jdb.keys()) == ["arrays", "dir1", "dir2", "file1", "file2", "file3"]
assert sorted(jdb.dir1.keys()) == ["dir2", "file3", "file5"]
assert sorted(jdb.dir1.keys()) == ["dir2", "file3", "file5", "file6", "validity"]

assert "arrays" in jdb

Expand Down Expand Up @@ -162,28 +162,33 @@ def test_scan():

def test_time_validity():
jdb = TextDB(testdb)
assert isinstance(jdb["dir1"].on("20220628T221955Z"), AttrsDict)
assert isinstance(jdb["dir1"].on("20230101T000001Z"), AttrsDict)

assert jdb["dir1"].on("20220628T221955Z")["data"] == 1
assert jdb.dir1.on("20220629T221955Z").data == 2
assert jdb["dir1"].on("20230101T000000Z")["data"] == 1
assert jdb.dir1.on("20230102T000000Z").data == 2
# time point in between
assert jdb["dir1"].on("20220628T233500Z")["data"] == 1
assert jdb["dir1"].on("20230101T120000Z")["data"] == 1
# time point after
assert jdb["dir1"].on("20220630T233500Z")["data"] == 2
assert jdb["dir1"].on("20230102T120000Z")["data"] == 2
# time point before
with pytest.raises(RuntimeError):
jdb["dir1"].on("20220627T233500Z")["data"]

# directory with no .jsonl
jdb["dir1"].on("20210101T000000Z")["data"]
# test remove functionality
assert jdb["dir1"].on("20230103T120000Z")["data"] == 1
# test reset functionality
assert jdb["dir1"].on("20230104T120000Z")["data"] == 3
# test replace functionality
assert jdb["dir1"].on("20230105T120000Z")["data"] == 1
# directory with no .yml
with pytest.raises(RuntimeError):
jdb["dir1"]["dir2"].on("20220627T233500Z")
jdb["dir1"]["dir2"].on("20230101T000001Z")

# invalid timestamp
with pytest.raises(ValueError):
jdb.dir1.on("20220627T2335002Z")
jdb.dir1.on("20230627T2335002Z")

# test usage of datetime object
tstamp = datetime(2022, 6, 28, 23, 35, 00, tzinfo=timezone.utc)
tstamp = datetime(2023, 6, 28, 23, 35, 00, tzinfo=timezone.utc)
assert jdb.dir1.on(tstamp).data == 1
assert jdb.dir1.on(tstamp, r"^file3.*", "all").data == 1

Expand Down Expand Up @@ -241,7 +246,15 @@ def test_merging():
jdb = TextDB(testdb, lazy=False)
j = jdb.dir1 | jdb.dir2
assert isinstance(j, AttrsDict)
assert sorted(j.keys()) == ["dir2", "file3", "file5", "file7", "file8"]
assert sorted(j.keys()) == [
"dir2",
"file3",
"file5",
"file6",
"file7",
"file8",
"validity",
]
assert hasattr(j, "dir2")
assert hasattr(j, "file8")

Expand Down
3 changes: 0 additions & 3 deletions tests/testdb/dir1/file3.json

This file was deleted.

1 change: 1 addition & 0 deletions tests/testdb/dir1/file3.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
data: 1
3 changes: 0 additions & 3 deletions tests/testdb/dir1/file5.json

This file was deleted.

1 change: 1 addition & 0 deletions tests/testdb/dir1/file5.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
data: 2
1 change: 1 addition & 0 deletions tests/testdb/dir1/file6.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
data: 3
2 changes: 0 additions & 2 deletions tests/testdb/dir1/validity.jsonl

This file was deleted.

Loading

0 comments on commit c7cb940

Please sign in to comment.