From 98a3a9614de3fa7e553d7c134764ac7abf7e5c73 Mon Sep 17 00:00:00 2001 From: Ben Jeffery Date: Wed, 9 Oct 2024 12:00:41 +0100 Subject: [PATCH] Add resources to provenance --- docs/provenance.md | 25 ++++++++++- python/CHANGELOG.rst | 4 ++ python/tests/test_provenance.py | 67 ++++++++++++++++++++++++++++- python/tskit/__init__.py | 4 ++ python/tskit/provenance.py | 35 ++++++++++++++- python/tskit/provenance.schema.json | 22 ++++++++++ python/tskit/util.py | 3 +- 7 files changed, 155 insertions(+), 5 deletions(-) diff --git a/docs/provenance.md b/docs/provenance.md index d2eed1fe92..76a6bd9ad0 100644 --- a/docs/provenance.md +++ b/docs/provenance.md @@ -80,13 +80,19 @@ To make things more concrete, let's consider an example: "version": "#31~16.04.1-Ubuntu SMP Wed Jul 18 08:54:04 UTC 2018", "machine": "x86_64" } + }, + "resources": { + "elapsed_time": 12.34, + "user_time": 10.56, + "sys_time": 1.78, + "max_mem": 1048576 } } ``` This information records the provenance for a very simple msprime simulation. The record is a JSON -object with three mandatory fields ("software", "parameters" and "environment") -which we discuss separately in the following sections. +object with three mandatory fields ("software", "parameters" and "environment") and one optional +("resources") which we discuss separately in the following sections. (sec_provenance_software)= @@ -221,6 +227,21 @@ The `libraries` section captures information about important libraries that the primary software links against. There is no required structure. +(sec_provenance_resources)= + +## Resources + +The resources section captures details about the computational resources used during the execution of the software. {meth}`~tskit.provenance.get_resources()` can be used to populate this section, if called after useful work is complete. This section is optional and has the following fields, each of which is optional and may not be filled depending on os support: + + +- `elapsed_time`: The total elapsed time in seconds. +- `user_time`: The total user CPU time in seconds. +- `sys_time`: The total system CPU time in seconds. +- `max_mem`: The maximum memory usage in bytes. + +Including this information makes it easy for users of tree-sequence producing software to +account for resource usage across pipelines of tools. + (sec_provenance_schema)= ## Full schema diff --git a/python/CHANGELOG.rst b/python/CHANGELOG.rst index 9815792123..79ecbfe81d 100644 --- a/python/CHANGELOG.rst +++ b/python/CHANGELOG.rst @@ -64,6 +64,10 @@ ``pack_untracked_polytomies`` allows large polytomies involving untracked samples to be summarised as a dotted line (:user:`hyanwong`, :issue:`3011` :pr:`3010`, :pr:`3012`) +- Add ``resources`` section to provenance schema, along with ``tskit.used_resources`` to + populate it. (:user:`benjeffery`, :pr:`3016`) + + -------------------- [0.5.8] - 2024-06-27 -------------------- diff --git a/python/tests/test_provenance.py b/python/tests/test_provenance.py index 0f7c662523..d5b26bb120 100644 --- a/python/tests/test_provenance.py +++ b/python/tests/test_provenance.py @@ -1,6 +1,6 @@ # MIT License # -# Copyright (c) 2018-2020 Tskit Developers +# Copyright (c) 2018-2024 Tskit Developers # Copyright (C) 2018 University of Oxford # # Permission is hereby granted, free of charge, to any person obtaining a copy @@ -26,6 +26,13 @@ import json import os import platform +import sys + +try: + import resource +except ImportError: + resource = None # resource.getrusage absent on windows + import msprime import pytest @@ -121,6 +128,37 @@ def test_extra_stuff(self): } tskit.validate_provenance(extra) + def test_resources(self): + resources = { + "schema_version": "1", + "software": {"name": "x", "version": "y"}, + "environment": {}, + "parameters": {}, + "resources": { + "elapsed_time": 1, + "user_time": 2, + "sys_time": 3, + "max_memory": 4, + }, + } + tskit.validate_provenance(resources) + + def test_resources_error(self): + resources = { + "schema_version": "1", + "software": {"name": "x", "version": "y"}, + "environment": {}, + "parameters": {}, + "resources": { + "elapsed_time": "1", + "user_time": 2, + "sys_time": 3, + "max_memory": 4, + }, + } + with pytest.raises(tskit.ProvenanceValidationError): + tskit.validate_provenance(resources) + class TestOutputProvenance: """ @@ -178,6 +216,33 @@ def test_libraries(self): assert libs == env["libraries"] +class TestGetResources: + def test_used_resources_keys(self): + resources = provenance.get_resources() + assert "elapsed_time" in resources + assert "user_time" in resources + assert "sys_time" in resources + if resource is not None: + assert "max_mem" in resources + + def test_used_resources_values(self): + resources = provenance.get_resources() + assert isinstance(resources["elapsed_time"], float) + assert isinstance(resources["user_time"], float) + assert isinstance(resources["sys_time"], float) + assert resources["elapsed_time"] > 0.0001 + assert resources["user_time"] > 0.0001 + assert resources["sys_time"] > 0.0001 + if resource is not None: + assert isinstance(resources["max_mem"], int) + assert resources["max_mem"] > 1024 + + def test_used_resources_platform(self): + resources = provenance.get_resources() + if sys.platform != "darwin" and resource is not None: + assert resources["max_mem"] % 1024 == 0 + + class TestGetSchema: """ Ensure we return the correct JSON schema. diff --git a/python/tskit/__init__.py b/python/tskit/__init__.py index df920f9b45..7928b249d6 100644 --- a/python/tskit/__init__.py +++ b/python/tskit/__init__.py @@ -19,6 +19,8 @@ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. +import time + import _tskit #: Special reserved value representing a null ID. @@ -91,3 +93,5 @@ from tskit.metadata import * # NOQA from tskit.text_formats import * # NOQA from tskit.intervals import RateMap # NOQA + +_START_TIME = time.time() diff --git a/python/tskit/provenance.py b/python/tskit/provenance.py index bc88e29f1a..8c233a0f7d 100644 --- a/python/tskit/provenance.py +++ b/python/tskit/provenance.py @@ -1,6 +1,6 @@ # MIT License # -# Copyright (c) 2018-2023 Tskit Developers +# Copyright (c) 2018-2024 Tskit Developers # Copyright (c) 2016-2017 University of Oxford # # Permission is hereby granted, free of charge, to any person obtaining a copy @@ -27,10 +27,18 @@ import json import os.path import platform +import sys +import time + +try: + import resource +except ImportError: + resource = None # resource.getrusage absent on windows import jsonschema import _tskit +import tskit import tskit.exceptions as exceptions from . import _version @@ -72,6 +80,31 @@ def get_environment(extra_libs=None, include_tskit=True): return env +def get_resources(): + """ + Returns a dict describing the resources used by the current process, suitable for + inclusion in the resources section of provenance records, see + :ref:`sec_provenance_resources`. This function should be run after the process has + completed its useful work, when provenance is being written. Note that as this + returns data for the process so shouldn't be called in a library API that could + be called as part of a larger process. + """ + times = os.times() + ret = { + "elapsed_time": time.time() - tskit._START_TIME, + "user_time": times.user + times.children_user, + "sys_time": times.system + times.children_system, + } + if resource is not None: + # Don't report max memory on Windows. We could do this using the psutil lib, via + # psutil.Process(os.getpid()).get_ext_memory_info().peak_wset if demand exists + ret["max_mem"] = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss + if sys.platform != "darwin": + ret["max_mem"] *= 1024 # Linux, freeBSD et al reports in KB, not bytes + + return ret + + def get_provenance_dict(parameters=None): """ Returns a dictionary encoding an execution of tskit conforming to the diff --git a/python/tskit/provenance.schema.json b/python/tskit/provenance.schema.json index fd683fff9e..7134a16729 100644 --- a/python/tskit/provenance.schema.json +++ b/python/tskit/provenance.schema.json @@ -45,6 +45,28 @@ "type": "object" } } + }, + "resources": { + "description": "Resources used by this operation.", + "type": "object", + "properties": { + "elapsed_time": { + "description": "Wall clock time in used in seconds.", + "type": "number" + }, + "user_time": { + "description": "User time used in seconds.", + "type": "number" + }, + "sys_time": { + "description": "System time used in seconds.", + "type": "number" + }, + "max_mem": { + "description": "Maximum memory used in bytes.", + "type": "number" + } + } } } } diff --git a/python/tskit/util.py b/python/tskit/util.py index 7f7a358fec..82436e22b6 100644 --- a/python/tskit/util.py +++ b/python/tskit/util.py @@ -216,7 +216,8 @@ def pack_arrays(list_of_lists, dtype=np.float64): """ Packs the specified list of numeric lists into a flattened numpy array of the specified dtype with corresponding offsets. See - :ref:`sec_encoding_ragged_columns` for details of this encoding of columns + :ref:`sec_encoding_ragged_columns` for detThis information + ails of this encoding of columns of variable length data. :param list[list] list_of_lists: The list of numeric lists to encode.