diff --git a/.github/workflows/fair-software.yml b/.github/workflows/fair-software.yml deleted file mode 100644 index dbeab05..0000000 --- a/.github/workflows/fair-software.yml +++ /dev/null @@ -1,17 +0,0 @@ -name: fair-software - -on: - push: - branches: [main] - -jobs: - verify: - name: "fair-software" - runs-on: ubuntu-latest - steps: - - uses: fair-software/howfairis-github-action@0.2.1 - name: Measure compliance with fair-software.eu recommendations - env: - PYCHARM_HOSTED: "Trick colorama into displaying colored output" - with: - MY_REPO_URL: "https://github.com/${{ github.repository }}" diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index ba2784a..f736dd3 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,6 +1,6 @@ repos: - repo: https://github.com/commitizen-tools/commitizen - rev: 3.5.3 + rev: 3.6.0 hooks: - id: commitizen stages: [commit-msg] @@ -12,7 +12,7 @@ repos: # - id: poetry-export # args: ["-f", "requirements.txt", "-o", "requirements.txt"] - repo: https://github.com/pre-commit/mirrors-mypy - rev: v1.4.1 # Use the sha / tag you want to point at + rev: v1.5.1 # Use the sha / tag you want to point at hooks: - id: mypy additional_dependencies: [types-all] @@ -21,6 +21,9 @@ repos: hooks: - id: pydocstyle additional_dependencies: [tomli] + files: ^perun/ + args: + - --config=pyproject.toml - repo: https://github.com/asottile/seed-isort-config rev: v2.2.0 hooks: @@ -42,6 +45,6 @@ repos: - id: check-yaml - id: check-added-large-files - repo: https://github.com/PyCQA/flake8 - rev: 6.0.0 + rev: 6.1.0 hooks: - id: flake8 diff --git a/README.md b/README.md index bdf6c34..9b8bb85 100644 --- a/README.md +++ b/README.md @@ -24,7 +24,9 @@ Check out the [docs](https://perun.readthedocs.io/en/latest/)! - Measures energy consumption of Python scripts using Intel RAPL, Nvidia-NVML, and psutil - Handles MPI applications efficiently - Gathers data from hundreds of nodes and accumulates it efficiently - - Can be used as a command-line tool or as a function decorator in Python scripts + - Monitor individual functions using decorators + - Tracks energy usage of the application over multiple executions + - Easy to benchmark applications and functions ## Installation @@ -50,222 +52,64 @@ To use perun as a command-line tool, run the monitor subcommand followed by the $ perun monitor path/to/your/script.py [args] ``` -perun will output a file containing runtime, energy, and other information gathered while your script runs: +perun will output two files, and HDF5 style containing all the raw data that was gathered, and a text file with a summary of the results. -```console +```text PERUN REPORT -App name: script -Run ID: 2023-03-23T12:10:48.627214 -RUNTIME: 0:00:06.333626 s -CPU_UTIL: 64.825 % -MEM_UTIL: 0.563 % -NET_READ: 1.401 kB -NET_WRITE: 1.292 kB -DISK_READ: 174.633 MB -DISK_WRITE: 88.000 kB +App name: finetune_qa_accelerate +First run: 2023-08-15T18:56:11.202060 +Last run: 2023-08-17T13:29:29.969779 + + +RUN ID: 2023-08-17T13:29:29.969779 +| Round # | Host | RUNTIME | ENERGY | CPU_POWER | CPU_UTIL | GPU_POWER | GPU_MEM | DRAM_POWER | MEM_UTIL | +|----------:|:--------------------|:----------|:-----------|:------------|:-----------|:------------|:-----------|:-------------|:-----------| +| 0 | hkn0432.localdomain | 995.967 s | 960.506 kJ | 231.819 W | 3.240 % | 702.327 W | 55.258 GB | 29.315 W | 0.062 % | +| 0 | hkn0436.localdomain | 994.847 s | 960.469 kJ | 235.162 W | 3.239 % | 701.588 W | 56.934 GB | 27.830 W | 0.061 % | +| 0 | All | 995.967 s | 1.921 MJ | 466.981 W | 3.240 % | 1.404 kW | 112.192 GB | 57.145 W | 0.061 % | + +The application has run been run 7 times. Throught its runtime, it has used 3.128 kWh, released a total of 1.307 kgCO2e into the atmosphere, and you paid 1.02 € in electricity for it. ``` -### Function Decorator +Perun will keep track of the energy of your application over multiple runs. + +### Function Monitoring -To use perun as a function decorator in your Python script, import the monitor decorator and add it to the function you want to monitor: +Using a function decorator, information can be calculated about the runtime, power draw and component utilization while the function is executing. ```python import time -from perun.decorator import monitor +from perun import monitor @monitor() -def your_function(n: int): +def main(n: int): time.sleep(n) ``` -When you run your script, perun will output a report from the function: +After running the script with ```perun monitor```, the text report will add information about the monitored functions. -```console -python path/to/your/script.py -``` +```text +Monitored Functions -> :exclamation: Each time the function is run, perun will output a new report from the function. +| Round # | Function | Avg Calls / Rank | Avg Runtime | Avg Power | Avg CPU Util | Avg GPU Mem Util | +|----------:|:----------------------------|-------------------:|:----------------|:-----------------|:---------------|:-------------------| +| 0 | main | 1 | 993.323±0.587 s | 964.732±0.499 W | 3.244±0.003 % | 35.091±0.526 % | +| 0 | prepare_train_features | 88 | 0.383±0.048 s | 262.305±19.251 W | 4.541±0.320 % | 3.937±0.013 % | +| 0 | prepare_validation_features | 11 | 0.372±0.079 s | 272.161±19.404 W | 4.524±0.225 % | 4.490±0.907 % | +``` ### MPI -If your python application uses mpi4py, you don't need to change anything. Perun is able to handle MPI applications, and will gather statistics in all the utilized nodes. +Perun is compatible with MPI applications that make use of ```mpi4py```, and requires changes in the code or in the perun configuration. Simply replace the ```python``` command with ```perun monitor```. ```console mpirun -n 8 perun monitor path/to/your/script.py ``` -or - -``` -mpirun -n 8 python path/to/your/script.py -``` - -## Usage - -### Subcommands - -Perun subcommands have some shared options that are typed before the subcommands. - -```console -Usage: perun [OPTIONS] COMMAND [ARGS]... - - Perun: Energy measuring and reporting tool. - -Options: - --version Show the version and exit. - -c, --configuration FILE Path to configuration file - -n, --app_name TEXT Name of the monitored application. The name - is used to distinguish between multiple - applications in the same directory. If left - empty, the filename will be used. - -i, --run_id TEXT Unique id of the latest run of the - application. If left empty, perun will use - the SLURM job id, or the current date. - --format [text|json|hdf5|pickle|csv|bench] - Report format. - --data_out DIRECTORY Where to save the output files, defaults to - the current working directory. - --raw Use the flag '--raw' if you need access to - all the raw data collected by perun. The - output will be saved on an hdf5 file on the - perun data output location. - --sampling_rate FLOAT Sampling rate in seconds. - --pue FLOAT Data center Power Usage Efficiency. - --emissions_factor FLOAT Emissions factor at compute resource - location. - --price_factor FLOAT Electricity price factor at compute resource - location. - --bench Activate benchmarking mode. - --bench_rounds INTEGER Number of rounds per function/app. - --bench_warmup_rounds INTEGER Number of warmup rounds per function/app. - -l, --log_lvl [DEBUG|INFO|WARN|ERROR|CRITICAL] - Loggging level - --help Show this message and exit. - -Commands: - export Export existing perun output file to another format. - monitor Gather power consumption from hardware devices while SCRIPT... - sensors Print sensors assigned to each rank by perun. - showconf Print current perun configuration in INI format. -``` - -### monitor - -Monitor energy usage of a python script. - -```console -Usage: perun monitor [OPTIONS] SCRIPT [SCRIPT_ARGS]... - - Gather power consumption from hardware devices while SCRIPT [SCRIPT_ARGS] is - running. - - SCRIPT is a path to the python script to monitor, run with arguments - SCRIPT_ARGS. - -Options: - --help Show this message and exit. -``` - -### sensors - -Print available monitoring backends and each available sensors for each MPI rank. - -```console -Usage: perun sensors [OPTIONS] - - Print sensors assigned to each rank by perun. - -Options: - --help Show this message and exit. -``` - -### export - -Export an existing perun output file to another format. - -```console -Usage: perun export [OPTIONS] INPUT_FILE OUTPUT_PATH - {text|json|hdf5|pickle|csv|bench} - - Export existing perun output file to another format. - -Options: - --help Show this message and exit. -``` - -### showconf - -Prints the current option configurations based on the global, local configurations files and command line options. - -```console -Usage: perun showconf [OPTIONS] - - Print current perun configuration in INI format. - -Options: - --default Print default configuration - --help Show this message and exit. -``` - -## Configuration - -There are multiple ways to configure perun, with a different level of priorities. - -- CMD Line options and Env Variables - - The highest priority is given to command line options and environmental variables. The options are shown in the command line section. The options can also be passed as environmental variables by adding the prefix 'PERUN' to them. Ex. "--format txt" -> PERUN_FORMAT=txt - -- Local INI file +## Docs - Perun will look into the cwd for ".perun.ini" file, where options can be fixed for the directory. - - Example: - - ```ini - [post-processing] - pue = 1.58 - emissions_factor = 0.262 - price_factor = 34.6 - - [monitor] - sampling_rate = 1 - - [output] - app_name - run_id - format = text - data_out = ./perun_results - depth - raw = False - - [benchmarking] - bench_enable = False - bench_rounds = 10 - bench_warmup_rounds = 1 - - [debug] - log_lvl = ERROR - ``` - - The location of the file can be changed using the option "-c" or "PERUN_CONFIGURATION". - -- Global INI file - - If the file ~/.config/perun.ini is found, perun will override the default configuration with the contents of the file. - -### Priority - -CMD LINE and ENV > Local INI > Global INI > Default options - - -## Data Output Structure - -When exporting data to machine readable formats like json, pickle, and hdf5, perun stores the data in a hierarchical format, with the application and individual runs at the root of data tree, and individual sensors and raw data a in the leafs. When processing, the data is propagated from the leafs (sensors), all the way to the root, where a aggregated statistics about the application are gatherd. - -
- -
+To get more information, check out our [docs page](https://perun.readthedocs.io/en/latest/). diff --git a/docs/configuration.rst b/docs/configuration.rst index bbb5f74..4171fd0 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -16,14 +16,13 @@ Options "pue", 1.58, "Power Usage Effectiveness: A measure of a data centers efficiency, calculated as PUE = Total facilitty energy / IT equipment energy" "emissions_factor", 417.80, "Average carbon intensity of electricity (gCO2e/kWh). Source: https://ourworldindata.org/grapher/carbon-intensity-electricity" - "price_factor", 32.51, "Power to Euros conversion factor (Cent/kWh). Source : https://www.stromauskunft.de/strompreise/" + "price_factor", 32.51, "Power to Currency conversion factor (Cent/kWh). Source : https://www.stromauskunft.de/strompreise/" + "price_unit", €, "Currency Icon" "sampling_rate", 1, "Seconds between measurements" - "app_name", None, "Name to identify the app. If **None**, name will be based on the file or function name. If **SLURM**, perun will look for the environmental variable **SLURM_JOB_NAME** and use that." + "app_name", None, "Name to identify the app. If **None**, name will be based on the file or function name." "run_id", None, "ID of the current run. If **None**, the current date and time will be used. If **SLURM**, perun will look for the environmental variable **SLURM_JOB_ID** and use that." "format", "text", "Output report format [text, pickle, csv, hdf5, json, bench]" "data_out", "./perun_results", "perun output location" - "raw", False, "If output file should include raw data" - "bench_enable", False, "Enable benchmarking mode. See :ref:`benchmarking`." - "bench_round", 10, "Number of times a benchmark is run" - "bench_warmup_rounds", 1, "Number of warmup rounds to run before starting the benchmarks." - "log_lvl", "ERROR", "Change logging output [DEBUG, INFO, WARNING, ERROR, CRITICAL]" + "rounds", 5, "Number of times a the application is run" + "warmup_rounds", 1, "Number of warmup rounds to run before starting the benchmarks." + "log_lvl", "WARNING", "Change logging output [DEBUG, INFO, WARNING, ERROR, CRITICAL]" diff --git a/docs/install.rst b/docs/install.rst index c86619c..613ae80 100644 --- a/docs/install.rst +++ b/docs/install.rst @@ -15,6 +15,12 @@ or if you need the latests updates, you can install from the main branch of gith $ pip install https://github.com/Helmholtz-AI-Energy/perun +If you are going to work with MPI, you can install it as an extra dependecy. + +.. code-block:: console + + $ pip install perun[mpi] + If you want to get the source code and modify it, you can clone the source code using git. .. code-block:: console @@ -60,7 +66,7 @@ GPU Supported backends: - - GPU power draw: `NVIDIA NVML `_ through pynvml. + - GPU power draw: `NVIDIA NVML `_ through nvidia-ml-py. DRAM ~~~~ diff --git a/docs/quickstart.rst b/docs/quickstart.rst index ebfa079..a2ca770 100644 --- a/docs/quickstart.rst +++ b/docs/quickstart.rst @@ -21,49 +21,66 @@ Once your code finishes running, you will find a new directory called ``perun_re PERUN REPORT - App name: script - Run ID: 2023-03-23T12:10:48.627214 - RUNTIME: 0:00:06.333626 s - CPU_UTIL: 64.825 % - MEM_UTIL: 0.563 % - NET_READ: 1.401 kB - NET_WRITE: 1.292 kB - DISK_READ: 174.633 MB - DISK_WRITE: 88.000 kB + App name: finetune_qa_accelerate + First run: 2023-08-15T18:56:11.202060 + Last run: 2023-08-17T13:29:29.969779 + + + RUN ID: 2023-08-17T13:29:29.969779 + + | Round # | Host | RUNTIME | ENERGY | CPU_POWER | CPU_UTIL | GPU_POWER | GPU_MEM | DRAM_POWER | MEM_UTIL | + |----------:|:--------------------|:----------|:-----------|:------------|:-----------|:------------|:-----------|:-------------|:-----------| + | 0 | hkn0432.localdomain | 995.967 s | 960.506 kJ | 231.819 W | 3.240 % | 702.327 W | 55.258 GB | 29.315 W | 0.062 % | + | 0 | hkn0436.localdomain | 994.847 s | 960.469 kJ | 235.162 W | 3.239 % | 701.588 W | 56.934 GB | 27.830 W | 0.061 % | + | 0 | All | 995.967 s | 1.921 MJ | 466.981 W | 3.240 % | 1.404 kW | 112.192 GB | 57.145 W | 0.061 % | + + The application has run been run 7 times. Throught its runtime, it has used 3.128 kWh, released a total of 1.307 kgCO2e into the atmosphere, and you paid 1.02 € in electricity for it. + .. note:: Depending on the hardware you are running and the available interfaces, the output might look different than the one listed here. For more details on the support data sources used by perun, check the :ref:`dependencies` section -perun can also be used as a function decorator to target specific code regions. +The the text report summarizes the data gathered throught the application run by individual host, and averaging power consumption of the full runtime. Perun also makes all the raw data gathered from the hardware on an HDF5 file that is located on the same results folder. To explore the data manually, we recommend the Visual Studio Code extension `H5Web `_, to process it with python using `h5py `_, or to export using the :code:`perun export` subcommand (see :ref:`usage`). + +The hdf5 file collects information over multiple runs of the application, adding a new section every time the application is executed using perun. The simplifies studying the behaviour of the application over time, make the last line in the summary report posible. + +Function Monitoring +------------------- + +To get information the power consumption, runtime and hardware utilization of individual functions while the application is being monitored, perun includes a function decorator. .. code-block:: python import time - from perun.decorator import monitor + from perun import monitor @monitor() - def your_sleep_function(n: int): + def main(n: int): time.sleep(n) -If you need more detailed output or access to the raw data that perun gathered, you can configure python both in the command line or as decorator to get the data you want. +This will add a new section to the text report and to the hdf5 file with the individual function profiles. +.. code-block:: -.. code-block:: console + Monitored Functions - $ perun --format json --raw monitor you_script.py + | Round # | Function | Avg Calls / Rank | Avg Runtime | Avg Power | Avg CPU Util | Avg GPU Mem Util | + |----------:|:----------------------------|-------------------:|:----------------|:-----------------|:---------------|:-------------------| + | 0 | main | 1 | 993.323±0.587 s | 964.732±0.499 W | 3.244±0.003 % | 35.091±0.526 % | + | 0 | prepare_train_features | 88 | 0.383±0.048 s | 262.305±19.251 W | 4.541±0.320 % | 3.937±0.013 % | + | 0 | prepare_validation_features | 11 | 0.372±0.079 s | 272.161±19.404 W | 4.524±0.225 % | 4.490±0.907 % | -The same options can be given to the function decorator. -.. code-block:: python +MPI Compatibility +----------------- - import time - from perun.decorator import monitor +Perun is capable of handling applications that make use of MPI using the `mpi4py `_ library without any need to reconfigure or modify the existing code. - @monitor(format="hdf5", sampling_rate=2) - def your_sleep_function(n: int): - time.sleep(n) +.. code-block:: console + + mpirun -n 4 perun monitor mpi_app.py -perun has more subcommand and configuration, to accomodate various use cases and workflows. For more information, check out the :ref:`usage` and :ref:`configuration` sections of the documentation. +Perun has multiple subcommands and configuration options to accomodate various use cases and workflows. For more information, check out the :ref:`usage` and :ref:`configuration` sections of the documentation, or use the help flag :code:`-h` in the command line. diff --git a/docs/usage.rst b/docs/usage.rst index fe2c461..1aa371c 100644 --- a/docs/usage.rst +++ b/docs/usage.rst @@ -3,15 +3,25 @@ Usage ===== -.. _cmd-line: +The `perun` command contains other subcommands that provide extra functionality. The command itself supports a couple of options, like setting a custom configuration or changing the logging level of the application. The information can be reviewd using the `--help` command flag. + +.. code-block:: console + + Usage: perun [OPTIONS] COMMAND [ARGS]... + + Perun: Energy measuring and reporting tool. + + Options: + --version Show the version and exit. + -c, --configuration FILE Path to configuration file + -l, --log_lvl [DEBUG|INFO|WARN|ERROR|CRITICAL] + Loggging level + --help Show this message and exit. -Command line ------------- -perun includes a command line tool which, besides the ``monitor`` subcommand, includes extra utilities that facilitates configuring perun and the data it collects. To get help regarding the perun command and any of its subcommands, the option ``--help`` is available for all of them. -monitor -~~~~~~~ +Monitor +----------- To start monitoring your python applications, simply use @@ -25,26 +35,88 @@ This also applies MPI applications $ mpirun -N 4 perun monitor your_app.py -perun will not disrupt your applications usage of MPI, and will quietly collect hardware data from all the different nodes being used. At the end, you will get a single report based on the data from all the nodes and their hardware components. +perun will not disrupt your applications usage of MPI, and will collect hardware data from all the different nodes being used. At the end, you will get a single report based on the data from all the nodes and their hardware components. -To modify the peruns behaviour, the command accepts options right after the perun command, like this: +To modify the peruns behaviour, the subcommand accepts options many configuration options that alter the monitoring or post processing process behaviour. .. code-block:: console - $ perun --format json --raw --sampling_rate 5 monitor your_app.py + $ perun monitor --format json --sampling_rate 5 your_app.py -Or if environmental variables are prefered: +The options can also be set as environmental variables. .. code-block:: console $ PERUN_FORMAT=json perun monitor your_app.py + A combination of both also works. If you have a set of options that works for your workflow, you can save them on '.perun.ini' file, and perun will use them automatically. For more info on the configuration options and file, check the :ref:`configuration` section. +The monitor command will by default output two files. The first one is an HDF5 file, named after the monitored python script, which contains all the information gathered by perun over multiple runs. The data has a tree structure, where the root node contains a summary all the application runs. Subsequent nodes contain information about individual `perun monitor` + +Application Name and Run ID +~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Each time you use perun with a python script, perun assings it an application name and a run id. By default, the application name will be the name of the python script ('train.py' will have the name 'train'). The ID is an identifier of the particular execution of the application with perun, and it is by default the current date and time in ISO format. Both the application name and the run id will be used to name output files and internally by perun, and can be configured using command line arguments (`-n`, `--name` for the name, `-i`, `--run_id` for the id) or using the :ref:`configuration` file. + +Multiple Rounds +~~~~~~~~~~~~~~~ + +perun has a special option `--rounds` which will run the application for multiple rounds using a single command, and generate statistics about the runtime, power consumption and device utilization. All the rounds will be associated with the same run id. + +.. code-block:: console + + $ perun monitor your_app.py + + +Additionaly, there is a `--warmup-rounds` option if you want the application to execute without monitoring before the *real* rounds. + +Monitoring Functions +~~~~~~~~~~~~~~~~~~~~ + +Perun includes a function decorator that provides additional information about individual functions in the application. The decorator itself only stores timestamps at the start and the end of the function execution. Those timestamps are then used to extract the relevant information out of the sampled hardware data. + +.. code-block:: python + + from perun import monitor + + @monitor() + def main(): + your_code_goes.here() + +Internally, perun will used the function name to identify it, and the internal id can be set using the option `region-name`. + +.. _format: + +Format +~~~~~~ + +In addition to HDF5 and text, perun support *json*, *pickle*, *csv* and a *bench* format. It can be setup from the start using the `--format` option in the monitor subcommand, or it can be generated later usint the :ref:`export` command. + +**hdf5** + +This is the default format, and the data structures on the file mirror the perun :py:class:`perun.data_model.data.DataNode` objects one-to-one. It includes the raw data metadata of each device, computational node, execution round, monitored function, run id and application. + +**json** + +Similar to hdf5, it keeps the internal perun data structure, but written as a json file. + +**pickle** + +Pickle is part of the python standart library and it stores python object in a binary format, meaning you can save the :py:class:`perun.data_model.data.DataNode` tree that is used by perun. + +**csv** + +This will create a csv table with all the raw data of an individual run. This will not include any device metadata. + +**bench** + +This format generates a json file that is meant to be used together with `Continuous Benchmark Github Action `_. It saves *customLessIsBetter* data points from the latest run, including monitored functions within the run, which can be used by the action to alert developers of performance degradations and create plots. + sensors -~~~~~~~ +------- -To get a quikc overview of which interfaces and information perun has access to, you can use the ``sensors`` subcommand. +To get a quick overview of which interfaces and information perun has access to, you can use the ``sensors`` subcommand. .. code-block:: console @@ -66,29 +138,34 @@ To get a quikc overview of which interfaces and information perun has access to, perun will print an overview of the interfaces and individual "sensors" available on each mpi rank, and to which host node the mpi ranks belong to. + export -~~~~~~ +------ + +.. _export: + -perun supports multiple ouput formats, and can transform its data from one format to another (with some exceptions). +You can export existing perun output files to other formats using the export command. .. code-block:: console - $ perun export perun_results/forward_22149666.hdf5 exported_results/ csv + $ perun export perun_results/forward_22149666.hdf5 csv -The command takes as a first argument one of the output files of perun, as a second argument where the location where the new file will be saved, and the format it will be exported to. The input file needs to be a ``json``, ``hdf5`` or ``pickle`` formated file, as perun can easily reconstruct the original data structures from those files. The output format can be ``text``, ``json``, ``hdf5``, ``pickle``, ``csv`` and ``bench`` +The command takes as a first argument one of the output files of perun, and as a second argument the format it will be exported to. The input file needs to be a ``json``, ``hdf5`` or ``pickle`` formated file, as the :py:class:`perun.data_model.data.DataNode` tree can only be reconstructed from those formats. The output format can be ``text``, ``json``, ``hdf5``, ``pickle``, ``csv`` and ``bench``. showconf -~~~~~~~~ +-------- To get a quick overview of the current configuration that perun is using, use the ``showconf`` subcommand. .. code-block:: console - $ perun showconf + $ perun showconf [post-processing] pue = 1.58 - emissions_factor = 0.262 - price_factor = 34.6 + emissions_factor = 417.8 + price_factor = 32.51 + price_unit = € [monitor] sampling_rate = 1 @@ -98,17 +175,13 @@ To get a quick overview of the current configuration that perun is using, use th run_id format = text data_out = ./perun_results - depth - raw = False [benchmarking] - bench_enable = False - bench_rounds = 10 - bench_warmup_rounds = 1 + rounds = 1 + warmup_rounds = 0 [debug] - log_lvl = ERROR - + log_lvl = WARNING The command will print the current perun configuration in ``.ini`` format, which can be used as a starting point for your own ``.perun.ini`` file. @@ -116,74 +189,60 @@ The command will print the current perun configuration in ``.ini`` format, which $ perun showconf > .perun.ini - To get the default configuration, simply add the ``--default`` flag. .. code-block:: console $ perun showconf --default - - - -Decorator ---------- - -To monitor a particular part of your code, you can use the ``@monitor`` decorator on the desired function. - - -.. code-block:: python - - import time - from perun.decorator import monitor - - @monitor() - def your_sleep_function(n: int): - time.sleep(n) - - -The decorator accepts the same options as the configuration file or the command line. - -.. code-block:: python - - import time - from perun.decorator import monitor - - @monitor(format="csv", pue=1.05) - def your_sleep_function(n: int): - time.sleep(n) - -.. attention:: - - perun will generate an output file each time an the function is called. - - - -.. _benchmarking: - -Benchmarking ------------- - -*Benchmarking mode* can be enabled by using the special flag ``--bench`` or using the argument ``bench=True`` on the decorator. Instead of running the code a single time, perun will instead run a your code a configurable number of times and will collect statistics about it. - -.. code-block:: console - - $ perun --bench monitor your_script.py - -This will change the what information gets retured by perun, with most of the output formats having to adjust for the extra information. This is the text output generated by perun when using benchmarking mode: - -.. code-block:: console - - App name: main - Run ID: 2023-03-22T17:10:31.825947 - +-----------------+---------+---------+----------+---------+ - | Name | mean | std | max | min | - +-----------------+---------+---------+----------+---------+ - | RUNTIME [s] | 10.009 | 0.001 | 10.010 | 10.008 | - | POWER [W] | 6.945 | 0.177 | 7.288 | 6.708 | - | CPU_UTIL [%] | 13.509 | 3.944 | 20.291 | 9.645 | - | MEM_UTIL [%] | 0.500 | 0.001 | 0.501 | 0.498 | - | DISK_READ [kB] | 1.200 | 1.833 | 4.000 | 0.000 | - | DISK_WRITE [kB] | 767.200 | 590.400 | 1964.000 | 60.000 | - | ENERGY [J] | 69.576 | 1.849 | 73.203 | 67.112 | - +-----------------+---------+---------+----------+---------+ +metadata +-------- + +Similar to the `sensors` command, metadata will print a json object with some information about the system. It can be usefull to keep track of software dependencies, changes in the OS or the python version. + +.. code-block:: json + + { + "juan-20w000p2ge": { + "libc_ver": "glibc 2.38", + "_node": "juan-20w000p2ge", + "architecture": "64bit ELF", + "system": "Linux", + "node": "juan-20w000p2ge", + "release": "6.1.44-1-MANJARO", + "version": "#1 SMP PREEMPT_DYNAMIC Wed Aug 9 09:02:26 UTC 2023", + "machine": "x86_64", + "_sys_version": "CPython 3.8.16 default Mar 3 2023 09:25:30 GCC 12.2.1 20230201", + "python_implementation": "CPython", + "python_version": "3.8.16", + "python_version_tuple": "3 8 16", + "python_build": "default Mar 3 2023 09:25:30", + "python_compiler": "GCC 12.2.1 20230201", + "platform": "Linux-6.1.44-1-MANJARO-x86_64-with-glibc2.34", + "backends": { + "Intel RAPL": {}, + "PSUTIL": { + "DISK_READ_BYTES": { + "source": "psutil 5.9.5" + }, + "RAM_USAGE": { + "total": "16481222656", + "available": "7718731776", + "source": "psutil 5.9.5" + }, + "CPU_USAGE": { + "source": "psutil 5.9.5" + }, + "NET_WRITE_BYTES": { + "source": "psutil 5.9.5" + }, + "DISK_WRITE_BYTES": { + "source": "psutil 5.9.5" + }, + "NET_READ_BYTES": { + "source": "psutil 5.9.5" + } + } + } + } + } diff --git a/example.perun.ini b/example.perun.ini index f9ad7cc..d296ea6 100644 --- a/example.perun.ini +++ b/example.perun.ini @@ -1,7 +1,7 @@ [post-processing] pue = 1.58 -emissions-factor = 0.355 -price-factor = 41.59 +emissions_factor = 417.8 +price_factor = 32.51 [monitor] sampling_rate = 1 @@ -9,12 +9,10 @@ sampling_rate = 1 [output] format = text data_out = ./perun_results -raw = True [benchmarking] -bench_enabled = False -bench_rounds = 10 -bench_warmup-rounds = 5 +rounds = 1 +warmup_rounds = 0 [debug] log_lvl = ERROR diff --git a/perun/__init__.py b/perun/__init__.py index 13b7e93..2fc7901 100644 --- a/perun/__init__.py +++ b/perun/__init__.py @@ -6,12 +6,4 @@ log = init_logging(config.get("debug", "log_lvl")) -import os - -os.environ["OMPI_MCA_mpi_warn_on_fork"] = "0" -os.environ["IBV_FORK_SAFE"] = "1" -os.environ["RDMAV_FORK_SAFE"] = "1" - -from perun.comm import Comm - -COMM_WORLD = Comm() +from perun.api.decorator import monitor diff --git a/perun/__main__.py b/perun/__main__.py index 48e6abc..1d11694 100644 --- a/perun/__main__.py +++ b/perun/__main__.py @@ -1,5 +1,5 @@ """perun module.""" if __name__ == "__main__": - from perun.cli import cli + from perun.api.cli import cli cli() diff --git a/perun/api/__init__.py b/perun/api/__init__.py new file mode 100644 index 0000000..24b043b --- /dev/null +++ b/perun/api/__init__.py @@ -0,0 +1 @@ +"""API module.""" diff --git a/perun/cli.py b/perun/api/cli.py similarity index 61% rename from perun/cli.py rename to perun/api/cli.py index 4754d73..2d0d968 100644 --- a/perun/cli.py +++ b/perun/api/cli.py @@ -2,16 +2,18 @@ Uses click https://click.palletsprojects.com/en/8.1.x/ to manage complex cmdline configurations. """ +import json +import sys from pathlib import Path -from typing import Dict, List, Optional +from typing import Optional import click import perun from perun import log from perun.configuration import config, read_custom_config, save_to_config_callback -from perun.io.io import IOFormat, exportTo, importFrom -from perun.util import getHostMetadata +from perun.io.io import IOFormat +from perun.perun import Perun @click.group() @@ -26,6 +28,107 @@ callback=read_custom_config, expose_value=False, ) +@click.option( + "-l", + "--log_lvl", + type=click.Choice(["DEBUG", "INFO", "WARN", "ERROR", "CRITICAL"]), + help="Loggging level", + callback=save_to_config_callback, + expose_value=False, +) +def cli(): + """Perun: Energy measuring and reporting tool.""" + log.setLevel(config.get("debug", "log_lvl")) + + +@cli.command() +@click.option( + "--default", + is_flag=True, + show_default=True, + default=False, + help="Print default configuration", +) +def showconf(default: bool): + """Print current perun configuration in INI format.""" + import sys + + from perun.configuration import _default_config + + if default: + config.read_dict(_default_config) + config.write(sys.stdout) + else: + config.write(sys.stdout) + + +@cli.command() +def sensors(): + """Print sensors assigned to each rank by perun.""" + perun = Perun(config) + if perun.comm.Get_rank() == 0: + for rank, bes in enumerate(perun.sensors_config): + click.echo(f"Rank: {rank}") + for key, items in bes.items(): + if len(items) > 0: + click.echo(f" {key}:") + for device in items: + click.echo(f" {device}") + click.echo("") + + click.echo("Hostnames: ") + for host, ranks in perun.host_rank.items(): + click.echo(f" {host}: {ranks}") + + +@cli.command() +def metadata(): + """Print global metadata dictionaries in json format.""" + perun = Perun(config) + + hostMD = perun.l_host_metadata + hostMD["backends"] = perun.l_backend_metadata + allHostsMD = perun.comm.gather(hostMD, root=0) + + if perun.comm.Get_rank() == 0 and allHostsMD: + metadataDict = {} + for host, assignedRanks in perun.host_rank.items(): + metadataDict[host] = allHostsMD[assignedRanks[0]] + + json.dump(metadataDict, sys.stdout, indent=4) + + +@cli.command() +@click.argument("input_file", type=click.Path(exists=True)) +@click.argument( + "output_format", + type=click.Choice([format.value for format in IOFormat]), +) +@click.option( + "-i", + "--id", + "mr_id", + type=str, + default=None, +) +def export(input_file: str, output_format: str, mr_id: Optional[str]): + """Export existing perun output file to another format.""" + in_file = Path(input_file) + if not in_file.exists(): + click.echo("File does not exist.", err=True) + return + + perun = Perun(config) + + out_path = in_file.parent + inputFormat = IOFormat.fromSuffix(in_file.suffix) + out_format = IOFormat(output_format) + + dataNode = perun.import_from(in_file, inputFormat) + perun.export_to(out_path, dataNode, out_format, mr_id) + + +@cli.command(context_settings={"ignore_unknown_options": True}) # Output option @click.option( "-n", @@ -42,6 +145,7 @@ expose_value=False, ) @click.option( + "-f", "--format", type=click.Choice([format.value for format in IOFormat]), help="Report format.", @@ -55,14 +159,6 @@ callback=save_to_config_callback, expose_value=False, ) -@click.option( - "--raw", - default=True, - help="Use the flag '--raw' if you need access to all the raw data collected by perun. The output will be saved on an hdf5 file on the perun data output location.", - is_flag=True, - callback=save_to_config_callback, - expose_value=False, -) # Sampling Options @click.option( "--sampling_rate", @@ -95,146 +191,19 @@ ) # Benchmarking @click.option( - "--bench", - "bench_enable", - is_flag=True, - help="Activate benchmarking mode.", - callback=save_to_config_callback, - expose_value=False, -) -@click.option( - "--bench_rounds", + "--rounds", type=int, - help="Number of rounds per function/app.", + help="Number of rounds to run the app.", callback=save_to_config_callback, expose_value=False, ) @click.option( - "--bench_warmup_rounds", + "--warmup_rounds", type=int, - help="Number of warmup rounds per function/app.", - callback=save_to_config_callback, - expose_value=False, -) -# @click.option( -# "--bench_metrics", -# multiple=True, -# help="Metrics to output. Only relevant with bench_minimal_format enabled", -# callback=save_to_config_callback, -# expose_value=False, -# ) -# Debug Options -@click.option( - "-l", - "--log_lvl", - type=click.Choice(["DEBUG", "INFO", "WARN", "ERROR", "CRITICAL"]), - help="Loggging level", + help="Number of warmup rounds to run the app.", callback=save_to_config_callback, expose_value=False, ) -def cli(): - """Perun: Energy measuring and reporting tool.""" - log.setLevel(config.get("debug", "log_lvl")) - - -@cli.command() -@click.option( - "--default", - is_flag=True, - show_default=True, - default=False, - help="Print default configuration", -) -def showconf(default: bool): - """Print current perun configuration in INI format.""" - import sys - - from perun.configuration import _default_config - - if default: - config.read_dict(_default_config) - config.write(sys.stdout) - else: - config.write(sys.stdout) - - -@cli.command() -@click.option( - "-v", - "--verbose", - is_flag=True, - default=False, - help="Print all the available system information in json format.", -) -def sensors(verbose: bool): - """Print sensors assigned to each rank by perun.""" - from perun import COMM_WORLD - from perun.backend import backends - from perun.coordination import getGlobalSensorRankConfiguration - - globalHostRank, globalSensorConfig = getGlobalSensorRankConfiguration( - COMM_WORLD, backends - ) - - if verbose: - import json - import sys - - hostMD = getHostMetadata(globalSensorConfig[0]) - allHostsMD: Optional[List[Dict]] = COMM_WORLD.gather(hostMD, root=0) - - if COMM_WORLD.Get_rank() == 0 and allHostsMD: - metadataDict = {} - for host, assignedRanks in globalHostRank.items(): - metadataDict[host] = allHostsMD[assignedRanks[0]] - - json.dump(metadataDict, sys.stdout, indent=4) - - else: - if COMM_WORLD.Get_rank() == 0: - for rank, bes in enumerate(globalSensorConfig): - click.echo(f"Rank: {rank}") - for key, items in bes.items(): - if len(items) > 0: - click.echo(f" {key}:") - for device in items: - click.echo(f" {device}") - click.echo("") - - click.echo("Hostnames: ") - for host, ranks in globalHostRank.items(): - click.echo(f" {host}: {ranks}") - - for b in backends: - b.close() - - -@cli.command() -@click.argument("input_file", type=click.Path(exists=True)) -@click.argument("output_path", type=click.Path(exists=True)) -@click.argument( - "output_format", - type=click.Choice([format.value for format in IOFormat]), -) -def export(input_file: str, output_path: str, output_format: str): - """Export existing perun output file to another format.""" - in_file = Path(input_file) - if not in_file.exists(): - click.echo("File does not exist.", err=True) - return - - out_path = Path(output_path) - if not out_path.parent.exists(): - click.echo("Output path does not exist", err=True) - return - - inputFormat = IOFormat.fromSuffix(in_file.suffix) - out_format = IOFormat(output_format) - dataNode = importFrom(in_file, inputFormat) - exportTo(out_path, dataNode, out_format, rawData=True) - - -@cli.command(context_settings={"ignore_unknown_options": True}) @click.argument("script", type=click.Path(exists=True)) @click.argument("script_args", nargs=-1) def monitor( @@ -247,9 +216,10 @@ def monitor( SCRIPT is a path to the python script to monitor, run with arguments SCRIPT_ARGS. """ # Setup script arguments + import sys - from perun.perun import monitor_application + perun = Perun(config) filePath: Path = Path(script) log.debug(f"Script path: {filePath}") @@ -259,7 +229,7 @@ def monitor( sys.path.insert(0, str(filePath.parent.absolute())) - monitor_application(filePath) + perun.monitor_application(filePath) def main(): diff --git a/perun/api/decorator.py b/perun/api/decorator.py new file mode 100644 index 0000000..79df36f --- /dev/null +++ b/perun/api/decorator.py @@ -0,0 +1,30 @@ +"""Decorator module.""" +import functools +from typing import Optional + +from perun import log +from perun.perun import Perun + + +def monitor(region_name: Optional[str] = None): + """Decorate function to monitor its energy usage.""" + + def inner_function(func): + @functools.wraps(func) + def func_wrapper(*args, **kwargs): + # Get custom config and kwargs + region_id = region_name if region_name else func.__name__ + + perun = Perun() # type: ignore + + log.info(f"Rank {perun.comm.Get_rank()}: Entering '{region_id}'") + perun.local_regions.addEvent(region_id) # type: ignore + func_result = func(*args, **kwargs) + perun.local_regions.addEvent(region_id) # type: ignore + log.info(f"Rank {perun.comm.Get_rank()}: Leaving '{region_id}'") + + return func_result + + return func_wrapper + + return inner_function diff --git a/perun/backend/__init__.py b/perun/backend/__init__.py index 5e91144..bbc4de9 100644 --- a/perun/backend/__init__.py +++ b/perun/backend/__init__.py @@ -1,6 +1 @@ """Backend module.""" -# flake8: noqa -from .backend import Backend, backend, backends -from .intel_rapl import IntelRAPLBackend -from .nvml import NVMLBackend -from .psutil import PSUTILBackend diff --git a/perun/backend/backend.py b/perun/backend/backend.py index abda2cb..c7309b4 100644 --- a/perun/backend/backend.py +++ b/perun/backend/backend.py @@ -1,5 +1,4 @@ """Backend module.""" -import functools from abc import ABC, abstractmethod from typing import Dict, List, Set @@ -11,6 +10,7 @@ class Backend(ABC): """Abstract backend class.""" + id: str = "abstract_backend" name: str = "Abstract backend class" description: str = "Abstract backend class description" @@ -19,6 +19,7 @@ def __init__(self) -> None: super().__init__() self.devices: Dict = {} self.setup() + log.info(f"Initialized {self.name} backend") @abstractmethod def visibleSensors(self) -> Set[str]: @@ -27,14 +28,12 @@ def visibleSensors(self) -> Set[str]: @abstractmethod def getSensors(self, deviceList: Set[str]) -> List[Sensor]: - """ - Return device objects based on the provided list of device ids. - - Args: - deviceList (Set[str]): List with wanted device ids + """Return device objects based on the provided list of device ids. - Returns: - List[Device]: List of device objects + :param deviceList: List with wanted device ids + :type deviceList: Set[str] + :return: List of device objects + :rtype: List[Sensor] """ pass @@ -47,30 +46,3 @@ def close(self): def setup(self): """Perform backend setup.""" pass - - -backends: List[Backend] = [] - - -def backend(cls): - """Backend class decorator. - - Marks a class a singleton, and if setup succeeds, - gets added to the backends list. - """ - - @functools.wraps(cls) - def backend_wrapper(*args, **kwargs): - if not backend_wrapper.instance: - try: - backend_wrapper.instance = cls(*args, **kwargs) - backends.append(backend_wrapper.instance) - except ImportError as ie: - log.warning(f"Missing dependencies for backend {cls.__name__}") - log.warning(ie) - except Exception as e: - log.warning(f"Unknown error loading dependecy {cls.__name__}") - log.warning(e) - - backend_wrapper.instance = None - return backend_wrapper diff --git a/perun/backend/intel_rapl.py b/perun/backend/intel_rapl.py index 3481893..3abcceb 100644 --- a/perun/backend/intel_rapl.py +++ b/perun/backend/intel_rapl.py @@ -1,5 +1,6 @@ """Defines Intel RAPL related classes.""" import os +import pprint as pp import re from io import IOBase from pathlib import Path @@ -9,10 +10,10 @@ import numpy as np from perun import log +from perun.backend.backend import Backend from perun.data_model.measurement_type import Magnitude, MetricMetaData, Unit - -from ..data_model.sensor import DeviceType, Sensor -from .backend import Backend, backend +from perun.data_model.sensor import DeviceType, Sensor +from perun.util import singleton RAPL_PATH = "/sys/class/powercap/" @@ -20,29 +21,26 @@ SUBDIR_RGX = r"intel-rapl:\d:\d$" -@backend +@singleton class IntelRAPLBackend(Backend): """Intel RAPL as a source of cpu and memory devices. Uses pyRAPL to gather device information and creates metrics for each available device """ + id = "intel_rapl" name = "Intel RAPL" description = "Reads energy usage from CPUs and DRAM using Intel RAPL" - def __init__(self) -> None: - """Init IntelRAPLBackend.""" - super().__init__() - log.info("Initialized Intel RAPL") - def setup(self): """Check Intel RAPL access.""" cpuInfo = cpuinfo.get_cpu_info() self.metadata = {} for key, value in cpuInfo.items(): - self.metadata[key] = value + if value is not None and value != "": + self.metadata[key] = str(value) - log.debug(f"CPU info metadata: {self.metadata}") + log.debug(f"CPU info metadata: {pp.pformat(self.metadata)}") raplPath = Path(RAPL_PATH) @@ -85,7 +83,8 @@ def func() -> np.number: if devType != DeviceType.OTHER: with open(child / "max_energy_range_uj", "r") as file: - max_energy = np.uint64(file.readline().strip()) + line = file.readline().strip() + max_energy = np.uint64(line) dataType = MetricMetaData( Unit.JOULE, Magnitude.MICRO, @@ -130,9 +129,10 @@ def func() -> np.number: if devType != DeviceType.OTHER: with open( - child / "max_energy_range_uj", "r" + grandchild / "max_energy_range_uj", "r" ) as file: - max_energy = np.uint64(file.readline().strip()) + line = file.readline().strip() + max_energy = np.uint64(line) dataType = MetricMetaData( Unit.JOULE, @@ -166,37 +166,39 @@ def func() -> np.number: file.close() del self.devices[pkg.id] - log.debug(f"IntelRapl devices {self.devices}") + log.debug( + f"IntelRapl devices {pp.pformat([deviceId for deviceId in self.devices])}" + ) def close(self) -> None: """Backend shutdown code (does nothing for intel rapl).""" - log.info("Closing files") + log.debug("Closing files") for file in self._files: - log.info(f"Closing file: {file}") + log.debug(f"Closing file: {file}") file.close() return def visibleSensors(self) -> Set[str]: - """ - Return string ids of visible devices. + """Return string id set of visible devices. - Returns: - Set[str]: Set with device string ids + Returns + ------- + Set[str] + Set with visible device ids. """ - log.debug(self.devices) - return {id for id, device in self.devices.items()} + return {id for id in self.devices.keys()} def getSensors(self, deviceList: Set[str]) -> List[Sensor]: - """ - Gather devive objects based on a set of device ids. + """Gather device objects based on a set of device ids. - Args: - deviceList (Set[str]): Set containing devices ids + Parameters + ---------- + deviceList : Set[str] + Set of device ids. - Returns: - List[Device]: Device objects + Returns + ------- + List[Sensor] + Device objects. """ return [self.devices[deviceId] for deviceId in deviceList] - - -IntelRAPLBackend() diff --git a/perun/backend/nvml.py b/perun/backend/nvml.py index fd62f73..219ada7 100644 --- a/perun/backend/nvml.py +++ b/perun/backend/nvml.py @@ -1,39 +1,35 @@ """Nvidia Mangement Library Source definition.""" -from typing import Callable, Set +from typing import Callable, List, Set import numpy as np import pynvml from pynvml import NVMLError from perun import log +from perun.backend.backend import Backend from perun.data_model.measurement_type import Magnitude, MetricMetaData, Unit +from perun.data_model.sensor import DeviceType, Sensor +from perun.util import singleton -from ..data_model.sensor import DeviceType, Sensor -from .backend import Backend, backend - -@backend +@singleton class NVMLBackend(Backend): """NVMLSource class. Setups connection to NVML and creates relevant devices """ + id = "nvlm" name = "NVIDIA ML" description: str = "Access GPU information from NVML python bindings" - def __init__(self) -> None: - """Init NVIDIA ML Backend.""" - super().__init__() - log.info("Initialized NVML Backend") - def setup(self): """Init pynvml and gather number of devices.""" pynvml.nvmlInit() deviceCount = pynvml.nvmlDeviceGetCount() self.metadata = { - "cuda_version": pynvml.nvmlSystemGetCudaDriverVersion(), - "driver_version": pynvml.nvmlSystemGetDriverVersion(), + "cuda_version": str(pynvml.nvmlSystemGetCudaDriverVersion()), + "driver_version": str(pynvml.nvmlSystemGetDriverVersion()), "source": "Nvidia Managment Library", } @@ -44,11 +40,10 @@ def close(self): pynvml.nvmlShutdown() def visibleSensors(self) -> Set[str]: - """ - Return string ids of visible devices. + """Return string ids of visible devices. - Returns: - Set[str]: Set with device string ids + :return: Set with string ids. + :rtype: Set[str] """ devices = set() for i in range(pynvml.nvmlDeviceGetCount()): @@ -56,16 +51,15 @@ def visibleSensors(self) -> Set[str]: devices.add(pynvml.nvmlDeviceGetUUID(handle)) return devices - def getSensors(self, deviceList: Set[str]): - """ - Gather device objects based on a set of device ids. - - Args: - deviceList (Set[str]): Set containing devices ids + def getSensors(self, deviceList: Set[str]) -> List[Sensor]: + """Gather sensor objects based on a set of device ids. - Returns: - List[Device]: Device objects + :param deviceList: Set containing device ids. + :type deviceList: Set[str] + :return: List with sensor objects + :rtype: List[Sensor] """ + pynvml.nvmlInit() def getCallback(handle) -> Callable[[], np.number]: def func() -> np.number: @@ -73,44 +67,71 @@ def func() -> np.number: return func + def getUsedMemCallback(handle) -> Callable[[], np.number]: + def func() -> np.number: + return np.uint64(pynvml.nvmlDeviceGetMemoryInfo(handle).used) + + return func + devices = [] + for deviceId in deviceList: try: + log.debug(f"Getting handle from '{deviceId}'") handle = pynvml.nvmlDeviceGetHandleByUUID(deviceId) index = pynvml.nvmlDeviceGetIndex(handle) + log.debug(f"Index: {index} - Handle : {handle}") name = f"CUDA:{index}" device_type = DeviceType.GPU device_metadata = { "uuid": deviceId, - "name": pynvml.nvmlDeviceGetName(handle), + "name": str(pynvml.nvmlDeviceGetName(handle)), **self.metadata, } - max_power = np.uint32(pynvml.nvmlDeviceGetPowerManagementLimit(handle)) + max_power = np.uint32( + pynvml.nvmlDeviceGetPowerManagementDefaultLimit(handle) + ) + log.debug(f"Device {deviceId} Max Power : {max_power}") data_type = MetricMetaData( Unit.WATT, Magnitude.MILI, np.dtype("uint32"), np.uint32(0), - np.uint32(max_power), + max_power, np.uint32(0), ) devices.append( Sensor( - name, + name + "_POWER", device_type, device_metadata, data_type, getCallback(handle), ) ) + max_memory = np.uint64(pynvml.nvmlDeviceGetMemoryInfo(handle).total) + data_type = MetricMetaData( + Unit.BYTE, + Magnitude.ONE, + np.dtype("uint64"), + np.uint64(0), + max_memory, + np.uint64(0), + ) + devices.append( + Sensor( + name + "_MEM", + device_type, + device_metadata, + data_type, + getUsedMemCallback(handle), + ) + ) + except NVMLError as e: - print(e) log.warning(f"Could not find device {deviceId}") log.warning(e) return devices - - -NVMLBackend() diff --git a/perun/backend/psutil.py b/perun/backend/psutil.py index f07702b..f82cf31 100644 --- a/perun/backend/psutil.py +++ b/perun/backend/psutil.py @@ -4,23 +4,23 @@ import numpy as np import psutil -from perun import log -from perun.backend.backend import Backend, backend +from perun.backend.backend import Backend from perun.data_model.measurement_type import Magnitude, MetricMetaData, Unit from perun.data_model.sensor import DeviceType, Sensor +from perun.util import singleton -@backend +@singleton class PSUTILBackend(Backend): """PSUTIL Backend class.""" + id: str = "psutil" name: str = "PSUTIL" description: str = "Obtain hardware data from psutil" def __init__(self) -> None: """Create psutil backend.""" super().__init__() - log.info("Init PSUTIL") def setup(self): """Configure psutil backend.""" @@ -34,7 +34,11 @@ def setup(self): self.devices[deviceName] = Sensor( deviceName, DeviceType.RAM, - {"total": mem.total, "available": mem.available, **self.metadata}, + { + "total": str(mem.total), + "available": str(mem.available), + **self.metadata, + }, MetricMetaData( Unit.PERCENT, Magnitude.ONE, @@ -82,14 +86,16 @@ def close(self): def visibleSensors(self) -> Set[str]: """Return list of visible devices.""" - return { + sensors = { "RAM_USAGE", "CPU_USAGE", - "DISK_READ_BYTES", - "DISK_WRITE_BYTES", "NET_WRITE_BYTES", "NET_READ_BYTES", } + if psutil.disk_io_counters(nowrap=True) is not None: + sensors.add("DISK_READ_BYTES") + sensors.add("DISK_WRITE_BYTES") + return sensors def _getCallback(self, device: str) -> Callable[[], np.number]: """Return measuring function for each device.""" @@ -106,12 +112,12 @@ def func() -> np.number: elif device == "DISK_READ_BYTES": def func() -> np.number: - return np.uint32(psutil.disk_io_counters(nowrap=True).read_bytes) + return np.uint32(psutil.disk_io_counters(nowrap=True).read_bytes) # type: ignore elif device == "DISK_WRITE_BYTES": def func() -> np.number: - return np.uint32(psutil.disk_io_counters(nowrap=True).write_bytes) + return np.uint32(psutil.disk_io_counters(nowrap=True).write_bytes) # type: ignore elif device == "NET_WRITE_BYTES": @@ -131,6 +137,3 @@ def func() -> np.number: def getSensors(self, deviceList: Set[str]) -> List[Sensor]: """Return desired device objects.""" return [self.devices[deviceName] for deviceName in deviceList] - - -PSUTILBackend() diff --git a/perun/backend/util.py b/perun/backend/util.py new file mode 100644 index 0000000..c3a1bf9 --- /dev/null +++ b/perun/backend/util.py @@ -0,0 +1,58 @@ +"""Backend util.""" +import platform +from typing import Any, Dict, Set + +from perun import log +from perun.backend.backend import Backend + + +def getHostMetadata() -> Dict[str, Any]: + """Return dictionary with the platform related metadata. + + Returns + ------- + Dict[str, Any] + Dictionary with host metadata. + """ + metadata = {} + for name, method in platform.__dict__.items(): + if callable(method): + try: + value = method() + if isinstance(value, tuple): + value = " ".join(value) + value = value.strip() + if value != "": + metadata[name] = value + except Exception as e: + log.debug(f"platform method {name} did not work") + log.debug(e) + + return metadata + + +def getBackendMetadata( + backends: Dict[str, Backend], backendConfig: Dict[str, Set[str]] +) -> Dict[str, Any]: + """Get backend related metadata dictionary based on the current sensor configuration. + + Parameters + ---------- + backends : List[Backend] + List with available backends. + backendConfig : Dict[str, Set[str]] + Sensor backend configuration to include in the metadata object. + + Returns + ------- + Dict[str, Any] + Backend metadata dictionary. + """ + backend_metadata: Dict[str, Any] = {} + for backend in backends.values(): + if backend.name in backendConfig: + backend_metadata[backend.name] = {} + sensors = backend.getSensors(backendConfig[backend.name]) + for sensor in sensors: + backend_metadata[backend.name][sensor.id] = sensor.metadata + return backend_metadata diff --git a/perun/comm.py b/perun/comm.py index e4dbc50..85e0b54 100644 --- a/perun/comm.py +++ b/perun/comm.py @@ -24,31 +24,27 @@ def __init__(self): log.warning(e) def Get_rank(self) -> int: - """Return MPI rank. + """Get local MPI rank. - Returns: - int: MPI Rank + Returns + ------- + int + MPI Rank """ return self._comm.Get_rank() if self._enabled else self._rank def Get_size(self) -> int: - """Return MPI world size. + """MPI World size. - Returns: - int: MPI world size + Returns + ------- + int + World Size """ return self._comm.Get_size() if self._enabled else self._size def gather(self, obj: Any, root: int = 0) -> Optional[List[Any]]: - """MPI gather operation at selected rank. - - Args: - obj (Any): Object to be gathererd. - root (int, optional): Rank to gather information at. Defaults to 0. - - Returns: - Optional[List[Any]]: List with objects from all the ranks. - """ + """MPI Gather operation.""" return self._comm.gather(obj, root=root) if self._enabled else [obj] def allgather(self, obj: Any) -> List[Any]: diff --git a/perun/configuration.py b/perun/configuration.py index 705b6ae..c309ecf 100644 --- a/perun/configuration.py +++ b/perun/configuration.py @@ -13,25 +13,23 @@ "pue": 1.58, "emissions_factor": 417.80, # gCO2eq/kWh "price_factor": 32.51, # Cent/kWh + "price_unit": "€", }, "monitor": { - "sampling_rate": 5, + "sampling_rate": 1, }, "output": { "app_name": None, "run_id": None, - "format": "pickle", + "format": "text", "data_out": "./perun_results", - "depth": None, - "raw": True, }, "benchmarking": { - "bench_enable": False, - "bench_rounds": 10, - "bench_warmup_rounds": 1, + "rounds": 1, + "warmup_rounds": 0, # "bench_metrics": ["ENERGY", "RUNTIME"], }, - "debug": {"log_lvl": "ERROR"}, + "debug": {"log_lvl": "WARNING"}, # "horeka": {"enabled": False, "url": "", "token": "", "org": ""}, } @@ -42,19 +40,24 @@ if globalConfigPath.exists() and globalConfigPath.is_file(): config.read(globalConfigPath) +localConfigPath = Path.cwd() / ".perun.ini" +if globalConfigPath.exists() and globalConfigPath.is_file(): + config.read(globalConfigPath) + def read_custom_config( ctx: Optional[click.Context], param: Optional[Union[click.Option, click.Parameter]], pathStr: str, ) -> None: - """ - Read an INI configuration file and overrides the values from the default and global configuration. + """Read an INI configuration file and overrides the values from the default and global configuration. - Args: - ctx (click.Context): Commandline context object (irrelevant) - param (Union[click.Option, click.Parameter]): Click cli object (irrelevant) - pathStr (str): String to configuration file + :param ctx: Command line context object (ignore) + :type ctx: Optional[click.Context] + :param param: Click CLI object (ignore) + :type param: Optional[Union[click.Option, click.Parameter]] + :param pathStr: String to configuration file (don't ignore) + :type pathStr: str """ configPath: Path = Path(pathStr) if configPath.exists() and configPath.is_file(): @@ -64,13 +67,14 @@ def read_custom_config( def save_to_config_callback( ctx: click.Context, param: Union[click.Option, click.Parameter], value: Any ): - """ - Override configuration with click cli options. + """Override configuration with click cli options. - Args: - ctx (click.Context): Click context - param (Union[click.Option, click.Parameter]): Click option/param object - value (Any): New configuration value + :param ctx: Click context (ignore) + :type ctx: click.Context + :param param: Click parameters/options + :type param: Union[click.Option, click.Parameter] + :param value: New configuration value + :type value: Any """ if value and isinstance(param, click.Option): key: Optional[str] = param.name @@ -79,12 +83,12 @@ def save_to_config_callback( def save_to_config(key: str, value: Any): - """ - Override indivial configuration values. + """Override individual configuration values. - Args: - key (str): Option name - value (Any): New option value + :param key: Option name + :type key: str + :param value: Option value + :type value: Any """ for section in config.sections(): if config.has_option(section, key): diff --git a/perun/coordination.py b/perun/coordination.py index cb5534a..6a49324 100644 --- a/perun/coordination.py +++ b/perun/coordination.py @@ -1,23 +1,27 @@ """Coordination module.""" -import platform -from typing import Dict, List, Optional, Set, Tuple +import pprint as pp +from typing import Dict, List, Set, Tuple -from perun import Comm, log -from perun.backend import Backend +from perun import log +from perun.backend.backend import Backend +from perun.comm import Comm -_cached_sensor_config: Optional[Tuple[Dict, List]] = None - -def getHostRankDict(comm: Comm) -> Dict[str, List[int]]: +def getHostRankDict(comm: Comm, hostname: str) -> Dict[str, List[int]]: """Return a dictionary with all the host names with each MPI rank in them. - Args: - comm (Comm): MPI Communicator - - Returns: - Dict[str, List[int]]: Dictionary with key hostnames and mpi ranks as values. + Parameters + ---------- + comm : Comm + MPI Communicator + hostname : str + Local rank Hostname + + Returns + ------- + Dict[str, List[int]] + Global host and mpi ranks dictionary. """ - hostname = platform.node() rank = comm.Get_rank() gHostRank: List[Tuple[str, int]] = comm.allgather((hostname, rank)) @@ -32,66 +36,51 @@ def getHostRankDict(comm: Comm) -> Dict[str, List[int]]: def getGlobalSensorRankConfiguration( - comm: Comm, backends: List[Backend] -) -> Tuple[Dict[str, List[int]], List[Dict[str, Set[str]]]]: + comm: Comm, backends: Dict[str, Backend], globalHostRanks: Dict[str, List[int]] +) -> List[Dict[str, Set[str]]]: """Gather available sensor information from every MPI rank and assign/unassign sensors to each rank to avoid over sampling. - Args: - comm (Comm): MPI Communicator - backends (List[Backend]): List of available backends in the current rank. - - Returns: - Tuple[Dict[str, List[int]], List[Dict[str, Set[str]]]]: Global rank and sensor configuration objects. + Parameters + ---------- + comm : Comm + MPI Communicator + backends : Dict[str, Backend] + Backend dictionary + globalHostRanks : Dict[str, List[int]] + Mapping from host to MPI ranks + + Returns + ------- + List[Dict[str, Set[str]]] + List with apointed backend and sensors for each MPI rank. """ visibleSensorsByBackend: Dict[str, Set[str]] = { - backend.name: backend.visibleSensors() for backend in backends + backend.name: backend.visibleSensors() for backend in backends.values() } - log.debug(f"Rank {comm.Get_rank()} : Visible devices {visibleSensorsByBackend}") + log.debug( + f"Rank {comm.Get_rank()} : Visible devices = {pp.pformat(visibleSensorsByBackend)}" + ) globalVisibleSensorsByBackend = comm.allgather(visibleSensorsByBackend) - globalHostRanks = getHostRankDict(comm) - globalSensorConfig = assignSensors(globalVisibleSensorsByBackend, globalHostRanks) - return ( - globalHostRanks, - globalSensorConfig, - ) - - -def getLocalSensorRankConfiguration( - comm: Comm, backends: List[Backend] -) -> Tuple[List[int], Dict[str, Set[str]]]: - """Obtain local sensor configuration. - - Args: - comm (Comm): MPI Communicator - backends (List[Backend]): List of availbale backends in the current ranks. - - Returns: - Tuple[List[int], Dict[str, Set[str]]]: Local rank and sensor configuration - """ - global _cached_sensor_config - if _cached_sensor_config is None: - globalHostRanks, globalSensorConfig = getGlobalSensorRankConfiguration( - comm, backends - ) - _cached_sensor_config = (globalHostRanks, globalSensorConfig) - else: - globalHostRanks, globalSensorConfig = _cached_sensor_config - - return globalHostRanks[platform.node()], globalSensorConfig[comm.Get_rank()] + return globalSensorConfig def assignSensors( hostBackends: List[Dict[str, Set[str]]], hostNames: Dict[str, List[int]] ) -> List[Dict[str, Set[str]]]: - """Assign found devices to the lowest rank in each host. - - Args: - hostSensors (List[Set[str]]): List with lenght of the mpi world size, with each index containing the devices of each rank. - hostNames (List[str]): Hostname of the mpi rank at the index. - - Returns: - List[Set[str]]: New list with the devices assiged to each rank. + """Assings each mpi rank a sensor based on available backends and Host to rank mapping. + + Parameters + ---------- + hostBackends : List[Dict[str, Set[str]]] + List with global backends + hostNames : Dict[str, List[int]] + Host to MPI Rank mapping + + Returns + ------- + List[Dict[str, Set[str]]] + List with apointed backend and sensors for each MPI rank. """ for host, ranks in hostNames.items(): firstRank = ranks[0] diff --git a/perun/data_model/data.py b/perun/data_model/data.py index 21d12f3..a8bcf21 100644 --- a/perun/data_model/data.py +++ b/perun/data_model/data.py @@ -1,6 +1,8 @@ """Storage Module.""" import dataclasses import enum +import time +from dataclasses import asdict from typing import Any, Dict, List, Optional, Union import numpy as np @@ -13,6 +15,7 @@ class NodeType(enum.Enum): """DataNode type enum.""" + APP = enum.auto() MULTI_RUN = enum.auto() RUN = enum.auto() NODE = enum.auto() @@ -25,14 +28,25 @@ class MetricType(str, enum.Enum): RUNTIME = "runtime" POWER = "power" + CPU_POWER = "cpu_power" + GPU_POWER = "gpu_power" + DRAM_POWER = "dram_power" + OTHER_POWER = "other_power" CPU_UTIL = "cpu_util" GPU_UTIL = "gpu_util" MEM_UTIL = "mem_util" + GPU_MEM = "gpu_mem" NET_READ = "net_read" NET_WRITE = "net_write" DISK_READ = "disk_read" DISK_WRITE = "disk_write" ENERGY = "energy" + CPU_ENERGY = "cpu_energy" + GPU_ENERGY = "gpu_energy" + DRAM_ENERGY = "dram_energy" + OTHER_ENERGY = "other_energy" + OTHER_MEM = "other_memory" + N_RUNS = "n_runs" class AggregateType(str, enum.Enum): @@ -63,6 +77,21 @@ def fromDict(cls, metricDict: Dict): AggregateType(metricDict["agg"]), ) + def copy(self): + """Create copy metric object. + + Returns + ------- + _type_ + Copy of object. + """ + return Metric( + MetricType(self.type.value), + self.value.copy(), + self.metric_md.copy(), + AggregateType(self.agg.value), + ) + @dataclasses.dataclass class Stats: @@ -70,6 +99,7 @@ class Stats: type: MetricType metric_md: MetricMetaData + sum: np.number mean: np.number std: np.number max: np.number @@ -77,7 +107,23 @@ class Stats: @classmethod def fromMetrics(cls, metrics: List[Metric]): - """Create a stats object based on the metric's values.""" + """Create stats object from list of metrics with the same type. + + Parameters + ---------- + metrics : List[Metric] + List of metrics with the same type. + + Returns + ------- + _type_ + Stats object. + + Raises + ------ + Exception + If metrics are not from the same type. + """ type = metrics[0].type metric_md = metrics[0].metric_md @@ -87,18 +133,23 @@ def fromMetrics(cls, metrics: List[Metric]): raise Exception("Metrics type don't match. Invalid Stats") values = np.array([metric.value for metric in metrics]) + sum = values.sum() mean = values.mean() std = values.std() max = values.max() min = values.min() - return cls(type, metric_md, mean, std, max, min) + return cls(type, metric_md, sum, mean, std, max, min) @property def value(self): - """ - Value property. + """Value property (mean). For compatibility with Metric dataclass. + + Returns + ------- + _type_ + Return the mean value of the stats object. """ return self.mean @@ -108,6 +159,7 @@ def fromDict(cls, statsDict: Dict): return cls( MetricType(statsDict["type"]), MetricMetaData.fromDict(statsDict["metric_md"]), + statsDict["min"], statsDict["mean"], statsDict["std"], statsDict["max"], @@ -126,7 +178,18 @@ class RawData: @classmethod def fromDict(cls, rawDataDict: Dict): - """Create RawData object from a dictionary.""" + """Create RawData object from a dictionary. + + Parameters + ---------- + rawDataDict : Dict + Dictionary with same keys as RawData object. + + Returns + ------- + _type_ + RawData object. + """ t_md = MetricMetaData.fromDict(rawDataDict["t_md"]) v_md = MetricMetaData.fromDict(rawDataDict["v_md"]) return cls( @@ -137,6 +200,95 @@ def fromDict(cls, rawDataDict: Dict): ) +class LocalRegions: + """Stores local region data while an application is being monitored.""" + + def __init__(self) -> None: + self._regions: Dict[str, List[int]] = {} + + def addEvent(self, region_name: str) -> None: + """Mark a new event for the named region. + + Parameters + ---------- + region_name : str + Region to mark the event from. + """ + if region_name not in self._regions: + self._regions[region_name] = [] + + self._regions[region_name].append(time.time_ns()) + + +@dataclasses.dataclass +class Region: + """Stores region data from all MPI ranks. + + For each marked region (decorated function), an numpy array with timestamps indicating function starts and ends. + """ + + id: str = "" + world_size: int = 0 + raw_data: Dict[int, np.ndarray] = dataclasses.field(default_factory=dict) + runs_per_rank: Optional[Stats] = None + runtime: Optional[Stats] = None + power: Optional[Stats] = None + cpu_util: Optional[Stats] = None + gpu_util: Optional[Stats] = None + processed: bool = False + + def toDict(self) -> Dict[str, Any]: + """Convert regions to a python dictionary. + + Returns + ------- + Dict[str, Dict[int, np.ndarray]] + Dictionary with region data. + """ + result = { + "id": self.id, + "world_size": self.world_size, + "raw_data": self.raw_data, + } + + result["runs_per_rank"] = ( + asdict(self.runs_per_rank) if self.runs_per_rank else None + ) + result["runtime"] = asdict(self.runtime) if self.runtime else None + result["power"] = asdict(self.power) if self.power else None + result["cpu_util"] = asdict(self.cpu_util) if self.cpu_util else None + result["gpu_util"] = asdict(self.gpu_util) if self.gpu_util else None + + return result + + @classmethod + def fromDict(cls, regionDictionary: Dict[str, Any]): + """Create Regions object from a dictionary. + + Parameters + ---------- + regions : Dict[str, Dict[int, np.ndarray]] + Regions dictionary. + + Returns + ------- + Regions + Regions object. + """ + regionObj = Region() + regionObj.id = regionDictionary["id"] + regionObj.world_size = regionDictionary["world_size"] + regionObj.raw_data = regionDictionary["raw_data"] + regionObj.processed = regionDictionary["processed"] + if regionObj.processed: + regionObj.runs_per_rank = Stats.fromDict(regionDictionary["runs_per_rank"]) + regionObj.runtime = Stats.fromDict(regionDictionary["runtime"]) + regionObj.power = Stats.fromDict(regionDictionary["power"]) + regionObj.cpu_util = Stats.fromDict(regionDictionary["cpu_util"]) + regionObj.gpu_util = Stats.fromDict(regionDictionary["gpu_util"]) + return regionObj + + class DataNode: """Recursive data structure that contains all the information of a monitored application.""" @@ -149,16 +301,29 @@ def __init__( metrics: Optional[Dict[MetricType, Union[Metric, Stats]]] = None, deviceType: Optional[DeviceType] = None, raw_data: Optional[RawData] = None, + regions: Optional[Dict[str, Region]] = None, processed: bool = False, ) -> None: - """DataNode. - - Args: - id (str): String identifier - type (NodeType): Type of Node - metadata (Dict): Metadata - nodes (Dict[str, Self], optional): Child DataNodes. Defaults to {}. - raw_data (Optional[RawData], optional): If sensor, contains raw sensor values. Defaults to None. + """Perun DataNode. + + Parameters + ---------- + id : str + Node id. + type : NodeType + Node type. + metadata : Dict + Node metadata. + nodes : Optional[Dict[str, Any]], optional + Children nodes, by default None + metrics : Optional[Dict[MetricType, Union[Metric, Stats]]], optional + Node metrics, by default None + deviceType : Optional[DeviceType], optional + Node device type, only relevant for leaf nodes, by default None + raw_data : Optional[RawData], optional + Raw data object, only relevant for leaf nodes, by default None + processed : bool, optional + Marks if the node has been processed, by default False """ self.id = id self.type = type @@ -169,11 +334,36 @@ def __init__( ) self.deviceType: Optional[DeviceType] = deviceType self.raw_data: Optional[RawData] = raw_data + self.regions: Optional[Dict[str, Region]] = regions self.processed = processed - def toDict( - self, depth: Optional[int] = None, include_raw_data: bool = False - ) -> Dict: + def addRegionData(self, localRegions: List[LocalRegions], start_time: int): + """Add region information to to data node. + + Parameters + ---------- + localRegions : List[LocalRegions] + Gathered local regions from all MPI ranks + start_time : int + 'Official' start time of the run. + """ + self.regions = {} + world_size = len(localRegions) + for rank, l_region in enumerate(localRegions): + for region_name, data in l_region._regions.items(): + if region_name not in self.regions: + r = Region() + r.id = region_name + r.world_size = world_size + self.regions[region_name] = r + + t_s = np.array(data) + t_s -= start_time + t_s = t_s.astype("float32") + t_s *= 1e-9 + self.regions[region_name].raw_data[rank] = t_s + + def toDict(self, include_raw_data: bool = True) -> Dict: """Transform object to dictionary.""" resultsDict = { "id": self.id, @@ -183,23 +373,18 @@ def toDict( type.value: dataclasses.asdict(metric) for type, metric in self.metrics.items() }, + "regions": { + region_name: region.toDict() + for region_name, region in self.regions.items() + } + if self.regions + else None, "deviceType": self.deviceType, "processed": self.processed, } - if depth is None: - resultsDict["nodes"] = ( - { - key: value.toDict(depth, include_raw_data) - for key, value in self.nodes.items() - }, - ) - elif depth > 1: - resultsDict["nodes"] = ( - { - key: value.toDict(depth - 1, include_raw_data) - for key, value in self.nodes.items() - }, - ) + resultsDict["nodes"] = ( + {key: value.toDict(include_raw_data) for key, value in self.nodes.items()}, + ) if include_raw_data and self.raw_data: resultsDict["raw_data"] = dataclasses.asdict(self.raw_data) @@ -208,7 +393,18 @@ def toDict( @classmethod def fromDict(cls, resultsDict: Dict): - """Build object from dictionary.""" + """Create dataNode from python dictionary. + + Parameters + ---------- + resultsDict : Dict + Dictionary with data node attributes. + + Returns + ------- + _type_ + DataNode object. + """ type = NodeType(resultsDict["type"]) newResults = cls( id=resultsDict["id"], @@ -237,4 +433,10 @@ def fromDict(cls, resultsDict: Dict): if "raw_data" in resultsDict: newResults.raw_data = RawData.fromDict(resultsDict["raw_data"]) + if "regions" in resultsDict: + newResults.regions = { + region_name: Region.fromDict(region_dict) + for region_name, region_dict in resultsDict["regions"].items() + } + return newResults diff --git a/perun/data_model/measurement_type.py b/perun/data_model/measurement_type.py index 2925cd1..36c5898 100644 --- a/perun/data_model/measurement_type.py +++ b/perun/data_model/measurement_type.py @@ -14,13 +14,16 @@ class Unit(str, enum.Enum): BYTE = "B" SECOND = "s" PERCENT = "%" + SCALAR = "" @property def symbol(self) -> str: """Symbol associated with Unit. - Returns: - str: String with unit symbol. + Returns + ------- + str + Unit symbol string. """ return self.value @@ -46,8 +49,10 @@ class Magnitude(float, enum.Enum): def symbol(self) -> str: """Symbol associated with magnitude prefix. - Returns: - str: String with magnitude symbol. + Returns + ------- + str + String symbol """ _symbols: Dict = { "PICO": "p", @@ -90,3 +95,20 @@ def fromDict(cls, mdDict: Dict): dtype.type(mdDict["max"], dtype=dtype), dtype.type(mdDict["fill"], dtype=dtype), ) + + def copy(self): + """Copy MetricMetaData object. + + Returns + ------- + _type_ + Copy of object. + """ + return MetricMetaData( + Unit(self.unit.value), + Magnitude(self.mag.value), + self.dtype, + self.min.copy(), + self.max.copy(), + self.fill.copy(), + ) diff --git a/perun/decorator.py b/perun/decorator.py deleted file mode 100644 index 523c5a2..0000000 --- a/perun/decorator.py +++ /dev/null @@ -1,33 +0,0 @@ -"""Decorator module.""" - -import functools - -from perun import config, log -from perun.configuration import read_custom_config, read_environ, save_to_config -from perun.perun import monitor_application - - -def monitor( - configuration: str = "./.perun.ini", - **conf_kwargs, -): - """Decorate function to monitor its energy usage.""" - - def inner_function(func): - @functools.wraps(func) - def func_wrapper(*args, **kwargs): - # Get custom config and kwargs - read_custom_config(None, None, configuration) - for key, value in conf_kwargs.items(): - save_to_config(key, value) - - read_environ() - log.setLevel(f"{config.get('debug', 'log_lvl')}") - - func_result = monitor_application(func, args, kwargs) - - return func_result - - return func_wrapper - - return inner_function diff --git a/perun/extras/__init__.py b/perun/extras/__init__.py deleted file mode 100644 index 917943a..0000000 --- a/perun/extras/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""Extras module.""" diff --git a/perun/extras/horeka.py b/perun/extras/horeka.py deleted file mode 100644 index 00cf9ad..0000000 --- a/perun/extras/horeka.py +++ /dev/null @@ -1,58 +0,0 @@ -"""Connection to HoreKa hardware measurements.""" -import platform -from datetime import datetime -from pathlib import Path -from typing import List, Union - -import pandas as pd -from influxdb_client import InfluxDBClient - -from perun import config -from perun.comm import Comm - -query = """from(bucket: "hk-collector") -|> range(start: _start, stop: _stop) -|> filter(fn: (r) => r["hostname"] == _node) -|> filter(fn: (r) => r["_measurement"] == "consumed_watts" or r["_measurement"] == "nv_power_usage") -|> pivot(rowKey: ["_time"], columnKey: ["_measurement"], valueColumn: "_value")""" - - -def get_horeka_measurements( - comm: Comm, outdir: Path, expName: str, expId: int, start: datetime, stop: datetime -): - """ - Read hardware data from an Influx Database. - - Args: - comm (Comm): MPI Communication Object - outdir (Path): Result location - expName (str): Experiment config.get("horeka", "org") - expId (int): Experiment Id - start (datetime): Experiment start time - stop (datetime): Experiment end time - """ - URL = config.get("horeka", "url") - TOKEN = config.get("horeka", "token") - ORG = config.get("horeka", "org") - nodename = platform.node().replace(".localdomain", "") - - idb = InfluxDBClient(url=URL, token=TOKEN, org=ORG) - - now = datetime.now() - p = {"_start": start - now, "_stop": stop - now, "_node": nodename} - - outpath = outdir / "horeka" / expName / str(expId) - - if comm.Get_rank() == 0: - if not outpath.exists(): - outpath.mkdir(parents=True) - - dfList: Union[List[pd.DataFrame], pd.DataFrame] = idb.query_api().query_data_frame( - query=query, params=p - ) - - if isinstance(dfList, List): - for index, df in enumerate(dfList): - df.to_csv(outpath / f"{nodename}_{index}.csv") - else: - dfList.to_csv(outpath / f"{nodename}.csv") diff --git a/perun/io/bench.py b/perun/io/bench.py index 86d5916..110cd91 100644 --- a/perun/io/bench.py +++ b/perun/io/bench.py @@ -1,33 +1,44 @@ """Bench io module.""" import json +import numpy as np + +from perun import log from perun.data_model.data import DataNode, MetricType, Stats -from perun.util import value2str +from perun.io.util import getTFactorMag lessIsBetterMetrics = [MetricType.RUNTIME, MetricType.ENERGY] -def exportBench(dataNode: DataNode) -> str: +def exportBench(dataNode: DataNode, mr_id: str) -> str: """Export data node to json format based on the github continuous benchmark action. https://github.com/benchmark-action/github-action-benchmark - Args: - dataNode (DataNode): Perun results + Parameters + ---------- + dataNode : DataNode + Data Node + mr_id : str + MULTI_RUN node to get data from. - Returns: - str: Output string + Returns + ------- + str + Json string with benchmark data. """ metricDict = [] - for metricType, metric in dataNode.metrics.items(): + mrNode = dataNode.nodes[mr_id] + + for metricType, metric in mrNode.metrics.items(): if metricType in lessIsBetterMetrics: metric_md = metric.metric_md - value, tfactor, mag = value2str(metric.value, metric_md) + tfactor, mag = getTFactorMag(metric.value, metric_md) if isinstance(metric, Stats): metricDict.append( { - "name": f"{dataNode.metadata['app_name']}_{dataNode.id} - {metricType.name}", + "name": f"{dataNode.id}_{mrNode.id} - {metricType.name}", "unit": f"{mag.symbol}{metric_md.unit.symbol}", "value": metric.mean / tfactor, "range": metric.std / tfactor, @@ -36,10 +47,81 @@ def exportBench(dataNode: DataNode) -> str: else: metricDict.append( { - "name": f"{dataNode.metadata['app_name']}_{dataNode.id} - {metricType.name}", + "name": f"{dataNode.id}_{mrNode.id} - {metricType.name}", "unit": f"{mag.symbol}{metric_md.unit.symbol}", "value": metric.value / tfactor, } ) + region_data = {} + if len(mrNode.nodes) > 1: + log.warning( + "When generating benchmarks for regions, it is preferable to if each function only runs a single time." + ) + + for runNode in mrNode.nodes.values(): + if runNode.regions: + for region_name, region in runNode.regions.items(): + if region_name not in region_data: + region_data[region_name] = { + MetricType.RUNTIME.name: ( + [region.runtime.mean], + region.runtime.metric_md, + ), + MetricType.POWER.name: ( + [region.power.mean], + region.power.metric_md, + ), + MetricType.CPU_UTIL.name: ( + [region.cpu_util.mean], + region.cpu_util.metric_md, + ), + MetricType.GPU_UTIL.name: ( + [region.gpu_util.mean], + region.gpu_util.metric_md, + ), + } + else: + region_data[region_name][MetricType.RUNTIME.name][0].append( + region.runtime.mean + ) + region_data[region_name][MetricType.POWER.name][0].append( + region.power.mean + ) + region_data[region_name][MetricType.CPU_UTIL.name][0].append( + region.cpu_util.mean + ) + region_data[region_name][MetricType.GPU_UTIL.name][0].append( + region.gpu_util.mean + ) + + for region_name, region in region_data.items(): + for metric_name, data in region.items(): + values = data[0] + metadata = data[1] + if len(values) > 1: + values = data[0] + metadata = data[1] + mean = np.mean(values) + std = np.std(values) + tfactor, mag = getTFactorMag(mean, metadata) + metricDict.append( + { + "name": f"{region_name}_{mr_id} - {metric_name}", + "unit": f"{mag.symbol}{metadata.unit.symbol}", + "value": mean / tfactor, + "range": std / tfactor, + } + ) + else: + value = values[0] + tfactor, mag = getTFactorMag(value, metadata) + metricDict.append( + { + "name": f"{region_name}_{mr_id} - {metric_name}", + "unit": f"{mag.symbol}{metadata.unit.symbol}", + "value": value / tfactor, + } + ) + return json.dumps(metricDict, indent=4) diff --git a/perun/io/hdf5.py b/perun/io/hdf5.py index 8a1b978..dc44063 100644 --- a/perun/io/hdf5.py +++ b/perun/io/hdf5.py @@ -1,6 +1,6 @@ """HDF5 IO module.""" from pathlib import Path -from typing import Union +from typing import Dict, Union import h5py import numpy as np @@ -13,6 +13,7 @@ MetricType, NodeType, RawData, + Region, Stats, ) from perun.data_model.measurement_type import Magnitude, Unit @@ -22,26 +23,35 @@ def exportHDF5(filePath: Path, dataNode: DataNode): """Export perun data nodes to an HDF5 file. - Args: - filePath (Path): Output path. - dataNode (DataNode): DataNode. + Parameters + ---------- + filePath : Path + Output path + dataNode : DataNode + Root of data node tree. """ - h5_file = h5py.File(filePath, "a") + h5_file = h5py.File(filePath, "w") _addNode(h5_file, dataNode) h5_file.close() def importHDF5(filePath: Path) -> DataNode: - """Import HDF5 created by perun. - - Args: - filePath (Path): File path. - - Raises: - ValueError: Incompatible hdf5 file. - - Returns: - DataNode: Recoverd perun DataNode + """Import DataNode from HDF5 format. + + Parameters + ---------- + filePath : Path + HDF5 file path. + + Returns + ------- + DataNode + Perun data node. + + Raises + ------ + ValueError + Incompatible HDF5 file. """ h5_file = h5py.File(filePath, "r") rootEntries = list(h5_file.keys()) @@ -60,6 +70,7 @@ def _addNode(h5group: h5py.Group, dataNode: DataNode): """Write node into hdf5 file.""" group = h5group.create_group(dataNode.id) group.attrs["type"] = dataNode.type.value + for key, value in dataNode.metadata.items(): group.attrs[key] = value @@ -67,16 +78,19 @@ def _addNode(h5group: h5py.Group, dataNode: DataNode): group.attrs["device_type"] = dataNode.deviceType.value metricGroup = group.create_group("metrics") - for metricId, metric in dataNode.metrics.items(): + for metric in dataNode.metrics.values(): _addMetric(metricGroup, metric) nodesGroup = group.create_group("nodes") - for nodeId, node in dataNode.nodes.items(): + for node in dataNode.nodes.values(): _addNode(nodesGroup, node) if dataNode.raw_data is not None: _addRawData(group, dataNode.raw_data) + if dataNode.regions is not None: + _addRegions(group, dataNode.regions) + def _readNode(group: h5py.Group) -> DataNode: """Read node from hdf5 file.""" @@ -101,6 +115,7 @@ def _readNode(group: h5py.Group) -> DataNode: metrics[metric.type] = metric raw_data = _readRawData(group["raw_data"]) if "raw_data" in group else None # type: ignore + regions = _readRegions(group["regions"]) if "regions" in group else None # type: ignore return DataNode( id=id, @@ -110,6 +125,7 @@ def _readNode(group: h5py.Group) -> DataNode: metrics=metrics, deviceType=device_type, raw_data=raw_data, + regions=regions, processed=True, ) @@ -126,6 +142,7 @@ def _addMetric(h5Group: h5py.Group, metric: Union[Metric, Stats]): metricGroup.attrs["agg_type"] = metric.agg.value metricGroup.attrs.create("value", metric.value, dtype=metadata.dtype) else: + metricGroup.attrs.create("sum", metric.sum, dtype=metadata.dtype) metricGroup.attrs.create("mean", metric.mean, dtype=metadata.dtype) metricGroup.attrs.create("min", metric.min, dtype=metadata.dtype) metricGroup.attrs.create("max", metric.max, dtype=metadata.dtype) @@ -146,6 +163,7 @@ def _readMetric(group: h5py.Group) -> Union[Metric, Stats]: return Stats( type=MetricType(group.attrs["type"]), metric_md=metric_md, + sum=group.attrs["sum"], # type: ignore mean=group.attrs["mean"], # type: ignore std=group.attrs["std"], # type: ignore min=group.attrs["min"], # type: ignore @@ -201,3 +219,51 @@ def _readRawData(group: h5py.Group) -> RawData: t_md=t_md, v_md=v_md, ) + + +def _addRegions(h5Group: h5py.Group, regions: Dict[str, Region]): + regions_group: h5py.Group = h5Group.create_group("regions") + for region in regions.values(): + _addRegion(regions_group, region) + + +def _addRegion(h5Group: h5py.Group, region: Region): + region_group = h5Group.create_group(region.id) + _addMetric(region_group, region.cpu_util) # type: ignore + _addMetric(region_group, region.gpu_util) # type: ignore + _addMetric(region_group, region.power) # type: ignore + _addMetric(region_group, region.runs_per_rank) # type: ignore + _addMetric(region_group, region.runtime) # type: ignore + region_group.attrs["id"] = region.id + region_group.attrs["processed"] = region.processed + region_group.attrs["world_size"] = region.world_size + raw_data_group = region_group.create_group("raw_data") + for rank, data in region.raw_data.items(): + raw_data_group.create_dataset(str(rank), data=data) + + +def _readRegions(group: h5py.Group) -> Dict[str, Region]: + regionsDict: Dict[str, Region] = {} + for key, region_group in group.items(): + regionsDict[key] = _readRegion(region_group) + return regionsDict + + +def _readRegion(group: h5py.Group) -> Region: + regionObj = Region() + regionObj.id = group.attrs["id"] + regionObj.processed = group.attrs["processed"] + regionObj.world_size = group.attrs["world_size"] + + regionObj.cpu_util = _readMetric(group["CPU_UTIL"]) # type: ignore + regionObj.gpu_util = _readMetric(group["GPU_UTIL"]) # type: ignore + regionObj.power = _readMetric(group["POWER"]) # type: ignore + regionObj.runtime = _readMetric(group["RUNTIME"]) # type: ignore + regionObj.runs_per_rank = _readMetric(group["N_RUNS"]) # type: ignore + + raw_data_group = group["raw_data"] + regionObj.raw_data = {} + for key, data in raw_data_group.items(): + regionObj.raw_data[int(key)] = data[:] + + return regionObj diff --git a/perun/io/io.py b/perun/io/io.py index 2e5a6b6..af9ae11 100644 --- a/perun/io/io.py +++ b/perun/io/io.py @@ -1,11 +1,12 @@ """IO Module.""" import enum +from datetime import datetime from pathlib import Path from typing import Optional, Union from perun import log -from perun.data_model.data import DataNode, NodeType +from perun.data_model.data import DataNode from perun.io.bench import exportBench from perun.io.hdf5 import exportHDF5, importHDF5 from perun.io.json import exportJson, importJson @@ -48,82 +49,106 @@ def fromSuffix(cls, suffix: str): def exportTo( - data_out: Path, - dataNode: DataNode, - format: IOFormat, - rawData: bool = False, - depth: Optional[int] = None, + output_path: Path, dataNode: DataNode, format: IOFormat, mr_id: Optional[str] = None ): """Export DataNode structure to the selected format. - Args: - dataNode (DataNode): DataNode tree with processed metrics. - format (IOFormat, optional): Output format. Defaults to IOFormat.TEXT. - rawData (bool, optional): If raw data should be included. Limits available formats. Defaults to False. + :param data_out: Output path + :type data_out: Path + :param dataNode: DataNode tree with processed metrics + :type dataNode: DataNode + :param format: Output format. + :type format: IOFormat """ if not dataNode.processed: log.warning("Data has not been processed before import. Proceed with caution.") raise Exception("DataNode needs to be processed before it can be exported.") - if not data_out.exists(): - log.info(f"{data_out} does not exists. So lets make it.") - data_out.mkdir() - - if format == IOFormat.BENCH and dataNode.type != NodeType.MULTI_RUN: - log.warning( - "BENCH format can only be used with 'bench' mode enabled. Using pickle instead." - ) - format = IOFormat.PICKLE - - filename = f"{dataNode.metadata['app_name']}_{dataNode.id}" - output_path: Path = data_out / filename - - existing_files = [path for path in output_path.parent.glob(f"{filename}*")] - if len(existing_files) > 0: - log.warning(f"File {output_path} already exists and will.") - idx = len(existing_files) - filename += f"_{idx}" - dataNode.id += f"_{idx}" + if not output_path.exists(): + log.info(f"{output_path.parent} does not exists. So lets make it.") + output_path.mkdir() + + if not mr_id and ( + format == IOFormat.BENCH or format == IOFormat.TEXT or format == IOFormat.CSV + ): + log.warning("No run ID provided, using last executed run to generate output") + last_dt = datetime.min + for node in dataNode.nodes.values(): + exec_dt = datetime.fromisoformat(node.metadata["execution_dt"]) + if exec_dt > last_dt: + last_dt = exec_dt + mr_id = node.id reportStr: Union[str, bytes] if format == IOFormat.JSON: - filename += ".json" fileType = "w" - reportStr = exportJson(dataNode, depth, rawData) - with open(data_out / filename, fileType) as file: + output_path = output_path / f"{dataNode.id}.{format.suffix}" + + if output_path.exists() and output_path.is_file(): + log.warn(f"Overwriting existing file {output_path}") + + reportStr = exportJson(dataNode) + with open(output_path, fileType) as file: file.write(reportStr) + elif format == IOFormat.HDF5: - filename += ".hdf5" - exportHDF5(data_out / filename, dataNode) + output_path = output_path / f"{dataNode.id}.{format.suffix}" + if output_path.exists() and output_path.is_file(): + log.warn(f"Overwriting existing file {output_path}") + + exportHDF5(output_path, dataNode) + elif format == IOFormat.PICKLE: - filename += ".pkl" fileType = "wb" + + output_path = output_path / f"{dataNode.id}.{format.suffix}" + + if output_path.exists() and output_path.is_file(): + log.warn(f"Overwriting existing file {output_path}") + reportStr = exportPickle(dataNode) - with open(data_out / filename, fileType) as file: + with open(output_path, fileType) as file: file.write(reportStr) + elif format == IOFormat.CSV: - filename += ".cvs" - exportCSV(data_out / filename, dataNode) + output_path = output_path / f"{dataNode.id}_{mr_id}.{format.suffix}" + + if output_path.exists() and output_path.is_file(): + log.warn(f"Overwriting existing file {output_path}") + + exportCSV(output_path, dataNode, mr_id) # type: ignore elif format == IOFormat.BENCH: - filename += ".json" fileType = "w" - reportStr = exportBench(dataNode) - with open(data_out / filename, fileType) as file: + output_path = output_path / f"{dataNode.id}_{mr_id}.{format.suffix}" + + if output_path.exists() and output_path.is_file(): + log.warn(f"Overwriting existing file {output_path}") + + reportStr = exportBench(dataNode, mr_id) # type: ignore + with open(output_path, fileType) as file: file.write(reportStr) - else: - filename += ".txt" + + elif format == IOFormat.TEXT: fileType = "w" - reportStr = textReport(dataNode) - with open(data_out / filename, fileType) as file: + output_path = output_path / f"{dataNode.id}_{mr_id}.{format.suffix}" + + if output_path.exists() and output_path.is_file(): + log.warn(f"Overwriting existing file {output_path}") + + reportStr = textReport(dataNode, mr_id) # type: ignore + with open(output_path, fileType) as file: file.write(reportStr) def importFrom(filePath: Path, format: IOFormat) -> DataNode: - """Import DataNode structure from path. If no format is given, it is inferred form path sufix. - - Args: - filePath (Path): Path to file - format (Optional[IOFormat], optional): File format. Defaults to None. + """Import DataNode structure from path. If no format is given, it is inferred from the file suffix. + + :param filePath: Path to file + :type filePath: Path + :param format: File format + :type format: IOFormat + :return: Perun DataNode structure + :rtype: DataNode """ if format == IOFormat.JSON: with open(filePath, "r") as file: diff --git a/perun/io/json.py b/perun/io/json.py index e030cc9..6792001 100644 --- a/perun/io/json.py +++ b/perun/io/json.py @@ -1,6 +1,5 @@ """IO Json module.""" import json -from typing import Optional import numpy as np @@ -11,10 +10,12 @@ class NumpyEncoder(json.JSONEncoder): """Json Numpy object encoder.""" def default(self, obj): - """Transform numpy objects. + """Encode obj to json or to a supported format. - Args: - obj (_type_): Numpy object + :param obj: Object to encode. + :type obj: _type_ + :return: Encoded obj. + :rtype: _type_ """ if isinstance(obj, np.integer): return int(obj) @@ -28,21 +29,16 @@ def default(self, obj): return super(NumpyEncoder, self).default(obj) -def exportJson( - dataNode: DataNode, depth: Optional[int] = None, include_raw: bool = False -) -> str: +def exportJson(dataNode: DataNode) -> str: """Export DataNode to json. - Args: - dataNode (DataNode): DataNode - depth (Optional[int], optional): If specified, export only the first 'depth' levels of the DataNode tree. Defaults to None. - include_raw (bool, optional): If raw data should be included. Defaults to False. - - Returns: - str: json string from dataNode + :param dataNode: DataNode + :type dataNode: DataNode + :return: Json string of data node. + :rtype: str """ - dataDict = dataNode.toDict(depth, include_raw) - return json.dumps(dataDict, indent=4, cls=NumpyEncoder) + dataDict = dataNode.toDict(True) + return json.dumps(dataDict, cls=NumpyEncoder) def importJson(jsonString: str) -> DataNode: diff --git a/perun/io/pandas.py b/perun/io/pandas.py index 461130e..08f0d32 100644 --- a/perun/io/pandas.py +++ b/perun/io/pandas.py @@ -4,32 +4,23 @@ import pandas as pd -from perun.data_model.data import DataNode, NodeType +from perun.data_model.data import DataNode -def exportCSV(outputPath: Path, dataNode: DataNode): - """Export all raw data collected to a csv file using pandas. +def exportCSV(outputPath: Path, dataNode: DataNode, mr_id: str): + """Export data node to csv format. - Args: - outputPath (Path): Output path - dataNode (DataNode): Perun data node with raw data. - """ - df = _dataNode2Pandas(dataNode) - df.to_csv(outputPath) - - -def _dataNode2Pandas(dataNode: DataNode) -> pd.DataFrame: - """Create a pandas dataframe for Data Node raw data. - - Args: - dataNode (DataNode): Perun Data Node - - Returns: - pd.DataFrame: DataFrame + Parameters + ---------- + outputPath : Path + Path to export data to. + dataNode : DataNode + Perun data node. + mr_id : str + Id of Multi_run node to get data from """ columns = [ - "app_name", - "run_id", + "run id", "hostname", "device_group", "sensor", @@ -40,30 +31,28 @@ def _dataNode2Pandas(dataNode: DataNode) -> pd.DataFrame: ] rows = [] - exp_name = dataNode.metadata["app_name"] - if dataNode.type == NodeType.MULTI_RUN: - for id, node in dataNode.nodes.items(): - rows.extend(_rowsFromRunNode(node, exp_name, id)) + mrNode = dataNode.nodes[mr_id] + for run_n, runNode in mrNode.nodes.items(): + rows.extend(_rowsFromRunNode(runNode, run_n)) - else: - id = dataNode.id - rows = _rowsFromRunNode(dataNode, exp_name, id) - - return pd.DataFrame(rows, columns=columns) + df = pd.DataFrame(rows, columns=columns) + df.to_csv(outputPath) -def _rowsFromRunNode( - runNode: DataNode, app_name: str, run_id: str -) -> List[Tuple[Any, ...]]: - """Read raw data from data nodes and arrange it as rows. +def _rowsFromRunNode(runNode: DataNode, run_n: int) -> List[Tuple[Any, ...]]: + """Create table rows from a RUN type data node. - Args: - runNode (DataNode): Data node. - app_name (str): Application name. - run_id (str): Run id. + Parameters + ---------- + runNode : DataNode + RUN type node + run_n : int + Id number of data node - Returns: - List[Tuple[Any]]: Rows with raw data. + Returns + ------- + List[Tuple[Any, ...]] + List of tuples with table entries. """ rows: List[Tuple[Any, ...]] = [] for hostId, hostNode in runNode.nodes.items(): @@ -76,8 +65,7 @@ def _rowsFromRunNode( for i in range(len(rawData.timesteps)): rows.append( ( - app_name, - run_id, + run_n, hostId, deviceGroupId, sensorId, diff --git a/perun/io/pickle.py b/perun/io/pickle.py index 3949ea2..c79add1 100644 --- a/perun/io/pickle.py +++ b/perun/io/pickle.py @@ -5,24 +5,32 @@ def exportPickle(dataNode: DataNode) -> bytes: - """Pickle data node. + """Export data node to pickle file. - Args: - dataNode (DataNode): DataNode to be pickled. + Parameters + ---------- + dataNode : DataNode + Data Node - Returns: - bytes: Pickled DataNode + Returns + ------- + bytes + Binary data to write to file. """ return pickle.dumps(dataNode) def importPickle(pickleData: bytes) -> DataNode: - """Unpickle DataNode. + """Import DataNode from pickled data file. - Args: - pickleData (bytes): Pickled DataNode + Parameters + ---------- + pickleData : bytes + Binary Data - Returns: - DataNode: Unpickled DataNode + Returns + ------- + DataNode + DataNode """ return pickle.loads(pickleData) diff --git a/perun/io/text_report.py b/perun/io/text_report.py index ad39fe4..b29474a 100644 --- a/perun/io/text_report.py +++ b/perun/io/text_report.py @@ -1,101 +1,118 @@ """Text report module.""" - +import pandas as pd from perun import config, log -from perun.data_model.data import DataNode, MetricType, NodeType, Stats -from perun.util import value2str +from perun.data_model.data import DataNode, MetricType +from perun.io.util import value2MeanStdStr, value2ValueUnitStr +tableMetrics = [ + MetricType.RUNTIME, + MetricType.ENERGY, + MetricType.CPU_POWER, + MetricType.CPU_UTIL, + MetricType.GPU_POWER, + MetricType.GPU_MEM, + MetricType.DRAM_POWER, + MetricType.MEM_UTIL, +] -def textReport(dataNode: DataNode) -> str: - """ - Create text report from based on root node of the DataNode tree. - Args: - dataNode: Root of DataNode tree. +def textReport(dataNode: DataNode, mr_id: str) -> str: + """Create text report from selected MULTI_RUN node. - Returns: - str: txt report - """ - if dataNode.type != NodeType.MULTI_RUN and dataNode.type != NodeType.RUN: - log.error( - "Text reports are meant for only to get a brief overview of individual runs or of benchmark results over multiple runs." - ) - raise Exception("Invalid dataNode type.") + Parameters + ---------- + dataNode : DataNode + Application data node + mr_id : str + Multirun id + Returns + ------- + str + Report string + """ if not dataNode.processed: log.error("Data has not been processed, unable to create report.") raise Exception("Cannot generate report from unprocessed data node.") - reportStr = ( - "------------------------------------------\n" + if mr_id not in dataNode.nodes: + log.error("Non existent run id") + raise Exception("Cannot generate report with non existent id.") + + # Report header + report_header = ( "PERUN REPORT\n" "\n" - f"App name: {dataNode.metadata['app_name']}\n" - f"Run ID: {dataNode.id}\n" + f"App name: {dataNode.id}\n" + f"First run: {dataNode.metadata['creation_dt']}\n" + f"Last run: {dataNode.metadata['last_execution_dt']}\n" + "\n\n" ) - if dataNode.type == NodeType.MULTI_RUN: - columns = ["Name", "Unit", "mean", "std", "max", "min"] - column_widths = [4, 4, 8, 8, 8, 8] - entries = {} - for metricType in MetricType: - if metricType in dataNode.metrics: - metric = dataNode.metrics[metricType] - if isinstance(metric, Stats): - value, tfactor, mag = value2str(metric.value, metric.metric_md) - unit_str = f"{mag.symbol}{metric.metric_md.unit.value}" - column_widths[0] = ( - column_widths[0] - if column_widths[0] > len(metricType.name) - else len(metricType.name) - ) - column_widths[1] = ( - column_widths[1] - if column_widths[1] > len(unit_str) - else len(unit_str) + # Host and device table + host_device_rows = [] + region_rows = [] + mr_node: DataNode = dataNode.nodes[mr_id] + + for run_number, run_node in mr_node.nodes.items(): + if run_node.regions: + for region_name, region in run_node.regions.items(): + if region.processed: + region_rows.append( + { + "Round #": run_node.id, + "Function": region_name, + "Avg Calls / Rank": region.runs_per_rank.mean, + "Avg Runtime": value2MeanStdStr(region.runtime), + "Avg Power": value2MeanStdStr(region.power), + "Avg CPU Util": value2MeanStdStr(region.cpu_util), + "Avg GPU Mem Util": value2MeanStdStr(region.gpu_util), + } ) - entries[metricType.name] = [ - unit_str, - f"{metric.mean/tfactor:.3f}", - f"{metric.std/tfactor:.3f}", - f"{metric.max/tfactor:.3f}", - f"{metric.min/tfactor:.3f}", - ] - - reportStr += "\n" - header_row: str = "|" + "".join( - [ - " {0:^{width}} |".format(column, width=width) - for column, width in zip(columns, column_widths) - ] + for host_name, host_node in run_node.nodes.items(): + entry = { + "Round #": run_number, + "Host": host_name, + } + for metric_type in tableMetrics: + if metric_type in host_node.metrics: + m = host_node.metrics[metric_type] + entry[metric_type.name] = value2ValueUnitStr(m.value, m.metric_md) + + host_device_rows.append(entry) + entry = {"Round #": run_number, "Host": "All"} + for metric_type in tableMetrics: + if metric_type in run_node.metrics: + m = run_node.metrics[metric_type] + entry[metric_type.name] = value2ValueUnitStr(m.value, m.metric_md) + + host_device_rows.append(entry) + + mr_table = pd.DataFrame.from_records(host_device_rows) + mr_report_str = f"RUN ID: {mr_id}\n\n" + mr_table.to_markdown(index=False) + "\n\n" + + # Regions + if len(region_rows) > 0: + region_table = pd.DataFrame.from_records(region_rows) + region_report_str = ( + "Monitored Functions\n\n" + region_table.to_markdown(index=False) + "\n\n" ) - table_width = len(header_row) - reportStr += header_row + "\n" - reportStr += ("-" * table_width) + "\n" - for key, values in entries.items(): - name = "| {0:<{width}} |".format(key, width=column_widths[0]) - values_str = "".join( - [ - " {0:>{w}} |".format(v, w=w) - for v, w in zip(values, column_widths[1:]) - ] - ) - reportStr += name + values_str + "\n" + else: + region_report_str = "" + + # Summary + n_runs = len(dataNode.nodes) + if MetricType.ENERGY in dataNode.metrics: + total_energy = dataNode.metrics[MetricType.ENERGY].sum # type: ignore + e_pue = total_energy * config.getfloat("post-processing", "pue") + e_kWh = e_pue / (3600 * 1e3) + kgCO2 = e_kWh * config.getfloat("post-processing", "emissions_factor") / 1e3 + money = e_kWh * config.getfloat("post-processing", "price_factor") / 1e2 + money_icon = config.get("post-processing", "price_unit") + summary_str = f"The application has run been run {n_runs} times. Throught its runtime, it has used {e_kWh:.3f} kWh, released a total of {kgCO2:.3f} kgCO2e into the atmosphere, and you paid {money:.2f} {money_icon} in electricity for it.\n" else: - for metricType in MetricType: - if metricType in dataNode.metrics: - metric = dataNode.metrics[metricType] - value, _, mag = value2str(metric.value, metric.metric_md) - reportStr += f"{metricType.name}: {value} {mag.symbol}{metric.metric_md.unit.value}\n" - - if MetricType.ENERGY in dataNode.metrics: - e_metric = dataNode.metrics[MetricType.ENERGY] - e_pue = e_metric.value * config.getfloat("post-processing", "pue") - e_kWh = e_pue / (3600 * 1e3) - kgCO2 = e_kWh * config.getfloat("post-processing", "emissions_factor") / 1e3 - euro = e_kWh * config.getfloat("post-processing", "price_factor") / 1e2 - - reportStr += f"\nThe application used a total of {e_kWh:.3f} kWh, released a total of {kgCO2:.3f} kgCO2eq into the atmosphere, and you paid {euro:.2f} € for it.\n" - - return reportStr + summary_str = f"The application has run been run {n_runs} times." + + return report_header + mr_report_str + region_report_str + summary_str diff --git a/perun/io/util.py b/perun/io/util.py new file mode 100644 index 0000000..c1577c8 --- /dev/null +++ b/perun/io/util.py @@ -0,0 +1,93 @@ +"""IO Util.""" +from typing import Tuple + +import numpy as np + +from perun.data_model.data import Stats +from perun.data_model.measurement_type import Magnitude, MetricMetaData, Unit + + +def getTFactorMag( + value: np.number, metric_md: MetricMetaData +) -> Tuple[float, Magnitude]: + """Get transformation factor and magnitude to improve string formating. + + Parameters + ---------- + value : np.number + Reference value + metric_md : MetricMetaData + Value description + + Returns + ------- + Tuple[float, Magnitude] + Scaling factor and Magnitude Enum + """ + if metric_md.unit == Unit.WATT or metric_md.unit == Unit.JOULE: + transformFactor = 1 + for mag in reversed(Magnitude): + if value > mag.value: + transformFactor = mag.value + break + + newMag = Magnitude(metric_md.mag.value * transformFactor) + return transformFactor, newMag + + elif metric_md.unit == Unit.PERCENT: + return 1.0, metric_md.mag + elif metric_md.unit == Unit.SECOND: + return 1.0, Magnitude.ONE + elif metric_md.unit == Unit.BYTE: + transformFactor = 1 + newMag = Magnitude.ONE + for magFactor, m in zip( + [1024**3, 1024**2, 1024**1], + [Magnitude.GIGA, Magnitude.MEGA, Magnitude.KILO], + ): + if value > magFactor: + transformFactor = magFactor + newMag = m + break + + return transformFactor, newMag + else: + return 1.0, metric_md.mag + + +def value2ValueUnitStr(value: np.number, metric_md: MetricMetaData) -> str: + """Return a printable representation as [Value:.3f][mag][unit] (e.g. 3.05mV) of the value based on its metric metadata. + + Parameters + ---------- + value : np.number + Value to apply formating to. + metric_md : MetricMetaData + Value metadata. + + Returns + ------- + str + String represenation + """ + tfactor, new_mag = getTFactorMag(value, metric_md) + return f"{value/tfactor:.3f} {new_mag.symbol}{metric_md.unit.value}" + + +def value2MeanStdStr(stats: Stats) -> str: + """Return a printable representation as [Value:.3f]±[std:.3f][mag][unit] (e.g. 3.05±0.1mV) of the value based on its metric metadata. + + Parameters + ---------- + stats : Stats obj + Stats to apply formating to. + metric_md : MetricMetaData + Value metadata. + + Returns + ------- + str + String represenation + """ + tfactor, new_mag = getTFactorMag(stats.mean, stats.metric_md) + return f"{stats.mean/tfactor:.3f}±{stats.std/tfactor:.3f} {new_mag.symbol}{stats.metric_md.unit.value}" diff --git a/perun/logging.py b/perun/logging.py index ab6461f..37d4818 100644 --- a/perun/logging.py +++ b/perun/logging.py @@ -4,11 +4,18 @@ import logging.config -def init_logging(level: str): - """Initialize default stdout logger. +def init_logging(level: str) -> logging.Logger: + """Initialize logging object. - Args: - level (str, optional): Desired log level. Defaults to "DEBUG". + Parameters + ---------- + level : str + Logging level + + Returns + ------- + Logger + Logger object """ logConfig = { "version": 1, diff --git a/perun/perun.py b/perun/perun.py index ed1bef0..59e6298 100644 --- a/perun/perun.py +++ b/perun/perun.py @@ -1,219 +1,377 @@ """Core perun functionality.""" +import os import platform +import pprint as pp +import time # import sys -import time -import types +from configparser import ConfigParser from datetime import datetime from multiprocessing import Event, Process, Queue from pathlib import Path -from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union - -import numpy as np - -from perun import COMM_WORLD, __version__, log -from perun.configuration import config -from perun.coordination import getLocalSensorRankConfiguration -from perun.data_model.data import DataNode, NodeType, RawData -from perun.data_model.measurement_type import Magnitude, MetricMetaData, Unit -from perun.data_model.sensor import DeviceType, Sensor -from perun.io.io import IOFormat, exportTo -from perun.processing import processDataNode, processSensorData -from perun.util import getRunId, getRunName - - -def monitor_application( - app: Union[Path, Callable], app_args: tuple = tuple(), app_kwargs: dict = dict() -) -> Optional[Any]: - """Execute coordination, monitoring, post-processing, and reporting steps, in that order. - - Raises: - executionError: From cmd line execution, raises any error found on the monitored app. - functionError: From decorator execution, raises any error found on the monitored method. - - Returns: - Optional[Any]: In decorator mode, return the output of the decorated method. - """ - # outPath: Path = Path(config.get("output", "data_out")) - - # Get node devices - from perun.backend import backends - - log.debug(f"Rank {COMM_WORLD.Get_rank()} Backends: {backends}") - - starttime = datetime.utcnow() - app_results: Optional[Any] = None - data_out = Path(config.get("output", "data_out")) - format = IOFormat(config.get("output", "format")) - includeRawData = config.getboolean("output", "raw") - depthStr = config.get("output", "depth") - if depthStr: - depth = int(depthStr) - else: - depth = None - - if not config.getboolean("benchmarking", "bench_enable"): - app_results, dataNode = _run_application( - backends, app, app_args, app_kwargs, record=True - ) - if dataNode: - # Only on first rank, export data - exportTo(data_out, dataNode, format, includeRawData, depth) - else: - # Start with warmup rounds - log.info(f"Rank {COMM_WORLD.Get_rank()} : Started warmup rounds") - for i in range(config.getint("benchmarking", "bench_warmup_rounds")): - log.info(f"Warmup run: {i}") - app_results, _ = _run_application( - backends, app, app_args, app_kwargs, record=False +from typing import Any, Dict, List, Optional, Set, Type + +from perun import __version__, log +from perun.backend.backend import Backend +from perun.backend.intel_rapl import IntelRAPLBackend +from perun.backend.nvml import NVMLBackend +from perun.backend.psutil import PSUTILBackend +from perun.backend.util import getBackendMetadata, getHostMetadata +from perun.comm import Comm +from perun.coordination import getGlobalSensorRankConfiguration, getHostRankDict +from perun.data_model.data import DataNode, LocalRegions, NodeType +from perun.io.io import IOFormat, exportTo, importFrom +from perun.processing import processDataNode +from perun.subprocess import perunSubprocess +from perun.util import getRunId, getRunName, increaseIdCounter, singleton + + +@singleton +class Perun: + """Perun object.""" + + def __init__(self, config: ConfigParser) -> None: + """Init perun with configuration. + + Parameters + ---------- + config : ConfigParser + Global configuration object + """ + self.config = config + self._comm: Optional[Comm] = None + self._backends: Optional[Dict[str, Backend]] = None + self._sensors_config: Optional[List[Dict[str, Set[str]]]] = None + self._l_sensor_config: Optional[Dict[str, Set[str]]] = None + self._hostname: Optional[str] = None + self._host_rank: Optional[Dict[str, List[int]]] = None + self._l_host_metadata: Optional[Dict[str, Any]] = None + self._l_backend_metadata: Optional[Dict[str, Any]] = None + self.local_regions: Optional[LocalRegions] = None + + def __del__(self): + """Perun object destructor.""" + self._close_backends() + + @property + def comm(self) -> Comm: + """Lazy initialization of mpi communication object.""" + if not self._comm: + os.environ["OMPI_MCA_mpi_warn_on_fork"] = "0" + os.environ["IBV_FORK_SAFE"] = "1" + os.environ["RDMAV_FORK_SAFE"] = "1" + + self._comm = Comm() + + return self._comm + + @property + def hostname(self) -> str: + """Lazy initialization of hostname. + + Returns + ------- + str + Local rank hostname. + """ + if not self._hostname: + self._hostname = platform.node() + return self._hostname + + @property + def backends(self) -> Dict[str, Backend]: + """Lazy initialization of backends dictionary. + + Returns + ------- + Dict[str, Backend] + Dictionary of available backends. + """ + if not self._backends: + self._backends = {} + classList: List[Type[Backend]] = [ + IntelRAPLBackend, + NVMLBackend, + PSUTILBackend, + ] + for backend in classList: + try: + backend_instance = backend() + self._backends[backend_instance.id] = backend_instance + except ImportError as ie: + log.warning(f"Missing dependencies for backend {backend.__name__}") + log.warning(ie) + except Exception as e: + log.warning(f"Unknown error loading dependecy {backend.__name__}") + log.warning(e) + + return self._backends + + def _close_backends(self): + """Close available backends.""" + for backend in self.backends.values(): + backend.close() + + @property + def host_rank(self) -> Dict[str, List[int]]: + """Lazy initialization of host_rank dictionary. + + Returns + ------- + Dict[str, List[int]] + Dictionary with key (hostname) and values (list of ranks in host) + """ + if not self._host_rank: + self._host_rank = getHostRankDict(self.comm, self.hostname) + + return self._host_rank + + @property + def sensors_config(self) -> List[Dict[str, Set[str]]]: + """Lazy initialization of global sensor configuration. + + Returns + ------- + List[Dict[str, Set[str]]] + Global sensor configuration. + """ + if not self._sensors_config: + self._sensors_config = getGlobalSensorRankConfiguration( + self.comm, self.backends, self.host_rank ) - - log.info(f"Rank {COMM_WORLD.Get_rank()} : Started bench rounds") - runNodes: List[DataNode] = [] - for i in range(config.getint("benchmarking", "bench_rounds")): - log.info(f"Rank {COMM_WORLD.Get_rank()} : Bench run: {i}") - app_results, runNode = _run_application( - backends, app, app_args, app_kwargs, record=True, run_id=str(i) + return self._sensors_config + + @property + def l_sensors_config(self) -> Dict[str, Set[str]]: + """Lazy initialization of local sensor configuration. + + Returns + ------- + Dict[str, Set[str]] + Local sensor configuration. + """ + if not self._l_sensor_config: + self._l_sensor_config = self.sensors_config[self.comm.Get_rank()] + + return self._l_sensor_config + + @property + def l_host_metadata(self) -> Dict[str, Any]: + """Lazy initialization of local metadata dictionary. + + Returns + ------- + Dict[str, Any] + Metadata dictionary + """ + if not self._l_host_metadata: + self._l_host_metadata = getHostMetadata() + return self._l_host_metadata + + @property + def l_backend_metadata(self) -> Dict[str, Any]: + """Lazy initialization of local metadata dictionary. + + Returns + ------- + Dict[str, Any] + Metadata dictionary + """ + if not self._l_backend_metadata: + self._l_backend_metadata = getBackendMetadata( + self.backends, self.l_sensors_config ) - if runNode: - runNodes.append(runNode) - - if len(runNodes) > 0: - benchNode = DataNode( - id=getRunId(starttime), - type=NodeType.MULTI_RUN, + return self._l_backend_metadata + + def monitor_application( + self, + app: Path, + ): + """Execute coordination, monitoring, post-processing, and reporting steps, in that order. + + Parameters + ---------- + app : Path + App script file path + """ + log.debug(f"Rank {self.comm.Get_rank()} Backends: {pp.pformat(self.backends)}") + + data_out = Path(self.config.get("output", "data_out")) + out_format = IOFormat(self.config.get("output", "format")) + starttime = datetime.now() + app_name = getRunName(app) + multirun_id = getRunId(starttime) + + log.info(f"App: {app_name}, MR_ID: {multirun_id}") + + if self.config.getint("benchmarking", "warmup_rounds"): + log.info(f"Rank {self.comm.Get_rank()} : Started warmup rounds") + for i in range(self.config.getint("benchmarking", "warmup_rounds")): + log.info(f"Warmup run: {i}") + _ = self._run_application(app, str(i), record=False) + + log.info(f"Rank {self.comm.Get_rank()}: Monitoring start") + multirun_nodes: Dict[str, DataNode] = {} + for i in range(self.config.getint("benchmarking", "rounds")): + runNode: Optional[DataNode] = self._run_application( + app, str(i), record=True + ) + if self.comm.Get_rank() == 0 and runNode: + multirun_nodes[str(i)] = runNode + + # Get app node data if it exists + if self.comm.Get_rank() == 0 and len(multirun_nodes) > 0: + # Multi_run data processing + multirun_node = DataNode( + multirun_id, + NodeType.MULTI_RUN, metadata={ "app_name": getRunName(app), - "startime": starttime.isoformat(), - # "perun_version": __version__, + "perun_version": __version__, + "execution_dt": starttime.isoformat(), + "n_runs": str(len(multirun_nodes)), + **{ + option: value + for section_name in self.config.sections() + for option, value in self.config.items(section_name) + }, }, - nodes={node.id: node for node in runNodes}, + nodes=multirun_nodes, + processed=False, ) - benchNode = processDataNode(benchNode) - - exportTo(data_out, benchNode, format, includeRawData, depth) - - return app_results - - -def _run_application( - backends: List, - app: Union[Path, Callable], - app_args: tuple = tuple(), - app_kwargs: dict = dict(), - record: bool = True, - run_id: Optional[str] = None, -) -> Tuple[Optional[Any], Optional[DataNode]]: - app_result: Optional[Any] = None - - log.info(f"Rank {COMM_WORLD.Get_rank()}: _run_application") - if record: - # 1) Get sensor configuration - mpiRanks, localBackends = getLocalSensorRankConfiguration(COMM_WORLD, backends) - - start_event = Event() - stop_event = Event() - - queue: Optional[Queue] = None - perunSP: Optional[Process] = None - - # 2) If assigned devices, create subprocess - if len(localBackends.keys()) > 0: - log.debug( - f"Rank {COMM_WORLD.Get_rank()} - Local Backendens : {localBackends}" - ) - queue = Queue() - perunSP = Process( - target=perunSubprocess, - args=[ - queue, - start_event, - stop_event, - localBackends, - config.getfloat("monitor", "sampling_rate"), - ], - ) - perunSP.start() - else: - start_event.set() - - # 3) Start application + multirun_node = processDataNode(multirun_node) + + app_data_file = data_out / f"{app_name}.{IOFormat.HDF5.suffix}" + app_data = None + if app_data_file.exists() and app_data_file.is_file(): + app_data = self.import_from(app_data_file, IOFormat.HDF5) + app_data.metadata["last_execution_dt"] = starttime.isoformat() + previous_run_ids = list(app_data.nodes.keys()) + multirun_id = increaseIdCounter(previous_run_ids, multirun_id) + multirun_node.id = multirun_id + app_data.nodes[multirun_node.id] = multirun_node + app_data.processed = False + + else: + app_data = DataNode( + app_name, + NodeType.APP, + metadata={ + "creation_dt": starttime.isoformat(), + "last_execution_dt": starttime.isoformat(), + }, + nodes={multirun_id: multirun_node}, + processed=False, + ) + app_data = processDataNode(app_data) + + self.export_to(data_out, app_data, IOFormat.HDF5) + if out_format != IOFormat.HDF5: + self.export_to(data_out, app_data, out_format, multirun_id) + + def _run_application( + self, + app: Path, + run_id: str, + record: bool = True, + ) -> Optional[DataNode]: + log.info(f"Rank {self.comm.Get_rank()}: _run_application") + if record: + # 1) Get sensor configuration + sp_ready_event = Event() + start_event = Event() + stop_event = Event() + + queue: Optional[Queue] = None + perunSP: Optional[Process] = None + + # 2) If assigned devices, create subprocess + if len(self.l_sensors_config.keys()) > 0: + log.debug( + f"Rank {self.comm.Get_rank()} - Local Backendens : {pp.pformat(self.l_sensors_config)}" + ) + queue = Queue() + perunSP = Process( + target=perunSubprocess, + args=[ + queue, + self.comm.Get_rank(), + self.backends, + self.l_sensors_config, + sp_ready_event, + start_event, + stop_event, + self.config.getfloat("monitor", "sampling_rate"), + ], + ) + perunSP.start() + else: + sp_ready_event.set() - if isinstance(app, Path): + # 3) Start application try: + sp_ready_event.wait() with open(str(app), "r") as scriptFile: - start_event.wait() - COMM_WORLD.barrier() - log.info(f"Rank {COMM_WORLD.Get_rank()}: Started App") - run_starttime = datetime.utcnow() + self.local_regions = LocalRegions() + self.comm.barrier() + log.info(f"Rank {self.comm.Get_rank()}: Started App") + start_event.set() + starttime_ns = time.time_ns() exec( scriptFile.read(), {"__name__": "__main__", "__file__": app.name}, ) - log.info(f"Rank {COMM_WORLD.Get_rank()}: App Stopped") + # run_stoptime = datetime.utcnow() + log.info(f"Rank {self.comm.Get_rank()}: App Stopped") stop_event.set() except Exception as e: log.error( - f"Rank {COMM_WORLD.Get_rank()}: Found error on monitored script: {str(app)}" + f"Rank {self.comm.Get_rank()}: Found error on monitored script: {str(app)}" ) stop_event.set() raise e - elif isinstance(app, types.FunctionType): - try: - start_event.wait() - COMM_WORLD.barrier() - log.info(f"Rank {COMM_WORLD.Get_rank()}: Started App") - run_starttime = datetime.utcnow() - - app_result = app(*app_args, **app_kwargs) - log.info(f"Rank {COMM_WORLD.Get_rank()}: Stopped App") - except Exception as e: - stop_event.set() - raise e - stop_event.set() - - # 4) App finished, stop subrocess and get data - if queue and perunSP: - log.info(f"Rank {COMM_WORLD.Get_rank()}: Getting queue contents") - nodeData = queue.get(block=True) - log.info(f"Rank {COMM_WORLD.Get_rank()}: Got queue contents") - log.info(f"Rank {COMM_WORLD.Get_rank()}: Waiting for subprocess to close") - perunSP.join() - perunSP.close() - log.info(f"Rank {COMM_WORLD.Get_rank()}: Subprocess closed") - queue.close() - else: - nodeData = None - - COMM_WORLD.barrier() - log.info(f"Rank {COMM_WORLD.Get_rank()}: Everyone exited the subprocess") - - if nodeData: - nodeData.metadata["mpi_ranks"] = mpiRanks - - # 5) Collect data from everyone on the first rank - dataNodes: Optional[List[DataNode]] = COMM_WORLD.gather(nodeData, root=0) - if dataNodes: - # 6) On the first rank, create run node - runNode = DataNode( - id=run_id if run_id is not None else getRunId(run_starttime), - type=NodeType.RUN, - metadata={ - "app_name": getRunName(app), - "startime": run_starttime.isoformat(), - "perun_version": __version__, - }, - nodes={node.id: node for node in dataNodes if node}, + # 4) App finished, stop subrocess and get data + if queue and perunSP: + log.info(f"Rank {self.comm.Get_rank()}: Getting queue contents") + nodeData = queue.get(block=True) + log.info(f"Rank {self.comm.Get_rank()}: Got queue contents") + log.info( + f"Rank {self.comm.Get_rank()}: Waiting for subprocess to close" + ) + perunSP.join() + perunSP.close() + log.info(f"Rank {self.comm.Get_rank()}: Subprocess closed") + queue.close() + else: + nodeData = None + + log.info(f"Rank {self.comm.Get_rank()}: Everyone exited the subprocess") + + if nodeData: + nodeData.metadata["mpi_ranks"] = self.host_rank[self.hostname] + + # 5) Collect data from everyone on the first rank + dataNodes: Optional[List[DataNode]] = self.comm.gather(nodeData, root=0) + globalRegions: Optional[List[LocalRegions]] = self.comm.gather( + self.local_regions, root=0 ) - runNode = processDataNode(runNode) - return app_result, runNode - return app_result, None + if dataNodes and globalRegions: + # 6) On the first rank, create run node + runNode = DataNode( + id=run_id, + type=NodeType.RUN, + metadata={**self.l_host_metadata}, + nodes={node.id: node for node in dataNodes if node}, + ) + runNode.addRegionData(globalRegions, starttime_ns) + runNode = processDataNode(runNode) + + return runNode + return None - else: - if isinstance(app, Path): - # filePath = app + else: try: with open(str(app), "r") as scriptFile: exec( @@ -222,125 +380,44 @@ def _run_application( ) except Exception as e: log.error( - f"Rank {COMM_WORLD.Get_rank()}: Found error on monitored script: {str(app)}" + f"Rank {self.comm.Get_rank()}: Found error on monitored script: {str(app)}" ) raise e - - elif isinstance(app, types.FunctionType): - # filePath = Path(sys.argv[0]) - try: - app_result = app(*app_args, **app_kwargs) - except Exception as e: - raise e - - return app_result, None - - -def perunSubprocess( - queue: Queue, - start_event, - stop_event, - backendConfig: Dict[str, Set[str]], - sampling_rate: float, -): - """ - Parallel function that samples energy values from hardware libraries. - - Args: - queue (Queue): multiprocessing Queue object where the results are sent after finish - start_event (_type_): Marks that perun finished setting up and started sampling from devices - stop_event (_type_): Signal the subprocess that the monitored processed has finished - deviceIds (Set[str]): List of device ids to sample from - sampling_rate (int): Sampling rate in s - """ - from perun.backend import backends - - lSensors: List[Sensor] = [] - for backend in backends: - if backend.name in backendConfig: - backend.setup() - lSensors += backend.getSensors(backendConfig[backend.name]) - - timesteps = [] - t_mT = MetricMetaData( - Unit.SECOND, - Magnitude.ONE, - np.dtype("float32"), - np.float32(0), - np.finfo("float32").max, - np.float32(-1), - ) - rawValues: List[List[np.number]] = [] - for _ in lSensors: - rawValues.append([]) - - log.debug(f"Rank {COMM_WORLD.Get_rank()}: perunSP lSensors: {lSensors}") - - start_event.set() - timesteps.append(time.time_ns()) - for idx, device in enumerate(lSensors): - rawValues[idx].append(device.read()) - - while not stop_event.wait(sampling_rate): - timesteps.append(time.time_ns()) - for idx, device in enumerate(lSensors): - rawValues[idx].append(device.read()) - - timesteps.append(time.time_ns()) - for idx, device in enumerate(lSensors): - rawValues[idx].append(device.read()) - - log.info(f"Rank {COMM_WORLD.Get_rank()}: Subprocess: Stop event received.") - - sensorNodes: Dict = {} - - t_s = np.array(timesteps) - t_s -= t_s[0] - t_s = t_s.astype("float32") - t_s *= 1e-9 - - for sensor, values in zip(lSensors, rawValues): - if sensor.type not in sensorNodes: - sensorNodes[sensor.type] = [] - - dn = DataNode( - id=sensor.id, - type=NodeType.SENSOR, - metadata=sensor.metadata, - deviceType=sensor.type, - raw_data=RawData(t_s, np.array(values), t_mT, sensor.dataType), - ) - # Apply processing to sensor node - dn = processSensorData(dn) - sensorNodes[sensor.type].append(dn) - - log.info(f"Rank {COMM_WORLD.Get_rank()}: Subprocess: Preprocessed Sensor Data") - deviceGroupNodes = [] - for deviceType, sensorNodes in sensorNodes.items(): - if deviceType != DeviceType.NODE: - dn = DataNode( - id=deviceType.value, - type=NodeType.DEVICE_GROUP, - metadata={}, - nodes={sensor.id: sensor for sensor in sensorNodes}, - deviceType=deviceType, - ) - - dn = processDataNode(dn) - deviceGroupNodes.append(dn) - else: - deviceGroupNodes.extend(sensorNodes) - - log.info(f"Rank {COMM_WORLD.Get_rank()}: Subprocess: Preprocessed Device Data") - - hostNode = DataNode( - id=platform.node(), - type=NodeType.NODE, - metadata={}, - nodes={node.id: node for node in deviceGroupNodes}, - ) - processDataNode(hostNode) - - # This should send a single processed node for the current computational node - queue.put(hostNode, block=True) - log.info(f"Rank {COMM_WORLD.Get_rank()}: Subprocess: Sent data") + return None + + def import_from(self, filePath: Path, format: IOFormat) -> DataNode: + """Import data node from given filepath. + + Parameters + ---------- + filePath : Path + Perun data node file path. + format : IOFormat + File format. + + Returns + ------- + DataNode + Imported DataNode. + """ + return importFrom(filePath, format) + + def export_to( + self, + dataOut: Path, + dataNode: DataNode, + format: IOFormat, + mr_id: Optional[str] = None, + ): + """Export data to selected format. + + Parameters + ---------- + dataOut : Path + Directory where data will be saved. + dataNode : DataNode + Data node to export. + format : IOFormat + Format to export data. + """ + exportTo(dataOut, dataNode, format, mr_id) diff --git a/perun/processing.py b/perun/processing.py index 5db54f3..42d5aa2 100644 --- a/perun/processing.py +++ b/perun/processing.py @@ -1,25 +1,131 @@ """Processing Module.""" -from typing import Dict, List +import copy +from datetime import datetime +from itertools import chain +from typing import Any, Dict, List, Optional, Tuple import numpy as np +from perun import log from perun.data_model.data import ( AggregateType, DataNode, Metric, MetricType, NodeType, + RawData, + Region, Stats, ) from perun.data_model.measurement_type import Magnitude, MetricMetaData, Unit from perun.data_model.sensor import DeviceType +def processEnergyData( + raw_data: RawData, + start: Optional[np.number] = None, + end: Optional[np.number] = None, +) -> Tuple[Any, Any]: + """Calculate energy and power from an accumulated energy vector. (SEE RAPL). + + Using the start and end parameters the results can be limited to certain areas of the application run. + + Parameters + ---------- + raw_data : RawData + Raw Data from sensor + start : Optional[np.number], optional + Start time of region, by default None + end : Optional[np.number], optional + End time of region, by default None + + Returns + ------- + _type_ + Tuple with total energy in joules and avg power in watts. + """ + runtime = raw_data.timesteps[-1] + t_s = raw_data.timesteps.astype("float32") + t_s *= raw_data.t_md.mag.value / Magnitude.ONE.value + + e_J = raw_data.values + maxValue = raw_data.v_md.max + dtype = raw_data.v_md.dtype.name + + if start and end: + runtime = end - start + index = np.all([t_s >= start, t_s <= end], axis=0) + e_J = e_J[index] + + d_energy = e_J[1:] - e_J[:-1] + + if "uint" in dtype: + idx = d_energy >= maxValue + max_dtype = np.iinfo(dtype).max + d_energy[idx] = maxValue + d_energy[idx] - max_dtype + else: + idx = d_energy <= 0 + d_energy[idx] = d_energy[idx] + maxValue + + d_energy = d_energy.astype("float32") + + total_energy = d_energy.sum() + + magFactor = raw_data.v_md.mag.value / Magnitude.ONE.value + energy_J = total_energy * magFactor + power_W = energy_J / runtime + return energy_J, power_W + + +def processPowerData( + raw_data: RawData, + start: Optional[np.number] = None, + end: Optional[np.number] = None, +) -> Tuple[Any, Any]: + """Calculate energy and power from power time series. + + Using the start and end parameters the results can be limited to certain areas of the application run. + + Parameters + ---------- + raw_data : RawData + Raw Data from sensor + start : Optional[np.number], optional + Start time of region, by default None + end : Optional[np.number], optional + End time of region, by default None + + Returns + ------- + _type_ + Tuple with total energy in joules and avg power in watts. + """ + t_s = raw_data.timesteps.astype("float32") + t_s *= raw_data.t_md.mag.value / Magnitude.ONE.value + + magFactor = raw_data.v_md.mag.value / Magnitude.ONE.value + power_W = raw_data.values.astype("float32") * magFactor + + if start and end: + t_s, power_W = getInterpolatedValues(t_s, power_W, start, end) + + avg_power_W = np.mean(power_W) + energy_J = np.trapz(power_W, t_s) + return energy_J, avg_power_W + + def processSensorData(sensorData: DataNode) -> DataNode: - """Calculate metrics based on the data found on sensor nodes. + """Calculate metrics based on raw values. - Args: - sensorData (DataNode): DataNode with raw data (SENSOR) + Parameters + ---------- + sensorData : DataNode + DataNode with raw sensor data. + + Returns + ------- + DataNode + DataNode with computed metrics. """ if sensorData.type == NodeType.SENSOR and sensorData.raw_data: rawData = sensorData.raw_data @@ -30,26 +136,9 @@ def processSensorData(sensorData: DataNode) -> DataNode: ) if rawData.v_md.unit == Unit.JOULE: - t_s = rawData.timesteps.astype("float32") - t_s *= rawData.t_md.mag.value / Magnitude.ONE.value - - e_J = rawData.values - maxValue = rawData.v_md.max - dtype = rawData.v_md.dtype.name - d_energy = e_J[1:] - e_J[:-1] - if "uint" in dtype: - idx = d_energy >= maxValue - max_dtype = np.iinfo(dtype).max - d_energy[idx] = maxValue + d_energy[idx] - max_dtype - else: - idx = d_energy <= 0 - d_energy[idx] = d_energy[idx] + maxValue - total_energy = d_energy.sum() - - magFactor = rawData.v_md.mag.value / Magnitude.ONE.value - energy_J = np.float32(total_energy) * magFactor + energy_J, power_W = processEnergyData(rawData) - sensorData.metrics[MetricType.ENERGY] = Metric( + energyMetric = Metric( MetricType.ENERGY, energy_J, MetricMetaData( @@ -62,9 +151,9 @@ def processSensorData(sensorData: DataNode) -> DataNode: ), AggregateType.SUM, ) - sensorData.metrics[MetricType.POWER] = Metric( + powerMetric = Metric( MetricType.POWER, - energy_J / runtime, + power_W, MetricMetaData( Unit.WATT, Magnitude.ONE, @@ -75,14 +164,39 @@ def processSensorData(sensorData: DataNode) -> DataNode: ), AggregateType.SUM, ) - elif rawData.v_md.unit == Unit.WATT: - t_s = rawData.timesteps.astype("float32") - t_s *= rawData.t_md.mag.value / Magnitude.ONE.value - magFactor = rawData.v_md.mag.value / Magnitude.ONE.value - power_W = rawData.values.astype("float32") * magFactor - energy_J = np.trapz(power_W, t_s) - sensorData.metrics[MetricType.ENERGY] = Metric( + sensorData.metrics[MetricType.ENERGY] = energyMetric + sensorData.metrics[MetricType.POWER] = powerMetric + + if sensorData.deviceType == DeviceType.CPU: + sensorData.metrics[MetricType.CPU_ENERGY] = energyMetric.copy() + sensorData.metrics[MetricType.CPU_ENERGY].type = MetricType.CPU_ENERGY + sensorData.metrics[MetricType.CPU_POWER] = powerMetric.copy() + sensorData.metrics[MetricType.CPU_POWER].type = MetricType.CPU_POWER + + elif sensorData.deviceType == DeviceType.GPU: + sensorData.metrics[MetricType.GPU_ENERGY] = energyMetric.copy() + sensorData.metrics[MetricType.GPU_ENERGY].type = MetricType.GPU_ENERGY + sensorData.metrics[MetricType.GPU_POWER] = powerMetric.copy() + sensorData.metrics[MetricType.GPU_POWER].type = MetricType.GPU_POWER + + elif sensorData.deviceType == DeviceType.RAM: + sensorData.metrics[MetricType.DRAM_ENERGY] = energyMetric.copy() + sensorData.metrics[MetricType.DRAM_ENERGY].type = MetricType.DRAM_ENERGY + sensorData.metrics[MetricType.DRAM_POWER] = powerMetric.copy() + sensorData.metrics[MetricType.DRAM_POWER].type = MetricType.DRAM_POWER + + elif sensorData.deviceType == DeviceType.OTHER: + sensorData.metrics[MetricType.OTHER_ENERGY] = energyMetric.copy() + sensorData.metrics[ + MetricType.OTHER_ENERGY + ].type = MetricType.OTHER_ENERGY + sensorData.metrics[MetricType.OTHER_POWER] = powerMetric.copy() + sensorData.metrics[MetricType.OTHER_POWER].type = MetricType.OTHER_POWER + + elif rawData.v_md.unit == Unit.WATT: + energy_J, power_W = processPowerData(rawData) + energyMetric = Metric( MetricType.ENERGY, energy_J, MetricMetaData( @@ -95,9 +209,9 @@ def processSensorData(sensorData: DataNode) -> DataNode: ), AggregateType.SUM, ) - sensorData.metrics[MetricType.POWER] = Metric( + powerMetric = Metric( MetricType.POWER, - np.mean(power_W), + power_W, MetricMetaData( Unit.WATT, Magnitude.ONE, @@ -108,6 +222,36 @@ def processSensorData(sensorData: DataNode) -> DataNode: ), AggregateType.SUM, ) + + sensorData.metrics[MetricType.ENERGY] = energyMetric + sensorData.metrics[MetricType.POWER] = powerMetric + + if sensorData.deviceType == DeviceType.CPU: + sensorData.metrics[MetricType.CPU_ENERGY] = energyMetric.copy() + sensorData.metrics[MetricType.CPU_ENERGY].type = MetricType.CPU_ENERGY + sensorData.metrics[MetricType.CPU_POWER] = powerMetric.copy() + sensorData.metrics[MetricType.CPU_POWER].type = MetricType.CPU_POWER + + elif sensorData.deviceType == DeviceType.GPU: + sensorData.metrics[MetricType.GPU_ENERGY] = energyMetric.copy() + sensorData.metrics[MetricType.GPU_ENERGY].type = MetricType.GPU_ENERGY + sensorData.metrics[MetricType.GPU_POWER] = powerMetric.copy() + sensorData.metrics[MetricType.GPU_POWER].type = MetricType.GPU_POWER + + elif sensorData.deviceType == DeviceType.RAM: + sensorData.metrics[MetricType.DRAM_ENERGY] = energyMetric.copy() + sensorData.metrics[MetricType.DRAM_ENERGY].type = MetricType.DRAM_ENERGY + sensorData.metrics[MetricType.DRAM_POWER] = powerMetric.copy() + sensorData.metrics[MetricType.DRAM_POWER].type = MetricType.DRAM_POWER + + elif sensorData.deviceType == DeviceType.OTHER: + sensorData.metrics[MetricType.OTHER_ENERGY] = energyMetric.copy() + sensorData.metrics[ + MetricType.OTHER_ENERGY + ].type = MetricType.OTHER_ENERGY + sensorData.metrics[MetricType.OTHER_POWER] = powerMetric.copy() + sensorData.metrics[MetricType.OTHER_POWER].type = MetricType.OTHER_POWER + elif rawData.v_md.unit == Unit.PERCENT: if sensorData.deviceType == DeviceType.CPU: metricType = MetricType.CPU_UTIL @@ -123,28 +267,37 @@ def processSensorData(sensorData: DataNode) -> DataNode: AggregateType.MEAN, ) elif rawData.v_md.unit == Unit.BYTE: + bytes_v = rawData.values + if sensorData.deviceType == DeviceType.NET: if "READ" in sensorData.id: metricType = MetricType.NET_READ else: metricType = MetricType.NET_WRITE - else: + + d_bytes = bytes_v[1:] - bytes_v[:-1] + result = d_bytes.sum() + aggType = AggregateType.SUM + elif sensorData.deviceType == DeviceType.DISK: if "READ" in sensorData.id: metricType = MetricType.DISK_READ else: metricType = MetricType.DISK_WRITE - bytes_v = rawData.values - maxValue = rawData.v_md.max - dtype = rawData.v_md.dtype.name - d_bytes = bytes_v[1:] - bytes_v[:-1] - result = d_bytes.sum() + d_bytes = bytes_v[1:] - bytes_v[:-1] + result = d_bytes.sum() + aggType = AggregateType.SUM + elif sensorData.deviceType == DeviceType.GPU: + metricType = MetricType.GPU_MEM + result = bytes_v.mean() + aggType = AggregateType.SUM + else: + metricType = MetricType.OTHER_MEM + result = bytes_v.mean() + aggType = AggregateType.SUM sensorData.metrics[metricType] = Metric( - metricType, - result.astype(rawData.v_md.dtype), - rawData.v_md, - AggregateType.SUM, + metricType, result.astype(rawData.v_md.dtype), rawData.v_md, aggType ) sensorData.processed = True @@ -152,12 +305,34 @@ def processSensorData(sensorData: DataNode) -> DataNode: def processDataNode(dataNode: DataNode, force_process=False) -> DataNode: - """Recursively calculate metrics of the current nodes, and of child nodes if necessary. + """Recursively calculate metrics on the dataNode tree. - Args: - dataNode (DataNode): Root of the DataNode structure - force_process (bool, optional): If true, ignored processed flag in child DataNodes. Defaults to False. + Parameters + ---------- + dataNode : DataNode + Root data node tree. + force_process : bool, optional + Force recomputation of child node metrics, by default False + + Returns + ------- + DataNode + Data node with computed metrics. """ + # Regions + if dataNode.regions: + start = datetime.now() + unprocessedRegions = [] + for region in dataNode.regions.values(): + if not region.processed: + addRunAndRuntimeInfoToRegion(region) + region.processed = True + unprocessedRegions.append(region) + + processRegionsWithSensorData(unprocessedRegions, dataNode) + duration = datetime.now() - start + log.info(f"Region processing duration: {duration}") + aggregatedMetrics: Dict[MetricType, List[Metric]] = {} for _, subNode in dataNode.nodes.items(): # Make sure sub nodes have their metrics ready @@ -167,17 +342,27 @@ def processDataNode(dataNode: DataNode, force_process=False) -> DataNode: else: subNode = processDataNode(subNode, force_process=force_process) - for metricType, metric in subNode.metrics.items(): - if isinstance(metric, Metric): - if metricType in aggregatedMetrics: - aggregatedMetrics[metricType].append(metric) - else: - aggregatedMetrics[metricType] = [metric] + if dataNode.type == NodeType.APP: + for subSubNode in subNode.nodes.values(): + for metricType, metric in subSubNode.metrics.items(): + if isinstance(metric, Metric): + if metricType in aggregatedMetrics: + aggregatedMetrics[metricType].append(metric) + else: + aggregatedMetrics[metricType] = [metric] + + else: + for metricType, metric in subNode.metrics.items(): + if isinstance(metric, Metric): + if metricType in aggregatedMetrics: + aggregatedMetrics[metricType].append(metric) + else: + aggregatedMetrics[metricType] = [metric] for metricType, metrics in aggregatedMetrics.items(): aggType = metrics[0].agg metric_md = metrics[0].metric_md - if dataNode.type == NodeType.MULTI_RUN: + if dataNode.type == NodeType.MULTI_RUN or dataNode.type == NodeType.APP: dataNode.metrics[metricType] = Stats.fromMetrics(metrics) else: if aggType == AggregateType.MEAN: @@ -195,3 +380,235 @@ def processDataNode(dataNode: DataNode, force_process=False) -> DataNode: dataNode.processed = True return dataNode + + +def processRegionsWithSensorData(regions: List[Region], dataNode: DataNode): + """Complete region information using sensor data found on the data node (in place op). + + Parameters + ---------- + regions : List[Region] + List of regions that use the same data node. + dataNode : DataNode + Data node with sensor data. + """ + world_size = regions[0].world_size + power = [ + [ + [0.0 for _ in range(region.raw_data[rank].shape[0] // 2)] + for rank in range(world_size) + ] + for region in regions + ] + cpu_util = copy.deepcopy(power) + + gpu_util = copy.deepcopy(power) + gpu_count = copy.deepcopy(power) + + has_gpu = False + + for hostNode in dataNode.nodes.values(): + # Get relevant ranks + ranks = hostNode.metadata["mpi_ranks"] + for deviceNode in hostNode.nodes.values(): + if ( + deviceNode.deviceType == DeviceType.CPU + or deviceNode.deviceType == DeviceType.GPU + or deviceNode.deviceType == DeviceType.RAM + ): + for sensorNode in deviceNode.nodes.values(): + if sensorNode.raw_data: + raw_data = sensorNode.raw_data + measuring_unit = raw_data.v_md.unit + for region_idx, region in enumerate(regions): + for rank in ranks: + if rank in region.raw_data: + events = region.raw_data[rank] + for i in range(events.shape[0] // 2): + if measuring_unit == Unit.JOULE: + _, power_W = processEnergyData( + raw_data, + events[i * 2], + events[i * 2 + 1], + ) + power[region_idx][rank][i] += power_W + elif measuring_unit == Unit.WATT: + _, power_W = processPowerData( + raw_data, + events[i * 2], + events[i * 2 + 1], + ) + power[region_idx][rank][i] += power_W + elif ( + measuring_unit == Unit.PERCENT + and deviceNode.deviceType == DeviceType.CPU + ): + _, values = getInterpolatedValues( + raw_data.timesteps.astype("float32"), + raw_data.values, + events[i * 2], + events[i * 2 + 1], + ) + cpu_util[region_idx][rank][i] += np.mean( + values + ) + elif ( + measuring_unit == Unit.BYTE + and deviceNode.deviceType == DeviceType.GPU + ): + has_gpu = True + _, values = getInterpolatedValues( + raw_data.timesteps.astype("float32"), + raw_data.values, + events[i * 2], + events[i * 2 + 1], + ) + gpu_util[region_idx][rank][i] += ( + np.mean(values) + * 100 + / raw_data.v_md.max + ).astype("float32") + gpu_count[region_idx][rank][i] += 1 + + for region_idx, region in enumerate(regions): + r_power = np.array(list(chain(*power[region_idx]))) + r_cpu_util = np.array(list(chain(*cpu_util[region_idx]))) + + r_gpu_util = np.array(list(chain(*gpu_util[region_idx]))) + r_gpu_count = np.array(list(chain(*gpu_count[region_idx]))) + + if has_gpu: + r_gpu_util /= r_gpu_count + + region.cpu_util = Stats( + MetricType.CPU_UTIL, + MetricMetaData( + Unit.PERCENT, + Magnitude.ONE, + np.dtype("float32"), + np.float32(0), + np.float32(100), + np.float32(-1), + ), + r_cpu_util.sum(), + r_cpu_util.mean(), + r_cpu_util.std(), + r_cpu_util.max(), + r_cpu_util.min(), + ) + region.gpu_util = Stats( + MetricType.GPU_UTIL, + MetricMetaData( + Unit.PERCENT, + Magnitude.ONE, + np.dtype("float32"), + np.float32(0), + np.float32(100), + np.float32(-1), + ), + r_gpu_util.sum(), + r_gpu_util.mean(), + r_gpu_util.std(), + r_gpu_util.max(), + r_gpu_util.min(), + ) + region.power = Stats( + MetricType.POWER, + MetricMetaData( + Unit.WATT, + Magnitude.ONE, + np.dtype("float32"), + np.float32(0), + np.finfo("float32").max, + np.float32(-1), + ), + r_power.sum(), + r_power.mean(), + r_power.std(), + r_power.max(), + r_power.min(), + ) + + +def addRunAndRuntimeInfoToRegion(region: Region): + """Process run and runtime stats in region objects (in place operation). + + Parameters + ---------- + region : Region + Region object + """ + runs_per_rank = [] + runtime = [] + + for rank in range(region.world_size): + if rank in region.raw_data: + events = region.raw_data[rank] + runs_per_rank.append(events.shape[0] / 2) + for i in range(1, events.shape[0], 2): + runtime.append(events[i] - events[i - 1]) + else: + runs_per_rank.append(0) + + runs_array = np.array(runs_per_rank) + runtime_array = np.array(runtime) + + region.runs_per_rank = Stats( + MetricType.N_RUNS, + MetricMetaData( + Unit.SCALAR, + Magnitude.ONE, + np.dtype("float32"), + np.float32(0), + np.finfo("float32").max, + np.float32(-1), + ), + runs_array.sum(), + runs_array.mean(), + runs_array.std(), + runs_array.max(), + runs_array.min(), + ) + + region.runtime = Stats( + MetricType.RUNTIME, + MetricMetaData( + Unit.SECOND, + Magnitude.ONE, + np.dtype("float32"), + np.float32(0), + np.finfo("float32").max, + np.float32(-1), + ), + runtime_array.sum(), + runtime_array.mean(), + runtime_array.std(), + runtime_array.max(), + runtime_array.min(), + ) + + +def getInterpolatedValues( + t: np.ndarray, x: np.ndarray, start: np.number, end: np.number +) -> Tuple[np.ndarray, np.ndarray]: + """Filter timeseries with a start and end limit, and interpolate the values at the edges. + + Parameters + ---------- + t : np.ndarray + Original time steps + x : np.ndarray + Original values + start : np.number + Start of the region of interest + end : np.number + End of the roi + + Returns + ------- + np.ndarray + ROI values + """ + new_t = np.concatenate([[start], t[np.all([t >= start, t <= end], axis=0)], [end]]) + new_x = np.interp(new_t, t, x) + return new_t, new_x diff --git a/perun/subprocess.py b/perun/subprocess.py new file mode 100644 index 0000000..ab5c8ee --- /dev/null +++ b/perun/subprocess.py @@ -0,0 +1,141 @@ +"""Perun subprocess module.""" +import platform +import time +from multiprocessing import Queue +from typing import Dict, List, Set + +import numpy as np + +from perun import log +from perun.backend.backend import Backend +from perun.data_model.data import DataNode, NodeType, RawData +from perun.data_model.measurement_type import Magnitude, MetricMetaData, Unit +from perun.data_model.sensor import DeviceType, Sensor +from perun.processing import processDataNode, processSensorData + + +def perunSubprocess( + queue: Queue, + rank: int, + backends: Dict[str, Backend], + l_sensors_config: Dict[str, Set[str]], + sp_ready_event, + start_event, + stop_event, + sampling_rate: float, +): + """Parallel function that samples energy values from hardware libraries. + + Parameters + ---------- + queue : Queue + Multiprocessing Queue object where the results are sent after finish + rank : int + Local MPI Rank + backends : List[Backend] + Available backend list + l_sensors_config : Dict[str, Set[str]] + Local MPI rank sensor configuration + sp_ready_event : _type_ + Indicates monitoring supbrocess is ready, multiprocessing module + start_event : _type_ + Indicates app start, multiprocessing module + stop_event : _type_ + Indicates app stop, multiprocessing module + sampling_rate : float + Sampling rate in seconds + """ + lSensors: List[Sensor] = [] + log.debug(f"SP: backends -- {backends}") + log.debug(f"SP: l_sensor_config -- {l_sensors_config}") + for backend in backends.values(): + if backend.name in l_sensors_config: + lSensors += backend.getSensors(l_sensors_config[backend.name]) + + timesteps = [] + t_mT = MetricMetaData( + Unit.SECOND, + Magnitude.ONE, + np.dtype("float32"), + np.float32(0), + np.finfo("float32").max, + np.float32(-1), + ) + rawValues: List[List[np.number]] = [] + for _ in lSensors: + rawValues.append([]) + + log.debug(f"Rank {rank}: perunSP lSensors: {lSensors}") + + # Monitoring process ready + sp_ready_event.set() + + # Waiting for main process to send the signal + start_event.wait() + timesteps.append(time.time_ns()) + for idx, device in enumerate(lSensors): + rawValues[idx].append(device.read()) + + while not stop_event.wait(sampling_rate): + timesteps.append(time.time_ns()) + for idx, device in enumerate(lSensors): + rawValues[idx].append(device.read()) + + timesteps.append(time.time_ns()) + for idx, device in enumerate(lSensors): + rawValues[idx].append(device.read()) + + log.info(f"Rank {rank}: Subprocess: Stop event received.") + + sensorNodes: Dict = {} + + t_s = np.array(timesteps) + t_s -= t_s[0] + t_s = t_s.astype("float32") + t_s *= 1e-9 + + for sensor, values in zip(lSensors, rawValues): + if sensor.type not in sensorNodes: + sensorNodes[sensor.type] = [] + + dn = DataNode( + id=sensor.id, + type=NodeType.SENSOR, + metadata=sensor.metadata, + deviceType=sensor.type, + raw_data=RawData(t_s, np.array(values), t_mT, sensor.dataType), + ) + # Apply processing to sensor node + dn = processSensorData(dn) + sensorNodes[sensor.type].append(dn) + + log.info(f"Rank {rank}: Subprocess: Preprocessed Sensor Data") + deviceGroupNodes = [] + for deviceType, sensorNodes in sensorNodes.items(): + if deviceType != DeviceType.NODE: + dn = DataNode( + id=deviceType.value, + type=NodeType.DEVICE_GROUP, + metadata={}, + nodes={sensor.id: sensor for sensor in sensorNodes}, + deviceType=deviceType, + ) + + dn = processDataNode(dn) + deviceGroupNodes.append(dn) + else: + deviceGroupNodes.extend(sensorNodes) + + log.info(f"Rank {rank}: Subprocess: Preprocessed Device Data") + + hostNode = DataNode( + id=platform.node(), + type=NodeType.NODE, + metadata={}, + nodes={node.id: node for node in deviceGroupNodes}, + ) + processDataNode(hostNode) + + # This should send a single processed node for the current computational node + queue.put(hostNode, block=True) + log.info(f"Rank {rank}: Subprocess: Sent data") diff --git a/perun/util.py b/perun/util.py index 980415b..a9d9409 100644 --- a/perun/util.py +++ b/perun/util.py @@ -1,49 +1,75 @@ """Util module.""" import os -import platform -from datetime import datetime, timedelta +import re +from datetime import datetime from pathlib import Path -from typing import Any, Callable, Dict, Set, Tuple, Union +from typing import List -import numpy as np +from perun import config -from perun import config, log -from perun.data_model.measurement_type import Magnitude, MetricMetaData, Unit +def singleton(class_): + """Singleton decorator. -def getRunName(app: Union[Path, Callable]) -> str: - """Return application name based on available info. + Parameters + ---------- + class_ : _type_ + Class to decorate as singleton - Args: - app (Union[Path, Callable]): The path or function that is being monitored. + Returns + ------- + _type_ + Decoreated class definition + """ + instances = {} + + def getinstance(*args, **kwargs): + if class_ not in instances: + instances[class_] = class_(*args, **kwargs) + return instances[class_] + + return getinstance + + +def getRunName(app: Path) -> str: + """Return application name based on configuration and application path. + + Parameters + ---------- + app : Path + Application path. - Returns: - str: Application name. + Returns + ------- + str + Application name. """ app_name = config.get("output", "app_name") if app_name and app_name != "SLURM": - return config.get("output", "app_name") + return app_name elif app_name and "SBATCH_JOB_NAME" in os.environ and app_name == "SLURM": return os.environ["SBATCH_JOB_NAME"] elif isinstance(app, Path): return app.stem - else: - return app.__name__ -def getRunId(startime: datetime) -> str: - """Get run id from available info. +def getRunId(starttime: datetime) -> str: + """Return run id based on the configuration object or the current datetime. - Args: - startime (datetime): Time when application was started. + Parameters + ---------- + starttime : datetime + Datetime object - Returns: - str: String id. + Returns + ------- + str + Run id. """ run_id = config.get("output", "run_id") if run_id and run_id != "SLURM": - return config.get("output", "run_id") + return run_id elif ( run_id and "SLURM_JOB_ID" in os.environ @@ -51,79 +77,24 @@ def getRunId(startime: datetime) -> str: ): return os.environ["SLURM_JOB_ID"] else: - return startime.isoformat() - - -def value2str( - value: np.number, metric_md: MetricMetaData -) -> Tuple[str, float, Magnitude]: - """Return a printable representation of the value based on its metric metadata. A printable value should not have more than 3 digits after before the decimal comma/dot. + return starttime.isoformat() - Args: - value (np.number): Value to format. - metric_md (MetricMetaData): Value metadata. - - Returns: - Tuple[str, float, Magnitude]: Formated value, transformation factor used, and the new magnitude prefix. - """ - if metric_md.unit == Unit.WATT or metric_md.unit == Unit.JOULE: - transformFactor = 1 - for mag in reversed(Magnitude): - if value > mag.value: - transformFactor = mag.value - break - - newValue = value / transformFactor - newMag = Magnitude(metric_md.mag.value * transformFactor) - return f"{newValue:.3f}", transformFactor, newMag - - elif metric_md.unit == Unit.PERCENT: - return f"{value:.3f}", 1.0, metric_md.mag - elif metric_md.unit == Unit.SECOND: - return str(timedelta(seconds=float(value))), 1.0, Magnitude.ONE - elif metric_md.unit == Unit.BYTE: - transformFactor = 1 - newMag = Magnitude.ONE - for magFactor, m in zip( - [1024**3, 1024**2, 1024**1], - [Magnitude.GIGA, Magnitude.MEGA, Magnitude.KILO], - ): - if value > magFactor: - transformFactor = magFactor - newMag = m - break - - newValue = value / transformFactor - return f"{newValue:.3f}", transformFactor, newMag - else: - return f"{value:.3f}", 1.0, metric_md.mag +def increaseIdCounter(existing: List[str], newId: str) -> str: + """Increase id counter based on number of existing entries with the same id. -def getHostMetadata(backendConfig: Dict[str, Set[str]]) -> Dict[str, Any]: - """Return dictionary with the full system metadata based on the provided backend configuration. + Parameters + ---------- + existing : List[str] + List of existing ids. + newId : str + New id to compare againts. - :param backendConfig: Sensor backend configuration to include in the metadata object. - :type backendConfig: Dict[str, Set[str]] - :return: Dictionary with system metadata - :rtype: Dict[str, Any] + Returns + ------- + str + newId with an added counter if any matches were found. """ - from perun.backend import backends - - metadata = {} - for name, method in platform.__dict__.items(): - if callable(method): - try: - metadata[name] = method() - except Exception as e: - log.warn(f"platform method {name} did not work") - log.warn(e) - - metadata["backends"] = {} - for backend in backends: - if backend.name in backendConfig: - metadata["backends"][backend.name] = {} - sensors = backend.getSensors(backendConfig[backend.name]) - for sensor in sensors: - metadata["backends"][backend.name][sensor.id] = sensor.metadata - - return metadata + exp = re.compile(r"^" + newId + r"(_\d+)?$") + count = len(list(filter(lambda x: exp.match(x), existing))) + return newId + f"_{count}" if count > 0 else newId diff --git a/pyproject.toml b/pyproject.toml index f18891f..3fca786 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,7 +8,7 @@ readme = "README.md" homepage = "https://github.com/Helmholtz-AI-Energy/perun" [tool.poetry.scripts] -perun = "perun.cli:main" +perun = "perun.api.cli:main" [tool.poetry.dependencies] python = ">=3.8,<4.0" @@ -17,12 +17,12 @@ py-cpuinfo = ">=5.0.0" numpy = "^1.20.0" psutil = "^5.9.0" h5py = "^3.5.9" -pandas = ">=1.3" +pandas = {version = ">=1.3"} +tabulate = ">=0.9" nvidia-ml-py = "^12.535.77" +mpi4py = {version = "^3.1", optional = true} [tool.poetry.extras] -dev = ["pytest", "flake8", "mypy", "black", "pre-commit", "pytest-cov"] -docs = ["sphinx", "sphinx-rtd-theme", "sphinx-autoapi"] mpi = ["mpi4py"] [tool.poetry.group.dev] @@ -35,12 +35,13 @@ flake8 = "*" mypy = "*" black = "*" pre-commit = "*" +pydocstyle = "*" [tool.poetry.group.docs] optional = true [tool.poetry.group.docs.dependencies] -sphinx = "*" +sphinx = "<7.0" sphinx-rtd-theme = "*" sphinx-autoapi = "*" @@ -50,12 +51,6 @@ optional = true [tool.poetry.group.mpi.dependencies] mpi4py = "^3.1" -# [tool.poetry.group.horeka] -# optional = true -# -# [tool.poetry.group.horeka.dependencies] -# influxdb-client = "*" - [tool.semantic_release] version_variable = [ "perun/__init__.py:__version__" ] version_toml = ["pyproject.toml:tool.poetry.version"] @@ -71,11 +66,10 @@ testpaths = ["tests"] [tool.isort] skip = ["perun/__init__.py"] -known_third_party = ["click", "cpuinfo", "h5py", "influxdb_client", "numpy", "pandas", "psutil", "pynvml", "pytest"] +known_third_party = ["click", "cpuinfo", "h5py", "numpy", "pandas", "psutil", "pynvml", "pytest"] profile = "black" [tool.pydocstyle] -match = '(test_)!.*\.py' match-dir = 'perun' convention = 'numpy' diff --git a/tests/perun/test_cli.py b/tests/perun/test_cli.py index 187fffd..71ebe21 100644 --- a/tests/perun/test_cli.py +++ b/tests/perun/test_cli.py @@ -3,7 +3,7 @@ from click.testing import CliRunner, Result import perun -from perun.cli import cli +from perun.api.cli import cli from perun.configuration import _default_config @@ -17,6 +17,7 @@ def test_cli_showconf(): # Test default option on the default filesystem runner = CliRunner() output: Result = runner.invoke(cli, ["showconf", "--default"]) + print(output.output) config = ConfigParser(allow_no_value=True) config.read_string( output.output, @@ -58,29 +59,6 @@ def test_cli_showconf(): ) assert config.getfloat("post-processing", "pue") == 0.1 - # Test cmdline - output: Result = runner.invoke(cli, ["--pue", "0.1", "showconf", "--default"]) - config = ConfigParser(allow_no_value=True) - config.read_string(output.output) - - for section in _default_config.keys(): - for option, value in _default_config[section].items(): - if value is not None: - assert str(value) == config.get(section, option) - else: - assert value == config.get(section, option) - - # Test configuration did change - output: Result = runner.invoke(cli, ["--pue", "0.1", "showconf"]) - config = ConfigParser(allow_no_value=True) - config.read_string(output.output) - - assert ( - config.getfloat("post-processing", "pue") - != _default_config["post-processing"]["pue"] - ) - assert config.getfloat("post-processing", "pue") == 0.1 - # # Test env variables # monkeypatch.setenv("PERUN_PUE", "0.1") # runner = CliRunner() diff --git a/tests/perun/test_coordination.py b/tests/perun/test_coordination.py index ecebafd..3e88fd5 100644 --- a/tests/perun/test_coordination.py +++ b/tests/perun/test_coordination.py @@ -1,4 +1,4 @@ -from typing import Any, List, Set +from typing import Any, Dict, List, Set import numpy as np @@ -7,14 +7,20 @@ # from pytest import MonkeyPatch -def compareNestedList(l1: List[Set[Any]], l2: List[Set[Any]]) -> bool: +def compareNestedListDictSet( + l1: List[Dict[str, Set[Any]]], l2: List[Dict[str, Set[Any]]] +) -> bool: if len(l1) != len(l2): return False for nl1, nl2 in zip(l1, l2): if len(nl1) != len(nl2): return False - if not np.all([e1 in nl2 for e1 in nl2]): - return False + for key1 in nl1.keys(): + if key1 not in nl2: + return False + + if not np.all([e1 in nl1[key1] for e1 in nl2[key1]]): + return False return True @@ -26,7 +32,7 @@ def test_assignSensors(): result = devices output = assignSensors(devices, hosts) - assert compareNestedList(result, output) + assert compareNestedListDictSet(result, output) # Test single node with multiple ranks, equal devices hosts = {"host0": [0, 1]} @@ -34,7 +40,7 @@ def test_assignSensors(): result = [{"b": {"s1", "s2", "s3"}}, {}] output = assignSensors(devices, hosts) - assert compareNestedList(output, result) + assert compareNestedListDictSet(output, result) # Test single node with multiple ranks, different devices devices hosts = {"host0": [0, 1]} @@ -42,7 +48,7 @@ def test_assignSensors(): result = [{"b": {"s1", "s2", "s3"}}, {}] output = assignSensors(devices, hosts) - assert compareNestedList(output, result) + assert compareNestedListDictSet(output, result) # Test single node, multiple ranks, different backends hosts = {"host0": [0, 1, 2]} @@ -52,9 +58,10 @@ def test_assignSensors(): {"b1": {"s2"}}, ] - result = [{"b0": {"s1", "s2", "s3"}, "b1": {"s0", "s1", "s3"}}, {}, {}] + result = [{"b0": {"s1", "s2", "s3"}, "b1": {"s0", "s1", "s2"}}, {}, {}] output = assignSensors(devices, hosts) - assert compareNestedList(output, result) + print(output) + assert compareNestedListDictSet(output, result) # Test 2 nodes with single ranks, same devices hosts = {"host0": [0], "host1": [1]} @@ -75,7 +82,7 @@ def test_assignSensors(): }, ] output = assignSensors(devices, hosts) - assert compareNestedList(output, result) + assert compareNestedListDictSet(output, result) # Test 2 nodes with multiple ranks, different devices hosts = {"host0": [0, 1], "host1": [2, 3]} @@ -94,7 +101,7 @@ def test_assignSensors(): ] output = assignSensors(devices, hosts) - assert compareNestedList(output, result) + assert compareNestedListDictSet(output, result) # def test_perunSubprocess(monkeypatch: MonkeyPatch, backends):