diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 966f2b2768..5ec6b6cbf2 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -7,7 +7,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: ['3.7', '3.8', '3.9', '3.10', '3.11', '3.12'] + python-version: ['3.7', '3.8', '3.9', '3.10', '3.11', '3.12', '3.13'] steps: - uses: actions/checkout@v4 - name: Set up Python ${{ matrix.python-version }} @@ -48,12 +48,15 @@ jobs: unittest-macos: runs-on: macos-latest + strategy: + matrix: + python-version: ['3.8', '3.9', '3.10', '3.11', '3.12', '3.13'] steps: - uses: actions/checkout@v4 - - name: Set up Python 3.9 + - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v5 with: - python-version: 3.9 + python-version: ${{ matrix.python-version }} - name: Install dependencies run: | ./bootstrap.sh @@ -79,7 +82,7 @@ jobs: docker build -f ci-scripts/dockerfiles/reframe-${{ matrix.modules-version }}.dockerfile -t reframe-${{ matrix.modules-version }}:latest . - name: Run Unittests run: | - docker run reframe-${{ matrix.modules-version }}:latest + docker run --init reframe-${{ matrix.modules-version }}:latest eb-spack-howto: runs-on: ubuntu-latest @@ -115,12 +118,15 @@ jobs: wheelvalidation: runs-on: ubuntu-latest + strategy: + matrix: + python-version: ['3.7', '3.8', '3.9', '3.10', '3.11', '3.12', '3.13'] steps: - uses: actions/checkout@v4 - - name: Setup up Python 3.8 + - name: Setup up Python ${{ matrix.python-version }} uses: actions/setup-python@v5 with: - python-version: 3.8 + python-version: ${{ matrix.python-version }} - name: Generate Wheel run: | python -m pip install --upgrade pip setuptools build @@ -137,7 +143,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: ['3.7', '3.8', '3.9', '3.10', '3.11', '3.12'] + python-version: ['3.7', '3.8', '3.9', '3.10', '3.11', '3.12', '3.13'] steps: - uses: actions/checkout@v4 - name: Set up Python ${{ matrix.python-version }} diff --git a/docs/config_reference.rst b/docs/config_reference.rst index 04d6959d60..805ef4c07c 100644 --- a/docs/config_reference.rst +++ b/docs/config_reference.rst @@ -601,7 +601,7 @@ System Partition Configuration :required: No :default: ``[]`` - A list of job scheduler `resource specification <#custom-job-scheduler-resources>`__ objects. + A list of job scheduler :ref:`resource specification ` objects. .. py:attribute:: systems.partitions.processor @@ -639,12 +639,19 @@ System Partition Configuration In case of errors during auto-detection, ReFrame will simply issue a warning and continue. + .. note:: + + The directory prefix for storing topology information is configurable through the :attr:`~config.general.topology_prefix` configuration option. + .. versionadded:: 3.5.0 .. versionchanged:: 3.7.0 ReFrame is now able to detect the processor information automatically. + .. versionchanged:: 4.7 + Directory prefix for topology files is now configurable. + .. py:attribute:: systems.partitions.devices @@ -747,6 +754,8 @@ ReFrame can launch containerized applications, but you need to configure properl If specified in conjunction with :attr:`~systems.partitions.container_platforms.env_vars`, it will be ignored. +.. _scheduler-resources: + Custom Job Scheduler Resources ============================== @@ -1142,7 +1151,7 @@ All logging handlers share the following set of common attributes: Log record format string. - ReFrame accepts all log record attributes from Python's `logging `__ mechanism and adds the following attributes: + ReFrame accepts all log record placeholders from Python's `logging `__ mechanism and adds the following ones: .. csv-table:: @@ -1201,16 +1210,17 @@ All logging handlers share the following set of common attributes: ``%(check_valid_prog_environs)s``, The value of the :attr:`~reframe.core.pipeline.RegressionTest.valid_prog_environs` attribute. ``%(check_valid_systems)s``, The value of the :attr:`~reframe.core.pipeline.RegressionTest.valid_systems` attribute. ``%(check_variables)s``, DEPRECATED: Please use ``%(check_env_vars)s`` instead. + ``%(hostname)s``, The hostname where ReFrame runs. ``%(osuser)s``, The name of the OS user running ReFrame. ``%(osgroup)s``, The name of the OS group running ReFrame. ``%(version)s``, The ReFrame version. ReFrame allows you to log any test variable, parameter or property if they are marked as "loggable". - The log record attribute will have the form ``%(check_NAME)s`` where ``NAME`` is the variable name, the parameter name or the property name that is marked as loggable. + The log record placeholder will have the form ``%(check_NAME)s`` where ``NAME`` is the variable name, the parameter name or the property name that is marked as loggable. - There is also the special ``%(check_#ALL)s`` format specifier which expands to all the loggable test attributes. - These include all the above specifiers and any additional loggable variables or parameters defined by the test. - On expanding this specifier, ReFrame will try to guess the delimiter to use for separating the different attributes based on the existing format. + There is also the special ``%(check_#ALL)s`` format placeholder which expands to all the loggable test attributes. + These include all the above placeholders and any additional loggable variables or parameters defined by the test. + On expanding this placeholder, ReFrame will try to guess the delimiter to use for separating the different attributes based on the existing format. If it cannot guess it, it will default to ``|``. Since this can lead to very long records, you may consider using it with the :attr:`~logging.handlers_perflog..filelog..ignore_keys` parameter to filter out some attributes that are not of interest. @@ -1225,13 +1235,16 @@ All logging handlers share the following set of common attributes: Limit the number of attributes that can be logged. User attributes or properties must be explicitly marked as "loggable" in order to be selectable for logging. .. versionadded:: 4.0 - The ``%(check_result)s`` specifier is added. + The ``%(check_result)s`` placeholder is added. .. versionadded:: 4.3 - The ``%(check_#ALL)s`` special specifier is added. + The ``%(check_#ALL)s`` special placeholder is added. .. versionadded:: 4.7 - The ``%(check_fail_phase)s`` and ``%(check_fail_reason)s`` specifiers are added. + The ``%(check_fail_phase)s`` and ``%(check_fail_reason)s`` placeholders are added. + +.. versionadded:: 4.8 + The ``%(hostname)s`` placeholder is added. .. py:attribute:: logging.handlers.format_perfvars @@ -1248,10 +1261,10 @@ All logging handlers share the following set of common attributes: .. important:: The last character of this format will be interpreted as the final delimiter of the formatted performance variables to the rest of the record. - The following log record attributes are defined additionally by this format specifier: + The following log record placeholders are defined additionally by this format specifier: .. csv-table:: - :header: "Log record attribute", "Description" + :header: "Log record placeholders", "Description" ``%(check_perf_lower_thres)s``, The lower threshold of the logged performance variable. ``%(check_perf_ref)s``, The reference value of the logged performance variable. @@ -1343,7 +1356,7 @@ The additional properties for the ``filelog`` handler are the following: .. py:attribute:: logging.handlers_perflog..filelog..ignore_keys - A list of log record `format specifiers <#config.logging.handlers.format>`__ that will be ignored by the special ``%(check_#ALL)s`` specifier. + A list of log record `format placeholders <#config.logging.handlers.format>`__ that will be ignored by the special ``%(check_#ALL)s`` placeholder. .. versionadded:: 4.3 @@ -1582,11 +1595,11 @@ This handler transmits the whole log record, meaning that all the information wi .. py:function:: json_formatter(record: object, extras: Dict[str, str], ignore_keys: Set[str]) -> str :arg record: The prepared log record. - The log record is a simple Python object with all the attributes listed in :attr:`~config.logging.handlers.format`, as well as all the default Python `log record `__ attributes. + The log record is a simple Python object with all the placeholders listed in :attr:`~config.logging.handlers.format`, as well as all the default Python `log record `__ placeholders. In addition to those, there is also the special :attr:`__rfm_check__` attribute that contains a reference to the actual test for which the performance is being logged. :arg extras: Any extra attributes specified in :attr:`~config.logging.handlers_perflog..httpjson..extras`. :arg ignore_keys: The set of keys specified in :attr:`~config.logging.handlers_perflog..httpjson..ignore_keys`. - ReFrame always adds the default Python log record attributes in this set. + ReFrame always adds the default Python log record placeholders in this set. :returns: A string representation of the JSON record to be sent to the server or :obj:`None` if the record should not be sent to the server. .. note:: @@ -1855,6 +1868,16 @@ General Configuration +.. py:attribute:: general.topology_prefix + + :required: No + :default: ``"${HOME}/.reframe/topology"`` + + Directory prefix for storing the auto-detected processor topology. + + .. versionadded:: 4.7 + + .. py:attribute:: general.trap_job_errors :required: No @@ -1864,7 +1887,6 @@ General Configuration .. versionadded:: 3.2 - .. py:attribute:: general.keep_stage_files :required: No diff --git a/docs/index.rst b/docs/index.rst index 0d6811cc5c..1961c2e07c 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -21,28 +21,30 @@ Publications Presentations & Talks --------------------- -* Slides [`pdf `__][`talk `__] @ `9th EasyBuild User Meeting 2024 `__. -* Slides [`part 1 `__][`part 2 `__][`talk `__] @ `8th EasyBuild User Meeting 2023 `__. -* Slides [`pdf `__] @ `7th EasyBuild User Meeting 2022 `__. -* Slides [`pdf `__] @ `6th EasyBuild User Meeting 2021 `__. -* Slides [`pdf `__] @ `5th EasyBuild User Meeting 2020 `__. -* Slides [`pdf `__] @ `HPC System Testing BoF `__, SC'19. -* Slides [`pdf `__] @ `HUST 2019 `__, SC'19. -* Slides [`pdf `__] @ `HPC Knowledge Meeting '19 `__. -* Slides [`pdf `__] & `Talk `__ @ `FOSDEM'19 `__. -* Slides [`pdf `__] @ `4th EasyBuild User Meeting `__. -* Slides [`pdf `__] @ `HUST 2018 `__, SC'18. -* Slides [`pdf `__] @ `CSCS User Lab Day 2018 `__. -* Slides [`pdf `__] @ `HPC Advisory Council 2018 `__. -* Slides [`pdf `__] @ `SC17 `__. -* Slides [`pdf `__] @ `CUG 2017 `__. +* [`slides `__] "Introduction to ReFrame," CINECA visit, Jun 2024. +* [`slides `__][`recording `__] "Recent Advances in ReFrame," `9th EasyBuild User Meeting 2024 `__. +* [`slides `__][`recording `__] "Recent Advances in ReFrame," `8th EasyBuild User Meeting 2023 `__. +* [`slides `__][`recording `__] "Embracing ReFrame Programmable Configurations," `8th EasyBuild User Meeting 2023 `__. +* [`slides `__] "ReFrame Update," `7th EasyBuild User Meeting 2022 `__. +* [`slides `__] "Writing powerful HPC regression tests with ReFrame," `6th EasyBuild User Meeting 2021 `__ +* [`slides `__] "ReFrame: A Framework for Writing Regression Tests for HPC Systems," `5th EasyBuild User Meeting 2020 `__. +* [`slides `__] "Enabling Continuous Testing of HPC Systems using ReFrame," `HPC System Testing BoF `__, SC'19. +* [`slides `__] "Enabling Continuous Testing of HPC Systems using ReFrame," `HUST 2019 `__, SC'19. +* [`slides `__] "ReFrame: A Tool for Enabling Regression Testing and Continuous Integration for HPC Systems," `HPC Knowledge Meeting '19 `__. +* [`slides `__][`recording `__] "ReFrame: A Regression Testing and Continuous Integration Framework for HPC systems," `FOSDEM'19 `__. +* [`slides `__] "ReFrame: A Regression Testing and Continuous Integration Framework for HPC systems," `4th EasyBuild User Meeting `__. +* [`slides `__] "ReFrame: A Regression Testing and Continuous Integration Framework for HPC systems," `HUST 2018 `__, SC'18. +* [`slides `__] "Regression Testing and Continuous Integration with ReFrame," `CSCS User Lab Day 2018 `__. +* [`slides `__] "ReFrame: A Regression Testing Framework Enabling Continuous Integration of Large HPC Systems," `HPC Advisory Council 2018 `__. +* [`slides `__] "ReFrame: A Regression Testing Tool for HPC Systems," Regression testing BoF, `SC17 `__. +* [`slides `__] "ReFrame: A regression framework for checking the health of large HPC systems" `CUG 2017 `__. Webinars & Tutorials -------------------- -* "ReFrame – Efficient System and Application Performance Testing," CSCS Webinar, Aug. 29, 2022 [`slides `__] [`recording `__] [`demo run `__]. -* Tutorial at 6th EasyBuild User Meeting 2021 [`YouTube `__] +* [`slides `__][`recording `__][`demo run `__] "ReFrame – Efficient System and Application Performance Testing," CSCS Webinar, Aug. 29, 2022. +* [`recording `__] "ReFrame Tutorial," 6th EasyBuild User Meeting 2021. Papers diff --git a/docs/manpage.rst b/docs/manpage.rst index ea3f90339b..d464c1e1e5 100644 --- a/docs/manpage.rst +++ b/docs/manpage.rst @@ -1043,7 +1043,7 @@ The way the tests are generated and how they interact with the test filtering op Parameterize a test on an existing variable. - This option will create a new test with a parameter named ``$VAR`` with the values given in the comma-separated list ``VAL0,VAL1,...``. + The test will behave as if the variable ``VAR`` was a parameter taking the values ``VAL0,VAL1,...``. The values will be converted based on the type of the target variable ``VAR``. The ``TEST.`` prefix will only parameterize the variable ``VAR`` of test ``TEST``. @@ -1163,19 +1163,15 @@ Miscellaneous options This option can also be set using the :envvar:`RFM_SYSTEM` environment variable. -.. option:: --table-format=csv|plain|outline|grid +.. option:: --table-format=csv|plain|pretty Set the formatting of tabular output printed by the options :option:`--performance-compare`, :option:`--performance-report` and the options controlling the stored sessions. The acceptable values are the following: - ``csv``: Generate CSV output - - ``grid``: Generate a table with grid lines - - ``outline``: (default) Generate a table with lines outlining the table and the header - - ``plain``: Generate a plain table without any lines - - Note that the default ``outline`` format will not render correctly multi-line cells. - In this cases, prefer the ``grid`` or ``plain`` formats. + - ``plain``: Generate a plain table without any vertical lines allowing for easy ``grep``-ing + - ``pretty``: (default) Generate a pretty table .. versionadded:: 4.7 diff --git a/docs/tutorial.rst b/docs/tutorial.rst index b3eead5ebb..326e14e73c 100644 --- a/docs/tutorial.rst +++ b/docs/tutorial.rst @@ -2000,39 +2000,55 @@ its unique identifier, its start and end time and how many test cases have run: ┍━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━┯━━━━━━━━━━━━━┑ │ UUID │ Start time │ End time │ Num runs │ Num cases │ ┝━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━┿━━━━━━━━━━━━━┥ - │ 340178ef-a51e-4ce8-8476-1e42ceb2efdd │ 20241011T092927+0000 │ 20241011T092930+0000 │ 1 │ 1 │ - │ 68f9a457-f132-459f-8c11-0e6533be3a24 │ 20241011T092931+0000 │ 20241011T092934+0000 │ 1 │ 1 │ - │ c1d3e813-e783-41aa-92b6-e7ff8eb3e4ec │ 20241011T092934+0000 │ 20241011T092935+0000 │ 1 │ 2 │ - │ 6a79ccf5-95c4-4cc0-a4a2-b3e49012565b │ 20241011T092936+0000 │ 20241011T092937+0000 │ 1 │ 4 │ - │ aa953baf-63d9-47b1-8800-1c6d05883334 │ 20241011T092938+0000 │ 20241011T092939+0000 │ 1 │ 4 │ - │ e8b23332-534a-4f48-aff7-1ae9d4085ecc │ 20241011T092939+0000 │ 20241011T092951+0000 │ 1 │ 26 │ - │ 57cfb5f3-94dd-4e7f-87c9-648a651b1337 │ 20241011T092951+0000 │ 20241011T092955+0000 │ 1 │ 10 │ - │ ec116664-5534-462f-aa33-87dad3bd794b │ 20241011T092956+0000 │ 20241011T092957+0000 │ 1 │ 10 │ - │ 92eaa50e-af92-411f-a11e-47e9fa938202 │ 20241011T092957+0000 │ 20241011T092957+0000 │ 1 │ 4 │ - │ 5bb110fd-9f6a-487d-af4f-4ab582406047 │ 20241011T092958+0000 │ 20241011T092959+0000 │ 1 │ 10 │ - │ 4a522d23-6ae4-4a28-bf39-d2872badcf01 │ 20241011T092959+0000 │ 20241011T092959+0000 │ 1 │ 1 │ - │ 2a6bb3b7-93d3-41ed-8618-48c268de5fcb │ 20241011T093000+0000 │ 20241011T093001+0000 │ 1 │ 5 │ + │ dbdb5f94-d1b2-4a11-aadc-57591d4a8496 │ 20241105T150144+0000 │ 20241105T150147+0000 │ 1 │ 1 │ + ├──────────────────────────────────────┼──────────────────────┼──────────────────────┼────────────┼─────────────┤ + │ eba49e9c-81f2-45b7-8680-34a5c9e08ac2 │ 20241105T150202+0000 │ 20241105T150205+0000 │ 1 │ 1 │ + ├──────────────────────────────────────┼──────────────────────┼──────────────────────┼────────────┼─────────────┤ + │ 62e6e1e8-dd3a-4e70-a452-5c416a8f4d0b │ 20241105T150216+0000 │ 20241105T150219+0000 │ 1 │ 1 │ + ├──────────────────────────────────────┼──────────────────────┼──────────────────────┼────────────┼─────────────┤ + │ 4ad75077-f2c5-4331-baf6-564275397f98 │ 20241105T150236+0000 │ 20241105T150237+0000 │ 1 │ 2 │ + ├──────────────────────────────────────┼──────────────────────┼──────────────────────┼────────────┼─────────────┤ + │ 0507e4a0-f44c-45af-a068-9da842498c1f │ 20241105T150253+0000 │ 20241105T150254+0000 │ 1 │ 4 │ + ├──────────────────────────────────────┼──────────────────────┼──────────────────────┼────────────┼─────────────┤ + │ a7c2ffa9-482e-403f-9a78-5727262f6c7f │ 20241105T150304+0000 │ 20241105T150305+0000 │ 1 │ 4 │ + ├──────────────────────────────────────┼──────────────────────┼──────────────────────┼────────────┼─────────────┤ + │ 47e8d98f-e2b9-4019-9a41-1c44d8a53d1b │ 20241105T150321+0000 │ 20241105T150332+0000 │ 1 │ 26 │ + ├──────────────────────────────────────┼──────────────────────┼──────────────────────┼────────────┼─────────────┤ + │ d0aa023b-2ebf-43d4-a0df-e809492434b5 │ 20241105T150352+0000 │ 20241105T150356+0000 │ 1 │ 10 │ + ├──────────────────────────────────────┼──────────────────────┼──────────────────────┼────────────┼─────────────┤ + │ 8d2f6493-2f5f-4e20-8a8d-1f1b7b1285b0 │ 20241105T150415+0000 │ 20241105T150416+0000 │ 1 │ 10 │ + ├──────────────────────────────────────┼──────────────────────┼──────────────────────┼────────────┼─────────────┤ + │ 1dd5da33-4e71-484a-b8e6-13ac4d513a66 │ 20241105T150436+0000 │ 20241105T150436+0000 │ 1 │ 4 │ + ├──────────────────────────────────────┼──────────────────────┼──────────────────────┼────────────┼─────────────┤ + │ 216559ed-be1e-4289-9c88-c9e6b20d2e2e │ 20241105T150447+0000 │ 20241105T150448+0000 │ 1 │ 10 │ + ├──────────────────────────────────────┼──────────────────────┼──────────────────────┼────────────┼─────────────┤ + │ b387ee78-a44b-4711-ad81-629ebf578e53 │ 20241105T150448+0000 │ 20241105T150448+0000 │ 1 │ 1 │ + ├──────────────────────────────────────┼──────────────────────┼──────────────────────┼────────────┼─────────────┤ + │ 4bc5ba16-1a4a-4b27-b75c-407f01f1d292 │ 20241105T150503+0000 │ 20241105T150503+0000 │ 1 │ 5 │ ┕━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━┷━━━━━━━━━━━━━┙ You can use :option:`--list-stored-testcases` to list the test cases of a specific session or those that have run within a certain period of time. -In the following example, we list the test cases of session ``aa953baf-63d9-47b1-8800-1c6d05883334`` showing the maximum performance for every performance variable. +In the following example, we list the test cases of session ``0507e4a0-f44c-45af-a068-9da842498c1f`` showing the maximum performance for every performance variable. Note that a session may contain multiple runs of the same test. .. code-block:: bash :caption: Run in the single-node container. - reframe --list-stored-testcases=aa953baf-63d9-47b1-8800-1c6d05883334/max:/ + reframe --list-stored-testcases=0507e4a0-f44c-45af-a068-9da842498c1f/max:/ .. code-block:: console ┍━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━┯━━━━━━━━━━┯━━━━━━━━━┯━━━━━━━━━┑ │ name │ sysenv │ pvar │ punit │ pval │ ┝━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━┿━━━━━━━━━━┿━━━━━━━━━┿━━━━━━━━━┥ - │ stream_test │ tutorialsys:default+gnu │ copy_bw │ MB/s │ 25169.3 │ - │ stream_test │ tutorialsys:default+gnu │ triad_bw │ MB/s │ 19387.8 │ - │ stream_test │ tutorialsys:default+clang │ copy_bw │ MB/s │ 25129.7 │ - │ stream_test │ tutorialsys:default+clang │ triad_bw │ MB/s │ 29232.8 │ + │ stream_test │ tutorialsys:default+gnu │ copy_bw │ MB/s │ 45454.4 │ + ├─────────────┼───────────────────────────┼──────────┼─────────┼─────────┤ + │ stream_test │ tutorialsys:default+gnu │ triad_bw │ MB/s │ 39979.1 │ + ├─────────────┼───────────────────────────┼──────────┼─────────┼─────────┤ + │ stream_test │ tutorialsys:default+clang │ copy_bw │ MB/s │ 43220.8 │ + ├─────────────┼───────────────────────────┼──────────┼─────────┼─────────┤ + │ stream_test │ tutorialsys:default+clang │ triad_bw │ MB/s │ 38759.9 │ ┕━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━┷━━━━━━━━━━┷━━━━━━━━━┷━━━━━━━━━┙ The grouping of the test cases, the aggregation and the actual columns shown in the final table are fully configurable. @@ -2051,15 +2067,23 @@ For example, the following will list the mean performance of all test cases that ┍━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┯━━━━━━━━━━┯━━━━━━━━━┯━━━━━━━━━┑ │ name │ sysenv │ pvar │ punit │ pval │ ┝━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┿━━━━━━━━━━┿━━━━━━━━━┿━━━━━━━━━┥ - │ stream_test │ generic:default+builtin │ copy_bw │ MB/s │ 40288 │ - │ stream_test │ generic:default+builtin │ triad_bw │ MB/s │ 30530.1 │ - │ stream_test │ tutorialsys:default+baseline │ copy_bw │ MB/s │ 40305.1 │ - │ stream_test │ tutorialsys:default+baseline │ triad_bw │ MB/s │ 30540.6 │ + │ stream_test │ generic:default+builtin │ copy_bw │ MB/s │ 40302.2 │ + ├─────────────────────────────────────────────────────┼──────────────────────────────┼──────────┼─────────┼─────────┤ + │ stream_test │ generic:default+builtin │ triad_bw │ MB/s │ 30565.7 │ + ├─────────────────────────────────────────────────────┼──────────────────────────────┼──────────┼─────────┼─────────┤ + │ stream_test │ tutorialsys:default+baseline │ copy_bw │ MB/s │ 40386.6 │ + ├─────────────────────────────────────────────────────┼──────────────────────────────┼──────────┼─────────┼─────────┤ + │ stream_test │ tutorialsys:default+baseline │ triad_bw │ MB/s │ 30565.5 │ + ├─────────────────────────────────────────────────────┼──────────────────────────────┼──────────┼─────────┼─────────┤ ... - │ stream_test %num_threads=1 %thread_placement=close │ tutorialsys:default+gnu │ copy_bw │ MB/s │ 46906.3 │ - │ stream_test %num_threads=1 %thread_placement=close │ tutorialsys:default+gnu │ triad_bw │ MB/s │ 35309.3 │ - │ stream_test %num_threads=1 %thread_placement=close │ tutorialsys:default+clang │ copy_bw │ MB/s │ 46811.4 │ - │ stream_test %num_threads=1 %thread_placement=close │ tutorialsys:default+clang │ triad_bw │ MB/s │ 35634.3 │ + ├─────────────────────────────────────────────────────┼──────────────────────────────┼──────────┼─────────┼─────────┤ + │ stream_test %num_threads=1 %thread_placement=close │ tutorialsys:default+gnu │ copy_bw │ MB/s │ 47490.5 │ + ├─────────────────────────────────────────────────────┼──────────────────────────────┼──────────┼─────────┼─────────┤ + │ stream_test %num_threads=1 %thread_placement=close │ tutorialsys:default+gnu │ triad_bw │ MB/s │ 34848.5 │ + ├─────────────────────────────────────────────────────┼──────────────────────────────┼──────────┼─────────┼─────────┤ + │ stream_test %num_threads=1 %thread_placement=close │ tutorialsys:default+clang │ copy_bw │ MB/s │ 47618.6 │ + ├─────────────────────────────────────────────────────┼──────────────────────────────┼──────────┼─────────┼─────────┤ + │ stream_test %num_threads=1 %thread_placement=close │ tutorialsys:default+clang │ triad_bw │ MB/s │ 36237.2 │ ┕━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┷━━━━━━━━━━┷━━━━━━━━━┷━━━━━━━━━┙ Note that the :option:`--list-stored-testcases` will list only performance tests. @@ -2077,22 +2101,25 @@ Comparing performance of test cases ----------------------------------- ReFrame can be used to compare the performance of the same test cases run in different time periods using the :option:`--performance-compare` option. -The following will compare the performance of the test cases of the session ``aa953baf-63d9-47b1-8800-1c6d05883334`` with any other same test case that has run the last 24h: +The following will compare the performance of the test cases of the session ``0507e4a0-f44c-45af-a068-9da842498c1f`` with any other same test case that has run the last 24h: .. code-block:: bash :caption: Run in the single-node container. - reframe --performance-compare=aa953baf-63d9-47b1-8800-1c6d05883334/now-1d:now/mean:/ + reframe --performance-compare=0507e4a0-f44c-45af-a068-9da842498c1f/now-1d:now/mean:/ .. code-block:: console ┍━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━┯━━━━━━━━━━┯━━━━━━━━━┯━━━━━━━━━━┯━━━━━━━━━━┯━━━━━━━━━┑ │ name │ sysenv │ pvar │ punit │ pval_A │ pval_B │ pdiff │ ┝━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━┿━━━━━━━━━━┿━━━━━━━━━┿━━━━━━━━━━┿━━━━━━━━━━┿━━━━━━━━━┥ - │ stream_test │ tutorialsys:default+gnu │ copy_bw │ MB/s │ 25169.3 │ 46554.8 │ -45.94% │ - │ stream_test │ tutorialsys:default+gnu │ triad_bw │ MB/s │ 19387.8 │ 37660.5 │ -48.52% │ - │ stream_test │ tutorialsys:default+clang │ copy_bw │ MB/s │ 25129.7 │ 47072.2 │ -46.61% │ - │ stream_test │ tutorialsys:default+clang │ triad_bw │ MB/s │ 29232.8 │ 40177.2 │ -27.24% │ + │ stream_test │ tutorialsys:default+gnu │ copy_bw │ MB/s │ 45454.4 │ 46984.3 │ -3.26% │ + ├─────────────┼───────────────────────────┼──────────┼─────────┼──────────┼──────────┼─────────┤ + │ stream_test │ tutorialsys:default+gnu │ triad_bw │ MB/s │ 39979.1 │ 37726.2 │ +5.97% │ + ├─────────────┼───────────────────────────┼──────────┼─────────┼──────────┼──────────┼─────────┤ + │ stream_test │ tutorialsys:default+clang │ copy_bw │ MB/s │ 43220.8 │ 47949.5 │ -9.86% │ + ├─────────────┼───────────────────────────┼──────────┼─────────┼──────────┼──────────┼─────────┤ + │ stream_test │ tutorialsys:default+clang │ triad_bw │ MB/s │ 38759.9 │ 39916.1 │ -2.90% │ ┕━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━┷━━━━━━━━━━┷━━━━━━━━━┷━━━━━━━━━━┷━━━━━━━━━━┷━━━━━━━━━┙ Note that the absolute base performance (``pval_A`` column) is listed along with the target performance (``pval_B`` column). @@ -2105,11 +2132,6 @@ Finally, a stored session can be deleted using the :option:`--delete-stored-sess .. code-block:: bash - reframe --delete-stored-sessions=1fb8488e-c361-4355-b7df-c0dcf3cdcc1e + reframe --delete-stored-sessions=47e8d98f-e2b9-4019-9a41-1c44d8a53d1b Deleting a session will also delete all its test cases from the database. - - -.. tip:: - - You can disable results storage by either setting ``RFM_ENABLE_RESULTS_STORAGE=0`` or by setting the :attr:`storage.enable ` configuration parameter to ``False``. diff --git a/examples/tutorial/scripts/runall.sh b/examples/tutorial/scripts/runall.sh new file mode 100644 index 0000000000..9d44ee6cf9 --- /dev/null +++ b/examples/tutorial/scripts/runall.sh @@ -0,0 +1,23 @@ +#!/bin/bash + +set -xe + +export RFM_ENABLE_RESULTS_STORAGE=1 + +pushd reframe-examples/tutorial +reframe -c stream/stream_runonly.py -r +reframe -c stream/stream_runonly.py -r +reframe -C config/baseline.py -c stream/stream_runonly.py -r +reframe -C config/baseline_environs.py -c stream/stream_build_run.py --exec-policy=serial -r +reframe -C config/baseline_environs.py -c stream/stream_fixtures.py -l +reframe -C config/baseline_environs.py -c stream/stream_fixtures.py -r +reframe -C config/baseline_environs.py -c stream/stream_variables.py -S num_threads=2 -r +reframe -C config/baseline_environs.py -c stream/stream_variables_fixtures.py --exec-policy=serial -S stream_test.stream_binary.array_size=50000000 -r +reframe -C config/baseline_environs.py -c stream/stream_parameters.py --exec-policy=serial -r +reframe -C config/baseline_environs.py -c stream/stream_variables_fixtures.py -P num_threads=1,2,4,8 --exec-policy=serial -r +reframe -c deps/deps_complex.py -r +reframe --restore-session --failed -r +reframe -c deps/deps_complex.py --keep-stage-files -r +reframe --restore-session --keep-stage-files -n T6 -r +reframe -c deps/deps_complex.py -n T6 -r +popd diff --git a/reframe/__init__.py b/reframe/__init__.py index 74c142338b..40de11dc25 100644 --- a/reframe/__init__.py +++ b/reframe/__init__.py @@ -6,7 +6,7 @@ import os import sys -VERSION = '4.7.0-dev.9' +VERSION = '4.8.0-dev.1' INSTALL_PREFIX = os.path.normpath( os.path.abspath(os.path.join(os.path.dirname(__file__), '..')) ) diff --git a/reframe/core/config.py b/reframe/core/config.py index feabf288cc..b1ede82bb4 100644 --- a/reframe/core/config.py +++ b/reframe/core/config.py @@ -447,8 +447,10 @@ def validate(self): try: jsonschema.validate(site_config, self._schema) except jsonschema.ValidationError as e: - raise ConfigError(f"could not validate configuration files: " - f"'{self._sources}'") from e + getlogger().debug(str(e)) + sources = ', '.join(f'`{f}`' for f in self._sources) + raise ConfigError('could not validate configuration files: ' + f'{sources}') from e def _warn_variables(config, opt_path): opt_path = '/'.join(opt_path + ['variables']) diff --git a/reframe/core/decorators.py b/reframe/core/decorators.py index b8fbc7e22b..ff62aa5e60 100644 --- a/reframe/core/decorators.py +++ b/reframe/core/decorators.py @@ -9,7 +9,6 @@ __all__ = ['simple_test'] - import inspect import sys import traceback @@ -171,10 +170,10 @@ def _validate_test(cls): if (cls.is_abstract()): getlogger().warning( - f'skipping test {cls.__qualname__!r}: ' - f'test has one or more undefined parameters' + f'skipping test {cls.__qualname__!r}: ' + + 'the following parameters are undefined: ' + + ', '.join(cls.param_space.undefined_params()) ) - return False conditions = [VersionValidator(v) for v in cls._rfm_required_version] if (cls._rfm_required_version and diff --git a/reframe/core/exceptions.py b/reframe/core/exceptions.py index 49640d671d..f554ce04b4 100644 --- a/reframe/core/exceptions.py +++ b/reframe/core/exceptions.py @@ -8,6 +8,7 @@ # import inspect +import jsonschema import os import reframe @@ -54,7 +55,10 @@ def message(self): def __str__(self): ret = self._message or '' if self.__cause__ is not None: - ret += ': ' + str(self.__cause__) + if isinstance(self.__cause__, jsonschema.ValidationError): + ret += ': ' + self.__cause__.message + else: + ret += ': ' + str(self.__cause__) return ret @@ -110,6 +114,9 @@ class AbortTaskError(ReframeError): error in other places etc.) ''' +class KeyboardError(ReframeError): + '''Raised when there is a KeyboardInterrupt during the asyncio execution + ''' class ConfigError(ReframeError): '''Raised when a configuration error occurs.''' diff --git a/reframe/core/fixtures.py b/reframe/core/fixtures.py index e285467974..3e790d3f46 100644 --- a/reframe/core/fixtures.py +++ b/reframe/core/fixtures.py @@ -325,12 +325,6 @@ def uninst_tests(self): '''Get the uninstantiated tests of this registry''' return self._registry.keys() - def _filter_valid_partitions(self, candidate_parts): - return [p for p in candidate_parts if p in self._env_by_part] - - def _filter_valid_environs(self, part, candidate_environs): - return [e for e in cadidate_environs if e in self._env_by_part[part]] - def _is_registry(self, other): if not isinstance(other, FixtureRegistry): raise TypeError('other is not a FixtureRegistry') @@ -776,7 +770,8 @@ def __init__(self, cls, *, scope='test', action='fork', variants='all', # Check that the fixture class is not an abstract test. if cls.is_abstract(): raise ValueError( - f'class {cls.__qualname__!r} has undefined parameters' + f'fixture {cls.__qualname__!r} has undefined parameters: ' + + ', '.join(cls.param_space.undefined_params()) ) # Validate the scope diff --git a/reframe/core/hooks.py b/reframe/core/hooks.py index 80cb1f9bbe..48e96efa6e 100644 --- a/reframe/core/hooks.py +++ b/reframe/core/hooks.py @@ -3,6 +3,7 @@ # # SPDX-License-Identifier: BSD-3-Clause +import asyncio import functools import inspect @@ -101,17 +102,28 @@ def select_hooks(obj, kind): return [h for h in hooks.get(phase, []) if h.__name__ not in getattr(obj, '_disabled_hooks', [])] - @functools.wraps(func) - def _fn(obj, *args, **kwargs): - for h in select_hooks(obj, 'pre_'): - getattr(obj, h.__name__)() - - func(obj, *args, **kwargs) - for h in select_hooks(obj, 'post_'): - getattr(obj, h.__name__)() + # maybe this could be improved + if asyncio.iscoroutinefunction(func): + @functools.wraps(func) + async def _fn(obj, *args, **kwargs): + for h in select_hooks(obj, 'pre_'): + getattr(obj, h.__name__)() + + await func(obj, *args, **kwargs) + for h in select_hooks(obj, 'post_'): + getattr(obj, h.__name__)() + return _fn + else: + @functools.wraps(func) + def _fn(obj, *args, **kwargs): + for h in select_hooks(obj, 'pre_'): + getattr(obj, h.__name__)() - return _fn + func(obj, *args, **kwargs) + for h in select_hooks(obj, 'post_'): + getattr(obj, h.__name__)() + return _fn return _deco diff --git a/reframe/core/logging.py b/reframe/core/logging.py index bf68536c4b..46e4852df4 100644 --- a/reframe/core/logging.py +++ b/reframe/core/logging.py @@ -4,6 +4,7 @@ # SPDX-License-Identifier: BSD-3-Clause import abc +import asyncio import logging import logging.handlers import numbers @@ -809,6 +810,7 @@ def __init__(self, logger=None, check=None): 'check_perf_upper_thres': None, 'check_perf_unit': None, 'check_result': None, + 'hostname': socket.gethostname(), 'osuser': osext.osuser(), 'osgroup': osext.osgroup(), 'version': osext.reframe_version(), @@ -953,24 +955,44 @@ def adjust_verbosity(self, num_steps): _logger = None _perf_logger = None -_context_logger = null_logger + +global tasks_loggers +tasks_loggers = {} + +global _global_logger +_global_logger = null_logger class logging_context: def __init__(self, check=None, level=DEBUG): - global _context_logger + try: + task = current_task() + except RuntimeError: + global _global_logger + task = None + + self._orig_logger = _global_logger self._level = level - self._orig_logger = _context_logger + self._context_logger = _global_logger if check is not None: - _context_logger = LoggerAdapter(_logger, check) - _context_logger.colorize = self._orig_logger.colorize + self._context_logger = LoggerAdapter(_logger, check) + self._context_logger.colorize = self._orig_logger.colorize + + if task: + tasks_loggers[task] = self._context_logger + else: + _global_logger = self._context_logger def __enter__(self): - return _context_logger + return self._context_logger def __exit__(self, exc_type, exc_value, traceback): - global _context_logger + global _global_logger + try: + task = current_task() + except RuntimeError: + task = None # Log any exceptions thrown with the current context logger if exc_type is not None: @@ -979,20 +1001,23 @@ def __exit__(self, exc_type, exc_value, traceback): getlogger().log(self._level, msg.format(exc_fullname, exc_value)) # Restore context logger - _context_logger = self._orig_logger + _global_logger = self._orig_logger + + if task: + tasks_loggers[task] = self._orig_logger def configure_logging(site_config): - global _logger, _context_logger, _perf_logger + global _logger, _global_logger, _perf_logger if site_config is None: _logger = None - _context_logger = null_logger + _global_logger = null_logger return _logger = _create_logger(site_config, 'handlers$', 'handlers') _perf_logger = _create_logger(site_config, 'handlers_perflog') - _context_logger = LoggerAdapter(_logger) + _global_logger = LoggerAdapter(_logger) def log_files(): @@ -1007,7 +1032,15 @@ def save_log_files(dest): def getlogger(): - return _context_logger + try: + task = current_task() + except RuntimeError: + task = None + if task: + logger_task = tasks_loggers.get(task) + if logger_task: + return tasks_loggers[task] + return _global_logger def getperflogger(check): @@ -1056,11 +1089,23 @@ class logging_sandbox: def __enter__(self): self._logger = _logger self._perf_logger = _perf_logger - self._context_logger = _context_logger + self._context_logger = _global_logger def __exit__(self, exc_type, exc_value, traceback): - global _logger, _perf_logger, _context_logger + global _logger, _perf_logger, _global_logger _logger = self._logger _perf_logger = self._perf_logger - _context_logger = self._context_logger + _global_logger = self._context_logger + + +def current_task(): + """Wrapper for asyncio.current_task() compatible + with Python 3.6 and later. + """ + if sys.version_info >= (3, 7): + # Use asyncio.current_task() directly in Python 3.7+ + return asyncio.current_task() + else: + # Fallback to asyncio.tasks.current_task() in Python 3.6 + return asyncio.Task.current_task() diff --git a/reframe/core/parameters.py b/reframe/core/parameters.py index 1633541971..f829b20e81 100644 --- a/reframe/core/parameters.py +++ b/reframe/core/parameters.py @@ -198,7 +198,7 @@ def update(self, other): self.values = tuple(filt_vals) + self.values except TypeError: raise ReframeSyntaxError( - f"'filter_param' must return an iterable" + "'filter_param' must return an iterable" ) from None def is_abstract(self): @@ -307,7 +307,7 @@ def inject(self, obj, cls=None, params_index=None): try: # Get the parameter values for the specified variant param_values = self.__param_combinations[params_index] - except IndexError as no_params: + except IndexError: raise RuntimeError( f'parameter space index out of range for ' f'{obj.__class__.__qualname__}' @@ -333,6 +333,11 @@ def defines(self, name): ''' return name in self.params and not self.params[name].is_abstract() + def undefined_params(self): + '''Return a list of all undefined parameters.''' + return [name for name, param in self.params.items() + if param.is_abstract()] + def __iter__(self): '''Create a generator object to iterate over the parameter space diff --git a/reframe/core/pipeline.py b/reframe/core/pipeline.py index 8c3ab5e46c..e95df8b362 100644 --- a/reframe/core/pipeline.py +++ b/reframe/core/pipeline.py @@ -261,11 +261,18 @@ def pipeline_hooks(cls): #: of the :attr:`valid_systems` list, in which case an AND operation on #: these constraints is implied. For example, the test defining the #: following will be valid for all systems that have define both ``feat1`` - #: and ``feat2`` and set ``foo=1`` + #: and ``feat2`` and set ``foo=1``: #: #: .. code-block:: python #: - #: valid_systems = ['+feat1 +feat2 %foo=1'] + #: valid_systems = [r'+feat1 +feat2 %foo=1'] + #: + #: Any partition/environment extra or + #: :ref:`partition resource ` can be specified as a + #: feature constraint without having to explicitly state this in the + #: partition's/environment's feature list. For example, if ``key1`` is part + #: of the partition/environment extras list, then ``+key1`` will select + #: that partition or environment. #: #: For key/value pairs comparisons, ReFrame will automatically convert the #: value in the key/value spec to the type of the value of the @@ -1018,7 +1025,7 @@ def __new__(cls, *args, **kwargs): prefix = cls._rfm_custom_prefix except AttributeError: if osext.is_interactive(): - prefix = os.getcwd() + prefix = rt.get_working_dir() else: try: prefix = cls._rfm_pinned_prefix @@ -1639,7 +1646,9 @@ def _resolve_fixtures(self): # registered under the same fixture class. So the loop below must # also inspect the fixture data the instance was registered with. for fixt_name, fixt_data in registry[f.cls].items(): - if f.scope != fixt_data.scope: + if fixt_data.variables != f.variables: + continue + elif f.scope != fixt_data.scope: continue elif fixt_data.variant_num not in target_variants: continue @@ -1762,6 +1771,7 @@ def setup(self, partition, environ, **job_opts): more details. ''' + os.chdir(rt.get_working_dir()) self._current_partition = partition self._current_environ = environ self._setup_paths() @@ -1788,7 +1798,7 @@ def _clone_to_stagedir(self, url): ) @final - def compile(self): + async def compile(self): '''The compilation phase of the regression test pipeline. :raises reframe.core.exceptions.ReframeError: In case of errors. @@ -1881,7 +1891,7 @@ def compile(self): # override those set by the framework. resources_opts = self._map_resources_to_jobopts() self._build_job.options = resources_opts + self._build_job.options - with osext.change_dir(self._stagedir): + with osext.change_dir_global(self._stagedir): # Prepare build job build_commands = [ *self.prebuild_cmds, @@ -1899,10 +1909,10 @@ def compile(self): raise PipelineError('failed to prepare build job') from e if not self.is_dry_run(): - self._build_job.submit() + await self._build_job.submit() @final - def compile_wait(self): + async def compile_wait(self): '''Wait for compilation phase to finish. .. versionadded:: 2.13 @@ -1924,7 +1934,7 @@ def compile_wait(self): if self.is_dry_run(): return - self._build_job.wait() + await self._build_job.wait() # We raise a BuildError when we an exit code and it is non zero if self._build_job.exitcode: @@ -1932,11 +1942,11 @@ def compile_wait(self): f'build job failed with exit code: {self._build_job.exitcode}' ) - with osext.change_dir(self._stagedir): + with osext.change_dir_global(self._stagedir): self.build_system.post_build(self._build_job) @final - def run(self): + async def run(self): '''The run phase of the regression test pipeline. This call is non-blocking. @@ -2030,7 +2040,7 @@ def _get_cp_env(): # override those set by the framework. resources_opts = self._map_resources_to_jobopts() self._job.options = resources_opts + self._job.options - with osext.change_dir(self._stagedir): + with osext.change_dir_global(self._stagedir): try: self.logger.debug('Generating the run script') self._job.prepare( @@ -2046,7 +2056,7 @@ def _get_cp_env(): raise PipelineError('failed to prepare run job') from e if not self.is_dry_run(): - self._job.submit() + await self._job.submit() self.logger.debug(f'Spawned run job (id={self.job.jobid})') # Update num_tasks if test is flexible @@ -2113,7 +2123,7 @@ def run_complete(self): return self._job.finished() @final - def run_wait(self): + async def run_wait(self): '''Wait for the run phase of this test to finish. :raises reframe.core.exceptions.ReframeError: In case of errors. @@ -2133,7 +2143,7 @@ def run_wait(self): if self.is_dry_run(): return - self._job.wait() + await self._job.wait() @final def sanity(self): @@ -2197,7 +2207,7 @@ def check_sanity(self): if self.is_dry_run(): return - with osext.change_dir(self._stagedir): + with osext.change_dir_global(self._stagedir): success = sn.evaluate(self.sanity_patterns) if not success: raise SanityError() @@ -2254,7 +2264,7 @@ def check_performance(self): unit) # Evaluate the performance function and retrieve the metrics - with osext.change_dir(self._stagedir): + with osext.change_dir_global(self._stagedir): for tag, expr in self.perf_variables.items(): try: value = expr.evaluate() if not self.is_dry_run() else None @@ -2339,7 +2349,7 @@ def _copy_to_outputdir(self): self._copy_job_files(self._job, self.outputdir) self._copy_job_files(self._build_job, self.outputdir) - with osext.change_dir(self.stagedir): + with osext.change_dir_global(self.stagedir): # Copy files specified by the user, but expand any glob patterns keep_files = itertools.chain( *(glob.iglob(f) for f in self.keep_files) @@ -2582,6 +2592,7 @@ def setup(self, partition, environ, **job_opts): Similar to the :func:`RegressionTest.setup`, except that no build job is created for this test. ''' + os.chdir(rt.get_working_dir()) self._current_partition = partition self._current_environ = environ self._setup_paths() @@ -2589,19 +2600,19 @@ def setup(self, partition, environ, **job_opts): self._setup_container_platform() self._resolve_fixtures() - def compile(self): + async def compile(self): '''The compilation phase of the regression test pipeline. This is a no-op for this type of test. ''' - def compile_wait(self): + async def compile_wait(self): '''Wait for compilation phase to finish. This is a no-op for this type of test. ''' - def run(self): + async def run(self): '''The run phase of the regression test pipeline. The resources of the test are copied to the stage directory and the @@ -2614,7 +2625,7 @@ def run(self): self._copy_to_stagedir(os.path.join(self._prefix, self.sourcesdir)) - super().run() + await super().run() class CompileOnlyRegressionTest(RegressionTest, special=True): @@ -2648,6 +2659,7 @@ def setup(self, partition, environ, **job_opts): Similar to the :func:`RegressionTest.setup`, except that no run job is created for this test. ''' + os.chdir(rt.get_working_dir()) # No need to setup the job for compile-only checks self._current_partition = partition self._current_environ = environ @@ -2666,13 +2678,13 @@ def stdout(self): def stderr(self): return self.build_job.stderr if self.build_job else None - def run(self): + async def run(self): '''The run stage of the regression test pipeline. Implemented as no-op. ''' - def run_wait(self): + async def run_wait(self): '''Wait for this test to finish. Implemented as no-op diff --git a/reframe/core/runtime.py b/reframe/core/runtime.py index 24d754b991..5baff6e8f6 100644 --- a/reframe/core/runtime.py +++ b/reframe/core/runtime.py @@ -204,6 +204,23 @@ def init_runtime(site_config, **kwargs): _runtime_context = RuntimeContext(site_config, **kwargs) +_working_dir = None + + +def set_working_dir(): + global _working_dir + + _working_dir = os.getcwd() + + +def get_working_dir(): + + if _working_dir is None: + raise ReframeFatalError('no working dir was yet set') + + return _working_dir + + def runtime(): '''Get the runtime context of the framework. @@ -311,11 +328,13 @@ def _is_valid_part(part, valid_systems): props[key] = val have_plus_feats = all( - ft in part.features or ft in part.resources + (ft in part.features or + ft in part.resources or ft in part.extras) for ft in plus_feats ) have_minus_feats = any( - ft in part.features or ft in part.resources + (ft in part.features or + ft in part.resources or ft in part.extras) for ft in minus_feats ) try: @@ -357,8 +376,9 @@ def _is_valid_env(env, valid_prog_environs): key, val = subspec[1:].split('=') props[key] = val - have_plus_feats = all(ft in env.features for ft in plus_feats) - have_minus_feats = any(ft in env.features + have_plus_feats = all(ft in env.features or ft in env.extras + for ft in plus_feats) + have_minus_feats = any(ft in env.features or ft in env.extras for ft in minus_feats) try: have_props = True @@ -397,6 +417,7 @@ def valid_sysenv_comb(valid_systems, valid_prog_environs, class temp_environment: '''Context manager to temporarily change the environment.''' + # TODO: Do we need to change something here? context management asyncio def __init__(self, modules=None, env_vars=None): self._modules = modules or [] @@ -414,6 +435,7 @@ def __exit__(self, exc_type, exc_value, traceback): class temp_config: '''Context manager to temporarily switch to specific configuration.''' + # TODO: Do we need to change something here? context management asyncio def __init__(self, system): self.__to = system diff --git a/reframe/core/schedulers/__init__.py b/reframe/core/schedulers/__init__.py index a8565a99bc..a74f579328 100644 --- a/reframe/core/schedulers/__init__.py +++ b/reframe/core/schedulers/__init__.py @@ -160,7 +160,7 @@ def filter_nodes_by_state(nodelist, state): :arg state: The state of the nodes. If ``all``, the initial list is returned untouched. If ``avail``, only the available nodes will be returned. - All other values are interpretes as a state string. + All other values are interpreted as a state string. State match is exclusive unless the ``*`` is added at the end of the state string. :returns: the filtered node list @@ -169,7 +169,7 @@ def filter_nodes_by_state(nodelist, state): nodelist = {n for n in nodelist if n.is_avail()} elif state != 'all': if state.endswith('*'): - # non-exclusive stat match + # non-exclusive state match state = state[:-1] nodelist = { n for n in nodelist if n.in_state(state) @@ -606,17 +606,26 @@ def guess_num_tasks(self): available_nodes = filter_nodes_by_state( available_nodes, self.sched_flex_alloc_nodes.lower() ) + getlogger().debug( + f'[F] Total available in state=' + f'{self.sched_flex_alloc_nodes.lower()}: {len(available_nodes)}' + ) available_nodes = self.scheduler.filternodes(self, available_nodes) + getlogger().debug( + f'[F] Total available after scheduler filter: ' + f'{len(available_nodes)}' + ) return len(available_nodes) * num_tasks_per_node - def submit(self): - return self.scheduler.submit(self) + async def submit(self): + result = await self.scheduler.submit(self) + return result - def wait(self): + async def wait(self): if self.jobid is None: raise JobNotStartedError('cannot wait an unstarted job') - self.scheduler.wait(self) + await self.scheduler.wait(self) self._completion_time = self._completion_time or time.time() def cancel(self): diff --git a/reframe/core/schedulers/flux.py b/reframe/core/schedulers/flux.py index a1d37038ab..080a539849 100644 --- a/reframe/core/schedulers/flux.py +++ b/reframe/core/schedulers/flux.py @@ -10,6 +10,7 @@ # Lawrence Livermore National Lab # +import asyncio import itertools import os import time @@ -73,7 +74,7 @@ def emit_preamble(self, job): def make_job(self, *args, **kwargs): return _FluxJob(*args, **kwargs) - def submit(self, job): + async def submit(self, job): '''Submit a job to the flux executor.''' flux_future = self._fexecutor.submit(job.fluxjob) @@ -89,7 +90,7 @@ def cancel(self, job): # This will raise JobException with event=cancel (on poll) flux.job.cancel(flux.Flux(), job._flux_future.jobid()) - def poll(self, *jobs): + async def poll(self, *jobs): '''Poll running Flux jobs for updated states.''' if jobs: @@ -141,13 +142,13 @@ def filternodes(self, job, nodes): 'flux backend does not support node filtering' ) - def wait(self, job): + async def wait(self, job): '''Wait until a job is finished.''' intervals = itertools.cycle([1, 2, 3]) while not self.finished(job): - self.poll(job) - time.sleep(next(intervals)) + await self.poll(job) + await asyncio.sleep(next(intervals)) def finished(self, job): if job.exception: diff --git a/reframe/core/schedulers/local.py b/reframe/core/schedulers/local.py index 87eead7530..09d2d3b6f5 100644 --- a/reframe/core/schedulers/local.py +++ b/reframe/core/schedulers/local.py @@ -3,11 +3,12 @@ # # SPDX-License-Identifier: BSD-3-Clause -import errno +import asyncio import os import signal import socket import time +import psutil import reframe.core.schedulers as sched import reframe.utility.osext as osext @@ -53,7 +54,7 @@ class LocalJobScheduler(sched.JobScheduler): def make_job(self, *args, **kwargs): return _LocalJob(*args, **kwargs) - def submit(self, job): + async def submit(self, job): # Run from the absolute path f_stdout = open(job.stdout, 'w+') f_stderr = open(job.stderr, 'w+') @@ -61,7 +62,7 @@ def submit(self, job): # The new process starts also a new session (session leader), so that # we can later kill any other processes that this might spawn by just # killing this one. - proc = osext.run_command_async( + proc = await osext.run_command_asyncio_alone( os.path.abspath(job.script_filename), stdout=f_stdout, stderr=f_stderr, @@ -94,7 +95,27 @@ def filternodes(self, job, nodes): def _kill_all(self, job): '''Send SIGKILL to all the processes of the spawned job.''' try: - os.killpg(job.jobid, signal.SIGKILL) + # Get the process with psutil because we need to cancel the group + p = psutil.Process(job.jobid) + # Get the children of this group + job.children = p.children(recursive=True) + children = job.children + except psutil.NoSuchProcess: + try: + # Maybe the main process was already killed/terminated + # but the children were not + children = job.children + except AttributeError: + children = [] + + try: + for child in children: + if child.is_running(): + child.send_signal(signal.SIGKILL) + job._signal = signal.SIGKILL + else: + self.log(f'child pid {child.pid} already dead') + job.proc.send_signal(signal.SIGKILL) job._signal = signal.SIGKILL except (ProcessLookupError, PermissionError): # The process group may already be dead or assigned to a different @@ -108,9 +129,28 @@ def _kill_all(self, job): def _term_all(self, job): '''Send SIGTERM to all the processes of the spawned job.''' + + try: + p = psutil.Process(job.jobid) + # Get the chilldren of the process + job.children = p.children(recursive=True) + except psutil.NoSuchProcess: + job.children = [] + try: - os.killpg(job.jobid, signal.SIGTERM) + job.proc.send_signal(signal.SIGTERM) job._signal = signal.SIGTERM + # Here, we don't know if it was ignored or not + for child in job.children: + # try to kill the children + try: + child.send_signal(signal.SIGTERM) + except (ProcessLookupError, PermissionError, + psutil.NoSuchProcess): + # The process group may already be dead or assigned + # to a different group, so ignore this error + self.log(f'child pid {child.pid} already dead') + except (ProcessLookupError, PermissionError): # Job has finished already, close file handles self.log(f'pid {job.jobid} already dead') @@ -129,7 +169,7 @@ def cancel(self, job): self._term_all(job) job._cancel_time = time.time() - def wait(self, job): + async def wait(self, job): '''Wait for the spawned job to finish. As soon as the parent job process finishes, all of its spawned @@ -140,8 +180,8 @@ def wait(self, job): ''' while not self.finished(job): - self.poll(job) - time.sleep(self.WAIT_POLL_SECS) + await self.poll(job) + await asyncio.sleep(self.WAIT_POLL_SECS) def finished(self, job): '''Check if the spawned process has finished. @@ -155,36 +195,26 @@ def finished(self, job): return job.state in ['SUCCESS', 'FAILURE', 'TIMEOUT'] - def poll(self, *jobs): + async def poll(self, *jobs): for job in jobs: - self._poll_job(job) + await self._poll_job(job) - def _poll_job(self, job): + async def _poll_job(self, job): if job is None or job.jobid is None: return - try: - pid, status = os.waitpid(job.jobid, os.WNOHANG) - except OSError as e: - if e.errno == errno.ECHILD: - # No unwaited children - self.log('no more unwaited children') - return - else: - raise e - if job.cancel_time: # Job has been cancelled; give it a grace period and kill it self.log(f'Job {job.jobid} has been cancelled; ' f'giving it a grace period') t_rem = self.CANCEL_GRACE_PERIOD - (time.time() - job.cancel_time) if t_rem > 0: - time.sleep(t_rem) + await asyncio.sleep(t_rem) self._kill_all(job) return - if not pid: + if job.proc.returncode is None: # Job has not finished; check if we have reached a timeout t_elapsed = time.time() - job.submit_time if job.time_limit and t_elapsed > job.time_limit: @@ -201,9 +231,9 @@ def _poll_job(self, job): self._kill_all(job) # Retrieve the status of the job and return - if os.WIFEXITED(status): - job._exitcode = os.WEXITSTATUS(status) + if job.proc.returncode >= 0: + job._exitcode = job.proc.returncode job._state = 'FAILURE' if job.exitcode != 0 else 'SUCCESS' - elif os.WIFSIGNALED(status): + else: job._state = 'FAILURE' - job._signal = os.WTERMSIG(status) + job._signal = job.proc.returncode diff --git a/reframe/core/schedulers/lsf.py b/reframe/core/schedulers/lsf.py index 73b6593f3b..b40c2ecea6 100644 --- a/reframe/core/schedulers/lsf.py +++ b/reframe/core/schedulers/lsf.py @@ -19,7 +19,8 @@ from reframe.core.exceptions import JobSchedulerError from reframe.core.schedulers.pbs import PbsJobScheduler -_run_strict = functools.partial(osext.run_command, check=True) +# Asynchronous _run_strict +_run_strict = functools.partial(osext.run_command_asyncio, check=True) @register_scheduler('lsf') @@ -78,14 +79,14 @@ def emit_preamble(self, job): # Filter out empty statements before returning return list(filter(None, preamble)) - def submit(self, job): + async def submit(self, job): with open(job.script_filename, 'r') as fp: cmd_parts = ['bsub'] if self._sched_access_in_submit: cmd_parts += job.sched_access cmd = ' '.join(cmd_parts) - completed = _run_strict(cmd, stdin=fp) + completed = await _run_strict(cmd, stdin=fp) jobid_match = re.search(r'^Job <(?P\S+)> is submitted', completed.stdout) @@ -95,7 +96,7 @@ def submit(self, job): job._jobid = jobid_match.group('jobid') job._submit_time = time.time() - def poll(self, *jobs): + async def poll(self, *jobs): if jobs: # filter out non-jobs jobs = [job for job in jobs if job is not None] @@ -103,7 +104,7 @@ def poll(self, *jobs): if not jobs: return - completed = _run_strict( + completed = await _run_strict( 'bjobs -o "jobid: user:10 stat: queue:" -noheader ' f'{" ".join(job.jobid for job in jobs)}' ) diff --git a/reframe/core/schedulers/oar.py b/reframe/core/schedulers/oar.py index 06733bf600..5ab55a7478 100644 --- a/reframe/core/schedulers/oar.py +++ b/reframe/core/schedulers/oar.py @@ -52,7 +52,10 @@ def oar_state_pending(state): return False -_run_strict = functools.partial(osext.run_command, check=True) +# Asynchronous _run_strict +_run_strict = functools.partial(osext.run_command_asyncio, check=True) +# Synchronous _run_strict +_run_strict_s = functools.partial(osext.run_command, check=True) @register_scheduler('oar') @@ -104,7 +107,7 @@ def emit_preamble(self, job): return preamble - def submit(self, job): + async def submit(self, job): # OAR batch submission mode needs full path to the job script job_script_fullpath = os.path.join(job.workdir, job.script_filename) cmd_parts = ['oarsub'] @@ -114,7 +117,7 @@ def submit(self, job): # OAR needs -S to submit job in batch mode cmd_parts += ['-S', job_script_fullpath] cmd = ' '.join(cmd_parts) - completed = _run_strict(cmd, timeout=self._submit_timeout) + completed = await _run_strict(cmd, timeout=self._submit_timeout) jobid_match = re.search(r'.*OAR_JOB_ID=(?P\S+)', completed.stdout) if not jobid_match: @@ -125,10 +128,10 @@ def submit(self, job): job._submit_time = time.time() def cancel(self, job): - _run_strict(f'oardel {job.jobid}', timeout=self._submit_timeout) + _run_strict_s(f'oardel {job.jobid}', timeout=self._submit_timeout) job._cancelled = True - def poll(self, *jobs): + async def poll(self, *jobs): if jobs: # Filter out non-jobs jobs = [job for job in jobs if job is not None] @@ -137,7 +140,7 @@ def poll(self, *jobs): return for job in jobs: - completed = _run_strict( + completed = await _run_strict( f'oarstat -fj {job.jobid}' ) @@ -154,7 +157,8 @@ def poll(self, *jobs): # https://github.com/oar-team/oar/blob/37db5384c7827cca2d334e5248172bb700015434/sources/core/qfunctions/oarstat#L332 job_raw_info = completed.stdout jobid_match = re.search( - r'^(Job_Id|id):\s*(?P\S+)', completed.stdout, re.MULTILINE + r'^(Job_Id|id):\s*(?P\S+)', completed.stdout, + re.MULTILINE ) if jobid_match: jobid = jobid_match.group('jobid') diff --git a/reframe/core/schedulers/pbs.py b/reframe/core/schedulers/pbs.py index 86dbb6063d..826cc8c158 100644 --- a/reframe/core/schedulers/pbs.py +++ b/reframe/core/schedulers/pbs.py @@ -9,6 +9,7 @@ # - Initial version submitted by Rafael Escovar, ASML # +import asyncio import functools import os import itertools @@ -35,7 +36,10 @@ PBS_CANCEL_DELAY = 3 -_run_strict = functools.partial(osext.run_command, check=True) +# Asynchronous _run_strict +_run_strict = functools.partial(osext.run_command_asyncio, check=True) +# Synchronous _run_strict +_run_strict_s = functools.partial(osext.run_command, check=True) JOB_STATES = { @@ -146,7 +150,7 @@ def filternodes(self, job, nodes): raise NotImplementedError('pbs backend does not support ' 'node filtering') - def submit(self, job): + async def submit(self, job): cmd_parts = ['qsub'] if self._sched_access_in_submit: cmd_parts += job.sched_access @@ -155,7 +159,7 @@ def submit(self, job): # Slurm wrappers. cmd_parts += ['-o', job.stdout, '-e', job.stderr, job.script_filename] cmd = ' '.join(cmd_parts) - completed = _run_strict(cmd, timeout=self._submit_timeout) + completed = await _run_strict(cmd, timeout=self._submit_timeout) jobid_match = re.search(r'^(?P\S+)', completed.stdout) if not jobid_match: raise JobSchedulerError('could not retrieve the job id ' @@ -164,18 +168,18 @@ def submit(self, job): job._jobid = jobid_match.group('jobid') job._submit_time = time.time() - def wait(self, job): + async def wait(self, job): intervals = itertools.cycle([1, 2, 3]) while not self.finished(job): - self.poll(job) - time.sleep(next(intervals)) + await self.poll(job) + await asyncio.sleep(next(intervals)) def cancel(self, job): time_from_submit = time.time() - job.submit_time if time_from_submit < PBS_CANCEL_DELAY: time.sleep(PBS_CANCEL_DELAY - time_from_submit) - _run_strict(f'qdel {job.jobid}', timeout=self._submit_timeout) + _run_strict_s(f'qdel {job.jobid}', timeout=self._submit_timeout) job._cancelled = True def finished(self, job): @@ -205,7 +209,7 @@ def _query_exit_code(self, job): return None - def poll(self, *jobs): + async def poll(self, *jobs): def output_ready(job): # We report a job as finished only when its stdout/stderr are # written back to the working directory @@ -220,7 +224,7 @@ def output_ready(job): if not jobs: return - completed = osext.run_command( + completed = await osext.run_command_asyncio( f'qstat -f {" ".join(job.jobid for job in jobs)}' ) diff --git a/reframe/core/schedulers/sge.py b/reframe/core/schedulers/sge.py index a6dc6b08ff..afc5d192b0 100644 --- a/reframe/core/schedulers/sge.py +++ b/reframe/core/schedulers/sge.py @@ -20,7 +20,10 @@ from reframe.core.schedulers.pbs import PbsJobScheduler from reframe.utility import seconds_to_hms -_run_strict = functools.partial(osext.run_command, check=True) +# Asynchronous _run_strict +_run_strict = functools.partial(osext.run_command_asyncio, check=True) +# Synchronous _run_strict +_run_strict_s = functools.partial(osext.run_command, check=True) @register_scheduler('sge') @@ -53,11 +56,11 @@ def emit_preamble(self, job): return preamble - def submit(self, job): + async def submit(self, job): # `-o` and `-e` options are only recognized in command line by the PBS, # SGE, and Slurm wrappers. cmd = f'qsub -o {job.stdout} -e {job.stderr} {job.script_filename}' - completed = _run_strict(cmd, timeout=self._submit_timeout) + completed = await _run_strict(cmd, timeout=self._submit_timeout) jobid_match = re.search(r'^Your job (?P\S+)', completed.stdout) if not jobid_match: raise JobSchedulerError('could not retrieve the job id ' @@ -66,7 +69,7 @@ def submit(self, job): job._jobid = jobid_match.group('jobid') job._submit_time = time.time() - def poll(self, *jobs): + async def poll(self, *jobs): if jobs: # Filter out non-jobs jobs = [job for job in jobs if job is not None] @@ -75,7 +78,7 @@ def poll(self, *jobs): return user = osext.osuser() - completed = osext.run_command(f'qstat -xml -u {user}') + completed = await osext.run_command_asyncio(f'qstat -xml -u {user}') if completed.returncode != 0: raise JobSchedulerError( f'qstat failed with exit code {completed.returncode} ' diff --git a/reframe/core/schedulers/slurm.py b/reframe/core/schedulers/slurm.py index 47d44c1dfc..5d7d130a80 100644 --- a/reframe/core/schedulers/slurm.py +++ b/reframe/core/schedulers/slurm.py @@ -3,6 +3,7 @@ # # SPDX-License-Identifier: BSD-3-Clause +import asyncio import functools import glob import itertools @@ -64,7 +65,10 @@ def slurm_state_pending(state): return False -_run_strict = functools.partial(osext.run_command, check=True) +# Asynchronous _run_strict +_run_strict = functools.partial(osext.run_command_asyncio, check=True) +# Synchronous _run_strict +_run_strict_s = functools.partial(osext.run_command, check=True) class _SlurmJob(sched.Job): @@ -255,7 +259,7 @@ def emit_preamble(self, job): # Filter out empty statements before returning return list(filter(None, preamble)) - def submit(self, job): + async def submit(self, job): cmd_parts = ['sbatch'] if self._sched_access_in_submit: cmd_parts += job.sched_access @@ -265,7 +269,8 @@ def submit(self, job): intervals = itertools.cycle([1, 2, 3]) while True: try: - completed = _run_strict(cmd, timeout=self._submit_timeout) + completed = await _run_strict(cmd, + timeout=self._submit_timeout) break except SpawnedProcessError as e: error_match = re.search( @@ -279,7 +284,7 @@ def submit(self, job): f'encountered a job submission error: ' f'{error_match.group(1)}: will resubmit after {t}s' ) - time.sleep(t) + await asyncio.sleep(t) jobid_match = re.search(r'Submitted batch job (?P\d+)', completed.stdout) @@ -293,7 +298,7 @@ def submit(self, job): def allnodes(self): try: - completed = _run_strict('scontrol -a show -o nodes') + completed = _run_strict_s('scontrol -a show -o nodes') except SpawnedProcessError as e: raise JobSchedulerError( 'could not retrieve node information') from e @@ -302,7 +307,7 @@ def allnodes(self): return _create_nodes(node_descriptions) def _get_default_partition(self): - completed = _run_strict('scontrol -a show -o partitions') + completed = _run_strict_s('scontrol -a show -o partitions') partition_match = re.search(r'PartitionName=(?P\S+)\s+' r'.*Default=YES.*', completed.stdout) if partition_match: @@ -310,8 +315,23 @@ def _get_default_partition(self): return None + def _get_actual_partition(self, options): + try: + completed = _run_strict( + ' '.join(['srun'] + options + ['--test-only', 'true']) + ) + partition_match = re.search(r'partition (?P\S+)\s+', + completed.stderr) + if partition_match: + return partition_match.group('partition') + + except SpawnedProcessError as e: + self.log('could not retrieve actual partition') + + return None + def _merge_files(self, job): - with osext.change_dir(job.workdir): + with osext.change_dir_global(job.workdir): out_glob = glob.glob(job.stdout + '_*') err_glob = glob.glob(job.stderr + '_*') self.log(f'merging job array output files: {", ".join(out_glob)}') @@ -336,6 +356,7 @@ def filternodes(self, job, nodes): option_parser.add_argument('-w', '--nodelist') option_parser.add_argument('-C', '--constraint') option_parser.add_argument('-x', '--exclude') + self.log(f'Filtering by Slurm options: {" ".join(options)}') parsed_args, _ = option_parser.parse_known_args(options) reservation = parsed_args.reservation partitions = parsed_args.partition @@ -345,42 +366,50 @@ def filternodes(self, job, nodes): if reservation: reservation = reservation.strip() nodes &= self._get_reservation_nodes(reservation) - self.log(f'[F] Filtering nodes by reservation {reservation}: ' - f'available nodes now: {len(nodes)}') + else: + nodes = {node for node in nodes if not node.in_state('RESERVED')} + + self.log(f'Filtering nodes by reservation={reservation}: ' + f'available nodes now: {len(nodes)}') if partitions: partitions = set(partitions.strip().split(',')) else: - default_partition = self._get_default_partition() + # Use a default partition if one is configured. Otherwise, + # fallback to the partition Slurm chooses for this set of options. + default_partition = ( + self._get_default_partition() or + self._get_actual_partition(options) + ) partitions = {default_partition} if default_partition else set() self.log( - f'[F] No partition specified; using {default_partition!r}' + f'No partition specified; using {default_partition!r}' ) nodes = {n for n in nodes if n.partitions >= partitions} - self.log(f'[F] Filtering nodes by partition(s) {partitions}: ' + self.log(f'Filtering nodes by partition(s) {partitions}: ' f'available nodes now: {len(nodes)}') if constraints: nodes = {n for n in nodes if n.satisfies(constraints)} - self.log(f'[F] Filtering nodes by constraint(s) {constraints}: ' + self.log(f'Filtering nodes by constraint(s) {constraints}: ' f'available nodes now: {len(nodes)}') if nodelist: nodelist = nodelist.strip() nodes &= self._get_nodes_by_name(nodelist) - self.log(f'[F] Filtering nodes by nodelist: {nodelist}: ' + self.log(f'Filtering nodes by nodelist: {nodelist}: ' f'available nodes now: {len(nodes)}') if exclude_nodes: exclude_nodes = exclude_nodes.strip() nodes -= self._get_nodes_by_name(exclude_nodes) - self.log(f'[F] Excluding node(s): {exclude_nodes}: ' + self.log(f'Excluding node(s): {exclude_nodes}: ' f'available nodes now: {len(nodes)}') return nodes def _get_reservation_nodes(self, reservation): - completed = _run_strict('scontrol -a show res %s' % reservation) + completed = _run_strict_s('scontrol -a show res %s' % reservation) node_match = re.search(r'(Nodes=\S+)', completed.stdout) if node_match: reservation_nodes = node_match[1] @@ -388,7 +417,7 @@ def _get_reservation_nodes(self, reservation): raise JobSchedulerError("could not extract the node names for " "reservation '%s'" % reservation) - completed = _run_strict('scontrol -a show -o %s' % reservation_nodes) + completed = _run_strict_s('scontrol -a show -o %s' % reservation_nodes) node_descriptions = completed.stdout.splitlines() return _create_nodes(node_descriptions) @@ -411,7 +440,7 @@ def _update_completion_time(self, job, timestamps): if ct: job._completion_time = max(ct) - def poll(self, *jobs): + async def poll(self, *jobs): '''Update the status of the jobs.''' if jobs: @@ -425,7 +454,7 @@ def poll(self, *jobs): t_start = time.strftime( '%F', time.localtime(min(job.submit_time for job in jobs)) ) - completed = _run_strict( + completed = await _run_strict( f'sacct -S {t_start} -P ' f'-j {",".join(job.jobid for job in jobs)} ' f'-o jobid,state,exitcode,end,nodelist' @@ -461,7 +490,7 @@ def poll(self, *jobs): job._state = ','.join(m.group('state') for m in jobarr_info) if not self._update_state_count % self.SACCT_SQUEUE_RATIO: - self._cancel_if_blocked(job) + await self._cancel_if_blocked(job) self._cancel_if_pending_too_long(job) if slurm_state_completed(job.state): @@ -482,17 +511,19 @@ def _cancel_if_pending_too_long(self, job): t_pending = time.time() - job.submit_time if t_pending >= job.max_pending_time: - self.log(f'maximum pending time for job exceeded; cancelling it') + self.log('maximum pending time for job exceeded; cancelling it') self.cancel(job) job._exception = JobError('maximum pending time exceeded', job.jobid) - def _cancel_if_blocked(self, job, reasons=None): + async def _cancel_if_blocked(self, job, reasons=None): if (job.is_cancelling or not slurm_state_pending(job.state)): return if not reasons: - completed = osext.run_command('squeue -h -j %s -o %%r' % job.jobid) + completed = await osext.run_command_asyncio( + 'squeue -h -j %s -o %%r' % job.jobid + ) reasons = completed.stdout.splitlines() if not reasons: # Can't retrieve job's state. Perhaps it has finished already @@ -545,7 +576,7 @@ def _do_cancel_if_blocked(self, job, reason_descr): job._exception = JobBlockedError(reason_msg, job.jobid) - def wait(self, job): + async def wait(self, job): # Quickly return in case we have finished already if self.finished(job): if job.is_array: @@ -555,14 +586,14 @@ def wait(self, job): intervals = itertools.cycle([1, 2, 3]) while not self.finished(job): - self.poll(job) - time.sleep(next(intervals)) + await self.poll(job) + await asyncio.sleep(next(intervals)) if job.is_array: self._merge_files(job) def cancel(self, job): - _run_strict(f'scancel {job.jobid}', timeout=self._submit_timeout) + _run_strict_s(f'scancel {job.jobid}', timeout=self._submit_timeout) job._is_cancelling = True def finished(self, job): @@ -578,7 +609,7 @@ class SqueueJobScheduler(SlurmJobScheduler): SQUEUE_DELAY = 2 - def poll(self, *jobs): + async def poll(self, *jobs): if jobs: # Filter out non-jobs jobs = [job for job in jobs if job is not None] @@ -590,12 +621,12 @@ def poll(self, *jobs): time_from_last_submit = time.time() - m rem_wait = self.SQUEUE_DELAY - time_from_last_submit if rem_wait > 0: - time.sleep(rem_wait) + await asyncio.sleep(rem_wait) # We don't run the command with check=True, because if the job has # finished already, squeue might return an error about an invalid # job id. - completed = osext.run_command( + completed = await osext.run_command_asyncio( f'squeue -h -j {",".join(job.jobid for job in jobs)} ' f'-o "%%i|%%T|%%N|%%r"' ) @@ -626,7 +657,7 @@ def poll(self, *jobs): # Use ',' to join nodes to be consistent with Slurm syntax job._nodespec = ','.join(m.group('nodespec') for m in job_match) - self._cancel_if_blocked( + await self._cancel_if_blocked( job, [s.group('reason') for s in state_match] ) self._cancel_if_pending_too_long(job) @@ -676,24 +707,41 @@ def in_statex(self, state): return self._states == set(state.upper().split('+')) def is_avail(self): - return any(self.in_statex(s) - for s in ('ALLOCATED', 'COMPLETING', 'IDLE')) + available_states = { + 'ALLOCATED', + 'COMPLETING', + 'IDLE', + 'RESERVED', + } + return self._states <= available_states def is_down(self): return not self.is_avail() def satisfies(self, slurm_constraint): + def _replacemany(s, replacements): + for src, dst in replacements: + s = s.replace(src, dst) + + return s + # Convert the Slurm constraint to a Python expression and evaluate it, # but restrict our syntax to accept only AND or OR constraints and - # their combinations - if not re.match(r'^[\w\d\(\)\|\&]*$', slurm_constraint): + # their combinations; to properly treat `-` in constraints we need to + # convert them to valid Python identifiers before evaluating the + # constraint. + if not re.match(r'^[\-\w\d\(\)\|\&]*$', slurm_constraint): return False - names = {grp[0] - for grp in re.finditer(r'(\w(\w|\d)*)', slurm_constraint)} - expr = slurm_constraint.replace('|', ' or ').replace('&', ' and ') - vars = {n: True for n in self.active_features} - vars.update({n: False for n in names - self.active_features}) + names = { + grp[0] for grp in re.finditer(r'[\-\w][\-\w\d]*', slurm_constraint) + } + expr = _replacemany(slurm_constraint, + [('-', '_'), ('|', ' or '), ('&', ' and ')]) + vars = {n.replace('-', '_'): True for n in self.active_features} + vars.update({ + n.replace('-', '_'): False for n in names - self.active_features + }) try: return eval(expr, {}, vars) except BaseException: diff --git a/reframe/frontend/autodetect.py b/reframe/frontend/autodetect.py index 893e8fe1ec..c43efd9e16 100644 --- a/reframe/frontend/autodetect.py +++ b/reframe/frontend/autodetect.py @@ -18,6 +18,7 @@ from reframe.core.schedulers import Job from reframe.core.systems import DeviceInfo, ProcessorInfo from reframe.utility.cpuinfo import cpuinfo +from reframe.frontend.executors import asyncio_run # This is meant to be used by the unit tests @@ -106,6 +107,7 @@ def _load_info(filename, schema=None): ) return {} except jsonschema.ValidationError as e: + getlogger().debug(str(e)) raise ConfigError( f'could not validate meta-config file {filename!r}' ) from e @@ -135,7 +137,7 @@ def _is_part_local(part): part.launcher_type.registered_name == 'local') -def _remote_detect(part): +def _remote_detect(part, cli_job_options): use_login_shell = runtime.runtime().get_option('general/0/use_login_shell') def _emit_script_for_source(job, env): @@ -171,7 +173,7 @@ def _emit_custom_script(job, env, commands): job = Job.create(part.scheduler, part.launcher_type(), name='rfm-detect-job', - sched_access=part.access) + sched_access=part.access + cli_job_options) custom_command = runtime.runtime().get_option( 'general/0/remote_install' ) @@ -184,8 +186,8 @@ def _emit_custom_script(job, env, commands): getlogger().debug('submitting detection script') _log_contents(job.script_filename) - job.submit() - job.wait() + asyncio_run(job.submit()) + asyncio_run(job.wait()) getlogger().debug('job finished') _log_contents(job.stdout) _log_contents(job.stderr) @@ -200,10 +202,11 @@ def _emit_custom_script(job, env, commands): return topo_info -def detect_topology(): +def detect_topology(cli_job_options=None): + cli_job_options = [] if cli_job_options is None else cli_job_options rt = runtime.runtime() detect_remote_systems = rt.get_option('general/0/remote_detect') - topo_prefix = os.path.join(os.getenv('HOME'), '.reframe/topology') + topo_prefix = osext.expandvars(rt.get_option('general/0/topology_prefix')) for part in rt.system.partitions: getlogger().debug(f'detecting topology info for {part.fullname}') found_procinfo = False @@ -279,7 +282,9 @@ def detect_topology(): _save_info(topo_file, part.processor.info) elif detect_remote_systems: with runtime.temp_environment(modules=modules, env_vars=vars): - part._processor = ProcessorInfo(_remote_detect(part)) + part._processor = ProcessorInfo( + _remote_detect(part, cli_job_options) + ) if part.processor.info: _save_info(topo_file, part.processor.info) diff --git a/reframe/frontend/cli.py b/reframe/frontend/cli.py index 92d9a1e01a..8bc4cf06e1 100644 --- a/reframe/frontend/cli.py +++ b/reframe/frontend/cli.py @@ -32,7 +32,7 @@ getallnodes, repeat_tests, parameterize_tests) from reframe.frontend.executors.policies import (SerialExecutionPolicy, - AsynchronousExecutionPolicy) + AsyncioExecutionPolicy) from reframe.frontend.executors import Runner, generate_testcases from reframe.frontend.loader import RegressionCheckLoader from reframe.frontend.printer import PrettyPrinter @@ -253,6 +253,8 @@ def validate_storage_options(namespace, cmd_options): @logging.time_function_noexit def main(): + # Setup the working dir + runtime.set_working_dir() # Setup command line options argparser = argparse.ArgumentParser() action_options = argparser.add_mutually_exclusive_group(required=True) @@ -628,7 +630,7 @@ def main(): configvar='general/perf_report_spec', envvar='RFM_PERF_REPORT_SPEC', help=('Print a report for performance tests ' - '(default: "now:now/last:+job_nodelist/+result")') + '(default: "now-1d:now/last:+job_nodelist/+result")') ) reporting_options.add_argument( '--session-extras', action='append', metavar='KV_DATA', @@ -655,7 +657,7 @@ def main(): envvar='RFM_SYSTEM' ) misc_options.add_argument( - '--table-format', choices=['csv', 'plain', 'outline', 'grid'], + '--table-format', choices=['csv', 'pretty', 'plain'], help='Table formatting', envvar='RFM_TABLE_FORMAT', configvar='general/table_format' ) @@ -979,8 +981,7 @@ def restrict_logging(): '--describe-stored-testcases', '--list-stored-sessions', '--list-stored-testcases', - '--performance-compare', - '--performance-report']): + '--performance-compare']): sys.exit(1) rt = runtime.runtime() @@ -1042,7 +1043,8 @@ def restrict_logging(): with exit_gracefully_on_error('failed to retrieve test case data', printer): printer.info(jsonext.dumps(reporting.testcase_info( - options.describe_stored_testcases, namepatt + options.describe_stored_testcases, + namepatt, options.filter_expr ), indent=2)) sys.exit(0) @@ -1110,7 +1112,20 @@ def restrict_logging(): sys.exit(0) - autodetect.detect_topology() + # Need to parse the cli options before autodetection + parsed_job_options = [] + for opt in options.job_options: + opt_split = opt.split('=', maxsplit=1) + optstr = opt_split[0] + valstr = opt_split[1] if len(opt_split) > 1 else '' + if opt.startswith('-') or opt.startswith('#'): + parsed_job_options.append(opt) + elif len(optstr) == 1: + parsed_job_options.append(f'-{optstr} {valstr}') + else: + parsed_job_options.append(f'--{optstr}={valstr}') + + autodetect.detect_topology(parsed_job_options) printer.debug(format_env(options.env_vars)) # Setup the check loader @@ -1225,19 +1240,6 @@ def print_infoline(param, value): try: logging.getprofiler().enter_region('test processing') - # Need to parse the cli options before loading the tests - parsed_job_options = [] - for opt in options.job_options: - opt_split = opt.split('=', maxsplit=1) - optstr = opt_split[0] - valstr = opt_split[1] if len(opt_split) > 1 else '' - if opt.startswith('-') or opt.startswith('#'): - parsed_job_options.append(opt) - elif len(optstr) == 1: - parsed_job_options.append(f'-{optstr} {valstr}') - else: - parsed_job_options.append(f'--{optstr}={valstr}') - # Locate and load checks; `force=True` is not needed for normal # invocations from the command line and has practically no effect, but # it is needed to better emulate the behavior of running reframe's CLI @@ -1561,7 +1563,7 @@ def module_unuse(*paths): if options.exec_policy == 'serial': exec_policy = SerialExecutionPolicy() elif options.exec_policy == 'async': - exec_policy = AsynchronousExecutionPolicy() + exec_policy = AsyncioExecutionPolicy() else: # This should not happen, since choices are handled by # argparser @@ -1638,9 +1640,12 @@ def module_unuse(*paths): if options.max_retries and runner.stats.failed(run=0): printer.retry_report(report) - # Print a failure report if we had failures in the last run + # Print a failure report in case of failures. + # If `--duration` or `--reruns` is used then take into account + # all runs, else (i.e., `--max-retries`) only the last run. success = True - if runner.stats.failed(): + runid = None if options.duration or options.reruns else -1 + if runner.stats.failed(run=runid): success = False printer.failure_report( report, @@ -1655,9 +1660,12 @@ def module_unuse(*paths): if (options.performance_report and not options.dry_run and not report.is_empty()): try: - data = reporting.performance_compare( - rt.get_option('general/0/perf_report_spec'), report - ) + if rt.get_option('storage/0/enable'): + data = reporting.performance_compare( + rt.get_option('general/0/perf_report_spec'), report + ) + else: + data = report.report_data() except Exception as err: printer.warning( f'failed to generate performance report: {err}' @@ -1699,7 +1707,8 @@ def module_unuse(*paths): ) # Store the generated report for analytics - if not report.is_empty() and not options.dry_run: + if (rt.get_option('storage/0/enable') and + not report.is_empty() and not options.dry_run): try: sess_uuid = report.store() except Exception as e: @@ -1730,6 +1739,12 @@ def module_unuse(*paths): sys.exit(1) sys.exit(0) + except errors.RunSessionTimeout as err: + printer.warning(f'run session stopped: {err}') + if not success: + sys.exit(1) + else: + sys.exit(0) except (Exception, KeyboardInterrupt, errors.ReframeFatalError): exc_info = sys.exc_info() tb = ''.join(traceback.format_exception(*exc_info)) diff --git a/reframe/frontend/executors/__init__.py b/reframe/frontend/executors/__init__.py index 0714b63b92..62df76e5b5 100644 --- a/reframe/frontend/executors/__init__.py +++ b/reframe/frontend/executors/__init__.py @@ -4,6 +4,7 @@ # SPDX-License-Identifier: BSD-3-Clause import abc +import asyncio import contextlib import copy import itertools @@ -124,6 +125,10 @@ def __repr__(self): e = self.environ.name if self.environ else None return f'({c!r}, {p!r}, {e!r})' + def __str__(self): + check, partition, environ = self + return f'{check.name} @{partition.fullname}+{environ.name}' + def prepare(self): '''Prepare test case for sending down the test pipeline''' if self._is_ready: @@ -389,6 +394,48 @@ def __exit__(this, exc_type, exc_value, traceback): self.fail() raise TaskExit from e + async def _safe_call_asyncio(self, fn, *args, **kwargs): + class update_timestamps: + '''Context manager to set the start and finish timestamps.''' + + # We use `this` to refer to the update_timestamps object, because + # we don't want to masquerade the self argument of our containing + # function + def __enter__(this): + if fn.__name__ not in ('poll', + 'run_complete', + 'compile_complete'): + stage = self._current_stage + self._timestamps[f'{stage}_start'] = time.time() + + def __exit__(this, exc_type, exc_value, traceback): + stage = self._current_stage + self._timestamps[f'{stage}_finish'] = time.time() + self._timestamps['pipeline_end'] = time.time() + + if fn.__name__ not in ('poll', 'run_complete', 'compile_complete'): + self._current_stage = fn.__name__ + + try: + with logging.logging_context(self.check) as logger: + logger.debug(f'Entering stage: {self._current_stage}') + with update_timestamps(): + # Pick the configuration of the current partition + with runtime.temp_config(self.testcase.partition.fullname): + return await fn(*args, **kwargs) + except SkipTestError as e: + if not self.succeeded: + # Only skip a test if it hasn't finished yet; + # This practically ignores skipping during the cleanup phase + self.skip() + raise TaskExit from e + except ABORT_REASONS: + self.fail() + raise + except BaseException as e: + self.fail() + raise TaskExit from e + def _dry_run_call(self, fn, *args, **kwargs): '''Call check's fn method in dry-run mode.''' @@ -416,17 +463,18 @@ def setup(self, *args, **kwargs): self._notify_listeners('on_task_setup') @logging.time_function - def compile(self): - self._safe_call(self.check.compile) + async def compile(self): + await self._safe_call_asyncio(self.check.compile) self._notify_listeners('on_task_compile') @logging.time_function - def compile_wait(self): - self._safe_call(self.check.compile_wait) + async def compile_wait(self): + await self._safe_call_asyncio(self.check.compile_wait) @logging.time_function - def run(self): - self._safe_call(self.check.run) + async def run(self): + # QUESTION: should I change the order here? + await self._safe_call_asyncio(self.check.run) self._notify_listeners('on_task_run') @logging.time_function @@ -447,8 +495,8 @@ def compile_complete(self): return done @logging.time_function - def run_wait(self): - self._safe_call(self.check.run_wait) + async def run_wait(self): + await self._safe_call_asyncio(self.check.run_wait) self.zombie = False @logging.time_function @@ -484,6 +532,8 @@ def cleanup(self, *args, **kwargs): self._safe_call(self.check.cleanup, *args, **kwargs) def fail(self, exc_info=None, callback='on_task_failure'): + if self._aborted: + return self._failed_stage = self._current_stage self._exc_info = exc_info or sys.exc_info() self._notify_listeners(callback) @@ -503,7 +553,6 @@ def abort(self, cause=None): logging.getlogger().debug2(f'Aborting test case: {self.testcase!r}') exc = AbortTaskError() exc.__cause__ = cause - self._aborted = True try: if not self.zombie and self.check.job: self.check.job.cancel() @@ -513,6 +562,7 @@ def abort(self, cause=None): self.fail() else: self.fail((type(exc), exc, None), 'on_task_abort') + self._aborted = True def info(self): '''Return an info string about this task.''' @@ -700,10 +750,8 @@ def print_separator(check, prefix): 'start processing checks') self._policy.enter() self._printer.reset_progress(len(testcases)) - for t in testcases: - self._policy.runcase(t) - self._policy.exit() + self._policy.execute(testcases) self._printer.separator('short single line', 'all spawned checks have finished\n') @@ -748,9 +796,51 @@ def timeout_expired(self): def enter(self): self._num_failed_tasks = 0 - def exit(self): + def _exit(self): pass @abc.abstractmethod - def runcase(self, case): + def _runcase(self, case): '''Run a test case.''' + + def execute(self, testcases): + '''Execute the policy for a given set of testcases.''' + # Moved here the execution + for t in testcases: + self._runcase(t) + + self._exit() + + +def asyncio_run(coro): + from reframe.frontend.executors.policies import all_tasks + try: + loop = asyncio.get_event_loop() + for task in all_tasks(loop): + if isinstance(task, asyncio.tasks.Task): + try: + task.cancel() + except RuntimeError: + pass + if loop.is_closed(): + loop = asyncio.new_event_loop() + watcher = asyncio.get_child_watcher() + if isinstance(watcher, asyncio.SafeChildWatcher): + # Detach the watcher from the current loop to avoid issues + watcher.close() + watcher.attach_loop(None) + asyncio.set_event_loop(loop) + if isinstance(watcher, asyncio.SafeChildWatcher): + # Reattach the watcher to the new loop + watcher.attach_loop(loop) + except RuntimeError: + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + + try: + loop.run_until_complete(coro) + except (Exception, KeyboardInterrupt): + for task in all_tasks(loop): + if isinstance(task, asyncio.tasks.Task): + task.cancel() + loop.close() diff --git a/reframe/frontend/executors/policies.py b/reframe/frontend/executors/policies.py index b75588c5c3..86a2670bcf 100644 --- a/reframe/frontend/executors/policies.py +++ b/reframe/frontend/executors/policies.py @@ -3,6 +3,7 @@ # # SPDX-License-Identifier: BSD-3-Clause +import asyncio import contextlib import math import sys @@ -14,10 +15,10 @@ RunSessionTimeout, SkipTestError, TaskDependencyError, - TaskExit) + TaskExit, + KeyboardError, + AbortTaskError) from reframe.core.logging import getlogger, level_from_str -from reframe.core.pipeline import (CompileOnlyRegressionTest, - RunOnlyRegressionTest) from reframe.frontend.executors import (ExecutionPolicy, RegressionTask, TaskEventListener, ABORT_REASONS) @@ -63,11 +64,12 @@ def __init__(self): self._num_polls = 0 self._sleep_duration = None self._t_init = None + self._jobs_pool = [] def reset_snooze_time(self): self._sleep_duration = self.SLEEP_MIN - def snooze(self): + async def snooze(self): if self._num_polls == 0: self._t_init = time.time() @@ -78,11 +80,35 @@ def snooze(self): f'Poll rate control: sleeping for {self._sleep_duration}s ' f'(current poll rate: {poll_rate} polls/s)' ) - time.sleep(self._sleep_duration) + await asyncio.sleep(self._sleep_duration) self._sleep_duration = min( self._sleep_duration*self.SLEEP_INC_RATE, self.SLEEP_MAX ) + def is_time_to_poll(self): + # We check here if it's time to poll + if self._num_polls == 0: + self._t_init = time.time() + + t_elapsed = time.time() - self._t_init + + self._num_polls += 1 + if t_elapsed >= self._sleep_duration: + return True + else: + return False + + def reset_time_to_poll(self): + self._t_init = time.time() + + +global _poll_controller +_poll_controller = _PollController() + + +def getpollcontroller(): + return _poll_controller + class SerialExecutionPolicy(ExecutionPolicy, TaskEventListener): def __init__(self): @@ -97,8 +123,8 @@ def __init__(self): self._retired_tasks = [] self.task_listeners.append(self) - def runcase(self, case): - super().runcase(case) + async def _runcase(self, case): + super()._runcase(case) check, partition, _ = case task = RegressionTask(case, self.task_listeners) if check.is_dry_run(): @@ -130,9 +156,9 @@ def runcase(self, case): task.testcase.environ, sched_flex_alloc_nodes=self.sched_flex_alloc_nodes, sched_options=self.sched_options) - task.compile() - task.compile_wait() - task.run() + await task.compile() + await task.compile_wait() + await task.run() # Pick the right scheduler if task.check.local: @@ -143,14 +169,13 @@ def runcase(self, case): self._pollctl.reset_snooze_time() while True: if not self.dry_run_mode: - sched.poll(task.check.job) - + await sched.poll(task.check.job) if task.run_complete(): break - self._pollctl.snooze() + await self._pollctl.snooze() - task.run_wait() + await task.run_wait() if not self.skip_sanity_check: task.sanity() @@ -163,7 +188,10 @@ def runcase(self, case): return except ABORT_REASONS as e: task.abort(e) - raise + if type(e) is KeyboardInterrupt: + raise KeyboardError + else: + raise e except BaseException: task.fail(sys.exc_info()) @@ -242,327 +270,238 @@ def on_task_success(self, task): if self.timeout_expired(): raise RunSessionTimeout('maximum session duration exceeded') - def exit(self): + def execute(self, testcases): + '''Execute the policy for a given set of testcases.''' + # Moved here the execution + try: + loop = asyncio.get_event_loop() + for task in all_tasks(loop): + if isinstance(task, asyncio.tasks.Task): + try: + task.cancel() + except RuntimeError: + pass + if loop.is_closed(): + loop = asyncio.new_event_loop() + watcher = asyncio.get_child_watcher() + if isinstance(watcher, asyncio.SafeChildWatcher): + # Detach the watcher from the current loop to avoid issues + watcher.close() + watcher.attach_loop(None) + asyncio.set_event_loop(loop) + if isinstance(watcher, asyncio.SafeChildWatcher): + # Reattach the watcher to the new loop + watcher.attach_loop(loop) + except RuntimeError: + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + + for case in testcases: + try: + loop.run_until_complete(self._runcase(case)) + except (Exception, KeyboardInterrupt) as e: + if type(e) in ABORT_REASONS or isinstance(e, KeyboardError): + for task in all_tasks(loop): + if isinstance(task, asyncio.tasks.Task): + task.cancel() + loop.close() + raise e + else: + getlogger().info(f"Execution stopped due to an error: {e}") + break + loop.close() + self._exit() + + def _exit(self): # Clean up all remaining tasks _cleanup_all(self._retired_tasks, not self.keep_stage_files) -class AsynchronousExecutionPolicy(ExecutionPolicy, TaskEventListener): +class AsyncioExecutionPolicy(ExecutionPolicy, TaskEventListener): '''The asynchronous execution policy.''' def __init__(self): super().__init__() - self._pollctl = _PollController() + self._pollctl = getpollcontroller() + self._pollctl.reset_snooze_time() + self._current_tasks = util.OrderedSet() # Index tasks by test cases self._task_index = {} - # A set of all the current tasks. We use an ordered set here, because - # we want to preserve the order of the tasks. - self._current_tasks = util.OrderedSet() - - # Quick look up for the partition schedulers including the - # `_rfm_local` pseudo-partition - self._schedulers = { - '_rfm_local': self.local_scheduler - } - # Tasks per partition self._partition_tasks = { '_rfm_local': util.OrderedSet() } - # Retired tasks that need to be cleaned up - self._retired_tasks = [] - # Job limit per partition self._max_jobs = { '_rfm_local': rt.runtime().get_option('systems/0/max_local_jobs') } - self._pipeline_statistics = rt.runtime().get_option( - 'systems/0/dump_pipeline_progress' - ) - self.task_listeners.append(self) - - def _init_pipeline_progress(self, num_tasks): - self._pipeline_progress = { - 'startup': [(num_tasks, 0)], - 'ready_compile': [(0, 0)], - 'compiling': [(0, 0)], - 'ready_run': [(0, 0)], - 'running': [(0, 0)], - 'completing': [(0, 0)], - 'retired': [(0, 0)], - 'completed': [(0, 0)], - 'fail': [(0, 0)], - 'skip': [(0, 0)] - } - self._pipeline_step = 0 - self._t_pipeline_start = time.time() - - def _update_pipeline_progress(self, old_state, new_state, num_tasks=1): - timestamp = time.time() - self._t_pipeline_start - for state in self._pipeline_progress: - count = self._pipeline_progress[state][self._pipeline_step][0] - if old_state != new_state: - if state == old_state: - count -= num_tasks - elif state == new_state: - count += num_tasks - - self._pipeline_progress[state].append((count, timestamp)) - - self._pipeline_step += 1 - - def _dump_pipeline_progress(self, filename): - import reframe.utility.jsonext as jsonext - with open(filename, 'w') as fp: - jsonext.dump(self._pipeline_progress, fp, indent=2) + # Tasks that have finished, but have not performed their cleanup phase + self._retired_tasks = [] + self.task_listeners.append(self) - def runcase(self, case): - super().runcase(case) - check, partition, environ = case - self._schedulers[partition.fullname] = partition.scheduler + async def _runcase(self, case, task): + # I added the task here as an argument because, + # I wanted to initialize it + # outside, when I gather the tasks. + # If I gather the tasks and then I do asyncio + # if one of them fails the others are not iformed, + # I had to code that manually. There is a way to make everything + # stop if an exepction is raised but I didn't know how to treat + # that raise Exception nicelly because I wouldn't be able + # to abort the tasks which the execution has not yet started, + # I needed to do abortall on all the tests, not only the ones + # which were initiated by the execution. Exit gracefully + # the execuion loop aborting all the tasks + super()._runcase(case) + check, partition, _ = case + # task = RegressionTask(case, self.task_listeners) + if check.is_dry_run(): + self.printer.status('DRY', task.info()) + else: + self.printer.status('RUN', task.info()) - # Set partition-based counters, if not set already self._partition_tasks.setdefault(partition.fullname, util.OrderedSet()) self._max_jobs.setdefault(partition.fullname, partition.max_jobs) - task = RegressionTask(case, self.task_listeners) self._task_index[case] = task - self.stats.add_task(task) - getlogger().debug2( - f'Added {check.name} on {partition.fullname} ' - f'using {environ.name}' - ) - self._current_tasks.add(task) - - def exit(self): - if self._pipeline_statistics: - self._init_pipeline_progress(len(self._current_tasks)) - - self._pollctl.reset_snooze_time() - while self._current_tasks: - try: - self._poll_tasks() - num_running = sum( - 1 if t.state in ('running', 'compiling') else 0 - for t in self._current_tasks - ) - timeout = rt.runtime().get_option( - 'general/0/pipeline_timeout' - ) - - self._advance_all(self._current_tasks, timeout) - if self._pipeline_statistics: - num_retired = len(self._retired_tasks) - - _cleanup_all(self._retired_tasks, not self.keep_stage_files) - if self._pipeline_statistics: - num_retired_actual = num_retired - len(self._retired_tasks) - - # Some tests might not be cleaned up because they are - # waiting for dependencies or because their dependencies - # have failed. - self._update_pipeline_progress( - 'retired', 'completed', num_retired_actual - ) - - if self.timeout_expired(): - raise RunSessionTimeout( - 'maximum session duration exceeded' - ) - - if num_running: - self._pollctl.snooze() - except ABORT_REASONS as e: - self._abortall(e) - raise - - if self._pipeline_statistics: - self._dump_pipeline_progress('pipeline-progress.json') - - def _poll_tasks(self): - if self.dry_run_mode: - return - - for partname, sched in self._schedulers.items(): - jobs = [] - for t in self._partition_tasks[partname]: - if t.state == 'compiling': - jobs.append(t.check.build_job) - elif t.state == 'running': - jobs.append(t.check.job) - - sched.poll(*jobs) - - def _exec_stage(self, task, stage_methods): - '''Execute a series of pipeline stages. - - Return True on success, False otherwise. - ''' - try: - for stage in stage_methods: - stage() - except TaskExit: - self._current_tasks.remove(task) - if task.check.current_partition: - partname = task.check.current_partition.fullname - else: - partname = None - - # Remove tasks from the partition tasks if there - with contextlib.suppress(KeyError): - self._partition_tasks['_rfm_local'].remove(task) - if partname: - self._partition_tasks[partname].remove(task) - - return False - else: - return True - - def _advance_all(self, tasks, timeout=None): - t_init = time.time() - num_progressed = 0 - - getlogger().debug2(f'Current tests: {len(tasks)}') - - # We take a snapshot of the tasks to advance by doing a shallow copy, - # since the tasks may removed by the individual advance functions. - for t in list(tasks): - old_state = t.state - bump_state = getattr(self, f'_advance_{t.state}') - num_progressed += bump_state(t) - new_state = t.state - - t_elapsed = time.time() - t_init - if timeout and t_elapsed > timeout and num_progressed: - break + # Do not run test if any of its dependencies has failed + # NOTE: Restored dependencies are not in the task_index + if any(self._task_index[c].failed + for c in case.deps if c in self._task_index): + raise TaskDependencyError('dependencies failed') - if self._pipeline_statistics: - self._update_pipeline_progress(old_state, new_state, 1) + if any(self._task_index[c].skipped + for c in case.deps if c in self._task_index): - getlogger().debug2(f'Bumped {num_progressed} test(s)') + # We raise the SkipTestError here and catch it immediately in + # order for `skip()` to get the correct exception context. + try: + raise SkipTestError('skipped due to skipped dependencies') + except SkipTestError as e: + task.skip() + raise TaskExit from e - def _advance_startup(self, task): - if self.deps_skipped(task): - try: - raise SkipTestError('skipped due to skipped dependencies') - except SkipTestError as e: - task.skip() - self._current_tasks.remove(task) - return 1 - elif self.deps_succeeded(task): - try: + deps_status = await self.check_deps(task) + if deps_status == "skipped": + try: + raise SkipTestError('skipped due to skipped dependencies') + except SkipTestError: + task.skip() + self._current_tasks.remove(task) + return 1 + elif deps_status == "succeded": if task.check.is_dry_run(): self.printer.status('DRY', task.info()) else: self.printer.status('RUN', task.info()) - - task.setup(task.testcase.partition, - task.testcase.environ, - sched_flex_alloc_nodes=self.sched_flex_alloc_nodes, - sched_options=self.sched_options) - except TaskExit: + elif deps_status == "failed": + exc = TaskDependencyError('dependencies failed') + task.fail((type(exc), exc, None)) self._current_tasks.remove(task) return 1 - if isinstance(task.check, RunOnlyRegressionTest): - # All tests should execute all the pipeline stages, even if - # they are no-ops - self._exec_stage(task, [task.compile, - task.compile_complete, - task.compile_wait]) - - return 1 - elif self.deps_failed(task): - exc = TaskDependencyError('dependencies failed') - task.fail((type(exc), exc, None)) - self._current_tasks.remove(task) - return 1 - else: - # Not all dependencies have finished yet - getlogger().debug2(f'{task.info()} waiting for dependencies') - return 0 + task.setup(task.testcase.partition, + task.testcase.environ, + sched_flex_alloc_nodes=self.sched_flex_alloc_nodes, + sched_options=self.sched_options) + partname = _get_partition_name(task, phase='build') + max_jobs = self._max_jobs[partname] + while len(self._partition_tasks[partname])+1 > max_jobs: + await asyncio.sleep(2) + self._partition_tasks[partname].add(task) + await task.compile() + await task.compile_wait() + self._partition_tasks[partname].remove(task) + task.compile_complete() + partname = _get_partition_name(task, phase='run') + max_jobs = self._max_jobs[partname] + while len(self._partition_tasks[partname])+1 > max_jobs: + await asyncio.sleep(2) + self._partition_tasks[partname].add(task) + await task.run() - def _advance_ready_compile(self, task): - partname = _get_partition_name(task, phase='build') - max_jobs = self._max_jobs[partname] - if len(self._partition_tasks[partname]) < max_jobs: - if self._exec_stage(task, [task.compile]): - self._partition_tasks[partname].add(task) + # Pick the right scheduler + if task.check.local: + sched = self.local_scheduler + else: + sched = partition.scheduler - return 1 + while True: + if not self.dry_run_mode: + if getpollcontroller().is_time_to_poll(): + getpollcontroller().reset_time_to_poll() + await sched.poll(*getpollcontroller()._jobs_pool) - getlogger().debug2(f'Hit the max job limit of {partname}: {max_jobs}') - return 0 + if task.run_complete(): + break - def _advance_compiling(self, task): - partname = _get_partition_name(task, phase='build') - try: - if task.compile_complete(): - task.compile_wait() - self._partition_tasks[partname].remove(task) - if isinstance(task.check, CompileOnlyRegressionTest): - # All tests should pass from all the pipeline stages, - # even if they are no-ops - self._exec_stage(task, [task.run, - task.run_complete, - task.run_wait]) + await self._pollctl.snooze() - return 1 - else: - return 0 - except TaskExit: + await task.run_wait() self._partition_tasks[partname].remove(task) - self._current_tasks.remove(task) - return 1 + if not self.skip_sanity_check: + task.sanity() - def _advance_ready_run(self, task): - partname = _get_partition_name(task, phase='run') - max_jobs = self._max_jobs[partname] - if len(self._partition_tasks[partname]) < max_jobs: - if self._exec_stage(task, [task.run]): - self._partition_tasks[partname].add(task) + if not self.skip_performance_check: + task.performance() - return 1 + self._retired_tasks.append(task) + task.finalize() - getlogger().debug2(f'Hit the max job limit of {partname}: {max_jobs}') - return 0 + except TaskExit: + self._current_tasks.remove(task) + if task.check.current_partition: + partname = task.check.current_partition.fullname + else: + partname = None - def _advance_running(self, task): - partname = _get_partition_name(task, phase='run') - try: - if task.run_complete(): - if self._exec_stage(task, [task.run_wait]): + # Remove tasks from the partition tasks if there + with contextlib.suppress(KeyError): + self._partition_tasks['_rfm_local'].remove(task) + with contextlib.suppress(KeyError): + if partname: self._partition_tasks[partname].remove(task) - return 1 + return + except ABORT_REASONS as e: + self._abortall(e) + if type(e) is KeyboardInterrupt: + raise KeyboardError else: - return 0 - except TaskExit: - self._partition_tasks[partname].remove(task) + raise e + except BaseException: + task.fail(sys.exc_info()) self._current_tasks.remove(task) - return 1 + if task.check.current_partition: + partname = task.check.current_partition.fullname + else: + partname = None - def _advance_completing(self, task): - try: - if not self.skip_sanity_check: - task.sanity() + # Remove tasks from the partition tasks if there + with contextlib.suppress(KeyError): + self._partition_tasks['_rfm_local'].remove(task) + with contextlib.suppress(KeyError): + if partname: + self._partition_tasks[partname].remove(task) + return - if not self.skip_performance_check: - task.performance() + async def check_deps(self, task): + while not (self.deps_skipped(task) or self.deps_failed(task) or + self.deps_succeeded(task)): + await asyncio.sleep(1) - task.finalize() - self._retired_tasks.append(task) - self._current_tasks.remove(task) - return 1 - except TaskExit: - self._current_tasks.remove(task) - return 1 + if self.deps_skipped(task): + return "skipped" + elif self.deps_failed(task): + return "failed" + elif self.deps_succeeded(task): + return "succeeded" def deps_failed(self, task): # NOTE: Restored dependencies are not in the task_index @@ -587,22 +526,24 @@ def _abortall(self, cause): with contextlib.suppress(FailureLimitError): task.abort(cause) - # These function can be useful for tracking statistics of the framework, - # such as number of tests that have finished setup etc. def on_task_setup(self, task): pass def on_task_run(self, task): - pass + getpollcontroller()._jobs_pool.append(task.check.job) def on_task_compile(self, task): + # getpollcontroller()._jobs_pool.append(task.check.job) + # print("Add compile", task.check.job.name) pass def on_task_exit(self, task): - self._pollctl.reset_snooze_time() + getpollcontroller()._jobs_pool.remove(task.check.job) def on_task_compile_exit(self, task): - self._pollctl.reset_snooze_time() + # getpollcontroller()._jobs_pool.remove(task.check.job) + # print("Remove compile", task.check.job.name) + pass def on_task_skip(self, task): msg = str(task.exc_info[1]) @@ -639,18 +580,92 @@ def on_task_failure(self, task): f'maximum number of failures ({self.max_failures}) reached' ) + if self.timeout_expired(): + raise RunSessionTimeout('maximum session duration exceeded') + def on_task_success(self, task): msg = f'{task.info()}' self.printer.status('OK', msg, just='right') _print_perf(task) timings = task.pipeline_timings(['setup', 'compile_complete', - 'run_complete', + 'run_complete', 'sanity', 'performance', 'total']) getlogger().verbose(f'==> {timings}') + + # Update reference count of dependencies for c in task.testcase.deps: # NOTE: Restored dependencies are not in the task_index if c in self._task_index: self._task_index[c].ref_count -= 1 + + _cleanup_all(self._retired_tasks, not self.keep_stage_files) + if self.timeout_expired(): + raise RunSessionTimeout('maximum session duration exceeded') + + def _exit(self): + # Clean up all remaining tasks + _cleanup_all(self._retired_tasks, not self.keep_stage_files) + + def execute(self, testcases): + try: + loop = asyncio.get_event_loop() + except RuntimeError: + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + if loop.is_closed(): + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + all_cases = [] + for t in testcases: + task = RegressionTask(t, self.task_listeners) + + self.stats.add_task(task) + self._current_tasks.add(task) + all_cases.append(asyncio.ensure_future(self._runcase(t, task))) + try: + # Wait for tasks until the first failure + loop.run_until_complete(self._execute_until_failure(all_cases)) + except (Exception, KeyboardInterrupt) as e: + if type(e) in ABORT_REASONS or isinstance(e, KeyboardError): + loop.run_until_complete(_cancel_gracefully(all_cases)) + try: + raise AbortTaskError + except AbortTaskError as exc: + self._abortall(exc) + loop.close() + raise e + else: + getlogger().info(f"Execution stopped due to an error: {e}") + finally: + loop.close() + loop.close() + self._exit() + + async def _execute_until_failure(self, all_cases): + """Wait for tasks to complete or fail, stopping at the first failure.""" + while all_cases: + done, all_cases = await asyncio.wait( + all_cases, return_when=asyncio.FIRST_COMPLETED + ) + for task in done: + if task.exception(): + raise task.exception() # Exit if aborted + + +async def _cancel_gracefully(all_cases): + for case in all_cases: + case.cancel() + await asyncio.gather(*all_cases, return_exceptions=True) + + +def all_tasks(loop): + """Wrapper for asyncio.current_task() compatible with Python 3.6 and later.""" + if sys.version_info >= (3, 7): + # Use asyncio.current_task() directly in Python 3.7+ + return asyncio.all_tasks(loop) + else: + # Fallback to asyncio.tasks.current_task() in Python 3.6 + return asyncio.Task.all_tasks(loop) diff --git a/reframe/frontend/filters.py b/reframe/frontend/filters.py index 77d90b9b6a..04e783abb2 100644 --- a/reframe/frontend/filters.py +++ b/reframe/frontend/filters.py @@ -6,6 +6,7 @@ import re from reframe.core.exceptions import ReframeError +from reframe.core.logging import getlogger def re_compile(patt): @@ -118,6 +119,8 @@ def _fn(case): try: return eval(expr, None, case.check.__dict__) except Exception as err: - raise ReframeError(f'invalid expression `{expr}`') from err + getlogger().warning(f'error while evaluating expression `{expr}` ' + f'for test case `{case}`: {err}') + return False return _fn diff --git a/reframe/frontend/loader.py b/reframe/frontend/loader.py index 6486ebc77d..c05f7f69fe 100644 --- a/reframe/frontend/loader.py +++ b/reframe/frontend/loader.py @@ -193,17 +193,31 @@ def load_from_file(self, filename, force=False): try: dirname = os.path.dirname(filename) - with osext.change_dir(dirname): - with util.temp_sys_path(dirname): - if os.path.exists(os.path.join(dirname, '__init__.py')): - # If the containing directory is a package, - # import it, too. - parent = util.import_module_from_file(dirname).__name__ - else: - parent = None + # Load all parent modules of test file + parents = [] + while os.path.exists(os.path.join(dirname, '__init__.py')): + parents.append(os.path.join(dirname)) + dirname = os.path.split(dirname)[0] + + parent_module = None + for pdir in reversed(parents): + with osext.change_dir(pdir): + with util.temp_sys_path(pdir): + package_path = os.path.join(pdir, '__init__.py') + parent_module = util.import_module_from_file( + package_path, parent=parent_module + ).__name__ + + # Now load the actual test file + if not parents: + pdir = dirname + + with osext.change_dir(pdir): + with util.temp_sys_path(pdir): return self.load_from_module( - util.import_module_from_file(filename, force, parent) + util.import_module_from_file(filename, force, + parent_module) ) except Exception: exc_info = sys.exc_info() diff --git a/reframe/frontend/printer.py b/reframe/frontend/printer.py index 413f271cf4..c081e65955 100644 --- a/reframe/frontend/printer.py +++ b/reframe/frontend/printer.py @@ -124,6 +124,7 @@ def _print_failure_info(rec, runid, total_runs): self.info(f" * Description: {rec['descr']}") self.info(f" * System partition: {rec['system']}") self.info(f" * Environment: {rec['environ']}") + self.info(f" * Test file: {rec['filename']}") self.info(f" * Stage directory: {rec['stagedir']}") self.info(f" * Node list: " f"{nodelist_abbrev(rec['job_nodelist'])}") @@ -275,10 +276,8 @@ def table(self, data, **kwargs): # Map our options to tabulate if table_format == 'plain': - tablefmt = 'plain' - elif table_format == 'outline': - tablefmt = 'mixed_outline' - elif table_format == 'grid': + tablefmt = 'simple' + elif table_format == 'pretty': tablefmt = 'mixed_grid' else: raise ValueError(f'invalid table format: {table_format}') diff --git a/reframe/frontend/reporting/__init__.py b/reframe/frontend/reporting/__init__.py index c1ff5745c0..48ae1a0d1f 100644 --- a/reframe/frontend/reporting/__init__.py +++ b/reframe/frontend/reporting/__init__.py @@ -17,7 +17,6 @@ import uuid from collections import UserDict from collections.abc import Hashable -from filelock import FileLock import reframe as rfm import reframe.utility.jsonext as jsonext @@ -215,12 +214,12 @@ def _restore_session(filename): except KeyError: found_ver = 'n/a' - getlogger().verbose(f'JSON validation error: {e}') + getlogger().debug(str(e)) raise ReframeError( f'failed to validate report {filename!r}: {e.args[0]} ' f'(check report data version: required {DATA_VERSION}, ' f'found: {found_ver})' - ) from None + ) from e return _RestoredSessionInfo(report) @@ -419,7 +418,11 @@ def update_run_stats(self, stats): 'num_skipped': self.__report['runs'][-1]['num_skipped'] }) - def _save(self, filename, compress, link_to_last): + def is_empty(self): + '''Return :obj:`True` is no test cases where run''' + return self.__report['session_info']['num_cases'] == 0 + + def save(self, filename, compress=False, link_to_last=True): filename = _expand_report_filename(filename, newfile=True) with open(filename, 'w') as fp: if compress: @@ -446,20 +449,47 @@ def _save(self, filename, compress, link_to_last): else: raise ReframeError('path exists and is not a symlink') - def is_empty(self): - '''Return :obj:`True` is no test cases where run''' - return self.__report['session_info']['num_cases'] == 0 - - def save(self, filename, compress=False, link_to_last=True): - prefix = os.path.dirname(filename) or '.' - with FileLock(os.path.join(prefix, '.report.lock')): - self._save(filename, compress, link_to_last) - def store(self): '''Store the report in the results storage.''' return StorageBackend.default().store(self, self.filename) + def report_data(self): + '''Get tabular data from this report''' + + columns = ['name', 'sysenv', 'job_nodelist', + 'pvar', 'punit', 'pval', 'result'] + data = [columns] + num_runs = len(self.__report['runs']) + for runid, runinfo in enumerate(self.__report['runs']): + for tc in map(_TCProxy, runinfo['testcases']): + if tc['result'] != 'success' and runid != num_runs - 1: + # Skip this testcase until its last retry + continue + + for pvar, reftuple in tc['perfvalues'].items(): + pvar = pvar.split(':')[-1] + pval, _, _, _, punit = reftuple + if pval is None: + # Ignore `None` performance values + # (performance tests that failed sanity) + continue + + line = [] + for c in columns: + if c == 'pvar': + line.append(pvar) + elif c == 'pval': + line.append(pval) + elif c == 'punit': + line.append(punit) + else: + line.append(tc[c]) + + data.append(line) + + return data + def generate_xml_report(self): '''Generate a JUnit report from a standard ReFrame JSON report.''' @@ -606,9 +636,12 @@ def _group_testcases(testcases, groups, columns): @time_function def _aggregate_perf(grouped_testcases, aggr_fn, cols): - if runtime().get_option('general/0/table_format') == 'csv': - # Use a csv friendly delimiter + # Update delimiter for joining unique values based on the table format + table_foramt = runtime().get_option('general/0/table_format') + if table_foramt == 'csv': delim = '|' + elif table_foramt == 'plain': + delim = ',' else: delim = '\n' diff --git a/reframe/frontend/testgenerators.py b/reframe/frontend/testgenerators.py index b5f597d48a..041e4580cf 100644 --- a/reframe/frontend/testgenerators.py +++ b/reframe/frontend/testgenerators.py @@ -77,12 +77,12 @@ def _generate_tests(testcases, gen_fn): @time_function def distribute_tests(testcases, node_map): def _rfm_pin_run_nodes(obj): - nodelist = getattr(obj, '$nid') + nodelist = getattr(obj, '.nid') if not obj.local: obj.job.pin_nodes = nodelist def _rfm_pin_build_nodes(obj): - pin_nodes = getattr(obj, '$nid') + pin_nodes = getattr(obj, '.nid') if obj.build_job and not obj.local and not obj.build_locally: obj.build_job.pin_nodes = pin_nodes @@ -99,9 +99,9 @@ def _rfm_set_valid_systems(obj): 'valid_systems': [partition.fullname], # We add a partition parameter so as to differentiate the test # in case another test has the same nodes in another partition - '$part': builtins.parameter([partition.fullname], + '.part': builtins.parameter([partition.fullname], loggable=False), - '$nid': builtins.parameter( + '.nid': builtins.parameter( [[n] for n in node_map[partition.fullname]], fmt=util.nodelist_abbrev, loggable=False ) @@ -113,7 +113,7 @@ def _rfm_set_valid_systems(obj): # will not be overwritten by a parent post-init hook builtins.run_after('init')(_rfm_set_valid_systems), ] - ), ['$part', '$nid'] + ), ['.part', '.nid'] return _generate_tests(testcases, _make_dist_test) @@ -127,10 +127,10 @@ def _make_repeat_test(testcase): return make_test( cls.__name__, (cls,), { - '$repeat_no': builtins.parameter(range(num_repeats), + '.repeat_no': builtins.parameter(range(num_repeats), loggable=False) } - ), ['$repeat_no'] + ), ['.repeat_no'] return _generate_tests(testcases, _make_repeat_test) @@ -164,7 +164,7 @@ def _make_parameterized_test(testcase): ) continue - body[f'${var}'] = builtins.parameter(values, loggable=False) + body[f'.{var}'] = builtins.parameter(values, loggable=False) def _set_vars(self): for var in body.keys(): diff --git a/reframe/schemas/config.json b/reframe/schemas/config.json index cb560b67f2..e18b29d494 100644 --- a/reframe/schemas/config.json +++ b/reframe/schemas/config.json @@ -589,7 +589,7 @@ "general/module_mappings": [], "general/non_default_craype": false, "general/perf_info_level": "info", - "general/perf_report_spec": "now:now/last:/+job_nodelist+result", + "general/perf_report_spec": "now-1d:now/last:/+job_nodelist+result", "general/pipeline_timeout": 3, "general/purge_environment": false, "general/remote_detect": false, @@ -599,8 +599,9 @@ "general/report_junit": null, "general/resolve_module_conflicts": true, "general/save_log_files": false, - "general/table_format": "outline", + "general/table_format": "pretty", "general/target_systems": ["*"], + "general/topology_prefix": "${HOME}/.reframe/topology", "general/timestamp_dirs": "%Y%m%dT%H%M%S%z", "general/trap_job_errors": false, "general/unload_modules": [], @@ -633,7 +634,7 @@ "logging/handlers_perflog/httpjson_debug": false, "modes/options": [], "modes/target_systems": ["*"], - "storage/enable": true, + "storage/enable": false, "storage/backend": "sqlite", "storage/sqlite_conn_timeout": 60, "storage/sqlite_db_file": "${HOME}/.reframe/reports/results.db", diff --git a/reframe/utility/__init__.py b/reframe/utility/__init__.py index 75c4008e3c..cfa38610ec 100644 --- a/reframe/utility/__init__.py +++ b/reframe/utility/__init__.py @@ -866,7 +866,7 @@ def count_digits(n): ''' num_digits = 1 - while n > 10: + while n >= 10: n /= 10 num_digits += 1 diff --git a/reframe/utility/osext.py b/reframe/utility/osext.py index 6c5ff547a5..76dd2cee98 100644 --- a/reframe/utility/osext.py +++ b/reframe/utility/osext.py @@ -7,6 +7,7 @@ # OS and shell utility functions # +import asyncio import collections.abc import errno import getpass @@ -338,6 +339,90 @@ def run_command_async(cmd, **popen_args) +async def run_command_asyncio_alone(cmd, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + shell=True, + timeout=None, + log=True, + serial=False, + **kwargs): + '''TODO: please write proper docstring + ''' + if log: + from reframe.core.logging import getlogger + getlogger().debug(f'[CMD] {cmd!r}') + + if isinstance(cmd, str) and not shell: + cmd = shlex.split(cmd) + + if shell: + # Call create_subprocess_shell + return await asyncio.create_subprocess_shell( + cmd, stdout=stdout, + stderr=stderr, + **kwargs + ) + else: + # Call create_subprocess_exec + return await asyncio.create_subprocess_exec( + cmd, stdout=stdout, + stderr=stderr, + **kwargs + ) + +async def run_command_asyncio(cmd, + check=False, + timeout=None, + shell=True, + log=True, + **kwargs): + '''TODO: please write proper docstring + ''' + if log: + from reframe.core.logging import getlogger + getlogger().debug(f'[CMD] {cmd!r}') + + if isinstance(cmd, str) and not shell: + cmd = shlex.split(cmd) + + try: + if shell: + # Call create_subprocess_shell + proc = await asyncio.create_subprocess_shell( + cmd, stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + **kwargs + ) + else: + # Call create_subprocess_exec + proc = await asyncio.create_subprocess_exec( + cmd, stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + **kwargs + ) + proc_stdout, proc_stderr = await asyncio.wait_for( + proc.communicate(), timeout=timeout + ) + except asyncio.TimeoutError as e: + os.killpg(proc.pid, signal.SIGKILL) + raise SpawnedProcessTimeout(e.cmd, + proc.stdout.read(), + proc.stderr.read(), timeout) from None + + completed = subprocess.CompletedProcess(cmd, + returncode=proc.returncode, + stdout=proc_stdout.decode(), + stderr=proc_stderr.decode()) + + if check and proc.returncode != 0: + raise SpawnedProcessError(completed.args, + completed.stdout, completed.stderr, + completed.returncode) + + return completed + + def run_command_async2(*args, check=False, **kwargs): '''Return a :class:`_ProcFuture` that encapsulates a command to be executed. @@ -650,6 +735,36 @@ def __enter__(self): def __exit__(self, exc_type, exc_val, exc_tb): os.chdir(self._wd_save) + async def __aenter__(self): + os.chdir(self._dir_name) + + async def __aexit__(self, exc_type, exc_val, exc_tb): + os.chdir(self._wd_save) + + +class change_dir_global: + '''Context manager to temporarily change the current working directory. + + :arg dir_name: The directory to temporarily change to. + ''' + + def __init__(self, dir_name): + from reframe.core.runtime import get_working_dir + self._wd_save = get_working_dir() + self._dir_name = dir_name + + def __enter__(self): + os.chdir(self._dir_name) + + def __exit__(self, exc_type, exc_val, exc_tb): + os.chdir(self._wd_save) + + async def __aenter__(self): + os.chdir(self._dir_name) + + async def __aexit__(self, exc_type, exc_val, exc_tb): + os.chdir(self._wd_save) + def is_url(s): '''Check if string is a URL.''' diff --git a/requirements.txt b/requirements.txt index b730f4ef21..c4bcad5af0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,15 +1,19 @@ archspec==0.2.5 argcomplete==3.1.2; python_version < '3.8' argcomplete==3.5.1; python_version >= '3.8' +autopep8==2.0.4 filelock==3.4.1; python_version == '3.6' filelock==3.12.2; python_version == '3.7' filelock==3.16.1; python_version >= '3.8' importlib_metadata==4.0.1; python_version < '3.8' jsonschema==3.2.0 +jinja2==3.0; python_version < '3.7' +jinja2==3.1.2; python_version >= '3.7' lxml==5.2.0; python_version < '3.8' and platform_machine == 'aarch64' lxml==5.3.0; python_version >= '3.8' or platform_machine != 'aarch64' +psutil pytest==7.0.1; python_version < '3.8' -pytest==8.3.3; python_version >= '3.8' +pytest==8.3.4; python_version >= '3.8' pytest-forked==1.4.0; python_version == '3.6' pytest-forked==1.6.0; python_version >= '3.7' pytest-parallel==0.1.1 diff --git a/setup.cfg b/setup.cfg index 642a277bc0..b9885902ac 100644 --- a/setup.cfg +++ b/setup.cfg @@ -16,6 +16,7 @@ classifiers = Programming Language :: Python :: 3.10 Programming Language :: Python :: 3.11 Programming Language :: Python :: 3.12 + Programming Language :: Python :: 3.13 License :: OSI Approved :: BSD License Operating System :: MacOS Operating System :: POSIX :: Linux @@ -29,13 +30,16 @@ install_requires = archspec >= 0.2.4 argcomplete argcomplete <= 3.1.2; python_version < '3.8' + autopep8 filelock filelock<=3.12.2; python_version == '3.7' filelock<=3.4.1; python_version == '3.6' + jinja2 jsonschema lxml==5.2.0; python_version < '3.8' and platform_machine == 'aarch64' lxml==5.3.0; python_version >= '3.8' or platform_machine != 'aarch64' PyYAML + psutil requests requests <= 2.27.1; python_version == '3.6' semver diff --git a/unittests/conftest.py b/unittests/conftest.py index 711716d174..1f638d52a3 100644 --- a/unittests/conftest.py +++ b/unittests/conftest.py @@ -106,7 +106,7 @@ def _make_loader(check_search_path, *args, **kwargs): @pytest.fixture(params=[policies.SerialExecutionPolicy, - policies.AsynchronousExecutionPolicy]) + policies.AsyncioExecutionPolicy]) def make_runner(request): '''Test runner with all the execution policies''' @@ -122,7 +122,7 @@ def _make_runner(*args, **kwargs): @pytest.fixture def make_async_runner(): def _make_runner(*args, **kwargs): - policy = policies.AsynchronousExecutionPolicy() + policy = policies.AsyncioExecutionPolicy() policy._pollctl.SLEEP_MIN = 0.001 return executors.Runner(policy, *args, **kwargs) diff --git a/unittests/resources/checks/frontend_checks.py b/unittests/resources/checks/frontend_checks.py index 4e0304ce0c..23de326184 100644 --- a/unittests/resources/checks/frontend_checks.py +++ b/unittests/resources/checks/frontend_checks.py @@ -7,10 +7,10 @@ # Special checks for testing the front-end # +import asyncio import os import signal import sys -import time import reframe as rfm import reframe.utility.sanity as sn @@ -107,18 +107,18 @@ def raise_before_setup(self): if self.phase == 'setup': raise KeyboardInterrupt - def run_wait(self): + async def run_wait(self): # We do our nasty stuff in wait() to make things more complicated if self.phase == 'wait': raise KeyboardInterrupt else: - return super().run_wait() + return await super().run_wait() class SystemExitCheck(BaseFrontendCheck, special=True): '''Simulate system exit from within a check.''' - def run_wait(self): + async def run_wait(self): # We do our nasty stuff in wait() to make things more complicated sys.exit(1) @@ -190,9 +190,9 @@ class SelfKillCheck(rfm.RunOnlyRegressionTest, special=True): executable = 'echo' sanity_patterns = sn.assert_true(1) - def run(self): - super().run() - time.sleep(0.5) + async def run(self): + await super().run() + await asyncio.sleep(0.5) os.kill(os.getpid(), signal.SIGTERM) diff --git a/unittests/resources/checks_unlisted/fixtures_complex.py b/unittests/resources/checks_unlisted/fixtures_complex.py index 53be46f92f..d200502b82 100644 --- a/unittests/resources/checks_unlisted/fixtures_complex.py +++ b/unittests/resources/checks_unlisted/fixtures_complex.py @@ -102,3 +102,17 @@ def validate_fixture_resolution(self): ParamFixture.num_variants ) ]) + + +@rfm.simple_test +class TestC(rfm.RunOnlyRegressionTest): + valid_systems = ['*'] + valid_prog_environs = ['*'] + executable = 'echo' + f0 = fixture(SimpleFixture, scope='environment', variables={'data': 10}) + f1 = fixture(SimpleFixture, scope='environment', variables={'data': 20}) + + @sanity_function + def validate_vars(self): + return sn.all([sn.assert_eq(self.f0.data, 10), + sn.assert_eq(self.f1.data, 20)]) diff --git a/unittests/resources/checks_unlisted/testlib/nested/__init__.py b/unittests/resources/checks_unlisted/testlib/nested/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/unittests/resources/checks_unlisted/testlib/nested/dummy.py b/unittests/resources/checks_unlisted/testlib/nested/dummy.py new file mode 100644 index 0000000000..41e54fa7cd --- /dev/null +++ b/unittests/resources/checks_unlisted/testlib/nested/dummy.py @@ -0,0 +1,17 @@ +# Copyright 2016-2024 Swiss National Supercomputing Centre (CSCS/ETH Zurich) +# ReFrame Project Developers. See the top-level LICENSE file for details. +# +# SPDX-License-Identifier: BSD-3-Clause + +import reframe as rfm +import reframe.utility.sanity as sn +from ..utility import dummy_fixture + + +@rfm.simple_test +class dummy_test(rfm.RunOnlyRegressionTest): + valid_systems = ['*'] + valid_prog_environs = ['*'] + executable = 'true' + sanity_patterns = sn.assert_true(1) + dummy = fixture(dummy_fixture) diff --git a/unittests/resources/checks_unlisted/testlib/simple.py b/unittests/resources/checks_unlisted/testlib/simple.py index 1316168bd5..10218115a6 100644 --- a/unittests/resources/checks_unlisted/testlib/simple.py +++ b/unittests/resources/checks_unlisted/testlib/simple.py @@ -16,7 +16,7 @@ class simple_echo_check(rfm.RunOnlyRegressionTest, pin_prefix=True): executable = 'echo' executable_opts = ['Hello'] message = variable(str, value='World') - dummy = fixture(dummy_fixture, scope='environment') + dummy = fixture(dummy_fixture) @run_before('run') def set_executable_opts(self): diff --git a/unittests/resources/config/settings.py b/unittests/resources/config/settings.py index b888e0fbfc..ae3c42502e 100644 --- a/unittests/resources/config/settings.py +++ b/unittests/resources/config/settings.py @@ -226,12 +226,6 @@ def hostname(): 'modules': ['PrgEnv-cray'], 'features': ['cxx14', 'mpi'], }, - { - 'name': 'builtin', - 'cc': 'cc', - 'cxx': '', - 'ftn': '' - }, { 'name': 'e0', 'modules': ['m0'] diff --git a/unittests/test_cli.py b/unittests/test_cli.py index 29bbb34578..799b45f9ee 100644 --- a/unittests/test_cli.py +++ b/unittests/test_cli.py @@ -420,7 +420,14 @@ def test_perflogdir_from_env(run_reframe, tmp_path, monkeypatch): 'default' / 'PerformanceFailureCheck.log') -def test_performance_report(run_reframe, run_action): +@pytest.fixture(params=['storage=yes', 'storage=no']) +def storage_enabled(request, monkeypatch): + value = request.param.split('=')[1] + monkeypatch.setenv('RFM_ENABLE_RESULTS_STORAGE', value) + return value == 'yes' + + +def test_performance_report(run_reframe, run_action, storage_enabled): returncode, stdout, stderr = run_reframe( checkpath=['unittests/resources/checks/frontend_checks.py'], more_options=['-n', '^PerformanceFailureCheck', @@ -433,6 +440,9 @@ def test_performance_report(run_reframe, run_action): else: assert returncode == 0 + if run_action != 'dry_run': + assert 'PERFORMANCE REPORT' in stdout + assert 'Traceback' not in stdout assert 'Traceback' not in stderr @@ -1066,6 +1076,13 @@ def test_reruns_with_duration(run_reframe): assert returncode == 1 +def test_exitcode_timeout(run_reframe): + assert_no_crash(*run_reframe( + more_options=['--duration=5s', '-n^HelloTest'], + checkpath=['unittests/resources/checks/hellocheck.py'] + )) + + @pytest.fixture(params=['name', 'rname', 'uid', 'ruid', 'random']) def exec_order(request): return request.param @@ -1253,11 +1270,11 @@ def test_testlib_inherit_fixture_in_different_files(run_reframe): action='run', ) assert returncode == 0 - assert 'Ran 3/3 test case(s)' in stdout + assert 'Ran 4/4 test case(s)' in stdout assert 'FAILED' not in stdout -@pytest.fixture(params=['csv', 'plain', 'grid', 'outline']) +@pytest.fixture(params=['csv', 'plain', 'pretty']) def table_format(request): return request.param @@ -1269,7 +1286,8 @@ def assert_no_crash(returncode, stdout, stderr, exitcode=0): return returncode, stdout, stderr -def test_storage_options(run_reframe, tmp_path, table_format): +def test_storage_options(run_reframe, tmp_path, table_format, monkeypatch): + monkeypatch.setenv('RFM_ENABLE_RESULTS_STORAGE', 'yes') run_reframe2 = functools.partial( run_reframe, checkpath=['unittests/resources/checks/frontend_checks.py'], @@ -1335,8 +1353,7 @@ def test_storage_options(run_reframe, tmp_path, table_format): '--describe-stored-testcases=now-1d:now', '--list-stored-sessions', '--list-stored-testcases=now-1d:now/mean:/', - '--performance-compare=now-1d:now/now-1d/mean:/', - '--performance-report=now-1d:now/mean:/' + '--performance-compare=now-1d:now/now-1d/mean:/' ]) def storage_option(request): return request.param @@ -1359,7 +1376,8 @@ def test_disabled_results_storage(run_reframe, storage_option, monkeypatch): assert 'requires results storage' in stdout -def test_session_annotations(run_reframe): +def test_session_annotations(run_reframe, monkeypatch): + monkeypatch.setenv('RFM_ENABLE_RESULTS_STORAGE', 'yes') assert_no_crash(*run_reframe( checkpath=['unittests/resources/checks/frontend_checks.py'], action='-r', @@ -1373,13 +1391,14 @@ def test_session_annotations(run_reframe): assert text in stdout -def test_performance_compare(run_reframe, table_format): +def test_performance_compare(run_reframe, table_format, monkeypatch): def assert_no_crash(returncode, stdout, stderr, exitcode=0): assert returncode == exitcode assert 'Traceback' not in stdout assert 'Traceback' not in stderr return returncode, stdout, stderr + monkeypatch.setenv('RFM_ENABLE_RESULTS_STORAGE', 'yes') run_reframe2 = functools.partial( run_reframe, checkpath=['unittests/resources/checks/frontend_checks.py'], diff --git a/unittests/test_filters.py b/unittests/test_filters.py index 02493d6564..73ed6cc2ad 100644 --- a/unittests/test_filters.py +++ b/unittests/test_filters.py @@ -4,6 +4,7 @@ # SPDX-License-Identifier: BSD-3-Clause import pytest +from collections import namedtuple import reframe as rfm import reframe.core.exceptions as errors @@ -11,7 +12,6 @@ import reframe.frontend.filters as filters import reframe.utility.sanity as sn import unittests.utility as test_util -from reframe.core.exceptions import ReframeError def count_checks(filter_fn, checks): @@ -19,8 +19,10 @@ def count_checks(filter_fn, checks): def make_case(*args, **kwargs): + _P = namedtuple('_Partition', ['fullname']) + _E = namedtuple('_Environment', ['name']) test = test_util.make_check(*args, **kwargs) - return executors.TestCase(test, None, None) + return executors.TestCase(test, _P('generic:default'), _E('builtin')) @pytest.fixture @@ -156,15 +158,13 @@ def test_validates_expr_invalid(sample_cases): validates = filters.validates # undefined variables - with pytest.raises(ReframeError): - assert count_checks(validates('foo == 3'), sample_cases) + assert count_checks(validates('foo == 3'), sample_cases) == 0 - # invalid syntax - with pytest.raises(ReframeError): - assert count_checks(validates('num_tasks = 2'), sample_cases) + # assignments + assert count_checks(validates('num_tasks = 2'), sample_cases) == 0 - with pytest.raises(ReframeError): - assert count_checks(validates('import os'), sample_cases) + # imports + assert count_checks(validates('import os'), sample_cases) == 0 - with pytest.raises(ReframeError): - assert count_checks(validates('"foo" i tags'), sample_cases) + # invalid syntax + assert count_checks(validates('"foo" i tags'), sample_cases) == 0 diff --git a/unittests/test_loader.py b/unittests/test_loader.py index d64fec6ba9..729708bde7 100644 --- a/unittests/test_loader.py +++ b/unittests/test_loader.py @@ -154,3 +154,9 @@ def test_relative_import_outside_rfm_prefix(loader, tmp_path): ) tests = loader.load_from_file(str(tmp_path / 'testlib' / 'simple.py')) assert len(tests) == 2 + + # Test nested library tests + tests = loader.load_from_file( + str(tmp_path / 'testlib' / 'nested' / 'dummy.py') + ) + assert len(tests) == 2 diff --git a/unittests/test_parameters.py b/unittests/test_parameters.py index 118bba9951..1b5826a908 100644 --- a/unittests/test_parameters.py +++ b/unittests/test_parameters.py @@ -48,10 +48,13 @@ class MyTest(TwoParams): def test_abstract_param(): class MyTest(Abstract): - pass + # Add another abstract parameter + P2 = parameter() assert MyTest.param_space['P0'] == () assert MyTest.param_space['P1'] == ('b',) + assert MyTest.param_space['P2'] == () + assert MyTest.param_space.undefined_params() == ['P0', 'P2'] def test_param_override(): diff --git a/unittests/test_perflogging.py b/unittests/test_perflogging.py index afc5e26f5e..570f5862ae 100644 --- a/unittests/test_perflogging.py +++ b/unittests/test_perflogging.py @@ -174,6 +174,7 @@ def test_perf_logging(make_runner, make_exec_ctx, perf_test, ) ) ) + rt.set_working_dir() logging.configure_logging(rt.runtime().site_config) runner = make_runner() testcases = executors.generate_testcases([perf_test]) diff --git a/unittests/test_pipeline.py b/unittests/test_pipeline.py index e5010e255c..cd19af6af0 100644 --- a/unittests/test_pipeline.py +++ b/unittests/test_pipeline.py @@ -22,18 +22,32 @@ from reframe.core.meta import make_test from reframe.core.warnings import ReframeDeprecationWarning +rt.set_working_dir() + def _run(test, partition, prgenv): + test_util.asyncio_run(_runasync, test, partition, prgenv) + + +async def _runasync(test, partition, prgenv): test.setup(partition, prgenv) - test.compile() - test.compile_wait() - test.run() - test.run_wait() + await compile_wait(test) + await run_wait(test) test.check_sanity() test.check_performance() test.cleanup(remove_files=True) +async def compile_wait(test): + await test.compile() + await test.compile_wait() + + +async def run_wait(test): + await test.run() + await test.run_wait() + + @pytest.fixture def HelloTest(): from unittests.resources.checks.hellocheck import HelloTest @@ -306,9 +320,8 @@ class MyTest(rfm.CompileOnlyRegressionTest): test = MyTest() test.setup(*local_exec_ctx) - test.compile() with pytest.raises(BuildError): - test.compile_wait() + test_util.asyncio_run(compile_wait, test) def test_compile_only_warning(local_exec_ctx): @@ -556,7 +569,7 @@ def _assert_supported(valid_systems, valid_prog_environs, # Check AND in features and extras _assert_supported( - valid_systems=['+cuda +mpi %gpu_arch=v100'], + valid_systems=[r'+cuda +mpi %gpu_arch=v100'], valid_prog_environs=['*'], expected={} ) @@ -566,9 +579,18 @@ def _assert_supported(valid_systems, valid_prog_environs, expected={} ) - # Check OR in features ad extras + # Check OR in features and extras _assert_supported( - valid_systems=['+cuda +mpi', '%gpu_arch=v100'], + valid_systems=['+cuda +mpi', r'%gpu_arch=v100'], + valid_prog_environs=['*'], + expected={ + 'testsys:gpu': ['PrgEnv-gnu', 'builtin'] + } + ) + + # Check that extra keys can used as features + _assert_supported( + valid_systems=['+cuda +mpi', '+gpu_arch'], valid_prog_environs=['*'], expected={ 'testsys:gpu': ['PrgEnv-gnu', 'builtin'] @@ -640,7 +662,7 @@ def _assert_supported(valid_systems, valid_prog_environs, ) _assert_supported( valid_systems=['*'], - valid_prog_environs=['%bar=x'], + valid_prog_environs=[r'%bar=x'], expected={ 'testsys:gpu': [], 'testsys:login': ['PrgEnv-gnu'] @@ -648,7 +670,7 @@ def _assert_supported(valid_systems, valid_prog_environs, ) _assert_supported( valid_systems=['*'], - valid_prog_environs=['%foo=2'], + valid_prog_environs=[r'%foo=2'], expected={ 'testsys:gpu': ['PrgEnv-gnu'], 'testsys:login': [] @@ -656,7 +678,7 @@ def _assert_supported(valid_systems, valid_prog_environs, ) _assert_supported( valid_systems=['*'], - valid_prog_environs=['%foo=bar'], + valid_prog_environs=[r'%foo=bar'], expected={ 'testsys:gpu': [], 'testsys:login': [] @@ -671,6 +693,24 @@ def _assert_supported(valid_systems, valid_prog_environs, } ) + # Check that extra keys can used as features + _assert_supported( + valid_systems=['*'], + valid_prog_environs=['+foo +bar'], + expected={ + 'testsys:gpu': ['PrgEnv-gnu'], + 'testsys:login': ['PrgEnv-gnu'] + } + ) + _assert_supported( + valid_systems=['*'], + valid_prog_environs=['+foo -bar'], + expected={ + 'testsys:gpu': [], + 'testsys:login': [] + } + ) + # Check valid_systems / valid_prog_environs combinations _assert_supported( valid_systems=['testsys:login'], @@ -793,7 +833,7 @@ class MyTest(rfm.CompileOnlyRegressionTest): test.setup(*local_exec_ctx) test.sourcepath = '/usr/src' with pytest.raises(PipelineError): - test.compile() + test_util.asyncio_run(test.compile) def test_sourcepath_upref(local_exec_ctx): @@ -806,7 +846,7 @@ class MyTest(rfm.CompileOnlyRegressionTest): test.setup(*local_exec_ctx) test.sourcepath = '../hellosrc' with pytest.raises(PipelineError): - test.compile() + test_util.asyncio_run(test.compile) def test_sourcepath_non_existent(local_exec_ctx): @@ -818,9 +858,8 @@ class MyTest(rfm.CompileOnlyRegressionTest): test = MyTest() test.setup(*local_exec_ctx) test.sourcepath = 'non_existent.c' - test.compile() with pytest.raises(BuildError): - test.compile_wait() + test_util.asyncio_run(compile_wait, test) def test_extra_resources(HelloTest, testsys_exec_ctx): diff --git a/unittests/test_policies.py b/unittests/test_policies.py index 19ac559606..d868f56a03 100644 --- a/unittests/test_policies.py +++ b/unittests/test_policies.py @@ -18,6 +18,7 @@ from reframe.core.exceptions import (AbortTaskError, FailureLimitError, ForceExitError, + KeyboardError, RunSessionTimeout, TaskDependencyError) from unittests.resources.checks.hellocheck import HelloTest @@ -32,6 +33,8 @@ SystemExitCheck ) +rt.set_working_dir() + def make_kbd_check(phase='wait'): return test_util.make_check(KeyboardInterruptCheck, phase=phase) @@ -249,7 +252,7 @@ def test_force_local_execution(make_runner, make_cases, testsys_exec_ctx): def test_kbd_interrupt_within_test(make_runner, make_cases, common_exec_ctx): runner = make_runner() - with pytest.raises(KeyboardInterrupt): + with pytest.raises(KeyboardError): runner.runall(make_cases([make_kbd_check()])) stats = runner.stats @@ -449,7 +452,7 @@ def make_async_runner(): def _make_runner(): evt_monitor = _TaskEventMonitor() - ret = executors.Runner(policies.AsynchronousExecutionPolicy()) + ret = executors.Runner(policies.AsyncioExecutionPolicy()) ret.policy.keep_stage_files = True ret.policy.task_listeners.append(evt_monitor) return ret, evt_monitor @@ -593,7 +596,7 @@ def test_kbd_interrupt_in_wait_with_concurrency( ): make_exec_ctx(options=max_jobs_opts(4)) runner, _ = make_async_runner() - with pytest.raises(KeyboardInterrupt): + with pytest.raises(KeyboardError): runner.runall(make_cases([ make_kbd_check(), make_sleep_check(10), make_sleep_check(10), make_sleep_check(10) @@ -612,7 +615,7 @@ def test_kbd_interrupt_in_wait_with_limited_concurrency( # three. make_exec_ctx(options=max_jobs_opts(2)) runner, _ = make_async_runner() - with pytest.raises(KeyboardInterrupt): + with pytest.raises(KeyboardError): runner.runall(make_cases([ make_kbd_check(), make_sleep_check(10), make_sleep_check(10), make_sleep_check(10) @@ -626,7 +629,7 @@ def test_kbd_interrupt_in_setup_with_concurrency( ): make_exec_ctx(options=max_jobs_opts(4)) runner, _ = make_async_runner() - with pytest.raises(KeyboardInterrupt): + with pytest.raises(KeyboardError): runner.runall(make_cases([ make_sleep_check(1), make_sleep_check(1), make_sleep_check(1), make_kbd_check(phase='setup') @@ -640,7 +643,7 @@ def test_kbd_interrupt_in_setup_with_limited_concurrency( ): make_exec_ctx(options=max_jobs_opts(2)) runner, _ = make_async_runner() - with pytest.raises(KeyboardInterrupt): + with pytest.raises(KeyboardError): runner.runall(make_cases([ make_sleep_check(1), make_sleep_check(1), make_sleep_check(1), make_kbd_check(phase='setup') diff --git a/unittests/test_reporting.py b/unittests/test_reporting.py index 073356aea4..b7f80ba4a9 100644 --- a/unittests/test_reporting.py +++ b/unittests/test_reporting.py @@ -27,6 +27,7 @@ _DEFAULT_BASE_COLS = DEFAULT_GROUP_BY + DEFAULT_EXTRA_COLS +rt.set_working_dir() # NOTE: We could move this to utility class _timer: diff --git a/unittests/test_schedulers.py b/unittests/test_schedulers.py index a11223121f..1863cfa33d 100644 --- a/unittests/test_schedulers.py +++ b/unittests/test_schedulers.py @@ -144,7 +144,7 @@ def prepare_job(job, command='hostname', def submit_job(job): with rt.module_use(test_util.TEST_MODULES): - job.submit() + test_util.asyncio_run(job.submit) def assert_job_script_sanity(job): @@ -477,7 +477,7 @@ def test_submit(make_job, exec_ctx): assert minimal_job.nodelist == [] submit_job(minimal_job) assert minimal_job.jobid != [] - minimal_job.wait() + test_util.asyncio_run(minimal_job.wait) # Additional scheduler-specific checks sched_name = minimal_job.scheduler.registered_name @@ -505,7 +505,7 @@ def test_submit_timelimit(minimal_job, local_only): submit_job(minimal_job) assert minimal_job.jobid is not None with pytest.raises(JobError): - minimal_job.wait() + test_util.asyncio_run(minimal_job.wait) t_job = time.time() - t_job assert t_job >= 2 @@ -532,8 +532,8 @@ def test_submit_unqualified_hostnames(make_exec_ctx, make_job, local_only): hostname = socket.gethostname().split('.')[0] minimal_job = make_job(sched_opts={'part_name': 'login'}) minimal_job.prepare('true') - minimal_job.submit() - minimal_job.wait() + test_util.asyncio_run(minimal_job.submit) + test_util.asyncio_run(minimal_job.wait) assert minimal_job.nodelist == [hostname] @@ -542,7 +542,7 @@ def test_submit_job_array(make_job, slurm_only, exec_ctx): job.options = ['--array=0-1'] prepare_job(job, command='echo "Task id: ${SLURM_ARRAY_TASK_ID}"') submit_job(job) - job.wait() + test_util.asyncio_run(job.wait) if job.scheduler.registered_name == 'slurm': assert job.exitcode == 0 with open(job.stdout) as fp: @@ -566,7 +566,7 @@ def test_cancel(make_job, exec_ctx): # want to test here. time.sleep(0.01) - minimal_job.wait() + test_util.asyncio_run(minimal_job.wait) t_job = time.time() - t_job assert minimal_job.finished() assert t_job < 30 @@ -589,7 +589,7 @@ def test_cancel_before_submit(minimal_job): def test_wait_before_submit(minimal_job): prepare_job(minimal_job, 'sleep 3') with pytest.raises(JobNotStartedError): - minimal_job.wait() + test_util.asyncio_run(minimal_job.wait) def test_finished(make_job, exec_ctx): @@ -597,7 +597,7 @@ def test_finished(make_job, exec_ctx): prepare_job(minimal_job, 'sleep 2') submit_job(minimal_job) assert not minimal_job.finished() - minimal_job.wait() + test_util.asyncio_run(minimal_job.wait) def test_finished_before_submit(minimal_job): @@ -610,7 +610,7 @@ def test_finished_raises_error(make_job, exec_ctx): minimal_job = make_job(sched_access=exec_ctx.access) prepare_job(minimal_job, 'echo hello') submit_job(minimal_job) - minimal_job.wait() + test_util.asyncio_run(minimal_job.wait) # Emulate an error during polling and verify that it is raised correctly # when finished() is called @@ -690,7 +690,7 @@ def test_guess_num_tasks(minimal_job, scheduler): minimal_job._sched_flex_alloc_nodes = 'idle' prepare_job(minimal_job) submit_job(minimal_job) - minimal_job.wait() + test_util.asyncio_run(minimal_job.wait) assert minimal_job.num_tasks == 1 elif scheduler.registered_name in ('slurm', 'squeue'): minimal_job.num_tasks = 0 @@ -737,7 +737,7 @@ def state(self): submit_job(minimal_job) with pytest.raises(JobError, match='maximum pending time exceeded'): - minimal_job.wait() + test_util.asyncio_run(minimal_job.wait) def assert_process_died(pid): @@ -759,6 +759,7 @@ def _read_pid(job, attempts=3): for _ in range(attempts): try: with open(job.stdout) as fp: + # print(fp.read()) return int(fp.read()) except ValueError: time.sleep(1) @@ -783,6 +784,7 @@ def test_cancel_with_grace(minimal_job, scheduler, local_only): minimal_job.time_limit = '1m' minimal_job.scheduler.CANCEL_GRACE_PERIOD = 2 prepare_job(minimal_job, + # command='(trap '' TERM; sleep 5) &', command='sleep 5 &', pre_run=['trap -- "" TERM'], post_run=['echo $!', 'wait'], @@ -797,7 +799,7 @@ def test_cancel_with_grace(minimal_job, scheduler, local_only): t_grace = time.time() minimal_job.cancel() time.sleep(0.1) - minimal_job.wait() + test_util.asyncio_run(minimal_job.wait) t_grace = time.time() - t_grace assert t_grace >= 2 and t_grace < 5 @@ -844,7 +846,7 @@ def test_cancel_term_ignore(minimal_job, scheduler, local_only): t_grace = time.time() minimal_job.cancel() time.sleep(0.1) - minimal_job.wait() + test_util.asyncio_run(minimal_job.wait) t_grace = time.time() - t_grace assert t_grace >= 2 and t_grace < 5 diff --git a/unittests/test_testgenerators.py b/unittests/test_testgenerators.py index 0d9b876d53..12cc7b44ab 100644 --- a/unittests/test_testgenerators.py +++ b/unittests/test_testgenerators.py @@ -51,7 +51,7 @@ def sys0p0_nodes(): nodelist_iter = sys0p0_nodes() for tc in new_cases: - nodes = getattr(tc.check, '$nid') + nodes = getattr(tc.check, '.nid') if tc.partition.fullname == 'sys0:p0': assert nodes == next(nodelist_iter) else: diff --git a/unittests/test_utility.py b/unittests/test_utility.py index 97d32c5810..2cb5ac462e 100644 --- a/unittests/test_utility.py +++ b/unittests/test_utility.py @@ -2070,6 +2070,14 @@ def test_nodelist_utilities(): assert nodelist(nodes) == 'nid0[00-99]-x,nid100-y' assert expand('nid0[00-99]-x,nid100-y') == nodes + # Test edge condition when node lists jump from N to N+1 digits + # See GH issue #3338 + nodes = ['vs-std-0009', 'vs-std-0010', 'vs-std-0099', 'vs-std-0100'] + assert nodelist(nodes) == 'vs-std-00[09-10],vs-std-0[099-100]' + assert expand('vs-std-00[09-10],vs-std-0[099-100]') == [ + 'vs-std-0009', 'vs-std-0010', 'vs-std-0099', 'vs-std-0100' + ] + # Test node duplicates assert nodelist(['nid001', 'nid001', 'nid002']) == 'nid001,nid00[1-2]' assert expand('nid001,nid00[1-2]') == ['nid001', 'nid001', 'nid002'] diff --git a/unittests/utility.py b/unittests/utility.py index 5d6414ef10..dffccb62ce 100644 --- a/unittests/utility.py +++ b/unittests/utility.py @@ -7,6 +7,7 @@ # unittests/utility.py -- Utilities used in unit tests # +import asyncio import functools import inspect import os @@ -37,6 +38,14 @@ USER_SYSTEM = None +def asyncio_run(task, *args): + loop = asyncio.get_event_loop() + if loop.is_closed(): + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + return loop.run_until_complete(task(*args)) + + def init_runtime(): site_config = config.load_config('unittests/resources/config/settings.py') site_config.select_subconfig('generic')