diff --git a/.github/workflows/internal_tools.yml b/.github/workflows/internal_tools.yml new file mode 100644 index 0000000000..66a7d4c2d8 --- /dev/null +++ b/.github/workflows/internal_tools.yml @@ -0,0 +1,24 @@ +name: Install internal tools Python packages & run test suite + +on: + pull_request: + branches: [main] + push: + branches: [main] + merge_group: + types: [checks_requested] +jobs: + infra-bionemo: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + submodules: 'recursive' + - uses: actions/setup-python@v5 + with: + python-version: '3.10' + cache: 'pip' + - run: pip install -r requirements-dev.txt -r requirements-test.txt + - run: pip install internal/infra-bionemo + - run: cd internal/infra-bionemo && pytest -v --cov=infra_bionemo --cov-report=term . diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 7051c9ae8f..c9b91149ae 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -27,7 +27,7 @@ repos: hooks: - id: license-header-check name: Run license-check script - entry: python scripts/license_check.py -c scripts -c sub-packages --license-header ./license_header --modify + entry: python internal/infra-bionemo/src/infra_bionemo/license_check.py -c scripts -c sub-packages -c docs -c internal --license-header ./license_header --modify language: python additional_dependencies: ["click==8.1.7"] pass_filenames: false diff --git a/.secrets.baseline b/.secrets.baseline index 8584dc4e50..c0824519f8 100644 --- a/.secrets.baseline +++ b/.secrets.baseline @@ -139,9 +139,9 @@ "filename": "pyproject.toml", "hashed_secret": "79670e9c9d1c7ea5b81a96a2053d81437712c78e", "is_verified": false, - "line_number": 44 + "line_number": 47 } ] }, - "generated_at": "2024-10-31T19:51:19Z" + "generated_at": "2024-11-01T22:26:03Z" } diff --git a/Dockerfile b/Dockerfile index 2814168015..2a74224ddf 100644 --- a/Dockerfile +++ b/Dockerfile @@ -172,12 +172,14 @@ FROM dev AS development WORKDIR /workspace/bionemo2 COPY --from=bionemo2-base /workspace/bionemo2/ . +COPY ./internal ./internal # because of the `rm -rf ./3rdparty` in bionemo2-base COPY ./3rdparty ./3rdparty USER root RUN < docs/CODE-REVIEW.md +├── CODEOWNERS +├── CONTRIBUTING.md -> docs/CONTRIBUTING.md +├── Dockerfile +├── LICENSE +│ ├── license.txt +│ └── third_party.txt +├── README.md +├── VERSION +├── ci +│ └── scripts +│ ├── nightly_test.sh +│ ├── pr_test.sh +│ └── static_checks.sh +├── docs +│ ├── CODE-REVIEW.md +│ ├── CONTRIBUTING.md +│ ├── Dockerfile +│ ├── README.md +│ ├── docs +│ │ ├── assets +│ │ │ ├── css +│ │ │ │ ├── color-schemes.css +│ │ │ │ ├── custom-material.css +│ │ │ │ └── fonts.css +│ │ │ └── images +│ │ │ ├── favicon.png +│ │ │ ├── logo-icon-black.svg +│ │ │ └── logo-white.svg +│ │ ├── developer-guide +│ │ │ ├── CODE-REVIEW.md +│ │ │ ├── CONTRIBUTING.md +│ │ │ └── jupyter-notebooks.ipynb +│ │ ├── index.md +│ │ └── user-guide +│ │ └── index.md +│ ├── mkdocs.yml +│ ├── requirements.txt +│ └── scripts +│ └── gen_ref_pages.py +├── launch.sh +├── license_header +├── pyproject.toml +├── requirements-cve.txt +├── requirements-dev.txt +├── requirements-test.txt +├── scripts # 🟢 Temporary scripts that demonstrate how to run some of these programs. These will be replaced. +│ ├── artifact_paths.yaml +│ ├── download_artifacts.py +│ ├── gpt-pretrain.py +│ ├── protein +│ │ └── esm2 +│ │ ├── esm2_pretrain.py +│ │ └── test_esm2_pretrain.py +│ └── singlecell +│ └── geneformer +│ ├── test_train.py +│ └── train.py +# 🟢 All work goes into `sub-packages` +# Sub-packages represent individually installable subsets of the bionemo codebase. We recommend that you +# create new sub-packages to track your experiments and save any updated models or utilities that you need. +├── sub-packages +│ ├── bionemo-core # 🟢 bionemo-core, and bionemo-llm represent top level sub-packages that do not depend on others +│ │ ├── LICENSE +│ │ ├── README.md +│ │ ├── pyproject.toml +│ │ ├── requirements.txt +│ │ ├── setup.py +│ │ ├── src # 🟢 All sub-packages have a `src` and a `test` sub-directory. +│ │ │ └── bionemo +│ │ │ └── core +│ │ │ ├── __init__.py +│ │ │ ├── api.py +│ │ │ ├── model +│ │ │ │ ├── __init__.py +│ │ │ │ └── config.py +│ │ │ └── utils +│ │ │ ├── __init__.py +│ │ │ ├── batching_utils.py +│ │ │ ├── dtypes.py +│ │ │ └── random_utils.py +│ │ └── tests # 🟢 Test files should be mirrored with `src` files, and have the same name other than `test_[file_name].py` +│ │ └── bionemo +│ │ ├── core +│ │ └── pytorch +│ │ └── utils +│ │ └── test_dtypes.py +│ ├── bionemo-esm2 # 🟢 The ESM2 model sub-package. This stores models and dataloaders necessary for pretraining and some example fine-tuning. +│ │ ├── LICENSE +│ │ ├── README.md +│ │ ├── _requirements-test.txt +│ │ ├── _requirements.txt +│ │ ├── pyproject.toml +│ │ ├── requirements.txt +│ │ ├── setup.py +│ │ ├── src +│ │ │ └── bionemo +│ │ │ └── esm2 +│ │ │ ├── __init__.py +│ │ │ ├── api.py +│ │ │ └── model +│ │ │ ├── __init__.py +│ │ │ ├── attention.py +│ │ │ ├── embedding.py +│ │ │ ├── lr_scheduler.py +│ │ │ └── model.py +│ │ └── tests +│ │ └── bionemo +│ │ └── esm2 +│ │ ├── __init__.py +│ │ ├── conftest.py +│ │ └── model +│ │ ├── __init__.py +│ │ ├── test_attention.py +│ │ ├── test_embedding.py +│ │ ├── test_lr_scheduler.py +│ │ └── test_model.py +│ ├── bionemo-example_model # 🟢 a small example model that demonstrates how to write a megatron model from scratch and train on MNIST +│ │ ├── LICENSE +│ │ ├── README.md +│ │ ├── _requirements.txt +│ │ ├── pyproject.toml +│ │ ├── requirements.txt +│ │ ├── setup.py +│ │ ├── src +│ │ │ └── bionemo +│ │ │ └── example_model +│ │ │ ├── __init__.py +│ │ │ └── lightning_basic.py +│ │ └── tests +│ │ └── bionemo +│ │ └── example_model +│ │ └── test_lightning_basic.py +│ ├── bionemo-fw # 🟢 a meta-package that pulls together all other packages. A user can install this and get all of bionemo. +│ │ ├── LICENSE +│ │ ├── README.md +│ │ ├── _requirements-test.txt +│ │ ├── _requirements.txt +│ │ ├── pyproject.toml +│ │ ├── requirements.txt +│ │ ├── setup.py +│ │ ├── src +│ │ │ └── bionemo +│ │ │ └── fw +│ │ │ └── __init__.py +│ │ └── tests +│ │ ├── __init__.py +│ │ └── bionemo +│ │ └── fw +│ │ └── test_sub_package_imports.py +│ ├── bionemo-geneformer # 🟢 geneformer sub-module +│ │ ├── LICENSE +│ │ ├── README.md +│ │ ├── _requirements-test.txt +│ │ ├── _requirements.txt +│ │ ├── pyproject.toml +│ │ ├── requirements.txt +│ │ ├── setup.py +│ │ ├── src +│ │ │ └── bionemo +│ │ │ └── geneformer +│ │ │ ├── __init__.py +│ │ │ ├── api.py +│ │ │ ├── model +│ │ │ │ ├── __init__.py +│ │ │ │ └── finetune_token_regressor.py +│ │ │ └── tokenizer +│ │ │ ├── __init__.py +│ │ │ ├── gene_tokenizer.py +│ │ │ └── label2id_tokenizer.py +│ │ └── tests +│ │ └── bionemo +│ │ └── geneformer +│ │ ├── __init__.py +│ │ ├── test_model.py +│ │ ├── test_stop_and_go.py +│ │ └── test_transformer_specs.py +│ ├── bionemo-llm # 🟢 shared model code for LLM style models, eg BERT variants, transformer variants, etc. +│ │ ├── LICENSE +│ │ ├── README.md +│ │ ├── _requirements-test.txt +│ │ ├── _requirements.txt +│ │ ├── pyproject.toml +│ │ ├── requirements.txt +│ │ ├── setup.py +│ │ ├── src +│ │ │ └── bionemo +│ │ │ └── llm +│ │ │ ├── __init__.py +│ │ │ ├── lightning.py +│ │ │ ├── model +│ │ │ │ ├── __init__.py +│ │ │ │ ├── biobert +│ │ │ │ │ ├── __init__.py +│ │ │ │ │ ├── lightning.py +│ │ │ │ │ ├── model.py +│ │ │ │ │ ├── testing_utils.py +│ │ │ │ │ └── transformer_specs.py +│ │ │ │ ├── config.py +│ │ │ │ ├── layers.py +│ │ │ │ └── loss.py +│ │ │ └── utils +│ │ │ ├── __init__.py +│ │ │ ├── datamodule_utils.py +│ │ │ ├── iomixin_utils.py +│ │ │ ├── logger_utils.py +│ │ │ ├── remote.py +│ │ │ └── weight_utils.py +│ │ └── tests +│ │ ├── __init__.py +│ │ └── bionemo +│ │ └── llm +│ │ ├── __init__.py +│ │ ├── model +│ │ │ ├── biobert +│ │ │ │ └── test_transformer_specs.py +│ │ │ └── test_loss.py +│ │ ├── test_lightning.py +│ │ └── utils +│ │ ├── __init__.py +│ │ ├── test_datamodule_utils.py +│ │ ├── test_iomixin_utils.py +│ │ └── test_logger_utils.py +│ ├── bionemo-scdl # 🟢 +│ │ ├── LICENSE +│ │ ├── README.md +│ │ ├── examples +│ │ │ └── example_notebook.ipynb +│ │ ├── pyproject.toml +│ │ ├── requirements.txt +│ │ ├── setup.py +│ │ ├── src +│ │ │ └── bionemo +│ │ │ └── scdl +│ │ │ ├── __init__.py +│ │ │ ├── api +│ │ │ │ ├── __init__.py +│ │ │ │ └── single_cell_row_dataset.py +│ │ │ ├── index +│ │ │ │ ├── __init__.py +│ │ │ │ └── row_feature_index.py +│ │ │ ├── io +│ │ │ │ ├── __init__.py +│ │ │ │ ├── single_cell_collection.py +│ │ │ │ └── single_cell_memmap_dataset.py +│ │ │ ├── scripts +│ │ │ │ ├── __init__.py +│ │ │ │ └── convert_h5ad_to_scdl.py +│ │ │ └── util +│ │ │ ├── __init__.py +│ │ │ ├── async_worker_queue.py +│ │ │ └── torch_dataloader_utils.py +│ │ └── tests +│ │ └── bionemo +│ │ └── scdl +│ │ ├── conftest.py +│ │ ├── index +│ │ │ └── test_row_feature_index.py +│ │ ├── io +│ │ │ ├── test_single_cell_collection.py +│ │ │ └── test_single_cell_memmap_dataset.py +│ │ └── util +│ │ ├── test_async_worker_queue.py +│ │ └── test_torch_dataloader_utils.py +│ ├── bionemo-testing +│ │ ├── LICENSE +│ │ ├── README.md +│ │ ├── _requirements.txt +│ │ ├── pyproject.toml +│ │ ├── requirements.txt +│ │ ├── setup.py +│ │ ├── src +│ │ │ └── bionemo +│ │ │ └── testing +│ │ │ ├── __init__.py +│ │ │ ├── callbacks.py +│ │ │ ├── harnesses +│ │ │ │ ├── __init__.py +│ │ │ │ └── stop_and_go.py +│ │ │ ├── megatron_parallel_state_utils.py +│ │ │ ├── testing_callbacks.py +│ │ │ └── utils.py +│ │ └── tests +│ │ └── bionemo +│ │ └── testing +│ │ └── test_megatron_parallel_state_utils.py +│ └── bionemo-webdatamodule +│ ├── LICENSE +│ ├── README.md +│ ├── pyproject.toml +│ ├── requirements.txt +│ ├── setup.py +│ ├── src +│ │ └── bionemo +│ │ └── webdatamodule +│ │ ├── __init__.py +│ │ ├── datamodule.py +│ │ └── utils.py +│ └── tests +│ └── bionemo +│ └── webdatamodule +│ ├── __init__.py +│ ├── conftest.py +│ └── test_datamodule.py +``` + +## Installation +### Initializing 3rd-party dependencies as git submodules + +For development, the NeMo and Megatron-LM dependencies are vendored in the bionemo-2 repository workspace as git +submodules. The pinned commits for these submodules represent the "last-known-good" versions of these packages that are +confirmed to be working with bionemo2 (and those that are tested in CI). + +To initialize these sub-modules when cloning the repo, add the `--recursive` flag to the git clone command: + +```bash +git clone --recursive git@github.com:NVIDIA/bionemo-fw-ea.git +``` + +To download the pinned versions of these submodules within an existing git repository, run + +```bash +git submodule update --init --recursive +``` diff --git a/internal/infra-bionemo/LICENSE b/internal/infra-bionemo/LICENSE new file mode 100644 index 0000000000..d645695673 --- /dev/null +++ b/internal/infra-bionemo/LICENSE @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/internal/infra-bionemo/README.md b/internal/infra-bionemo/README.md new file mode 100644 index 0000000000..b29c1052b8 --- /dev/null +++ b/internal/infra-bionemo/README.md @@ -0,0 +1,39 @@ +# bionemo_infra +Tools maintained by the BioNeMo Framework Infrastructure team. + + +## Development +All code must be formatted & linted using `ruff` and type checked using `mypy`. + +To type check, run: +```bash +mypy --install-types --non-interactive --ignore-missing --check-untyped-defs . +``` + +All code must have type annotations. In special circumstances, such as private or helper functions, may elide static +type annotations if it is beneficial. + +To run unit tests, use `pytest -v`. Unit tests must cover all features and known bug cases. + +To get code coverage reports, run: +```bash +pytest -v --cov=infra_bionemo --cov-report=term --cov-report=html . +``` +Then, open the [HTML coverage report](htmlcov/index.html) in your browser. Note that file coverage stats are reported +in the terminal too. + +### First Time Setup +For first time setup, be sure to install the development and test dependencies of the entire bionemo repository. +These are defined at the repository's top-level [`pyproject.toml`](../../pyproject.toml) file. Follow the instructions +outlined in the [top-level README](../../README.md). Once you have your local virtual environment ready, you may +install this project's code by running the following: +```bash +uv pip install --editable . +``` + +NOTE: you may have to re-install the project if there are new or removed packages. If the requirements haven't changed, + you can re-install without re-installing the dependencies by doing: `uv pip install --no-deps -e .` + +### Versioning +This project uses [Semantic Versioning 2.0](https://semver.org/). Contributors *MUST* update the `version` in +`pyproject.toml` correctly in their MRs. The CI will reject MRs that do not increment the version number correctly. diff --git a/internal/infra-bionemo/pyproject.toml b/internal/infra-bionemo/pyproject.toml new file mode 100644 index 0000000000..a114dc0bef --- /dev/null +++ b/internal/infra-bionemo/pyproject.toml @@ -0,0 +1,36 @@ +[build-system] +requires = ["setuptools", "wheel"] +build-backend = "setuptools.build_meta" + +# For guidance, see: https://packaging.python.org/en/latest/guides/writing-pyproject-toml/ +[project] +name = "infra-bionemo" +version = "0.1.0" +authors = [{ name = "BioNeMo Team", email = "bionemofeedback@nvidia.com" }] +description = "Internal library of utilities and programs for BioNeMo-related infrastructure." +readme = "README.md" +requires-python = ">=3.10" +keywords = [] +license = {file = "LICENSE"} +classifiers = [ + "Programming Language :: Python :: 3.10", + "Private :: Do Not Upload", +] +dependencies = [ + "click>=8.1.7,<9.0.0", + "tomli>=2.0.2", + "tomli_w>=1.1.0", +] + +[project.scripts] +license-check = "infra_bionemo.license_check:entrypoint" +create-bionemo-project = "infra_bionemo.new_project.exe.bionemo_subpackage:entrypoint" +create-py-project = "infra_bionemo.new_project.exe.simple:entrypoint" +create-namespaced-project = "infra_bionemo.new_project.exe.namespace:entrypoint" + +[tool.pytest.ini_options] +testpaths = ["tests"] +filterwarnings = [ "ignore::DeprecationWarning",] + +[tool.coverage.run] +source = ["infra_bionemo"] diff --git a/internal/infra-bionemo/setup.py b/internal/infra-bionemo/setup.py new file mode 100644 index 0000000000..7c961a11bf --- /dev/null +++ b/internal/infra-bionemo/setup.py @@ -0,0 +1,21 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: LicenseRef-Apache2 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from setuptools import setup + + +if __name__ == "__main__": + setup() diff --git a/internal/infra-bionemo/src/infra_bionemo/__init__.py b/internal/infra-bionemo/src/infra_bionemo/__init__.py new file mode 100644 index 0000000000..25e6abfbc5 --- /dev/null +++ b/internal/infra-bionemo/src/infra_bionemo/__init__.py @@ -0,0 +1,14 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: LicenseRef-Apache2 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/scripts/license_check.py b/internal/infra-bionemo/src/infra_bionemo/license_check.py similarity index 94% rename from scripts/license_check.py rename to internal/infra-bionemo/src/infra_bionemo/license_check.py index 631285cce6..32f30d4790 100644 --- a/scripts/license_check.py +++ b/internal/infra-bionemo/src/infra_bionemo/license_check.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. + import ast import sys from dataclasses import dataclass @@ -44,15 +45,20 @@ ) LICENSE_HEADER: str = """ -# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: LicenseRef-NvidiaProprietary +# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: LicenseRef-Apache2 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at # -# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual -# property and proprietary rights in and to this material, related -# documentation and any modifications thereto. Any use, reproduction, -# disclosure or distribution of this material and related documentation -# without an express license agreement from NVIDIA CORPORATION or -# its affiliates is strictly prohibited. +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. """.strip() @@ -62,7 +68,7 @@ class HeaderNotFound(ValueError): pyfile: Path - def __str__(self) -> str: + def __str__(self) -> str: # noqa: D105 return f"{self.pyfile.name} does not have the license header!" @@ -79,6 +85,7 @@ def __str__(self) -> str: def license_check( pyfile: Path, *, license_header: str = LICENSE_HEADER, modify: bool, replace: bool = False ) -> Optional[LicenseCheckError]: + """Check Python file for license header, returning nothing on success or an error describing the failure.""" if not pyfile.is_file(): return IOError(f"{pyfile.name} file does not exist!") @@ -133,6 +140,7 @@ def has_header(pyfile_contents: str, *, license_header: str = LICENSE_HEADER) -> def append_license_header(pyfile_contents: str, *, license_header: str = LICENSE_HEADER, n_sep_lines: int = 2) -> str: """Appends the :param:`license_header` to the beginning of the input Python code (:param:`pyfile_contents`). + Inserts :param:`n_sep_lines` newlines between the license header & Python file contents. """ spacer = "\n" * n_sep_lines @@ -421,7 +429,7 @@ def _error(noncompliant_files: Mapping[Path, LicenseCheckError], n_files_checked f"files that do not have the license header!{maybe_modify_msg}\n" ) for pyfile, error in noncompliant_files.items(): - error_message += f" {pyfile}: {error}\n" + error_message += f" {str(pyfile)}: {error}\n" return ValueError(error_message) diff --git a/internal/infra-bionemo/src/infra_bionemo/new_project/__init__.py b/internal/infra-bionemo/src/infra_bionemo/new_project/__init__.py new file mode 100644 index 0000000000..25e6abfbc5 --- /dev/null +++ b/internal/infra-bionemo/src/infra_bionemo/new_project/__init__.py @@ -0,0 +1,14 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: LicenseRef-Apache2 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/internal/infra-bionemo/src/infra_bionemo/new_project/api.py b/internal/infra-bionemo/src/infra_bionemo/new_project/api.py new file mode 100644 index 0000000000..778e4b5ece --- /dev/null +++ b/internal/infra-bionemo/src/infra_bionemo/new_project/api.py @@ -0,0 +1,412 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: LicenseRef-Apache2 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import string +from dataclasses import dataclass +from pathlib import Path +from typing import List, Sequence + +from infra_bionemo.license_check import is_valid_python +from infra_bionemo.new_project.templates import ( + pyproject_toml_setuptools, + pyproject_toml_subproject, + pytest_example, + readme_md, + requirements_txt, + setup_py, +) + + +__all__: Sequence[str] = ( + "File", + "Dir", + "create_on_filesystem", + "bionemo_subproject_structure", + "namespace_py_project_structure", + "py_project_structure", + "check", + "convert", +) + + +@dataclass(frozen=True) +class File: + """Virtual representation of a text file.""" + + name: str + contents: str + + +@dataclass(frozen=True) +class Dir: + """Virtual representation of a directory.""" + + name: str + contents: List["Dir" | File] + + +def create_on_filesystem(starting_location: Path, representation: Dir | File) -> None: + """Recursively constructs files and directories as specified. + + This function creates files and directories according to the supplied virtual representation. The relationship + between them is encoded using the :class:`Dir` and :class:`File` classes and specified via `contents` of a + :class:`Dir`. Files may have contents by populating the `contents` field of the :class:`File` class. Any contents + will be written to disk. These files and directories are written to disk from the :param:`starting_location`. Note + that as this function recursively executes, any :class:`Dir` specified in the contents of another :class:`Dir` + becomes the new :param:`starting_location`. + + Args: + starting_location: Where the contents will be written to on-disk. + representation: The virtual filesystem representation being created. + + Raises: + TypeError if the :param:`representation` is not a :class:`Dir` or :class:`File`. + """ + if not starting_location.is_dir(): + raise ValueError(f"❌ Starting location must be a directory. This is not: {starting_location}") + + match representation: + case File(name, contents): + fi = starting_location / name + fi.touch(exist_ok=False) + if len(contents) > 0: + with open(str(fi), "wt") as wt: + wt.write(contents) + + case Dir(name, contents): + d = starting_location / name + d.mkdir(exist_ok=False) + for child in contents: + create_on_filesystem(d, child) + + case _: + raise TypeError(f"😱 Expecting a {File} or a {Dir} but obtained a {type(representation)}") + + +def namespace_py_project_structure( + base_name: str, + project_module_name: str, + dependencies: List[str], + add_setup_py: bool = True, + add_test_reqs: bool = False, + add_dev_reqs: bool = False, + prefix_test_dirs: bool = False, +) -> Dir: + """Virtual representation of files and folders for a namespaced Python project. + + The returned `Dir` represents the entire directory containing a namespaced Python project. Such a project needs + things like a place to store the Python packages and modules (`src/`), a place for unit tests (`tests/`), + files to list project infrastructure (`requirements*.txt`, `pyproject.toml`, `setup.py`), and documentation + (`README.md`). + + It also needs to have the right directory setup to support PEP 420 namespace packages. Of note, the `src/` + directory will contain a directory for the namespace (`base_name`) that will **not** have a `__init__.py` file. + However, its sub-package directories (first, the Python-friendly version of `project_module_name`) **will** have + `__init__.py` files like regular Python packages do. + + Note, unlike :func:`py_project_structure`, this function defaults to exclude the test & development dependencies + under `requirements-test.txt` and `requirements-dev.txt`, respectively. Additionally, this function will include the + `setup.py` file by default. + + Args: + base_name: The namespace package name. The import name for the project will follow `base_name.*`. + Note, however, that when used as a Python name, this value will have `-` converted to `_`. + project_module_name: Used in the project infrastructure & documentation files. It's also used to create the + first differentiated namespaced Python package and initial unit test file. This will be + the first sub-package created under the `base_name` namespace. Note, however, that when + used as a Python name, this value will have `-` converted to `_`. + dependencies: populates the generated `requirements.txt` file. + add_setup_py: if true, includes a `File` for `setup.py`. + add_test_reqs: If true, includes a `File` for `requirements-test.txt` populated with `pytest`. + add_dev_reqs: If true, includes a `File` for `requirements-dev.txt` populated with `ruff` & `ipdb`. + prefix_test_dirs: If present, then "test_" is prefixed to the name of each directory created under `tests/` + with "_" as the word separator. + + Returns: + Virtual representation of simple Python project on a filesystem. + + Raises: + ValueError If the :param:`base_name` or :param:`project_module_name` is not a valid Python identifier. + """ + check(base_name) + check(project_module_name) + + project_name = f"{base_name}-{project_module_name}" + + base_module = convert(base_name) + module_name = convert(project_module_name) + + test_dir_prefix = "test_" if prefix_test_dirs else "" + + project = Dir( + name=project_name, + contents=[ + File("README.md", readme_md(base_module, project_name)), + File( + "pyproject.toml", + pyproject_toml_setuptools(module_name, project_name), + ), + File("requirements.txt", requirements_txt(dependencies)), + Dir( + "src", + contents=[ + Dir( + name=base_module, + contents=[ + Dir( + name=module_name, + contents=[ + File("__init__.py", contents=""), + ], + ) + ], + ) + ], + ), + Dir( + "tests", + contents=[ + Dir( + name=f"{test_dir_prefix}{base_module}", + contents=[ + Dir( + name=f"{test_dir_prefix}{module_name}", + contents=[ + File(f"test_TODO_{base_module}_{module_name}.py", contents=pytest_example()) + ], + ) + ], + ) + ], + ), + ], + ) + + if add_setup_py: + project.contents.append(File("setup.py", setup_py())) + if add_test_reqs: + project.contents.append(File("requirements-test.txt", requirements_txt(["pytest-cov"]))) + if add_dev_reqs: + project.contents.append(File("requirements-dev.txt", requirements_txt(["ruff", "ipython", "ipdb"]))) + + return project + + +def bionemo_subproject_structure( + subproject_name: str, + internal_dependencies: List[str], +) -> Dir: + """Virtual representation of files and folders for a bionemo sub-project Python project. + + Very similar to :func:`namespace_py_project_structure`, but specific for creating new sub-projects in + the bionemo framework repository. Like that function, the returned `Dir` represents the entire directory + containing a namespaced Python project, with files and subdirectories set up for PEP 420 namespace packages. + + Args: + subproject_name: The bionemo sub-package name. Note the directory will be `bionemo-` and the + Python import path will be `import bionemo.`. + When used as a Python name, this value will have `-` converted to `_`. + internal_dependencies: Other bionemo subprojects to depend on. + + Returns: + Virtual representation of simple Python project on a filesystem. + + Raises: + ValueError If the :param:`base_name` or :param:`project_module_name` is not a valid Python identifier. + ValueError If the :param:`internal_dependencies` are not all bionemo sub-projects. + """ + # TODO some mild refactoring necessary to this and namespace project creation + # most logic is the same, but we want to have a private function to do it and then + # call that with the right checking from these 2 public-facing functions + base_name = "bionemo" + check(base_name) + check(subproject_name) + + project_name = f"{base_name}-{subproject_name}" + + base_module = convert(base_name) + module_name = convert(subproject_name) + + project = Dir( + name=project_name, + contents=[ + File("README.md", readme_md(base_module, project_name)), + File( + "pyproject.toml", + pyproject_toml_subproject(subproject_name, internal_dependencies), + ), + Dir( + "src", + contents=[ + Dir( + name=base_module, + contents=[ + Dir( + name=module_name, + contents=[ + File("__init__.py", contents=""), + ], + ) + ], + ) + ], + ), + Dir( + "tests", + contents=[ + Dir( + name=f"{base_module}", + contents=[ + Dir( + name=f"{module_name}", + contents=[ + File(f"test_TODO_{base_module}_{module_name}.py", contents=pytest_example()) + ], + ) + ], + ) + ], + ), + ], + ) + + return project + + +def py_project_structure( + project_name: str, + dependencies: List[str], + add_setup_py: bool = False, + add_test_reqs: bool = True, + add_dev_reqs: bool = True, + prefix_test_dirs: bool = True, +) -> Dir: + """Virtual representation of files and folders for a simple, non-namespaced Python project. + + The returned `Dir` represents the entire directory containing a Python project. Such a project needs + things like a place to store the Python packages and modules (`src/`), a place for unit tests (`tests/`), + files to list project infrastructure (`requirements*.txt`, `pyproject.toml`, `setup.py`), and documentation + (`README.md`). + + Note, unlike :func:`namespace_py_project_structure`, this function defaults to include the test & development + dependencies under `requirements-test.txt` and `requirements-dev.txt`, respectively. Additionally, this function + will not include the `setup.py` file by default. + + Args: + project_name: Used in the project infrastructure & documentation files. It's also used to create the first + Python package and initial unit test file. + dependencies: Populates the generated `requirements.txt` file. + add_setup_py: If true, includes a `File` for `setup.py`. + add_test_reqs: If true, includes a `File` for `requirements-test.txt` populated with `pytest`. + add_dev_reqs: If true, includes a `File` for `requirements-dev.txt` populated with `ruff` & `ipdb`. + prefix_test_dirs: If present, then "test_" is prefixed to the name of each directory created under `tests/` + with "_" as the word separator. + + Returns: + Virtual representation of simple Python project on a filesystem. + + Raises: + ValueError If the project name is not a valid Python package or module name. + """ + check(project_name) + + module_name = convert(project_name) + + test_dir_prefix = "test_" if prefix_test_dirs else "" + + project = Dir( + name=project_name, + contents=[ + File("README.md", readme_md(module_name, project_name)), + File( + "pyproject.toml", + pyproject_toml_setuptools(module_name, project_name), + ), + File("requirements.txt", requirements_txt(dependencies)), + Dir( + "src", + contents=[ + Dir( + name=module_name, + contents=[ + File("__init__.py", contents=""), + ], + ) + ], + ), + Dir( + "tests", + contents=[ + Dir( + f"{test_dir_prefix}{module_name}", + contents=[File(f"test_TODO_{module_name}.py", contents=pytest_example())], + ) + ], + ), + ], + ) + + if add_setup_py: + project.contents.append(File("setup.py", setup_py())) + if add_test_reqs: + project.contents.append(File("requirements-test.txt", requirements_txt(["pytest-cov"]))) + if add_dev_reqs: + project.contents.append(File("requirements-dev.txt", requirements_txt(["ruff", "ipython", "ipdb"]))) + + return project + + +def check(project_module_name: str) -> None: + """Checks whether or not the input is acceptable as a Python module or package name. + + Raises: + ValueError if the input is invalid. Error message will contain specific reason. + """ + project_module_name = project_module_name.strip() + + if len(project_module_name) == 0: + raise ValueError(f"❌ Must be non-empty: {project_module_name=}") + + if " " in project_module_name: + raise ValueError(f"❌ No empty spaces allowed in {project_module_name=}") + + if project_module_name[0] in string.digits: + raise ValueError(f"❌ Cannot start with number: {project_module_name=}") + + for i, c in enumerate(project_module_name): + if c in string.ascii_uppercase: + raise ValueError( + f'❌ Cannot have capital letters: character {i} ("{c}") is not allowed for {project_module_name=}' + ) + + if "_" in project_module_name: + raise ValueError(f"❌ Use '-' instead of '_' as a word separator for {project_module_name=}") + + as_python = convert(project_module_name) + # We want to know if the name will be a valid python module. + # A quick hack is to try and see if it would work as the name of a value. + # So we make it equal to an int in a small Python program and try to parse it w/ the ast package. + if is_valid_python(f"{as_python} = 10") is not None: + raise ValueError( + f"❌ {project_module_name=} is invalid as a python module name ({as_python=}): " + "it contains one or more not allowed characters" + ) + + +def convert(project_module_name: str) -> str: + """Replaces hyphens with underscores and removes surrounding whitespace.""" + return project_module_name.strip().replace("-", "_") diff --git a/internal/infra-bionemo/src/infra_bionemo/new_project/exe/__init__.py b/internal/infra-bionemo/src/infra_bionemo/new_project/exe/__init__.py new file mode 100644 index 0000000000..25e6abfbc5 --- /dev/null +++ b/internal/infra-bionemo/src/infra_bionemo/new_project/exe/__init__.py @@ -0,0 +1,14 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: LicenseRef-Apache2 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/internal/infra-bionemo/src/infra_bionemo/new_project/exe/bionemo_subpackage.py b/internal/infra-bionemo/src/infra_bionemo/new_project/exe/bionemo_subpackage.py new file mode 100644 index 0000000000..bda9f310a8 --- /dev/null +++ b/internal/infra-bionemo/src/infra_bionemo/new_project/exe/bionemo_subpackage.py @@ -0,0 +1,139 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: LicenseRef-Apache2 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import shutil +from pathlib import Path +from typing import Sequence + +import click +import tomli +import tomli_w + +from infra_bionemo.new_project.api import bionemo_subproject_structure, check, create_on_filesystem +from infra_bionemo.new_project.utils import ask_yes_or_no + + +__all__: Sequence[str] = () + + +@click.command(help="Create a new bionemo sub-package project") +@click.option("--project-name", "-p", type=str, required=True, help="Name of new bionemo sub-package project") +@click.option( + "--loc-sub-pack", + "-l", + type=str, + required=True, + help="Location to sub-packages/ directory", + default="./sub-packages", +) +@click.option( + "--relax-name-check", + "-r", + is_flag=True, + help="If present, allows --loc-sub-pack to not be exactly 'sub-packages/'.", +) +def entrypoint(project_name: str, loc_sub_pack: str, relax_name_check: bool) -> None: + main(**locals()) # pragma: no cover + + +def main(*, project_name: str, loc_sub_pack: str, relax_name_check: bool) -> None: + location_sub_packages = Path(loc_sub_pack) + + if project_name.startswith("bionemo-"): + project_name = project_name.split("bionemo-", maxsplit=1)[1] + + full_project_name = f"bionemo-{project_name}" + print(f"🔨 Creating {location_sub_packages}/{full_project_name}") + + if not location_sub_packages.is_dir(): + raise ValueError( + f"❌ Need to specify location of sub-packages/ with --loc-sub-pack. Does not exist: {location_sub_packages}" + ) + + if not relax_name_check and location_sub_packages.name != "sub-packages": + raise ValueError( + f"❌ Must specify sub-packages/ as --loc-sub-pack, not: {location_sub_packages} " + f"Otherwise, specify --relax-name-check to skip this check." + ) + + bionemo_fw = location_sub_packages / "bionemo-fw" + if not bionemo_fw.is_dir(): + raise ValueError( + "❌ bionemo-fw is missing from sub-packages! " + f"Check that this exists: {location_sub_packages / 'bionemo-fw'}" + ) + bionemo_fw_pyproject_toml = bionemo_fw / "pyproject.toml" + if not bionemo_fw_pyproject_toml.is_file(): + raise ValueError( + f"❌ bionemo-fw is missing its pyproject.toml file. Cannot add {full_project_name} as a dependency!" + ) + + check(project_name) + + internal_deps = [] + # UPDATE THIS LIST WITH NEW bionemo-* COMPONENT LIBRARIES! + for component in ["bionemo-llm"]: + if ask_yes_or_no(f"🤔 Do you want to depend on {component} ?"): + internal_deps.append(component) + + new_project_representation = bionemo_subproject_structure( + subproject_name=project_name, + internal_dependencies=internal_deps, + ) + + print("🔨 Creating new project on file system.") + try: + # create the bionemo subpackage project + create_on_filesystem(location_sub_packages, new_project_representation) + + # add to bionemo-fw's requirements + _add_dependency(bionemo_fw_pyproject_toml, full_project_name) + + except Exception: # pragma: no cover + print("❌ ERROR: failed to create! Cleaning up.") + shutil.rmtree(str(location_sub_packages / full_project_name)) + raise + + print(f"✅ Created {full_project_name} and added as a dependency to the bionemo-fw package 🎉") + + +def _add_dependency(bionemo_fw_pyproject_toml: Path, full_project_name: str) -> None: + with open(str(bionemo_fw_pyproject_toml), "rb") as rb: + fw_toml = tomli.load(rb) + + if "project" not in fw_toml: + raise ValueError( + "bionemo-fw's pyproject.toml is invalid! No project section found in: " f"{bionemo_fw_pyproject_toml}" + ) + if "dependencies" not in fw_toml["project"]: + raise ValueError( + "bionemo-fw's pyproject.toml is invalid! No project.dependencies section found in: " + f"{bionemo_fw_pyproject_toml}" + ) + if not isinstance(fw_toml["project"]["dependencies"], list): + raise ValueError( + "bionemo-fw's pyproject.toml is invalid! The project.dependencies section is not a list, it is a " + f'{type(fw_toml["project"]["dependencies"])=}, found in: ' + f"{bionemo_fw_pyproject_toml}" + ) + fw_toml["project"]["dependencies"].append(full_project_name) + + fw_toml_s = tomli_w.dumps(fw_toml) + with open(str(bionemo_fw_pyproject_toml), "wt") as wt: + wt.write(fw_toml_s) + + +if __name__ == "__main__": + entrypoint() # pragma: no cover diff --git a/internal/infra-bionemo/src/infra_bionemo/new_project/exe/namespace.py b/internal/infra-bionemo/src/infra_bionemo/new_project/exe/namespace.py new file mode 100644 index 0000000000..e17abb444d --- /dev/null +++ b/internal/infra-bionemo/src/infra_bionemo/new_project/exe/namespace.py @@ -0,0 +1,89 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: LicenseRef-Apache2 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import shutil +from pathlib import Path +from typing import Sequence + +import click + +from infra_bionemo.new_project.api import check, create_on_filesystem, namespace_py_project_structure + + +__all__: Sequence[str] = () + + +def main( + *, + namespace: str, + module: str, + location: str, + no_test_append: bool, +) -> None: + loc = Path(location) + project_name = f"{namespace}-{module}" + print(f"🔨 Creating namespaced project {loc}/{project_name}") + + if loc.is_file(): + raise ValueError("❌ --location is a file!") + + loc.mkdir(parents=True, exist_ok=True) + + check(namespace) + check(module) + + new_project_representation = namespace_py_project_structure( + base_name=namespace, + project_module_name=module, + dependencies=[], + add_setup_py=False, + add_test_reqs=True, + add_dev_reqs=True, + prefix_test_dirs=not no_test_append, + ) + + print("🔨 Creating new namespace Python project on file system.") + try: + # create the Python project + create_on_filesystem(loc, new_project_representation) + + except Exception: # pragma: no cover + print("❌ ERROR: failed to create! Cleaning up.") + shutil.rmtree(str(loc / project_name)) + raise + + print(f"✅ Created namespaced {project_name} in {loc} 🎉") + + +@click.command(help="Create a new bionemo sub-package project") +@click.option("--namespace", "-n", type=str, required=True, help="Name of new Python base namespace.") +@click.option("--module", "-m", type=str, required=True, help="Name of new Python subpackage in the namespace.") +@click.option("--location", "-l", type=str, required=True, help="Location to create new project.", default=".") +@click.option( + "--no-test-append", + is_flag=True, + help="If present, do not append 'test_' to the name of each directory created under 'tests/'", +) +def entrypoint( + namespace: str, + module: str, + location: str, + no_test_append: bool, +) -> None: + main(**locals()) # pragma: no cover + + +if __name__ == "__main__": + entrypoint() # pragma: no cover diff --git a/internal/infra-bionemo/src/infra_bionemo/new_project/exe/simple.py b/internal/infra-bionemo/src/infra_bionemo/new_project/exe/simple.py new file mode 100644 index 0000000000..52f2945def --- /dev/null +++ b/internal/infra-bionemo/src/infra_bionemo/new_project/exe/simple.py @@ -0,0 +1,69 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: LicenseRef-Apache2 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import shutil +from pathlib import Path +from typing import Sequence + +import click + +from infra_bionemo.new_project.api import check, create_on_filesystem, py_project_structure + + +__all__: Sequence[str] = () + + +@click.command(help="Create a Python project") +@click.option("--project-name", "-p", type=str, required=True, help="Name of new Python project & module.") +@click.option("--location", "-l", type=str, required=True, help="Location to create new project.", default=".") +def entrypoint(project_name: str, location: str) -> None: + main(**locals()) # pragma: no cover + + +def main(*, project_name: str, location: str) -> None: + loc = Path(location) + print(f"🔨 Creating {loc}/{project_name}") + + if loc.is_file(): + raise ValueError("❌ --location is a file!") + + loc.mkdir(parents=True, exist_ok=True) + + check(project_name) + + new_project_representation = py_project_structure( + project_name=project_name, + dependencies=[], + add_setup_py=False, + add_test_reqs=True, + add_dev_reqs=True, + prefix_test_dirs=True, + ) + + print("🔨 Creating new project on file system.") + try: + # create the Python project + create_on_filesystem(loc, new_project_representation) + + except Exception: # pragma: no cover + print("❌ ERROR: failed to create! Cleaning up.") + shutil.rmtree(str(loc / project_name)) + raise + + print(f"✅ Created {project_name} in {loc} 🎉") + + +if __name__ == "__main__": + entrypoint() # pragma: no cover diff --git a/internal/infra-bionemo/src/infra_bionemo/new_project/templates.py b/internal/infra-bionemo/src/infra_bionemo/new_project/templates.py new file mode 100644 index 0000000000..f442a425f5 --- /dev/null +++ b/internal/infra-bionemo/src/infra_bionemo/new_project/templates.py @@ -0,0 +1,240 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: LicenseRef-Apache2 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import sys +from string import Template +from typing import Sequence + + +__all__: Sequence[str] = ( + "pyproject_toml_setuptools", + "pyproject_toml_subproject", + "setup_py", + "requirements_txt", + "readme_md", + "pytest_example", +) + + +def pyproject_toml_setuptools(package_name: str, project_name: str) -> str: + """A pyproject.toml file contents that configures a Python project according to PEP-517 & PEP-518 with setuptools. + + Args: + package_name: name of the project's Python package. + project_name: name of the Python project. + + Returns: + pyproject.toml contents that configure all aspects of the Python project. Uses setuptools. + + Raises: + ValueError wrapping any encountered exception. + """ + try: + return Template(_pyproject_toml_setuptools).substitute( + package_name=package_name, + project_name=project_name, + ) + except Exception as e: # pragma: no cover + raise ValueError("😱 Creation of pyproject.toml failed!") from e + + +def pyproject_toml_subproject(subproject_name: str, internal_dependencies: Sequence[str]) -> str: + """A pyproject.toml suitable as a bionemo sub-project. + + Args: + subproject_name: name of the project's Python package, not the top-level namespaced one. + internal_dependencies: list of other bionemo sub-projects to depend on. + + Returns: + pyproject.toml contents that configure all aspects of the Python project. Uses setuptools and uv. + + Raises: + ValueError wrapping any encountered exception. + ValueError if providing a non-bionemo internal dependency. + """ + ok_internal_deps = [] + for x in internal_dependencies: + x = x.strip() + if len(x) == 0 or not x.startswith("bionemo-"): + raise ValueError(f"Invalid internal dependency: {x}") + if x == "bionemo-core": + print("bionemo-core is always a dependency, ignoring redundant inclusion", file=sys.stderr) + else: + ok_internal_deps.append(x) + + try: + return Template(_pyproject_toml_subproject).substitute( + subproject_name=subproject_name, + internal_deps=",".join(ok_internal_deps), + ) + except Exception as e: # pragma: no cover + raise ValueError("😱 Creation of pyproject.toml for bionemo sub-project failed!") from e + + +def setup_py() -> str: + """Contents of a minimal setup.py file that works with a pyproject.toml configured project.""" + return _setup_py + + +def requirements_txt(packages: Sequence[str]) -> str: + """Contents of a simple requirements.txt style list of Python package dependencies.""" + return "\n".join(packages) + + +def readme_md(package_name: str, project_name: str) -> str: + """Contents for the start of a Python project's README in Markdown format. + + Args: + package_name: name of the project's Python package. + project_name: name of the Python project. + + Returns: + Basic README contents. + + Raises: + ValueError wrapping any encountered exception. + """ + try: + return Template(_readme_md).substitute( + package_name=package_name, + project_name=project_name, + ) + except Exception as e: # pragma: no cover + raise ValueError("😱 Creation of README.md failed!") from e + + +def pytest_example() -> str: + """Contents of an example pytest based Python file.""" + return _pytest_example + + +_pyproject_toml_subproject: str = """ +[build-system] +requires = ["setuptools>=64", "wheel"] +build-backend = "setuptools.build_meta" + +[project] +name = "bionemo-${subproject_name}" +readme = "README.md" +description = "" +authors = [{ name = "BioNeMo Team", email = "bionemofeedback@nvidia.com" }] +requires-python = ">=3.10" +license = { file = "LICENSE" } +version = { file = "VERSION" } +dependencies = [ + # internal + 'bionemo-core', ${internal_deps} + # external +] + +[tool.setuptools.packages.find] +where = ["src"] +include = ["bionemo.*"] +namespaces = true +exclude = ["test*."] + +[tool.uv] +cache-keys = [{ git = true }] +""" + + +_pyproject_toml_setuptools: str = """ +[build-system] +requires = ["setuptools", "wheel"] +build-backend = "setuptools.build_meta" + +# For guidance, see: https://packaging.python.org/en/latest/guides/writing-pyproject-toml/ +[project] +name = "${project_name}" +version = "0.0.0" +authors = [] +description = "" +readme = "README.md" +requires-python = ">=3.10" +keywords = [] +license = {file = "LICENSE"} +classifiers = [ + "Programming Language :: Python :: 3.10", + "Private :: Do Not Upload", +] +dynamic = ["dependencies"] + +[tool.setuptools.dynamic] +dependencies = {file = ["requirements.txt"]} + +[tool.pytest.ini_options] +testpaths = ["tests"] +filterwarnings = [ "ignore::DeprecationWarning",] + +[tool.coverage.run] +source = ["${package_name}"] + +[tool.black] +line-length = 120 +target-version = ['py310'] + +[tool.ruff] +lint.ignore = ["C901", "E741", "E501",] +# Run `ruff linter` for a description of what selection means. +lint.select = ["C", "E", "F", "I", "W",] +line-length = 120 + +# Ignore import violations in all `__init__.py` files. +[tool.ruff.lint.per-file-ignores] +"__init__.py" = ["E402", "F401", "F403", "F811",] + +[tool.ruff.lint.isort] +lines-after-imports = 2 +known-first-party = ["${package_name}"] + +[tool.ruff.lint.pydocstyle] +convention = "google" + +""".strip() + +_setup_py: str = """ +from setuptools import setup + + +if __name__ == "__main__": + setup() +""".strip() + + +_readme_md: str = """ +# ${project_name} + +To install, execute the following: +```bash +pip install -e . +``` + +To run unit tests, execute: +```bash +pytest -v . +``` + +""".strip() + + +_pytest_example: str = """ +import pytest +from pytest import fixture, raises, mark + + +def test_todo() -> None: + raise ValueError(f"Implement tests! Make use of {fixture} for data, {raises} to check for " + f"exceptional cases, and {mark} as needed") + +""".strip() diff --git a/internal/infra-bionemo/src/infra_bionemo/new_project/utils.py b/internal/infra-bionemo/src/infra_bionemo/new_project/utils.py new file mode 100644 index 0000000000..de101e2167 --- /dev/null +++ b/internal/infra-bionemo/src/infra_bionemo/new_project/utils.py @@ -0,0 +1,53 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: LicenseRef-Apache2 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Sequence + + +__all__: Sequence[str] = ("ask_yes_or_no",) + + +def ask_yes_or_no(message: str) -> bool: + """Prompt user via STDIN for a boolean response: 'yes'/'y' is True and 'no'/'n' is False. + + Note that the input gathered from STDIN is stripped of all surrounding whitespace and converted to lowercase. + While the user is prompted on STDOUT to supply 'y' or 'n', note that 'yes' and 'no' are accepted, respectively. + An affirmative response ('yes' or 'y') will result in True being returned. A negative response ('no' or 'n') + results in a False being returned. + + This function loops forever until it reads an unambiguous affirmative ('y') or negative ('n') response via STDIN. + + Args: + message: Added to the STDOUT prompt for the user. + + Returns: + True if user responds in the affirmative via STDIN. False if user responds in the negative. + + Raises: + ValueError iff message is the empty string or only consists of whitespace. + """ + if len(message) == 0 or len(message.strip()) == 0: + raise ValueError("Must supply non-empty message for STDOUT user prompt.") + + while True: + print(f"{message} [y/n]\n>> ", end="") + response = input().strip().lower() + match response: + case "y" | "yes": + return True + case "n" | "no": + return False + case _: + print(f'😱 ERROR: must supply "y" or "n", not "{response}". Try again!\n') diff --git a/internal/infra-bionemo/tests/conftest.py b/internal/infra-bionemo/tests/conftest.py new file mode 100644 index 0000000000..25e6abfbc5 --- /dev/null +++ b/internal/infra-bionemo/tests/conftest.py @@ -0,0 +1,14 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: LicenseRef-Apache2 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/internal/infra-bionemo/tests/test_infra_bionemo/test_license_check.py b/internal/infra-bionemo/tests/test_infra_bionemo/test_license_check.py new file mode 100644 index 0000000000..870f23bd29 --- /dev/null +++ b/internal/infra-bionemo/tests/test_infra_bionemo/test_license_check.py @@ -0,0 +1,235 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: LicenseRef-Apache2 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from pathlib import Path +from typing import Tuple + +from pytest import fixture, raises + +from infra_bionemo.license_check import ( + Checked, + HeaderNotFound, + append_license_header, + check_license_project_files, + ensure_license_starts_with_pound, + has_header, + is_valid_python, + license_check, + remove_existing_license_header, +) +from infra_bionemo.license_check import ( + main as license_check_cli_main, +) + + +@fixture(scope="module") +def valid() -> str: + return """ +def double(x: int) -> int: + return x * 2 +print(f"{double(10)=}") + """.strip() + + +@fixture(scope="module") +def invalid() -> str: + return """ +print "nope, this is no longer ok!" + """.strip() + + +@fixture(scope="module") +def license_text() -> str: + return """ +# The license would be here. +# And continue on. +# For maybe a few more +# lines. + """.strip() + + +@fixture(scope="module") +def license_text_not_escaped() -> str: + return """ +The license would be here. +And continue on. +For maybe a few more +lines. + """.strip() + + +@fixture(scope="module") +def full(license_text: str, valid: str) -> str: + return f"{license_text}\n\n{valid}" + + +def test_is_valid_python(valid: str, invalid: str): + assert is_valid_python(valid) is None + assert isinstance(is_valid_python(invalid), SyntaxError) + + +def test_has_header(full: str, license_text: str): + assert has_header(full, license_header=license_text) is True + + +def test_append_license_header(valid: str, license_text: str, full: str): + actual_full = append_license_header(valid, license_header=license_text, n_sep_lines=2) + assert actual_full == full + + +def test_license_check(valid: str, invalid: str, full: str, license_text: str, tmp_path: Path): + pyfile = tmp_path / "_testing_pyfile_89712652015.py" + + # does not exist + assert isinstance(license_check(pyfile, modify=False), IOError) + + # invalid python + with open(str(pyfile), "wt") as wt: + wt.write(invalid) + assert isinstance(license_check(pyfile, modify=False), SyntaxError) + + # valid, but w/o license header + with open(str(pyfile), "wt") as wt: + wt.write(valid) + assert isinstance(license_check(pyfile, modify=False), HeaderNotFound) + + # valid w/o license header, but automatic fix works + assert license_check(pyfile, modify=True) is None + assert license_check(pyfile, modify=False) is None + + # works as expected on valid python w/ header + with open(str(pyfile), "wt") as wt: + wt.write(full) + assert license_check(pyfile, modify=False, license_header=license_text) is None + + +def test_check_license_project_files(valid: str, invalid: str, full: str, license_text: str, tmp_path: Path): + project_dir = tmp_path / "python_package_for_testing_1245" + + # not a directory + with raises(AssertionError): + check_license_project_files(project_dir, modify=False, replace=False, license_header=license_text) + + # create directory & populate .py files + _, invalid_fi, _ = _create_py_project_and_files(project_dir, valid, invalid, full) + + # checking w/o modifying --> one invalid file + 1 file w/o license + checked: Checked = check_license_project_files( + project_dir, modify=False, replace=False, license_header=license_text + ) + assert checked.n_files == 3 + assert len(checked.noncompliant_files) == 2 + + # remove invalid file + invalid_fi.unlink() + # now, checking w/o modification will result in no non-compliant files + checked = check_license_project_files(project_dir, modify=True, replace=False, license_header=license_text) + assert checked.n_files == 2 + assert len(checked.noncompliant_files) == 0 + + +def test_ensure_license_starts_with_pound(license_text: str, license_text_not_escaped: str): + assert ensure_license_starts_with_pound(license_text_not_escaped) == license_text + + +def test_remove_existing_license_header(valid: str, license_text: str): + assert ( + len(remove_existing_license_header(license_text)) == 0 + ), "Removing from a header-only file should result in an empty string." + + pyfile_with_header = append_license_header(valid, license_header=license_text, n_sep_lines=1) + removed_pyfile = remove_existing_license_header(pyfile_with_header) + assert removed_pyfile == valid + + +def test_main(valid: str, invalid: str, full: str, license_text: str, tmp_path: Path): + project_dir = tmp_path / "different_python_package_for_testing_1245" + + valid_fi, invalid_fi, full_fi = _create_py_project_and_files( + project_dir, + valid, + invalid, + full, + ) + + full_fi_2 = tmp_path / "another_full.py" + with open(str(full_fi_2), "wt") as wt: + wt.write(full) + + invalid_fi_2 = tmp_path / "another_invalid.py" + with open(str(invalid_fi_2), "wt") as wt: + wt.write(invalid) + + # check: len(files) + len(directories) > 0 + with raises(ValueError): + license_check_cli_main( + modify=False, + replace=False, + license_header_contents=license_text, + files=[], + directories=[], + ) + + # check: non-empty license contents + with raises(ValueError): + license_check_cli_main( + modify=False, + replace=False, + license_header_contents="", + files=[full_fi_2], + directories=[project_dir], + ) + + # invalid file + # valid file w/o license header + with raises(ValueError): + license_check_cli_main( + modify=False, + replace=False, + license_header_contents=license_text, + files=[full_fi_2, invalid_fi_2], + directories=[project_dir], + ) + + # can fix if there are no invalid files + invalid_fi.unlink() + # and modify=True + checked_n_files: int = license_check_cli_main( + modify=True, + replace=False, + license_header_contents=license_text, + files=[full_fi_2], + directories=[project_dir], + ) + assert checked_n_files == 3 + + +def _create_py_project_and_files(project_dir: Path, valid: str, invalid: str, full: str) -> Tuple[Path, Path, Path]: + """Creates Python project dir w/ valid, invalid, and full .py file contents. + Returns (valid, invalid, full) filepaths. + """ + project_dir.mkdir() + # add the valid, invalid, and full python code to the directory + # make use of nested directories to ensure that recursive logic works + valid_fi = project_dir / "valid.py" + invalid_fi = project_dir / "another_package" / "invalid.py" + invalid_fi.parent.mkdir() + full_fi = project_dir / "different" / "nested" / "packages" / "full.py" + full_fi.parent.mkdir(parents=True) + for fi, contents in [(valid_fi, valid), (invalid_fi, invalid), (full_fi, full)]: + with open(str(fi), "wt") as wt: + wt.write(contents) + return valid_fi, invalid_fi, full_fi diff --git a/internal/infra-bionemo/tests/test_infra_bionemo/test_new_project/test_api.py b/internal/infra-bionemo/tests/test_infra_bionemo/test_new_project/test_api.py new file mode 100644 index 0000000000..1834132610 --- /dev/null +++ b/internal/infra-bionemo/tests/test_infra_bionemo/test_new_project/test_api.py @@ -0,0 +1,196 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: LicenseRef-Apache2 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from pathlib import Path + +from pytest import mark, raises + +from infra_bionemo.new_project.api import ( + Dir, + File, + bionemo_subproject_structure, + check, + convert, + create_on_filesystem, + namespace_py_project_structure, + py_project_structure, +) + + +def test_simple_file_and_dir(): + f = File("start.txt", contents="Hello world!") + d = Dir("location", contents=[f]) + assert f in d.contents + + +def test_create_on_filesystem_errors(tmpdir): + with raises(ValueError): + create_on_filesystem(Path("./.12341251820y67451-does-not-exist"), File("_", contents="")) + with raises(TypeError): + create_on_filesystem(Path(tmpdir), None) # type:ignore + + +def test_check_errors(): + with raises(ValueError): + check("") + + with raises(ValueError): + check("1not-ok") + + with raises(ValueError): + check("NOT-OK") + + with raises(ValueError): + check("not_ok") + + with raises(ValueError): + check("not$ok") + + +@mark.parametrize( + "project_name", + [ + "bionemo", + "bionemo-geneformer", + "infra-bionemo", + "super-awesome-tools", + ], +) +def test_project_name_check(project_name): + check(project_name) # raises error + with raises(ValueError): + check("not a valid project name") + + +@mark.parametrize( + "input_,expected", + [ + ("bionemo", "bionemo"), + (" bionemo-geneformer ", "bionemo_geneformer"), + ("infra-bionemo", "infra_bionemo"), + ("super-awesome-tools", "super_awesome_tools"), + ], +) +def test_module_name_convert(input_, expected): + actual = convert(input_) + assert actual == expected, f"{input_=} did not convert into {expected=}, instead was {actual=}" + + +def test_bionemo_subproject(): + d = bionemo_subproject_structure("geneformer-extras", ["bionemo-llm", "bionemo-geometric"]) + _assert_has_core_toplevel(d) + assert not _has_file_by_name("setup.py", d, descend=False) + assert not _has_file_by_name("requirements.txt", d, descend=False) + assert not _has_file_by_name("requirements-test.txt", d, descend=False) + assert not _has_file_by_name("requirements-dev.txt", d, descend=False) + assert _has_file_by_name("__init__.py", d, descend=True) + assert _has_file_by_name("test_TODO_bionemo_geneformer_extras.py", d, descend=True) + assert _has_dir_by_name("bionemo", d, descend=True) + assert _has_dir_by_name("geneformer_extras", d, descend=True) + + +def test_namespace_project(): + d = namespace_py_project_structure("bionemo", "geneformer-extras", ["nemo", "megatron"]) + _assert_has_core_toplevel(d) + assert _has_file_by_name("requirements.txt", d, descend=False) + assert _has_file_by_name("setup.py", d, descend=False) + assert not _has_file_by_name("requirements-test.txt", d, descend=False) + assert not _has_file_by_name("requirements-dev.txt", d, descend=False) + assert _has_file_by_name("__init__.py", d, descend=True) + assert _has_file_by_name("test_TODO_bionemo_geneformer_extras.py", d, descend=True) + assert _has_dir_by_name("bionemo", d, descend=True) + assert _has_dir_by_name("geneformer_extras", d, descend=True) + + d = namespace_py_project_structure( + "bionemo", + "geneformer-extras", + ["nemo", "megatron"], + add_test_reqs=True, + add_dev_reqs=True, + prefix_test_dirs=True, + ) + assert _has_file_by_name("requirements-test.txt", d, descend=False) + assert _has_file_by_name("requirements-dev.txt", d, descend=False) + assert _has_dir_by_name("test_bionemo", d, descend=True) + assert _has_dir_by_name("test_geneformer_extras", d, descend=True) + + +def test_simple_project(): + d = py_project_structure("infra-bionemo", ["nltk"]) + _assert_has_core_toplevel(d) + assert _has_file_by_name("requirements.txt", d, descend=False) + assert not _has_file_by_name("setup.py", d, descend=False) + assert _has_file_by_name("requirements-test.txt", d, descend=False) + assert _has_file_by_name("requirements-dev.txt", d, descend=False) + assert _has_file_by_name("__init__.py", d, descend=True) + assert _has_file_by_name("test_TODO_infra_bionemo.py", d, descend=True) + assert _has_dir_by_name("infra_bionemo", d, descend=True) + + d = py_project_structure( + "infra-bionemo", + ["nltk"], + add_setup_py=True, + prefix_test_dirs=True, + ) + assert _has_file_by_name("setup.py", d, descend=False) + assert _has_dir_by_name("test_infra_bionemo", d, descend=True) + + +def _assert_has_core_toplevel(x: Dir): + assert _has_file_by_name("README.md", x, descend=False) + assert _has_file_by_name("pyproject.toml", x, descend=False) + assert _has_dir_by_name("src", x, descend=False) + assert _has_dir_by_name("tests", x, descend=False) + + +def _has_file_by_name(f_or_name: File | str, x: Dir, descend: bool) -> bool: + match f_or_name: + case File(name, _): + filename: str = name + case str(): + filename = f_or_name + case _: + raise TypeError(f"Expecting f_or_name to be File or str, not {type(f_or_name)}, {f_or_name=}") + + for c in x.contents: + if isinstance(c, File): + if c.name == filename: + return True + if descend and isinstance(c, Dir): + found = _has_file_by_name(filename, c, descend=True) + if found: + return True + return False + + +def _has_dir_by_name(d_or_name: Dir | str, x: Dir, descend: bool) -> bool: + match d_or_name: + case Dir(name, _): + dirname: str = name + case str(): + dirname = d_or_name + case _: + raise TypeError(f"Expecting d_or_name to be Dir or str, not {type(d_or_name)}, {d_or_name=}") + + for c in x.contents: + if isinstance(c, Dir): + if c.name == dirname: + return True + if descend: + found = _has_dir_by_name(dirname, c, descend=True) + if found: + return True + return False diff --git a/internal/infra-bionemo/tests/test_infra_bionemo/test_new_project/test_cli_tools.py b/internal/infra-bionemo/tests/test_infra_bionemo/test_new_project/test_cli_tools.py new file mode 100644 index 0000000000..e0bb2e28ea --- /dev/null +++ b/internal/infra-bionemo/tests/test_infra_bionemo/test_new_project/test_cli_tools.py @@ -0,0 +1,148 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: LicenseRef-Apache2 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import io +from pathlib import Path + +import tomli_w +from pytest import raises + +from infra_bionemo.new_project.exe.bionemo_subpackage import main as main_bionemo_sub +from infra_bionemo.new_project.exe.namespace import main as main_namespace +from infra_bionemo.new_project.exe.simple import main as main_simple + + +def test_create_namespace_cli(tmpdir): + (Path(tmpdir) / "file").touch() + # not a dir! + with raises(ValueError): + main_namespace(namespace="acme", module="rocket", location=f"{str(tmpdir)}/file", no_test_append=False) + (Path(tmpdir) / "file").unlink() + + main_namespace(namespace="acme", module="rocket", location=str(tmpdir), no_test_append=False) + + location = Path(str(tmpdir)) / "acme-rocket" + assert location.is_dir() + assert (location / "src").is_dir() + assert (location / "src" / "acme").is_dir() + assert not (location / "src" / "acme" / "__init__.py").exists() + assert (location / "src" / "acme" / "rocket").is_dir() + assert (location / "src" / "acme" / "rocket" / "__init__.py").is_file() + assert (location / "tests").is_dir() + assert (location / "tests" / "test_acme").is_dir() + assert (location / "tests" / "test_acme" / "test_rocket").is_dir() + assert (location / "tests" / "test_acme" / "test_rocket" / "test_TODO_acme_rocket.py").is_file() + assert (location / "README.md").is_file() + assert not (location / "setup.py").exists() + assert (location / "pyproject.toml").is_file() + assert (location / "requirements.txt").is_file() + assert (location / "requirements-test.txt").is_file() + assert (location / "requirements-dev.txt").is_file() + + +def test_create_simple_cli(tmpdir): + (Path(tmpdir) / "file").touch() + # not a dir! + with raises(ValueError): + main_simple(project_name="simple", location=f"{str(tmpdir)}/file") + (Path(tmpdir) / "file").unlink() + + main_simple(project_name="simple", location=str(tmpdir)) + + location = Path(str(tmpdir)) / "simple" + assert location.is_dir() + assert (location / "src").is_dir() + assert (location / "src" / "simple").is_dir() + assert (location / "src" / "simple" / "__init__.py").is_file() + assert (location / "tests").is_dir() + assert (location / "tests" / "test_simple").is_dir() + assert (location / "tests" / "test_simple" / "test_TODO_simple.py").is_file() + assert (location / "README.md").is_file() + assert not (location / "setup.py").exists() + assert (location / "pyproject.toml").is_file() + assert (location / "requirements.txt").is_file() + assert (location / "requirements-test.txt").is_file() + assert (location / "requirements-dev.txt").is_file() + + +def test_create_bionemo_cli(tmpdir, monkeypatch): + # not a dir! + with raises(ValueError): + main_bionemo_sub( + project_name="bionemo-supermodel", + loc_sub_pack=f"{str(tmpdir)}/file", + relax_name_check=False, + ) + + # no sub-packages dir! + with raises(ValueError): + main_bionemo_sub( + project_name="bionemo-supermodel", + loc_sub_pack=str(tmpdir), + relax_name_check=False, + ) + + sub_packages = Path(tmpdir) / "sub-packages" + sub_packages.mkdir(parents=True, exist_ok=True) + + # no bionemo-fw dir! + with raises(ValueError): + main_bionemo_sub( + project_name="bionemo-supermodel", + loc_sub_pack=str(sub_packages), + relax_name_check=False, + ) + + (sub_packages / "bionemo-fw").mkdir(parents=True, exist_ok=True) + + # no pyproject.toml in bionemo-fw dir! + with raises(ValueError): + main_bionemo_sub( + project_name="bionemo-supermodel", + loc_sub_pack=str(sub_packages), + relax_name_check=False, + ) + + # create & add basic pyproject.toml structure that's checked + bionemo_fw_pyproject_toml = sub_packages / "bionemo-fw" / "pyproject.toml" + bionemo_fw_pyproject_toml.touch(exist_ok=True) + with open(str(bionemo_fw_pyproject_toml), "wt") as wt: + wt.write(tomli_w.dumps({"project": {"dependencies": []}})) + + with monkeypatch.context() as ctx: + ctx.setattr("sys.stdin", io.StringIO("y")) + main_bionemo_sub( + project_name="bionemo-supermodel", + loc_sub_pack=str(sub_packages), + relax_name_check=False, + ) + + location = sub_packages / "bionemo-supermodel" + assert location.is_dir() + assert (location / "src").is_dir() + assert (location / "src" / "bionemo").is_dir() + assert not (location / "src" / "bionemo" / "__init__.py").exists() + assert (location / "src" / "bionemo" / "supermodel").is_dir() + assert (location / "src" / "bionemo" / "supermodel" / "__init__.py").is_file() + assert (location / "tests").is_dir() + assert (location / "tests" / "bionemo").is_dir() + assert (location / "tests" / "bionemo" / "supermodel").is_dir() + assert (location / "tests" / "bionemo" / "supermodel" / "test_TODO_bionemo_supermodel.py").is_file() + assert (location / "README.md").is_file() + assert not (location / "setup.py").exists() + assert (location / "pyproject.toml").is_file() + assert not (location / "requirements.txt").exists() + assert not (location / "requirements-test.txt").exists() + assert not (location / "requirements-dev.txt").exists() diff --git a/internal/infra-bionemo/tests/test_infra_bionemo/test_new_project/test_utils.py b/internal/infra-bionemo/tests/test_infra_bionemo/test_new_project/test_utils.py new file mode 100644 index 0000000000..2be994408e --- /dev/null +++ b/internal/infra-bionemo/tests/test_infra_bionemo/test_new_project/test_utils.py @@ -0,0 +1,38 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: LicenseRef-Apache2 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import io + +from pytest import raises + +from infra_bionemo.new_project.utils import ask_yes_or_no + + +def test_ask_yes_or_no(monkeypatch): + with raises(ValueError): + ask_yes_or_no("") + + with monkeypatch.context() as ctx: + ctx.setattr("sys.stdin", io.StringIO("y")) + assert ask_yes_or_no("hello world?") + + with monkeypatch.context() as ctx: + ctx.setattr("sys.stdin", io.StringIO("n")) + assert not ask_yes_or_no("hello world?") + + with monkeypatch.context() as ctx: + ctx.setattr("sys.stdin", io.StringIO("loop once\ny")) + assert ask_yes_or_no("hello world?") diff --git a/pyproject.toml b/pyproject.toml index 71c74a7ce5..342e95505c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -28,6 +28,8 @@ dependencies = [ # external 'nemo_run', 'torch==2.3.*', + # internal code + "infra-bionemo", ] [project.optional-dependencies] @@ -37,6 +39,7 @@ build = ['flash-attn', 'pip'] members = [ "3rdparty/*", "sub-packages/bionemo-*/", + "internal/infra-bionemo/", ] [tool.uv.sources] @@ -58,6 +61,7 @@ bionemo-size-aware-batching = { workspace = true } bionemo-testing = { workspace = true } bionemo-webdatamodule = { workspace = true } # in internal/ +infra-bionemo = { workspace = true } [tool.uv] dev-dependencies = [ @@ -101,7 +105,7 @@ select = [ [tool.ruff.lint.isort] lines-after-imports = 2 -known-first-party = ["bionemo"] +known-first-party = ["bionemo", "infra_bionemo"] [tool.ruff.lint.pydocstyle] convention = "google" @@ -111,7 +115,7 @@ norecursedirs = ["3rdparty"] addopts = ["--ignore=3rdparty"] [tool.pyright] -include = ["./scripts/", "./sub-packages/"] +include = ["./scripts/", "./sub-packages/", "./internal/"] exclude = ["*/tests/"] executionEnvironments = [ { "root" = ".", pythonVersion = "3.10", extraPaths = [ @@ -131,5 +135,6 @@ executionEnvironments = [ './sub-packages/bionemo-testing/src', './sub-packages/bionemo-webdatamodule/src', # internal + './internal/infra-bionemo/src', ] }, ]