From 215a9a2d4f3ddfa00a24931b86012e9fb1da18ed Mon Sep 17 00:00:00 2001 From: shubhamjagtap639 Date: Tue, 30 May 2023 14:40:47 +0530 Subject: [PATCH 01/39] Initial commit: project generated by prefect-collection-template --- .cruft.json | 20 + .gitattributes | 1 + .github/CODEOWNERS | 0 .github/ISSUE_TEMPLATE.md | 7 + .github/PULL_REQUEST_TEMPLATE.md | 28 + .github/codeql-config.yml | 4 + .github/dependabot.yml | 12 + .github/workflows/add-to-project.yml | 24 + .github/workflows/codeql-analysis.yml | 35 + .github/workflows/nightly-dev-tests.yml | 38 + .github/workflows/publish-docs.yml | 31 + .github/workflows/release.yml | 79 + .github/workflows/static_analysis.yml | 27 + .github/workflows/template-sync.yml | 46 + .github/workflows/tests.yml | 41 + .github/workflows/windows-tests.yml | 34 + .gitignore | 138 ++ .pre-commit-config.yaml | 29 + CHANGELOG.md | 28 + LICENSE | 202 ++ MAINTAINERS.md | 114 + MANIFEST.in | 14 + README.md | 121 + docs/flows.md | 6 + docs/gen_blocks_catalog.py | 103 + docs/gen_examples_catalog.py | 120 + docs/gen_home_page.py | 21 + docs/img/favicon.ico | Bin 0 -> 15406 bytes .../img/prefect-logo-mark-solid-white-500.png | Bin 0 -> 16294 bytes docs/img/prefect-logo-white.png | Bin 0 -> 2214 bytes .../integrations/analytics/custom.html | 16 + docs/stylesheets/extra.css | 114 + docs/tasks.md | 6 + mkdocs.yml | 82 + prefect_datahub/__init__.py | 4 + prefect_datahub/_version.py | 677 ++++++ prefect_datahub/blocks.py | 35 + prefect_datahub/datahub_emitter.py | 172 ++ prefect_datahub/flows.py | 26 + prefect_datahub/tasks.py | 24 + requirements-dev.txt | 16 + requirements.txt | 1 + setup.cfg | 39 + setup.py | 47 + tests/conftest.py | 22 + tests/test_block_standards.py | 22 + tests/test_flows.py | 6 + tests/test_tasks.py | 24 + versioneer.py | 2163 +++++++++++++++++ 49 files changed, 4819 insertions(+) create mode 100644 .cruft.json create mode 100644 .gitattributes create mode 100644 .github/CODEOWNERS create mode 100644 .github/ISSUE_TEMPLATE.md create mode 100644 .github/PULL_REQUEST_TEMPLATE.md create mode 100644 .github/codeql-config.yml create mode 100644 .github/dependabot.yml create mode 100644 .github/workflows/add-to-project.yml create mode 100644 .github/workflows/codeql-analysis.yml create mode 100644 .github/workflows/nightly-dev-tests.yml create mode 100644 .github/workflows/publish-docs.yml create mode 100644 .github/workflows/release.yml create mode 100644 .github/workflows/static_analysis.yml create mode 100644 .github/workflows/template-sync.yml create mode 100644 .github/workflows/tests.yml create mode 100644 .github/workflows/windows-tests.yml create mode 100644 .gitignore create mode 100644 .pre-commit-config.yaml create mode 100644 CHANGELOG.md create mode 100644 LICENSE create mode 100644 MAINTAINERS.md create mode 100644 MANIFEST.in create mode 100644 README.md create mode 100644 docs/flows.md create mode 100644 docs/gen_blocks_catalog.py create mode 100644 docs/gen_examples_catalog.py create mode 100644 docs/gen_home_page.py create mode 100644 docs/img/favicon.ico create mode 100644 docs/img/prefect-logo-mark-solid-white-500.png create mode 100644 docs/img/prefect-logo-white.png create mode 100644 docs/overrides/partials/integrations/analytics/custom.html create mode 100644 docs/stylesheets/extra.css create mode 100644 docs/tasks.md create mode 100644 mkdocs.yml create mode 100644 prefect_datahub/__init__.py create mode 100644 prefect_datahub/_version.py create mode 100644 prefect_datahub/blocks.py create mode 100644 prefect_datahub/datahub_emitter.py create mode 100644 prefect_datahub/flows.py create mode 100644 prefect_datahub/tasks.py create mode 100644 requirements-dev.txt create mode 100644 requirements.txt create mode 100644 setup.cfg create mode 100644 setup.py create mode 100644 tests/conftest.py create mode 100644 tests/test_block_standards.py create mode 100644 tests/test_flows.py create mode 100644 tests/test_tasks.py create mode 100644 versioneer.py diff --git a/.cruft.json b/.cruft.json new file mode 100644 index 0000000..565337a --- /dev/null +++ b/.cruft.json @@ -0,0 +1,20 @@ +{ + "template": "https://github.com/PrefectHQ/prefect-collection-template", + "commit": "e11a3be195f24f60ed3f564dfccb40170ee7c3fa", + "checkout": null, + "context": { + "cookiecutter": { + "full_name": "Shubham Jagtap", + "email": "shubham.jagtap@gslab.com", + "github_organization": "shubhamjagtap639", + "collection_name": "prefect-datahub", + "collection_slug": "prefect_datahub", + "collection_short_description": "Block used to emit prefect task and flow related metadata to Datahub REST", + "_copy_without_render": [ + ".github/workflows/*.yml" + ], + "_template": "https://github.com/PrefectHQ/prefect-collection-template" + } + }, + "directory": null +} diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..fc49a45 --- /dev/null +++ b/.gitattributes @@ -0,0 +1 @@ +prefect_datahub/_version.py export-subst diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS new file mode 100644 index 0000000..e69de29 diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md new file mode 100644 index 0000000..f0ac91d --- /dev/null +++ b/.github/ISSUE_TEMPLATE.md @@ -0,0 +1,7 @@ + + +# Expectation / Proposal + +# Traceback / Example + +- [ ] I would like to [help contribute](https://shubhamjagtap639.github.io/prefect-datahub/#contributing) a pull request to resolve this! diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md new file mode 100644 index 0000000..9230c80 --- /dev/null +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -0,0 +1,28 @@ + + + + +Closes + +### Example + + +### Screenshots + + +### Checklist + + +- [ ] References any related issue by including "Closes #" or "Closes ". + - If no issue exists and your change is not a small fix, please [create an issue](https://github.com/shubhamjagtap639/prefect-datahub/issues/new/choose) first. +- [ ] Includes tests or only affects documentation. +- [ ] Passes `pre-commit` checks. + - Run `pre-commit install && pre-commit run --all` locally for formatting and linting. +- [ ] Includes screenshots of documentation updates. + - Run `mkdocs serve` view documentation locally. +- [ ] Summarizes PR's changes in [CHANGELOG.md](https://github.com/shubhamjagtap639/prefect-datahub/blob/main/CHANGELOG.md) diff --git a/.github/codeql-config.yml b/.github/codeql-config.yml new file mode 100644 index 0000000..d64f1cc --- /dev/null +++ b/.github/codeql-config.yml @@ -0,0 +1,4 @@ +paths-ignore: + - tests/**/test_*.py + - versioneer.py + - prefect_datahub/_version.py \ No newline at end of file diff --git a/.github/dependabot.yml b/.github/dependabot.yml new file mode 100644 index 0000000..653b05d --- /dev/null +++ b/.github/dependabot.yml @@ -0,0 +1,12 @@ +version: 2 +updates: + + - package-ecosystem: "pip" + directory: "/" + schedule: + interval: "daily" + + - package-ecosystem: "github-actions" + directory: "/" + schedule: + interval: "daily" \ No newline at end of file diff --git a/.github/workflows/add-to-project.yml b/.github/workflows/add-to-project.yml new file mode 100644 index 0000000..08c201d --- /dev/null +++ b/.github/workflows/add-to-project.yml @@ -0,0 +1,24 @@ +name: Add issues to integrations board + +on: + issues: + types: + - opened + +jobs: + + add-to-project: + name: Add issue to project + runs-on: ubuntu-latest + steps: + - uses: tibdex/github-app-token@v1 + id: generate-token + name: Generate GitHub token + with: + app_id: ${{ secrets.SYNC_APP_ID }} + private_key: ${{ secrets.SYNC_APP_PRIVATE_KEY }} + + - uses: actions/add-to-project@v0.4.0 + with: + project-url: ${{ secrets.ADD_TO_PROJECT_URL }} + github-token: ${{ steps.generate-token.outputs.token }} diff --git a/.github/workflows/codeql-analysis.yml b/.github/workflows/codeql-analysis.yml new file mode 100644 index 0000000..fde50b6 --- /dev/null +++ b/.github/workflows/codeql-analysis.yml @@ -0,0 +1,35 @@ +name: CodeQL + +on: + push: + branches: + - main + +jobs: + analyze: + name: Analyze + runs-on: ubuntu-latest + permissions: + actions: read + contents: read + security-events: write + + strategy: + fail-fast: false + matrix: + language: + - python + + steps: + - name: Checkout repository + uses: actions/checkout@v3 + + - name: Initialize CodeQL + uses: github/codeql-action/init@v2 + with: + languages: ${{ matrix.language }} + config-file: ./.github/codeql-config.yml + queries: security-and-quality + + - name: Perform CodeQL Analysis + uses: github/codeql-action/analyze@v2 diff --git a/.github/workflows/nightly-dev-tests.yml b/.github/workflows/nightly-dev-tests.yml new file mode 100644 index 0000000..d471925 --- /dev/null +++ b/.github/workflows/nightly-dev-tests.yml @@ -0,0 +1,38 @@ +name: Nightly tests against Prefect's main branch +on: + schedule: + - cron: "0 6 * * *" + workflow_dispatch: + +jobs: + submit-update-pr: + name: Run tests against Prefect's main branch + runs-on: ubuntu-latest + strategy: + matrix: + python-version: + - "3.7" + - "3.8" + - "3.9" + - "3.10" + fail-fast: false + steps: + - uses: actions/checkout@v3 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + cache: pip + cache-dependency-path: requirements*.txt + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + python -m pip install --upgrade --upgrade-strategy eager -e ".[dev]" "prefect @ git+https://github.com/PrefectHQ/prefect.git@main" + + - name: Run tests + env: + PREFECT_API_DATABASE_CONNECTION_URL: "sqlite+aiosqlite:///./collection-tests.db" + run: | + pytest tests -vv \ No newline at end of file diff --git a/.github/workflows/publish-docs.yml b/.github/workflows/publish-docs.yml new file mode 100644 index 0000000..10166bd --- /dev/null +++ b/.github/workflows/publish-docs.yml @@ -0,0 +1,31 @@ +name: Publish docs + +on: + workflow_dispatch + +jobs: + build-and-publish-docs: + name: Build and publish docs + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v3 + + - name: Set up Python 3.10 + uses: actions/setup-python@v4 + with: + python-version: "3.10" + cache: pip + cache-dependency-path: requirements*.txt + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + python -m pip install --upgrade --upgrade-strategy eager -e ".[dev]" + mkdocs build + + - name: Publish docs + uses: JamesIves/github-pages-deploy-action@v4.4.1 + with: + branch: docs + folder: site diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml new file mode 100644 index 0000000..4181077 --- /dev/null +++ b/.github/workflows/release.yml @@ -0,0 +1,79 @@ +name: Build & Release + +on: + push: + tags: + - "v*" + +jobs: + build-release: + name: Build Release + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: 3.7 + + - name: Install packages + run: | + python -m pip install --upgrade pip build + python -m pip install --upgrade --upgrade-strategy eager -e .[dev] + + - name: Build a binary wheel and a source tarball + run: | + python -m build --sdist --wheel --outdir dist/ + + - name: Publish build artifacts + uses: actions/upload-artifact@v3 + with: + name: built-package + path: "./dist" + + publish-release: + name: Publish release to PyPI + needs: [build-release] + environment: "prod" + runs-on: ubuntu-latest + + steps: + - name: Download build artifacts + uses: actions/download-artifact@v3 + with: + name: built-package + path: "./dist" + + - name: Publish distribution to PyPI + uses: pypa/gh-action-pypi-publish@release/v1 + with: + password: ${{ secrets.PYPI_API_TOKEN }} + verbose: true + + build-and-publish-docs: + name: Build and publish docs + needs: [build-release, publish-release] + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v3 + + - name: Set up Python 3.10 + uses: actions/setup-python@v4 + with: + python-version: "3.10" + cache: pip + cache-dependency-path: requirements*.txt + + - name: Build docs + run: | + python -m pip install --upgrade pip + python -m pip install --upgrade --upgrade-strategy eager -e .[dev] + mkdocs build + + - name: Publish docs + uses: JamesIves/github-pages-deploy-action@v4.4.1 + with: + branch: docs + folder: site diff --git a/.github/workflows/static_analysis.yml b/.github/workflows/static_analysis.yml new file mode 100644 index 0000000..58fb933 --- /dev/null +++ b/.github/workflows/static_analysis.yml @@ -0,0 +1,27 @@ +name: Static analysis + +on: [pull_request] + +jobs: + pre-commit-checks: + name: Pre-commit checks + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v3 + with: + persist-credentials: false + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: 3.9 + + - name: Install pre-commit + run: | + python -m pip install --upgrade pip + pip install pre-commit + + - name: Run pre-commit + run: | + pre-commit run --show-diff-on-failure --color=always --all-files diff --git a/.github/workflows/template-sync.yml b/.github/workflows/template-sync.yml new file mode 100644 index 0000000..aec8fdb --- /dev/null +++ b/.github/workflows/template-sync.yml @@ -0,0 +1,46 @@ +name: Template Synchronization +on: + schedule: + - cron: "0 6 * * *" + workflow_dispatch: + +jobs: + submit-update-pr: + name: Submit update PR + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: 3.9 + + - name: Install cruft + run: pip install "cookiecutter>=1.7.3,<2.0.0" cruft + + - name: Perform updates + run: cruft update -y + + - uses: tibdex/github-app-token@v1 + id: generate-token + name: Generate GitHub token + with: + app_id: ${{ secrets.SYNC_APP_ID }} + private_key: ${{ secrets.SYNC_APP_PRIVATE_KEY }} + + - name: Submit PR + uses: peter-evans/create-pull-request@v4 + with: + commit-message: Updating collection with changes to prefect-collection-template + token: ${{ steps.generate-token.outputs.token }} + branch: sync-with-template + delete-branch: true + title: Sync Collection with changes to prefect-collection-template + body: | + Automated PR created to propagate changes from prefect-collection-template to this collection + + Feel free to make any necessary changes to this PR before merging. + labels: | + template sync + automated pr diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml new file mode 100644 index 0000000..6e7a5d4 --- /dev/null +++ b/.github/workflows/tests.yml @@ -0,0 +1,41 @@ +name: Tests + +on: [pull_request] + +jobs: + run-tests: + name: Run Tests + runs-on: ubuntu-latest + strategy: + matrix: + python-version: + - "3.7" + - "3.8" + - "3.9" + - "3.10" + fail-fast: false + steps: + - uses: actions/checkout@v3 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + cache: pip + cache-dependency-path: requirements*.txt + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + python -m pip install --upgrade --upgrade-strategy eager -e ".[dev]" + + - name: Run tests + env: + PREFECT_SERVER_DATABASE_CONNECTION_URL: "sqlite+aiosqlite:///./collection-tests.db" + run: | + coverage run --branch -m pytest tests -vv + coverage report + + - name: Run mkdocs build + run: | + mkdocs build --verbose --clean diff --git a/.github/workflows/windows-tests.yml b/.github/workflows/windows-tests.yml new file mode 100644 index 0000000..c5d9813 --- /dev/null +++ b/.github/workflows/windows-tests.yml @@ -0,0 +1,34 @@ + +name: Windows Tests + +on: [pull_request] + +jobs: + run-tests: + name: Run Tests + runs-on: windows-latest + strategy: + matrix: + # Prefect only tests 3.9 on Windows + python-version: + - "3.9" + fail-fast: false + steps: + - uses: actions/checkout@v3 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + cache: pip + cache-dependency-path: requirements*.txt + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + python -m pip install --upgrade --upgrade-strategy eager -e ".[dev]" + - name: Run tests + env: + PREFECT_SERVER_DATABASE_CONNECTION_URL: "sqlite+aiosqlite:///./collection-tests.db" + run: | + pytest tests -vv diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..b96a3be --- /dev/null +++ b/.gitignore @@ -0,0 +1,138 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# OS files +.DS_Store + +# VS Code +.vscode + +# Jupyter notebook +*.ipynb diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..2f089e8 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,29 @@ +repos: + - repo: https://github.com/pycqa/isort + rev: 5.12.0 + hooks: + - id: isort + language_version: python3 + - repo: https://github.com/psf/black + rev: 22.3.0 + hooks: + - id: black + language_version: python3 + - repo: https://github.com/pycqa/flake8 + rev: 4.0.1 + hooks: + - id: flake8 + - repo: https://github.com/econchick/interrogate + rev: 1.5.0 + hooks: + - id: interrogate + args: [-vv] + pass_filenames: false + - repo: https://github.com/fsouza/autoflake8 + rev: v0.3.2 + hooks: + - id: autoflake8 + language_version: python3 + args: [ + '--in-place', + ] diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..04e9176 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,28 @@ +# Changelog + +All notable changes to this project will be documented in this file. + +The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), +and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). + +## Unreleased + +### Added + +### Changed + +### Deprecated + +### Removed + +### Fixed + +### Security + +## 0.1.0 + +Released on ????? ?th, 20??. + +### Added + +- `task_name` task - [#1](https://github.com/shubhamjagtap639/prefect-datahub/pull/1) diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..53c2097 --- /dev/null +++ b/LICENSE @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright 2021 Prefect Technologies, Inc. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. \ No newline at end of file diff --git a/MAINTAINERS.md b/MAINTAINERS.md new file mode 100644 index 0000000..b58c764 --- /dev/null +++ b/MAINTAINERS.md @@ -0,0 +1,114 @@ +# prefect-datahub + +## Getting Started + +Now that you've bootstrapped a project, follow the steps below to get started developing your Prefect Collection! + +### Python setup + +Requires an installation of Python 3.7+ + +We recommend using a Python virtual environment manager such as pipenv, conda or virtualenv. + +### GitHub setup + +Create a Git respoitory for the newly generated collection and create the first commit: + +```bash +git init +git add . +git commit -m "Initial commit: project generated by prefect-collection-template" +``` + +Then, create a new repo following the prompts at: +https://github.com/organizations/shubhamjagtap639/repositories/new + +Upon creation, push the repository to GitHub: +```bash +git remote add origin https://github.com/shubhamjagtap639/prefect-datahub.git +git branch -M main +git push -u origin main +``` + +It's recommended to setup some protection rules for main at: +https://github.com/shubhamjagtap639/prefect-datahub/settings/branches + +- Require a pull request before merging +- Require approvals + +Lastly, [code owners](https://docs.github.com/en/repositories/managing-your-repositorys-settings-and-features/customizing-your-repository/about-code-owners) for the repository can be set, like this [example here](https://github.com/PrefectHQ/prefect/blob/master/.github/CODEOWNERS). + +### Project setup + +To setup your project run the following: + +```bash +# Create an editable install of your project +pip install -e ".[dev]" + +# Configure pre-commit hooks +pre-commit install +``` + +To verify the setup was successful you can run the following: + +- Run the tests for tasks and flows in the collection: + ```bash + pytest tests + ``` +- Serve the docs with `mkdocs`: + ```bash + mkdocs serve + ``` + +## Developing tasks and flows + +For information about the use and development of tasks and flow, check out the [flows](https://docs.prefect.io/concepts/flows/) and [tasks](https://docs.prefect.io/concepts/tasks/) concepts docs in the Prefect docs. + +## Writing documentation + +This collection has been setup to with [mkdocs](https://www.mkdocs.org/) for automatically generated documentation. The signatures and docstrings of your tasks and flow will be used to generate documentation for the users of this collection. You can make changes to the structure of the generated documentation by editing the `mkdocs.yml` file in this project. + +To add a new page for a module in your collection, create a new markdown file in the `docs` directory and add that file to the `nav` section of `mkdocs.yml`. If you want to automatically generate documentation based on the docstrings and signatures of the contents of the module with `mkdocstrings`, add a line to the new markdown file in the following format: + +```markdown +::: prefect_datahub.{module_name} +``` + +You can also refer to the `flows.md` and `tasks.md` files included in your generated project as examples. + +Once you have working code, replace the default "Write and run a flow" example in `README.md` to match your collection. + +## Development lifecycle + +### CI Pipeline + +This collection comes with [GitHub Actions](https://docs.github.com/en/actions) for testing and linting. To add additional actions, you can add jobs in the `.github/workflows` folder. Upon a pull request, the pipeline will run linting via [`black`](https://black.readthedocs.io/en/stable/), [`flake8`](https://flake8.pycqa.org/en/latest/), [`interrogate`](https://interrogate.readthedocs.io/en/latest/), and unit tests via `pytest` alongside `coverage`. + +`interrogate` will tell you which methods, functions, classes, and modules have docstrings, and which do not--the job has a fail threshold of 95%, meaning that it will fail if more than 5% of the codebase is undocumented. We recommend following the [Google Python Style Guide](https://google.github.io/styleguide/pyguide.html#38-comments-and-docstrings) for docstring format. + +Simiarly, `coverage` ensures that the codebase includes tests--the job has a fail threshold of 80%, meaning that it will fail if more than 20% of the codebase is missing tests. + +### Track Issues on Project Board + +To automatically add issues to a GitHub Project Board, you'll need a [secret added](https://docs.github.com/en/actions/security-guides/encrypted-secrets#creating-encrypted-secrets-for-an-environment) to the repository. Specifically, a secret named `ADD_TO_PROJECT_URL`, formatted like `https://github.com/orgs//projects/`. + +### Package and Publish + +GitHub actions will handle packaging and publishing of your collection to [PyPI](https://pypi.org/) so other Prefect users can your collection in their flows. + +To publish to PyPI, you'll need a PyPI account and to generate an API token to authenticate with PyPI when publishing new versions of your collection. The [PyPI documentation](https://pypi.org/help/#apitoken) outlines the steps needed to get an API token. + +Once you've obtained a PyPI API token, [create a GitHub secret](https://docs.github.com/en/actions/security-guides/encrypted-secrets#creating-encrypted-secrets-for-a-repository) named `PYPI_API_TOKEN`. + +To publish a new version of your collection, [create a new GitHub release](https://docs.github.com/en/repositories/releasing-projects-on-github/managing-releases-in-a-repository#creating-a-release) and tag it with the version that you want to deploy (e.g. v0.3.2). This will trigger a workflow to publish the new version on PyPI and deploy the updated docs to GitHub pages. + +Upon publishing, a `docs` branch is automatically created. To hook this up to GitHub Pages, simply head over to https://github.com/shubhamjagtap639/prefect-datahub/settings/pages, select `docs` under the dropdown menu, keep the default `/root` folder, `Save`, and upon refresh, you should see a prompt stating "Your site is published at https://shubhamjagtap639.github.io/prefect-datahub". Don't forget to add this link to the repo's "About" section, under "Website" so users can access the docs easily. + +Feel free to [submit your collection](https://docs.prefect.io/collections/overview/#listing-in-the-collections-catalog) to the Prefect [Collections Catalog](https://docs.prefect.io/collections/catalog/)! + +## Further guidance + +If you run into any issues during the bootstrapping process, feel free to open an issue in the [prefect-collection-template](https://github.com/PrefectHQ/prefect-collection-template) repository. + +If you have any questions or issues while developing your collection, you can find help in either the [Prefect Discourse forum](https://discourse.prefect.io/) or the [Prefect Slack community](https://prefect.io/slack). diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000..9e3fb02 --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,14 @@ +# Things to always exclude +global-exclude .git* +global-exclude .ipynb_checkpoints +global-exclude *.py[co] +global-exclude __pycache__/** + +# Top-level Config +include versioneer.py +include prefect_datahub/_version.py +include LICENSE +include MANIFEST.in +include setup.cfg +include requirements.txt +include requirements-dev.txt diff --git a/README.md b/README.md new file mode 100644 index 0000000..4838c18 --- /dev/null +++ b/README.md @@ -0,0 +1,121 @@ +# prefect-datahub + +

+ + + + PyPI + + + + + + +
+ + + + +

+ +Visit the full docs [here](https://shubhamjagtap639.github.io/prefect-datahub) to see additional examples and the API reference. + +Block used to emit prefect task and flow related metadata to Datahub REST + + + + +## Resources + +For more tips on how to use tasks and flows in a Collection, check out [Using Collections](https://docs.prefect.io/collections/usage/)! + +### Installation + +Install `prefect-datahub` with `pip`: + +```bash +pip install prefect-datahub +``` + +Requires an installation of Python 3.7+. + +We recommend using a Python virtual environment manager such as pipenv, conda or virtualenv. + +These tasks are designed to work with Prefect 2.0. For more information about how to use Prefect, please refer to the [Prefect documentation](https://docs.prefect.io/). + + + +### Feedback + +If you encounter any bugs while using `prefect-datahub`, feel free to open an issue in the [prefect-datahub](https://github.com/shubhamjagtap639/prefect-datahub) repository. + +If you have any questions or issues while using `prefect-datahub`, you can find help in either the [Prefect Discourse forum](https://discourse.prefect.io/) or the [Prefect Slack community](https://prefect.io/slack). + +Feel free to star or watch [`prefect-datahub`](https://github.com/shubhamjagtap639/prefect-datahub) for updates too! + +### Contributing + +If you'd like to help contribute to fix an issue or add a feature to `prefect-datahub`, please [propose changes through a pull request from a fork of the repository](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/proposing-changes-to-your-work-with-pull-requests/creating-a-pull-request-from-a-fork). + +Here are the steps: + +1. [Fork the repository](https://docs.github.com/en/get-started/quickstart/fork-a-repo#forking-a-repository) +2. [Clone the forked repository](https://docs.github.com/en/get-started/quickstart/fork-a-repo#cloning-your-forked-repository) +3. Install the repository and its dependencies: +``` +pip install -e ".[dev]" +``` +4. Make desired changes +5. Add tests +6. Insert an entry to [CHANGELOG.md](https://github.com/shubhamjagtap639/prefect-datahub/blob/main/CHANGELOG.md) +7. Install `pre-commit` to perform quality checks prior to commit: +``` +pre-commit install +``` +8. `git commit`, `git push`, and create a pull request diff --git a/docs/flows.md b/docs/flows.md new file mode 100644 index 0000000..d8621f0 --- /dev/null +++ b/docs/flows.md @@ -0,0 +1,6 @@ +--- +description: +notes: This documentation page is generated from source file docstrings. +--- + +::: prefect_datahub.flows \ No newline at end of file diff --git a/docs/gen_blocks_catalog.py b/docs/gen_blocks_catalog.py new file mode 100644 index 0000000..7e40612 --- /dev/null +++ b/docs/gen_blocks_catalog.py @@ -0,0 +1,103 @@ +""" +Discovers all blocks and generates a list of them in the docs +under the Blocks Catalog heading. +""" + +from pathlib import Path +from textwrap import dedent + +import mkdocs_gen_files +from prefect.blocks.core import Block +from prefect.utilities.dispatch import get_registry_for_type +from prefect.utilities.importtools import from_qualified_name, to_qualified_name + +COLLECTION_SLUG = "prefect_datahub" + + +def find_module_blocks(): + blocks = get_registry_for_type(Block) + collection_blocks = [ + block + for block in blocks.values() + if to_qualified_name(block).startswith(COLLECTION_SLUG) + ] + module_blocks = {} + for block in collection_blocks: + block_name = block.__name__ + module_nesting = tuple(to_qualified_name(block).split(".")[1:-1]) + if module_nesting not in module_blocks: + module_blocks[module_nesting] = [] + module_blocks[module_nesting].append(block_name) + return module_blocks + + +def insert_blocks_catalog(generated_file): + module_blocks = find_module_blocks() + if len(module_blocks) == 0: + return + generated_file.write( + dedent( + f""" + Below is a list of Blocks available for registration in + `prefect-datahub`. + + To register blocks in this module to + [view and edit them](https://docs.prefect.io/ui/blocks/) + on Prefect Cloud, first [install the required packages]( + https://shubhamjagtap639.github.io/prefect-datahub/#installation), + then + ```bash + prefect block register -m {COLLECTION_SLUG} + ``` + """ # noqa + ) + ) + generated_file.write( + "Note, to use the `load` method on Blocks, you must already have a block document " # noqa + "[saved through code](https://docs.prefect.io/concepts/blocks/#saving-blocks) " # noqa + "or [saved through the UI](https://docs.prefect.io/ui/blocks/).\n" + ) + for module_nesting, block_names in module_blocks.items(): + module_path = f"{COLLECTION_SLUG}." + " ".join(module_nesting) + module_title = ( + module_path.replace(COLLECTION_SLUG, "") + .lstrip(".") + .replace("_", " ") + .title() + ) + generated_file.write(f"## [{module_title} Module][{module_path}]\n") + for block_name in block_names: + block_obj = from_qualified_name(f"{module_path}.{block_name}") + block_description = block_obj.get_description() + if not block_description.endswith("."): + block_description += "." + generated_file.write( + f"[{block_name}][{module_path}.{block_name}]\n\n{block_description}\n\n" + ) + generated_file.write( + dedent( + f""" + To load the {block_name}: + ```python + from prefect import flow + from {module_path} import {block_name} + + @flow + def my_flow(): + my_block = {block_name}.load("MY_BLOCK_NAME") + + my_flow() + ``` + """ + ) + ) + generated_file.write( + f"For additional examples, check out the [{module_title} Module]" + f"(../examples_catalog/#{module_nesting[-1]}-module) " + f"under Examples Catalog.\n" + ) + + +blocks_catalog_path = Path("blocks_catalog.md") +with mkdocs_gen_files.open(blocks_catalog_path, "w") as generated_file: + insert_blocks_catalog(generated_file) diff --git a/docs/gen_examples_catalog.py b/docs/gen_examples_catalog.py new file mode 100644 index 0000000..c8f8261 --- /dev/null +++ b/docs/gen_examples_catalog.py @@ -0,0 +1,120 @@ +""" +Locates all the examples in the Collection and puts them in a single page. +""" + +import re +from collections import defaultdict +from inspect import getmembers, isclass, isfunction +from pathlib import Path +from pkgutil import iter_modules +from textwrap import dedent +from types import ModuleType +from typing import Callable, Set, Union + +import mkdocs_gen_files +from griffe.dataclasses import Docstring +from griffe.docstrings.dataclasses import DocstringSectionKind +from griffe.docstrings.parsers import Parser, parse +from prefect.logging.loggers import disable_logger +from prefect.utilities.importtools import load_module, to_qualified_name + +import prefect_datahub + +COLLECTION_SLUG = "prefect_datahub" + + +def skip_parsing(name: str, obj: Union[ModuleType, Callable], module_nesting: str): + """ + Skips parsing the object if it's a private object or if it's not in the + module nesting, preventing imports from other libraries from being added to the + examples catalog. + """ + try: + wrong_module = not to_qualified_name(obj).startswith(module_nesting) + except AttributeError: + wrong_module = False + return obj.__doc__ is None or name.startswith("_") or wrong_module + + +def skip_block_load_code_example(code_example: str) -> bool: + """ + Skips the code example if it's just showing how to load a Block. + """ + return re.search(r'\.load\("BLOCK_NAME"\)\s*$', code_example.rstrip("`")) + + +def get_code_examples(obj: Union[ModuleType, Callable]) -> Set[str]: + """ + Gathers all the code examples within an object. + """ + code_examples = set() + with disable_logger("griffe.docstrings.google"): + with disable_logger("griffe.agents.nodes"): + docstring = Docstring(obj.__doc__) + parsed_sections = parse(docstring, Parser.google) + + for section in parsed_sections: + if section.kind == DocstringSectionKind.examples: + code_example = "\n".join( + (part[1] for part in section.as_dict().get("value", [])) + ) + if not skip_block_load_code_example(code_example): + code_examples.add(code_example) + if section.kind == DocstringSectionKind.admonition: + value = section.as_dict().get("value", {}) + if value.get("annotation") == "example": + code_example = value.get("description") + if not skip_block_load_code_example(code_example): + code_examples.add(code_example) + + return code_examples + + +code_examples_grouping = defaultdict(set) +for _, module_name, ispkg in iter_modules(prefect_datahub.__path__): + + module_nesting = f"{COLLECTION_SLUG}.{module_name}" + module_obj = load_module(module_nesting) + + # find all module examples + if skip_parsing(module_name, module_obj, module_nesting): + continue + code_examples_grouping[module_name] |= get_code_examples(module_obj) + + # find all class and method examples + for class_name, class_obj in getmembers(module_obj, isclass): + if skip_parsing(class_name, class_obj, module_nesting): + continue + code_examples_grouping[module_name] |= get_code_examples(class_obj) + for method_name, method_obj in getmembers(class_obj, isfunction): + if skip_parsing(method_name, method_obj, module_nesting): + continue + code_examples_grouping[module_name] |= get_code_examples(method_obj) + + # find all function examples + for function_name, function_obj in getmembers(module_obj, callable): + if skip_parsing(function_name, function_obj, module_nesting): + continue + code_examples_grouping[module_name] |= get_code_examples(function_obj) + + +examples_catalog_path = Path("examples_catalog.md") +with mkdocs_gen_files.open(examples_catalog_path, "w") as generated_file: + generated_file.write( + dedent( + """ + # Examples Catalog + + Below is a list of examples for `prefect-datahub`. + """ + ) + ) + for module_name, code_examples in code_examples_grouping.items(): + if len(code_examples) == 0: + continue + module_title = module_name.replace("_", " ").title() + generated_file.write( + f"## [{module_title} Module][{COLLECTION_SLUG}.{module_name}]\n" + ) + for code_example in code_examples: + generated_file.write(code_example + "\n") diff --git a/docs/gen_home_page.py b/docs/gen_home_page.py new file mode 100644 index 0000000..3341134 --- /dev/null +++ b/docs/gen_home_page.py @@ -0,0 +1,21 @@ +""" +Copies README.md to index.md. +""" + +from pathlib import Path + +import mkdocs_gen_files + +# Home page + +readme_path = Path("README.md") +docs_index_path = Path("index.md") + +with open(readme_path, "r") as readme: + with mkdocs_gen_files.open(docs_index_path, "w") as generated_file: + for line in readme: + if line.startswith("Visit the full docs [here]("): + continue # prevent linking to itself + generated_file.write(line) + + mkdocs_gen_files.set_edit_path(Path(docs_index_path), readme_path) diff --git a/docs/img/favicon.ico b/docs/img/favicon.ico new file mode 100644 index 0000000000000000000000000000000000000000..c4b421585b5f5cbbb793df9d0f0c7c09341d5989 GIT binary patch literal 15406 zcmeHOX>e256~0c2jl9b%wk1pMk4}DM+RU^~NR!qfl;D)rq-h{iy3Hg_r<0_ehA^Q7 z8Xyi~Cln}wn1p>Z`%c&lW-(v`wj@fRkW6WrP9Qj7AwTK8)9+mAxss4&$(C%Vt!Cal zE$`m*opZl?&b{Yc36UV~7nzv?cdK~mYeGCMgs@t@pC2#^QHk#!ddT;ov^1M& zP6p9iDbck*V)D@TK13^_!vg3~k?N#pF?bQYr2La+5A7`=TAxR>B#S)L(ly5bafF77 zAGFtnY+rBj(7__2&CubaZ1T*>(60SRXjCq*J?Q6=w)#Q)1UhdyNi`;oc0z~Mx#XFj zsoffawj*F3zRHD4;|wdq_HnFm2z00g7c|e}EUjg@(3;T4XTMJJ+}_;BL-vvzZME0G z=P{?MoI&bRuMXQ(^b%})V$O+Y`~&Tc(G=S`)d+JyQ}_G7ailkIG#cKH*kk4U~_ zKV!x91TOYZ(Y$Dp=tb*tuhKH(SiH}Orfn>qrPGkVAy1L>wKD5pv@ml5Rfz0Z^c_hb zQvRlVh1S4leF&fP7x=BY88>NKN(H?w@+0Y^GtWr*TOfZO{0;lv#qiziduOD>|E4t2 z1j9^vU6?u}BV1c4e|sN=s^G6y!3LGE!9v(zHhk|?`0qcZ5WQizLx1QsC(<|&PG0bD zQvR+2Mf%INumOA~y$>6_2OCU-4JIQ7{V|#7wWJ&LqUaUezO$dD{Jn(=ZG*qv0QoC0 z2JrhdAM$5Heg))DfDOj>j-UMfkiQc#XCuY{K3~Q@<{`(wcQFQULH;;H{Nx`jQfN;h z(H7W%dAJNV;4zp78%&1{roaZ{<0!w{q|nEZ&vCm7@;P>Y03IxW4Q9ax?=TOO=<>921_{l$ER_L&aXfJHA)i(ygzFmdKQndG`@H^zY1PP~Tn^94#pns&HU zXf4CA{uw6r+BXL53wZt4>I;mdOij~{_Y;~{?zavr7dvT? z+zNyf5%xE%`Fc^KRiQH$q8iwM#~_3+P^PDA=YHt74%F9!!-WU7r~-~}g%#Ask5YfT zwT0>-{{-Y8g8V%g1AkwDn54|i&@K%OAz#N)E)?KcMT6zGrwwVLzb3n76xO{qjDbc+ zFb4a4zJUF)&lf0INB@J@(Ym~#yNdO-`f8$+HszkN>deLxV=Qhb*Urrb!2ip4i!94y9*lN3pB|?_yYGrqv|df z9+r7`n4tP6g?8ZiME8}|`HE`uTVHolek%^_KWS^ub5sR;uFBQ0pL1U>6Yfoq2yNwv zMEB)jxpG}E*VmoWUze}nc^^z`b4J}=WIVU*nIz@Xs(g2VzPeof9wYsA`FtmP;iF`; z>t>K?NR8f%hm54(4Wp1eUmz> zuP(RsZv7Q0`{A&{e>CpfM?kESU;w}nmZI1f%!gin1@DyKZ7H*^sA zH{?07*Ywb`?8{UMJ(gtjL#*xn6@V`!{*A!TdsSYeFU|JQV&mtuFntrvH1vf(h$!FO zUAjvAoAaG=ZO1tm-sf=6f%7i&((ll$)a6vs`%#QXdUs(ACs*R%0{pyBIDhc|)$4Fl&fKck;_ub-9p_Z2#2 zo`LuI%puM@a^86F^9NTtj7X5kKumr82Gz+okz@{8sO)AE9X0z zLp+b~Sx@-I0_Ko67W54KCySjjXU?^L&fDu8;#jcKFBWhOgVz&0kG&k~8Td~D|1r+@ zVhp(U!egMv0$$VUv4CqJdMuchM(UK*YqtgooOtQJm*mfBi&NGNbPjRdfH|bEC)$Vw zEfZ5b$DR;pYfHsXoBDTMo!Q@75`P`=pG3`p>n@ygXAbGHKwnP;!~*zIaLDymidtPN z)U!{DCS{n|i1Q>PsZ`|n_aE=~68~A?KMnlMp~Jwx-^Zb?h-ESsfJ4jSg97&j+*ZmH1LcOU0__=1ob*ma5hwh05SkLM6*cS_26=`ZsKcQZKItag4 z2R^5C9b-~SLqr*Pl^PE&N&L1FC!MnrorVrv`)hX|!~R&)xu&M6sKcqZ2K(x8L3JSD zXB|}Na1+;7T;Gnh4w66il4gtnQ61OKeDgRYhkUU>t|yG-dN)n2eNw1*hqi$~SRbAr z!^I-j1LF~s{EVzI*-G}H?&zemz<-K4ZU_=dr#osPyg&T+`Fl`u;-o437}M zmscu{O!$Omq`Vrdv7km%7Bv?C82D}QLz=8zN)Evfas5)p0^t8oj!2%Zam`3qFAfxH zz}yYCx$iSSgPNPF{4Vh}ZO^|=^~Kd#JH<4&MD-b&R9Ey0oiNoQ7BtHmc$iq=nw_Dd z4jK|a<3$~mno+0xVnl*^?WIKJ=BnJgYNM6U{NW59&Z5N_KaVHoa{`^nFQKCaFVmsI zLx=^g_55!Q%9X`AO7jV`J23Yi zAN)Ee**0`6J&X3`K0~|v)MHJj2JZ{p^D^Da2-LcPpK-fTi?#I^wBox$+wfds%craJ zT3mI-K6?k(ZsLny*F%oUR*_8G@?N0Ld9}1Y_YUTsXrW@-R;VYD(RQU+TbTP&XOPv?2R9;Lt%@k==ZEzR|s|uW--%PJ+5-w84FT zL5_0gcrZ_baqFME&i(;o!SiDsc(o#9MAN*?x;3McTz^{yo}97pc>$a&^{e0aAo~aC z;K)v+hJwe`bLMT*pM=G!+dKH)_4cR6+*Gm`4?;b$64y1@S`E+fV9frXyAHps`!yDz zE?m3=xNk+y5r4eF@8gf3m?w_H@8L|(8sK)t2Dh$9{P2h1Nzj>f9YbgCvBxiCFZ0A^ z*$#VufwLl=;CFkv#STC16iL(oo;Z9w>5xBZ4{y-#e5OA}>mStrAB4>t;%jdoeid=7_B8#8?WpJ84_?u+a=1M7efvwIpoF NR+s-j|KD$ce*-+RrwsrA literal 0 HcmV?d00001 diff --git a/docs/img/prefect-logo-mark-solid-white-500.png b/docs/img/prefect-logo-mark-solid-white-500.png new file mode 100644 index 0000000000000000000000000000000000000000..f83aa6ef6a34ee4c596bd1c7c2046a2f05cb9342 GIT binary patch literal 16294 zcmeHui93{U-}jhCO)18bC1MPr#S%*PVJu}=wkQgrjY9TqO7<{Gq6|{fC=!yag=j;I z?35&z*MTDY)qN19Ds@i@vLgtHu7KVvdv|0_|Io}kKi0N zJq^D$Upjo;2mWec{jjC(g9Y!F5u!N)+~fZfEs+f zc*gfMHsIm~Pam}aP2BnjHTcYWERVyk5Ai*(iL*1cz!JQ?&tMPADaa|{w6&-U$}1|$!U$QPKu_P(0kWPx z5`P!@*E)J8L+<(t zre3aR-2(Ma`<~Ha=~z}lRd&Ck)xUK7clv)B^IvAOs8TzA^o)(Qks16%w&UbL=oI}*+^^EZ*zUu4)7d9+->mfbbgt)9wDR+@ zJbS{e#(4F%9?3tCRyN&`S3LdeZBplSkD1A{!@B#AoOOOEb#{LqbyrHB_>ItChZ0>} zc$)$j0;ej5=aw}6zfT9WeC#h_lAD4;ZkJBNCt7-$R;}uiPx$%oWYyn%TTs|!4Ps_y z<~2$*!EkPQf#UvnwRr4uyL`dw?J=(Q@2U@eAFEeP!bBiYSOgpTqaI#YE8AXRVbosB zkgn5SIKx&=>~xP4s=WSp0F662xl(V{;?rQ$Tu6a^|M%Q9bE^+w0kwOa2Xf!@qVWhM zCZbLcCHbQUR}|kf!`bM}#DuJby3@1;Uu6$(ZQLZNq01VBKt)Mn-+AuwzUb_9Hm*-7 z_!uoZingie>fe3%j^M9eH1p#aZoW$xrzZqwRR~nI-7Z#X45MhZQY&ytc(rY8tlfmU zlkt^`GmEa}L#os4{gZSXC719lE&581M9^eTfFc7CY512aya3zg5@Rai47I!Q%hk?G zWW)Ghwu$}*1<;SP^P4_*54QvJbifsvoX)~z-xoYn%h?w@zg>27@3 zm9>_ql_|YZ%x9pXZn(q0G~~?(J~aN69TFXCN51g%67uHl-y!)5q>kL0Ty~BgkvHL1 zp6x#ID{`7RKSDPC6bc5S@-IBsK*|a&2;Kj)qy3)J%o$?;mo1`1^ng-r!SJvI<4k9O z7;>B&9qut6nqjV!eF+&;)xq{7CxELwdP=yVpF?=B&Zz^)pIjHp%!?1Awn*2tExz9r z5j~h$EWt?qJk5rllZa?ry6h~Artx^ZYc{?^JLo(2t}|NdJp+qHqBH9eNKJGyi5XGa zy~mYSFztL1y>C}UTN=*!`pwOyu2d60VMYX=RzKrZL-HHtVv)5{g zohq;2yZcB+wD~0L3AH!M-X*cNcd>1pvN+Mc@t*jQ?^8O(;fPjgY-WPyJBu$P#c1qY zar5&&na%q!3-bA|2W$@PAQ=i(R~sCN9(>oyZ2R-T6{w?*2z+DS&Y^o*Go1?Jn!jpr zb}RZdklps3rQ=bUMLf34<-bGK z@x&C7Q*eQA@wHw46ryO{;^;K5Q=Nv+fdBhSllEk-raEok>h{= z=Z)Q)lS$@B?uOE_5uL$!=hT;=h?j$wC(hgJwW!VvCSJ`~?=_^6_t!ohD9eg@C9NzM zj6`cLVVb&Lvq%6rd$b=xUUUwx?0<#F)ixE3+!;b1kUl;=WdaH@`C;65(bxc1I?5HL zywt~6nKqDPFQ6_IoKb9A(~j0$z~p^qQ=CyA)Y}`(J0jty$mlDd<=ZHZB?Z4+ zPIg;X8)qa8D3qERXrWIZyeSCdq+QJSe3kOT!MW#7(pox0j~tI0G~^&S3LA_B zHv0VOKg{y(dwYZ<>qOUy8>nwg(FZx>)Yxk~Zj!>|FICyJ3q=0(r?QI)`-bu%Pde8g z7Fw+CcR9cZX~Oc)aIs}1*-&KDxPE-IdB@w+EZ5p{K_#hA=@J{t(F zXME6_X@@;Kx%6tp@mie3)0>qWYaV_Ycwn#H)MByx>|@Hn4qbDt$h%h%B(Jwg=7dwO zNK^0|PuCy2y+m^!iBxavPHa|*XfhO6&?L$gtF#se+f+R?-<^!mmESiaiwh@Xs_f}$ z)~be%ozr;#6p6N7j3En+l;N8k6GPh!M(!$^ICJd8tsOtGnr2m7GKNa&yBQ;OBeHJy zMdwkBL)Z>IS?jX0<7r85<`@-z$ zbc#FWRWd1yT33r{zJWhRJ1L}|GVC2+wRo*fY4f@G=1kIB0;QD{Zr*MhAHqCZ*sjQh ziNKS=JTiS~LXBcM(xM;vNe-QBHgs*u>J3V;D|I_s6l)>K*5GJS#~pP4wI_Fl%fgh^ z%4~^%Z68je2>%B`e6tzLKyo4?$)}Qc&DLkUK6iYIUv_P;=4FG`=P5-rvV^-w_0`uK zP}paN*}MJ1lN16yxy-xGF+sZ1DKA4Edxxu*4MzvkS2;AREa|@;ZNw$C-x0&b5yVE2 z$W(?#lPaT8BsWMP$8dWh<85io@}tr`{nhGrX-uHHnj;i8{Jb8#OYMs>;ggJPZOW9G zoJGl<)5=r0IS<|9TJc4K8r7B_T^4-NO}5r3-s@=Sbtl`dqgS^2cg%7?K1CqUV?5$k z&R+brqg^5-2?+?xNPz%7D(kaD{cEeYk<>-3J={{exRn{X)Yh$Wr$TNDrq#Ox~!&LyyLiP8%n! z9W{M*y#07ZxbdVB5lR}&24Y@kJyo?eF8uI-Q>EEwpCZHbI|$t@!GNC0Wy2rZi0vPi zJlh2d>_@g1b0FD~n3%-67xq-AfymELbCKb29Z1T7y^dL_2ExeDH*eG zC`Xq0WD`5vwm^NN%~n~j=r1*Sn(@_2{^rN>(NN)zP14E#!XX;4_2yYx<&nNxb!Nj$?Qf65b=wKYMZY}%=3m73$Z%b_cAF!) zox;N{-LMgD?h*ZE#_xJddS&U%vzfJz7o{n;UzPYb$e}^1*dV04-oP6Z;g~iFjtjf4 zI5Fs+?P6B5t7T)u*5l61t;g2a6Wsh~MfsQ&D_sao*x(S8drC;V;Ks3t!pt7HzYo>s%C}l zY>`^qM-MP1Na_wuYxjK&7mC-{JjeU?0>RCl+)Ekv76e(Q@(rpX6iy~Km{u*v_?6n3 zS-ci(Js?TUnMXMv;T4B16QJHn()Wv;coEYlwDh|>RmpEHY(>X9J~JmkC!1B`%fv#d zpwKPr9ByB9GOuuc#~=P+B^P%Yt=EUXy0ToN0D~VXXp6hh$L{RqMoTPlZ0n2tG#Sy!bulu$}>=b}54^5glJ#;v)74W{$mT+G3wL6o`oTKPNIexoiaT zkKZa@(xRD=KZyUCV*~2rASlc9@+`eLw}L!yr8sz|SAtT*Rhh)$*0^r85%@i@*&fo4 zXPIRQnkSBacV<#m*Lo+kABP=JJwAOE&+>)jEo_lwqXz~a?(vJw^3q3 zQIi*VSY41HBq~8G-X1r2pVJXW=zG@ru9G)j`V)kl_rT_dgy8c<=bgDyhW9a&p%}I~ zSA_zMVNIaz1zU4@jYM@-qivBqn=cGOM${SgW9)XHjA}0lJ{MJ#sDX~-i-SkG5 zQb$3lqv_J+P3)|YeKTHa8+0JvP$v7_prKbNXJ|i%Jx|ZviymAn8FQg5_g+LX{t~GO zrH7Ds|A1k(6hk4fg)r^)J+7rQ+;sb2_l>B#1zg-JRqC<*V6kjV@2(ij1Dzeyr~MUz z0y`KT`|pp;&o_(1w-vjK`o7x4hSoDgvsAU;qQ-)Wg#Zdsw=LrpDD<`p^Ff;NL49|k){rzAPr!70%MFjoNt~{rO~77em@;^c_)n$qNR~i+2-PL4Q*OKV0+J<(K6WUUE*!Wni@Y z%y*VUCToL%geRnop9TZC-w`sK=JqxWU$oP<&#W*-S&*g8IuLFhzN_~V%DzEV(J#JQ z7Ivur>23^MIRjVDYoT9;+8||}L0o%xJ3{Ab1X0X5#(OCA0ymh7Vv=g# zH3W(Qye_5xF}jaZYpC&<5vg@CSN^UnjKp-BBXXwP-|UU$9l->L&h)CZ>pLC2Jy0wy zdOm}T4Xt5}$kG0A)jgft4#$Z9*4Jb`-VrD5(n=G|R$q@2^1HjQGs^U7;ot$s4}}mj zUV#&EXc~+hM0o=tJ~6n^5&dj^TGF?Mu$+TmK6X~joREM11jUAKz$0>+Fznq5yMfcS zxE-HAcCIjg?rq)Ar+qfEQwYidPPWJz&I4L0>?~36W$F8lqEPRlnfws1s_BUVf?luy zfQYU^;if8wI|B+aT6NmrY2_olHn+TtLeSPSf;ZGi2Rf)+ZWjYAtTvU&-1J7$GvVsy8)XOEhQ_X`l(H}D>4 z|8!5Gfo5&P9!A;VIIH+a|G2OLlnDVr6@yjaA@Ub%Kj^sHDc|mSS4f>OEzx-OlWsQj z`_+*$Uz?6^_KjK5&bR!O0A*c?L0l#Qdl4*YV*c}DB-%Gd)0IZg31}c&oH)A&SitO~|Lym(vjhFmN!F0ydZEak&9`1@jAXngF*gZQAWD55P1`Jgy9-)2}p8@kTF6Xwh z|D#?Qed$D-KH3-h&XcYIJCP&|*lCYN)fzcnL*PiS$3|41R7Be&sg_mjGbykWfCOG7 zWbSQ!`Yt0$2dfWkHRgXFx(^(K?=jiWB|=?%KS@x{38K5B(SuQUKajtuBIDBUim%@; zgWgWgjbi@gE?ZDHanJBHc}v6o2aDU=7cV-z>r9DLxN#k?4Wbm2{(codhuY_Nz}xV5 zvT(ugP4qvO?XCHaj~8F=aTY*rL9vn@3oqD{7$F%(p@@W?GNwn8YmMl?kkdNtX(MGF zinvMCR=CBR0rO^`AIR%-KdZAI$8`*no4vLt*xKO9pT}F%zKPjrIusqJ+2(8I z#&xn2kwg&?p~(HiVsR3uNR?lCN|j~~lDtMQu1WRn`sl|DSU&x9O~1Wkl#Ycq^rXr1 z24X2VN$D@caSoVJ@V<#dKTOwd-*TVN=ysfO-x=d9ZM)bNViY*Ax#FM`S`u1nX`zEV zPVOL*asD>_d<$AuH|JlBO=olx-mV^f`q5%p%ee|cR0c9w4KK&8wqsOriGiI_Q}O3w zJ2m%`bLMLlj&mWoA(nEZh*J1$Gd$0fQrD-hfTDz|nu!;~JDs^o01oNNp5B4Nfv_aD zZZX}H7NO+Lbi8>|x$EN7gb<;2Rvh#cI@BMHmjnSx60?QQ@Y-GWsMhMbcZspM1qH2Sn}yes(?_H$udi^@J6Yzsjs4C9{uBWMFd-sD1R07xB*6^2IY>?E zTP*cTy`sii!5aHq4rKuG&M@M=lsE=GCt{!VTO`oq*Q+F1=OT0Rtc8Xx*+D=q>ss9o zh!Z>7=FzCz+;*1Ky(cklT{VxwYV~Pkjp3YtLJgHG3Y|`$fzi)DAI%bYM|RK1NL6m@ zV{#S;Lxuy(-nK1sLMPh*k<)SN4|pObxgmv2dtsI(5b8$ktncf~AWZkfzi0}E6!UK@ z-OJ`q4I}Uvhdp;Vzey$FGO_cnP-Sg&NI^&DHb3ZTEi}Lzmc^t>K`};O#Ym+!6rvm_9X3>T$qc75zo-Ng!&0d#->Fb$5v=s`Y z7L7y&VJ^KGYnOcZB>H=MYu^+~bM;EQO1_p;s8%86QEnXc<(i2E8)6*}B+I?Ce{V(v z%4X2DJULd|{I#|@-DmJp;<;lq;c$Onh({dX=BpM@TbIZBusj9k2(MYnbsw72!CE7E z>;Q2x4tdj{+^P|7PcM2bYN@$Lsy06;D783Alh3QT)`Y&px>Xh-T*f#BhR4J1cp^oVhPN$xmdyrubCvr|sHkiD0Ju*YRq{?@km(`oV-T*{>% zK=Sx$N)&ND^^d)mL*G$L%qrZ&xKSHFC-(B9HCJOwF_(zR!L*IU8*4FNMx>s8%&dz8 zyf14qVq`BvyhuX;YAGu1dA*X<*{Lgev*o>5jVunkXdm-udD)TuKlM zlr}~56&ciZ-L1K|WadYp7W~Zgo=|NNg&gjq5;gcJ;owq6B-9TmCH1aIyaZqvp}cpd zL;b9ay_Pc{u3@6y`79@D*8C#!U&MpoByMATG8BJy3n8>{RIlPmn=OrCMs|_WO3RH7 zqk-N<(3U)ZlJkX!e6qs6&DiNbRZouONe#wjhzyv1Qv$i^Cu72ueSPS#qMR zxL;y^e|gtWug7JkHxIM0`~YnF!OO1W*dGbKt>>NIxKi0Tli~rO=qtatb1!s>VU~IU z@;e~)W?0<%j0qi`H1$jQSk~-Xu87kPkgsH=+v4cK(6|w;_L6TUX%xrYv3QXMZOSGq z&B4-KBVwH^?BJUJp^x<7q?rBid(YpZTU_QeVpbv#dd8Xo@|QaNHb$zaHwM^5i-|cJ z;1%7U_b48&(nhhZ`F`%2egSBs3uhywjhN=*8?PZhy|#FsgU=?Bl)7XieqP>!gxW1m zb1Q2^oXoSN<1pg*v*C)bSZ`+HJI}zkhjJXR#D;!M2>nd0IN#W^_U?yBu7_O;;?%bEtV1V|PfT=y_M1SDIvtYk9Ma-mLxRkE(%_76)#& z^HqlTzqJTC|1#k6c8HMM?XMU0G&iTyZ04tv1DJ%jTEAH^&A^udQsd+rW)lRteL`{O z%Mo!o3uw(K8OgSd;cOM@i#%IY8h*U@+}v_uyKkx_w>QL3aT9seGS;))R)?plwDAaC2{xBFMP%k3#nH%XYh~mUL*J z-uaxC3ENjJ)-fQHGQ355e$h`Xhq)`YD@apcw1qUqS??YdF^NC56M2#_%^dL>8@9@{ zjCQ&z^8LBj!&jD{8Dqnp=^EwSpQ^MI+GZ1{e1eAM1=_U;65+U+U9$Z&o+)D{!Ym=ewr2EwpHi`B+U@T7p%!N=YiP)aRI` z(PVl`YU{E{rS6E2h1jL6n|`6s?<&n;X9hYv!|-Kt-ikA9T_wBGPw-e*+SbM*#0_g( z=+A6_ZU5BGUvbXCAh9_&mS-EWP7A5SLMYtN-Pj#HIL?AF7}upI1fT()J|A;0L8Ey7 z^9iUMhIYhpY5NmsLZWeKO;YxYnB}aIGWGW5WKyt<{5^=VnxvOsJ`QgFm|DDv)g0#% zFgb>zAC0Kdl&maOiSp*P!=GwQ>Csm8s@~!IvEBe?VReu`X8;;9{&Q;doTBa&PHA>L zwYgq(hSz@8@BnE|^FUHU$dF)hFgk(7gb7)g&2#du-UvzmB~PHH#M-8yiTZOxo1h$* zmPp#?r@RTdfd|d1IU_852KaQ7B;RZ!sY|SJx34`OS|N@N*lvS7l_cq=ya6DDJLhGz z?g5a|i_#H2Cx8x>xq05x@C&0lkzn9ST6;FO);6?nR?L#+07U#x)X@9RYn@9bet zv;JZ^t%4h3d%`7aI*RpIH&DbZd{!;!Vw8;}bpsPWUQiSEQd}(;XMtI0V)YbNHM%+N z&clz8=vxtemA9(=OT0B}4%3-)H|4)B=2EoWhtkJSd#pr|kFoHxZ<`|%LeCY;zwcb> z`>l)gzFALQ!_B=TW%Zs;O7bdLmA6U5i;qIp$E||_kFI|)=kr&6f#w*)f<@41xw&s&v^N{m1yuD!Z(OCzR_y=?jIJNK~AXUsGTGj2x5V zm>-lfIL6da8)3fS-?_=L$m=sBB!RZt^g8SF%xE_wB~-gLsGb{35KH_`aai0PTIY1L z$tufseLsL-x=|TIOYx--$E~JrdxXE8Nxjh23{j)1Jn3Tx=YX4--pJkQ9_m-^6QOrz zP48MzOQ2>JW(zOxkZv{IhE8elbx>BYoOlfh=~rLWe)}f&6tmaQC%wpuTuI(%^s~L?d zUV2ngC~r%DGM{CAMci2Xop@|``sqx)J06m(5titA$}Kb6^4H^O1C}dS&J1TUu8OYQ z^)tdjKY=6OPha0e@)SvZiIX#+mh30QD&)z+g@>vfHxu{(BJeYPy4potjY5*gH0ZZ zdrMUd85ywWxbvC30^RT#!#0)rgBr2RnNm1Xut6nzZLqvgt--q_gl?;R-W(dh;pm8x z3mB2L-8XO2Dig>5J*+t{P^DC`q~FU#IdrQ z%)*zl-m+fSiXF#{CPAE&+DQKKvZbVja=A$-LE_sqbbq~2aov64SlZG~IQ za4<9Q+`4;C70ksen3^?NB)Si z-cN5n^p)XX+)AqKeK0m~b6}jEwTl*TDrWR#154X>NO1{fqpGv9y#pU=q~XkeTjZC!2;He0aobr(5Eu07 zaLfbk^~}ks@rtPk#%8ZFDeodfV%>Yx8)DharUk%PVHjL9;9J;4-yOdkzVtPwN`PQ9 zrh~{~Ml|eUiRC*Mqd4(LbE&rUhR-fK%9u&7P=(Sg)>{_mJ)6DN5G!)2{vt^h)Qyoz!5^b)sd$o>PE_bm#$MXQ zU2)dpEj0B_+3ydEW5n3p6K-8b+fi#c?C+FKc8!tYzj0Giu;vz&0ia}tuiqPpV|KE& z7Upd=uJeBJQPw+)YwgL{-Cf>mRi!*uZm zSX%uEld|~UQ{{T$5imy@v;aS4@joy7LhSoT|jVtbfcqpd1bA#QOWSjgSXz738(aN-!xdxJlHvLx0r#vW4AjHyl?5;zr< zkuO+-uXv6QgjN&O7?fLLw$-}y7`@dyJq>NIc3U_D{^)XJ>jY4c$I{%_?rCd^fH9S; zk~_88+#_8*TL`_&2pWU;&)Q_bqL4^a(vq}W2gMP4IsDy{AvF(wHl0;KMFA2LjeF+@QA2#Qvi@rF!LXdMcAZ{v39b)>$$J<7jk~&`sjP@x zYhjy|@3{N2_HWM=(p@+;UIG=Zm;}J_F#zK)FULimC=Lu)AbKsY7zh^s`!6l$H_|S?`y97@~bl0_i85%p% z7}~mB5dm<@A$SWkcbk5I0R~ZkWksW1tLsDTSvrOL*O(OeTakUZ~D??{8LaESugCHgASN0v2F-@)uF9jNEEis ztj_kr?C;~(n!_C+iD}P{vEm~}FcG0Embl9SoWcz3Y~^dopOr!3y!oK<8ln+q;l=ID z3aH75M0BL~OX9&MH~4&YGej9xqj|?^S(DJc#|FN9!!gIzS+;-?B;)7ng~L8v;D&&8 zon?;&zxWq2H(KykbmT$^mH-7x5YLogWFjJ_jzEC=8BFB_qt>R@7(;w4Y}^pji-%>> z@$&?cK$Qz%*Ud4ojF{!XzL}n0Lp`$PgHfZSU{FVY-Y>@7xgOCPUuj2OT?x2T7M2DT z!_?Q@_2Brf+4LInpL59)L-YfKY&mbP+;IE$lJ5h*5)V`PS96T0{8}s>6Cp0KsZ~be zaj1R6mlAt%wTARhJWTbz)u?7wb59vQ^hY>XX+>w9Bw(G9{DFe@wdSp@w$+x3%<zGj>tH;PC>##clIGCKEU3|^?}_cHpRB}n%>F8L%mV>T0~9sgaP>AG zyAk+_Y$PW!K`xbefY%3(s%if9jik$&(wqgPw8zNWC zNDzJYpDPtYO7NhMGUev(C4}gGzpw*I0E~w4#nl1R92xPDe8hHSBPve(2ZzqzBYa36 zhws5mjR<8(=8$i&r{Ea(7d&QtP6oQkI+#%~5lCrZ);?Q-PSz_$FfS!E?j&mzyu$h> zvfi^I%|=kbMx6&7>kA;*B3t&aPX%Q~KqZ5X@r7e<)+{*h+4y%=q!2JJgd)lPvo=Pp z>F=s|>;?##*RWsM|E`VD9Txm&LsuZi9i#W^pS7Xf*zk8l;#P2-tyd2V{IfPX)8&7e z`CrYj4CsH&?0I>$Pg{&0{fv6izVr+5YjfL`W9=%B zU)?*j0Cy}S@QJW=qUA<|8(UbD&gd+J!P)P;<1N=49&^W|8C=4Na@7(T{H?OZw$l zi!E%%-Rb4KJkys>EL@?a9sc3;B{#a8Q~_BgSrxVlj2LCz0)_rCD5z}AVb@G_-}u3quG(ypm&iI<%Azi#u*?P&^WlML&NB*CaRX-)kc8Zbu+ zdYT2pGA4)Q3^U3GJA=OpU;H@54R=IhIHz<@i6FI9fPxx~$-HnC0qU=wxslWKM+?kMu-2iMWKCy?v5>eki&UajN^4=x{HxX!Z>T}1~7v6S{PGv^8+2Vz_`I#NE> zac>C^TteZ8qf`FUwf;9dTURTg{_!9CjDt&hV;pY+POjhfONCUKdZ}0uWrM@$c>=V( zR>%TYn71Xp*Cb8jD|rSo8z(!!&=4pa=#Ol7y4&B3twyLGrnY=5FH3-i9E>jk z5agt&+uPhtpiX}gOp4CA5*BoOnuzbfVtAq4I;QMlOhbkA_bB>*pU9!h7dVbmwpIno zdnuF7H}g>*=wP*#W~-a!ZL1A%!;aL^x2d-Y*u7AC6kPg}PK1hfb#um= z*}j=e@-ySRW*(J_w2*uPZJ-Q?))uO(l0o-j5^EJx4NAA4yth;P4Ie6LBgsuCTNw(r z(wMIU`lvU7Q-AzlS2qiM9)z#&VD}(@#_y#Vd=^BTnAG98{8eTPD+CrA*iQL_i2fR_66zYn++`{_@^gV#6YVvC*4Cqu>_G z8v*-_r;pk(Bj(s%g~pQh`pe-g<%==ge63ZC`%~X}uFLo)4pY;4^PKXH>vdaDSDQnh zEL~NQs}ZQsUu|+k<2s%z`8yqo0x{#Eomy{Oe~@9A>@?PUQoo;9v;ca(Q6OftF@axR zs!eLfVUBxJzvhnc6qtT2cv0-8TXiT4L=`R6!TRwfG06@68j#QR6}@`v8%eg>kG z`l|nCicUdV_j#$s{V6 zw3esn&7tK=b+DI`$Wm0K0ey$B;^${f?iVX7sw4T!l)Td}?E?{{Xv_<+Xg7k$F=8a< zdqr&Yv$R_MY+{UcEQ5F>0VOw)9oz_csOrEWVlrvT-Th~0C83qs7k{Ei zeCG9Zi4N$^9r@Gvz_8Yr@&&NqmS%yz_};EX@iQZiNj@yp9!l(IaFeKJi|qH4sdx+o zvR`9^!K7f16*GC|NqteNw+phYC;=TN)Y?UecciBJxX}*-YS7 z57^MlR_Zeag;%U5jNO3bL1n&becv3KaGB%Sx{qN5p|=s^_I!KSGp?73t2Lt^ja_DA z3jdAk5p{s=Ec!n2={$Cf`qPo>mD()kw`rF*SB2O4bfIxoy?a_f$UBJt?;_8Ak2|RF zDnMk>;+=2ycMzxuck`>IKxnFN_*C62!6o_$E-U_XdDgZFOyhQM({=q-RqKKHWvSTZ zJE8=GmMh=4U0e4lxH+PE|J29MTM9&35w0vt2RR#hb+aZFP&VTF0B7V+4vj&nFQwd| zJi5c3u%jOYjttiBWE}ql-q_j^VeViB)0pGumIC>#wq@*meape}DHkhT^f}KuFH1i)~hG4?d>!>Q=4c zA^&g%-c4<;5U4o4-Xj7Y9`ahcJ0znz0{?P0-39Qy$||RJCeGi@6Yg{S)2Ev28xGa# zvn&chY=E57fc&0Yp+mhLfLVAN^baU`;pG37Pq4{yP!@R7GGx7YE8Gi$zrS)XQqk{U z)&SHQ)=5?XIfwt*H)v{fjN0-t{1|Mk26HF(|Ql!&Xri z1P%7=!<&3(E&ZHrg`roK8Qnowoi0N9&lgyou~^k9v&gT%k{A)7on2KI6I}Ub7KWsUMYqL|4P*$Xxetf}(u;>0|AzXf(TuXXgm-OW!-1JzX zR-R;Il1mK`ZW@*(*{ZYz>jT22!4$Nv=?| z31|O>zQTB|ms9U2X@d2kNjNLn8OCd^%qE$S7U3XUGiy38XLCyCqd_<~1_57il0D;g zPNd)J0kVCe^41(#q*()^*Wn=BJvA-=*z@s_!vS?6RXE6YFWNT8QbtN73Rk*p{Ggj>nBt=En~M+Z{hVJ36i3lxb%yAf(a2$%cB`LEnT#>h5NV27WksKtA-FN zoOJs3eyuD&kLZ4ZYy;^Nu2%ANmXkm!p3>MZv_0yi)UJ=Q^a-aS-Btsye0oexk$vs$ zn-crkV$m*KOq1w~j^&dD{XQ5-mvBo3lZ8!W`}UKT7Wh{Qj;%<#gp;0bb)u`j?AY%o zZ}~*UEf?mZs~=AK&O++zJKgdLW~33cCEQB*&cb4%0X<}kq)Rx-lyXh0I5Y~k)OueI z4KxY25MEBvM#Y#W;gD`%GPDVYbYsq;aFyWOEFj$ka44MQje1;fApkAH)!0avffnH~ zb`ydxoYe7~rVD)3;|nKsvarIIlN@~Eq?S`W*Ip3Pg~Di&$N)ws=qY*IWCIwvTn!jU#kT4S?bK9Yn( zx+y?XN!9)w{34JvX@GDFkW^A}8wUkQ5f1651PBMRB1Je2&-yAJgMQET*q-B4_74F`Bb}4 zO&!0VnEmBq+iiNRXs_wU-N{qUr%5<&Wfj6rhcDWBjt@L6sfJ;~VObtR@mqvu5zWyi9H(*Ip5Z?LDfps&)44GRO=CDi z1NBccF9PrBB-#J2jWJJWJQis*_M&m1O*k{7a69_(Qx?5?VO&tjWG9-VPq>`y=tMOw zgS7Tnf&UoPA~Yut?(XioKK_)-*_3pN=F@qgg(5~YM_0lbO1Y_o3l+_Ag+0~(5ht1> zM7ZmO_h`E%(LRjWLD&^8Ty!rQ2l|9F^cTI2Jl0#1>_f!Pflb4gk;fsC$vQXYU~1t~ zh~^(hNvIOd(Kl&u44pir6wRlEiGxbf93cs3C|yE?yC%tY6K%TN18@#vHFep$R4CkRof~ryT2dW$EVuDGf^&)HfBs*YK5J@<63sDGI1X6JA4xlC zk0M2LNW5)6+kL_wX75GgfJEWmF*t@Hu<1qPK z^vDURk3Qm9n*M-j{Yadp8X9>2&EPnURO63Gd-r-AyESkZnbxshlIkp6()sTP5K(ca+xCh3q|QXMJxR#o?+aX`Z( z$Wu%nu8?SX(2#I3lZT=^H|9V?!i||cWWO&OxJQFYq#oG`}x3oR;Ny^${$Q~&?~07*qoM6N<$f?w7gpa1{> literal 0 HcmV?d00001 diff --git a/docs/overrides/partials/integrations/analytics/custom.html b/docs/overrides/partials/integrations/analytics/custom.html new file mode 100644 index 0000000..96a2301 --- /dev/null +++ b/docs/overrides/partials/integrations/analytics/custom.html @@ -0,0 +1,16 @@ + + + + + diff --git a/docs/stylesheets/extra.css b/docs/stylesheets/extra.css new file mode 100644 index 0000000..11a0209 --- /dev/null +++ b/docs/stylesheets/extra.css @@ -0,0 +1,114 @@ +/* theme */ +:root > * { + /* theme */ + --md-primary-fg-color: #115AF4; + --md-primary-fg-color--light: #115AF4; + --md-primary-fg-color--dark: #115AF4; +} + +/* Table formatting */ +.md-typeset table:not([class]) td { + padding: 0.5em 1.25em; +} +.md-typeset table:not([class]) th { + padding: 0.5em 1.25em; +} + +/* convenience class to keep lines from breaking +useful for wrapping table cell text in a span +to force column width */ +.no-wrap { + white-space: nowrap; +} + +/* badge formatting */ +.badge::before { + background-color: #1860F2; + color: white; + font-size: 0.8rem; + font-weight: normal; + padding: 4px 8px; + margin-left: 0.5rem; + vertical-align: super; + text-align: center; + border-radius: 5px; +} + +.badge-api::before { + background-color: #1860F2; + color: white; + font-size: 0.8rem; + font-weight: normal; + padding: 4px 8px; + text-align: center; + border-radius: 5px; +} + +.experimental::before { + background-color: #FCD14E; + content: "Experimental"; +} + +.cloud::before { + background-color: #799AF7; + content: "Prefect Cloud"; +} + +.deprecated::before { + background-color: #FA1C2F; + content: "Deprecated"; +} + +.new::before { + background-color: #2AC769; + content: "New"; +} + +.expert::before { + background-color: #726576; + content: "Advanced"; +} + +/* dark mode slate theme */ +/* dark mode code overrides */ +[data-md-color-scheme="slate"] { + --md-code-bg-color: #252a33; + --md-code-fg-color: #eee; + --md-code-hl-color: #3b3d54; + --md-code-hl-name-color: #eee; +} + +/* dark mode link overrides */ +[data-md-color-scheme="slate"] .md-typeset a { + color: var(--blue); +} + +[data-md-color-scheme="slate"] .md-typeset a:hover { + font-weight: bold; +} + +/* dark mode nav overrides */ +[data-md-color-scheme="slate"] .md-nav--primary .md-nav__item--active>.md-nav__link { + color: var(--blue); + font-weight: bold; +} + +[data-md-color-scheme="slate"] .md-nav--primary .md-nav__link--active { + color: var(--blue); + font-weight: bold; +} + +/* dark mode collection catalog overrides */ +[data-md-color-scheme="slate"] .collection-item { + background-color: #3b3d54; +} + +/* dark mode recipe collection overrides */ +[data-md-color-scheme="slate"] .recipe-item { + background-color: #3b3d54; +} + +/* dark mode API doc overrides */ +[data-md-color-scheme="slate"] .prefect-table th { + background-color: #3b3d54; +} \ No newline at end of file diff --git a/docs/tasks.md b/docs/tasks.md new file mode 100644 index 0000000..90b35df --- /dev/null +++ b/docs/tasks.md @@ -0,0 +1,6 @@ +--- +description: +notes: This documentation page is generated from source file docstrings. +--- + +::: prefect_datahub.tasks \ No newline at end of file diff --git a/mkdocs.yml b/mkdocs.yml new file mode 100644 index 0000000..327cd06 --- /dev/null +++ b/mkdocs.yml @@ -0,0 +1,82 @@ +site_name: prefect-datahub +site_url: https://shubhamjagtap639.github.io/prefect-datahub +repo_url: https://github.com/shubhamjagtap639/prefect-datahub +edit_uri: edit/main/docs/ +theme: + name: material + custom_dir: docs/overrides + favicon: img/favicon.ico + palette: + - media: "(prefers-color-scheme)" + toggle: + icon: material/brightness-auto + name: Switch to light mode + - media: "(prefers-color-scheme: light)" + accent: blue + primary: blue + scheme: default + toggle: + icon: material/weather-sunny + name: Switch to dark mode + - media: "(prefers-color-scheme: dark)" + accent: blue + primary: blue + scheme: slate + toggle: + icon: material/weather-night + name: Switch to light mode + icon: + repo: fontawesome/brands/github + logo: + img/prefect-logo-mark-solid-white-500.png + font: + text: Inter + code: Source Code Pro + features: + - content.code.copy + - content.code.annotate +extra_css: + - stylesheets/extra.css +markdown_extensions: + - admonition + - attr_list + - codehilite + - md_in_html + - meta + - pymdownx.highlight: + use_pygments: true + - pymdownx.superfences + - pymdownx.tabbed + - pymdownx.inlinehilite + - pymdownx.snippets + +plugins: + - search + - gen-files: + scripts: + - docs/gen_home_page.py + - docs/gen_examples_catalog.py + - docs/gen_blocks_catalog.py + - mkdocstrings: + handlers: + python: + options: + show_root_heading: True + show_object_full_path: False + show_category_heading: True + show_bases: True + show_signature: False + heading_level: 1 +watch: + - prefect_datahub/ + - README.md + +nav: + - Home: index.md + - Blocks Catalog: blocks_catalog.md + - Examples Catalog: examples_catalog.md + - API Reference: + - Tasks: tasks.md + - Flows: flows.md + + diff --git a/prefect_datahub/__init__.py b/prefect_datahub/__init__.py new file mode 100644 index 0000000..185a4c9 --- /dev/null +++ b/prefect_datahub/__init__.py @@ -0,0 +1,4 @@ +from . import _version +from .blocks import DatahubBlock # noqa + +__version__ = _version.get_versions()["version"] diff --git a/prefect_datahub/_version.py b/prefect_datahub/_version.py new file mode 100644 index 0000000..04b1004 --- /dev/null +++ b/prefect_datahub/_version.py @@ -0,0 +1,677 @@ +# This file helps to compute a version number in source trees obtained from +# git-archive tarball (such as those provided by githubs download-from-tag +# feature). Distribution tarballs (built by setup.py sdist) and build +# directories (produced by setup.py build) will contain a much shorter file +# that just contains the computed version number. + +# This file is released into the public domain. Generated by +# versioneer-0.21 (https://github.com/python-versioneer/python-versioneer) + +"""Git implementation of _version.py.""" + +import errno +import os +import re +import subprocess +import sys +from typing import Callable, Dict + + +def get_keywords(): + """Get the keywords needed to look up the version information.""" + # these strings will be replaced by git during git-archive. + # setup.py/versioneer.py will grep for the variable names, so they must + # each be defined on a line of their own. _version.py will just call + # get_keywords(). + git_refnames = "$Format:%d$" + git_full = "$Format:%H$" + git_date = "$Format:%ci$" + keywords = {"refnames": git_refnames, "full": git_full, "date": git_date} + return keywords + + +class VersioneerConfig: + """Container for Versioneer configuration parameters.""" + + +def get_config(): + """Create, populate and return the VersioneerConfig() object.""" + # these strings are filled in when 'setup.py versioneer' creates + # _version.py + cfg = VersioneerConfig() + cfg.VCS = "git" + cfg.style = "pep440" + cfg.tag_prefix = "" + cfg.parentdir_prefix = "" + cfg.versionfile_source = "prefect_datahub/_version.py" + cfg.verbose = False + return cfg + + +class NotThisMethod(Exception): + """Exception raised if a method is not valid for the current scenario.""" + + +LONG_VERSION_PY: Dict[str, str] = {} +HANDLERS: Dict[str, Dict[str, Callable]] = {} + + +def register_vcs_handler(vcs, method): # decorator + """Create decorator to mark a method as the handler of a VCS.""" + + def decorate(f): + """Store f in HANDLERS[vcs][method].""" + if vcs not in HANDLERS: + HANDLERS[vcs] = {} + HANDLERS[vcs][method] = f + return f + + return decorate + + +def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False, env=None): + """Call the given command(s).""" + assert isinstance(commands, list) + process = None + for command in commands: + try: + dispcmd = str([command] + args) + # remember shell=False, so use git.cmd on windows, not just git + process = subprocess.Popen( + [command] + args, + cwd=cwd, + env=env, + stdout=subprocess.PIPE, + stderr=(subprocess.PIPE if hide_stderr else None), + ) + break + except OSError: + e = sys.exc_info()[1] + if e.errno == errno.ENOENT: + continue + if verbose: + print("unable to run %s" % dispcmd) + print(e) + return None, None + else: + if verbose: + print("unable to find command, tried %s" % (commands,)) + return None, None + stdout = process.communicate()[0].strip().decode() + if process.returncode != 0: + if verbose: + print("unable to run %s (error)" % dispcmd) + print("stdout was %s" % stdout) + return None, process.returncode + return stdout, process.returncode + + +def versions_from_parentdir(parentdir_prefix, root, verbose): + """Try to determine the version from the parent directory name. + + Source tarballs conventionally unpack into a directory that includes both + the project name and a version string. We will also support searching up + two directory levels for an appropriately named parent directory + """ + rootdirs = [] + + for _ in range(3): + dirname = os.path.basename(root) + if dirname.startswith(parentdir_prefix): + return { + "version": dirname[len(parentdir_prefix) :], + "full-revisionid": None, + "dirty": False, + "error": None, + "date": None, + } + rootdirs.append(root) + root = os.path.dirname(root) # up a level + + if verbose: + print( + "Tried directories %s but none started with prefix %s" + % (str(rootdirs), parentdir_prefix) + ) + raise NotThisMethod("rootdir doesn't start with parentdir_prefix") + + +@register_vcs_handler("git", "get_keywords") +def git_get_keywords(versionfile_abs): + """Extract version information from the given file.""" + # the code embedded in _version.py can just fetch the value of these + # keywords. When used from setup.py, we don't want to import _version.py, + # so we do it with a regexp instead. This function is not used from + # _version.py. + keywords = {} + try: + with open(versionfile_abs, "r") as fobj: + for line in fobj: + if line.strip().startswith("git_refnames ="): + mo = re.search(r'=\s*"(.*)"', line) + if mo: + keywords["refnames"] = mo.group(1) + if line.strip().startswith("git_full ="): + mo = re.search(r'=\s*"(.*)"', line) + if mo: + keywords["full"] = mo.group(1) + if line.strip().startswith("git_date ="): + mo = re.search(r'=\s*"(.*)"', line) + if mo: + keywords["date"] = mo.group(1) + except OSError: + pass + return keywords + + +@register_vcs_handler("git", "keywords") +def git_versions_from_keywords(keywords, tag_prefix, verbose): + """Get version information from git keywords.""" + if "refnames" not in keywords: + raise NotThisMethod("Short version file found") + date = keywords.get("date") + if date is not None: + # Use only the last line. Previous lines may contain GPG signature + # information. + date = date.splitlines()[-1] + + # git-2.2.0 added "%cI", which expands to an ISO-8601 -compliant + # datestamp. However we prefer "%ci" (which expands to an "ISO-8601 + # -like" string, which we must then edit to make compliant), because + # it's been around since git-1.5.3, and it's too difficult to + # discover which version we're using, or to work around using an + # older one. + date = date.strip().replace(" ", "T", 1).replace(" ", "", 1) + refnames = keywords["refnames"].strip() + if refnames.startswith("$Format"): + if verbose: + print("keywords are unexpanded, not using") + raise NotThisMethod("unexpanded keywords, not a git-archive tarball") + refs = {r.strip() for r in refnames.strip("()").split(",")} + # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of + # just "foo-1.0". If we see a "tag: " prefix, prefer those. + TAG = "tag: " + tags = {r[len(TAG) :] for r in refs if r.startswith(TAG)} + if not tags: + # Either we're using git < 1.8.3, or there really are no tags. We use + # a heuristic: assume all version tags have a digit. The old git %d + # expansion behaves like git log --decorate=short and strips out the + # refs/heads/ and refs/tags/ prefixes that would let us distinguish + # between branches and tags. By ignoring refnames without digits, we + # filter out many common branch names like "release" and + # "stabilization", as well as "HEAD" and "master". + tags = {r for r in refs if re.search(r"\d", r)} + if verbose: + print("discarding '%s', no digits" % ",".join(refs - tags)) + if verbose: + print("likely tags: %s" % ",".join(sorted(tags))) + for ref in sorted(tags): + # sorting will prefer e.g. "2.0" over "2.0rc1" + if ref.startswith(tag_prefix): + r = ref[len(tag_prefix) :] + # Filter out refs that exactly match prefix or that don't start + # with a number once the prefix is stripped (mostly a concern + # when prefix is '') + if not re.match(r"\d", r): + continue + if verbose: + print("picking %s" % r) + return { + "version": r, + "full-revisionid": keywords["full"].strip(), + "dirty": False, + "error": None, + "date": date, + } + # no suitable tags, so version is "0+unknown", but full hex is still there + if verbose: + print("no suitable tags, using unknown + full revision id") + return { + "version": "0+unknown", + "full-revisionid": keywords["full"].strip(), + "dirty": False, + "error": "no suitable tags", + "date": None, + } + + +@register_vcs_handler("git", "pieces_from_vcs") +def git_pieces_from_vcs(tag_prefix, root, verbose, runner=run_command): + """Get version from 'git describe' in the root of the source tree. + + This only gets called if the git-archive 'subst' keywords were *not* + expanded, and _version.py hasn't already been rewritten with a short + version string, meaning we're inside a checked out source tree. + """ + GITS = ["git"] + TAG_PREFIX_REGEX = "*" + if sys.platform == "win32": + GITS = ["git.cmd", "git.exe"] + TAG_PREFIX_REGEX = r"\*" + + _, rc = runner(GITS, ["rev-parse", "--git-dir"], cwd=root, hide_stderr=True) + if rc != 0: + if verbose: + print("Directory %s not under git control" % root) + raise NotThisMethod("'git rev-parse --git-dir' returned error") + + # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty] + # if there isn't one, this yields HEX[-dirty] (no NUM) + describe_out, rc = runner( + GITS, + [ + "describe", + "--tags", + "--dirty", + "--always", + "--long", + "--match", + "%s%s" % (tag_prefix, TAG_PREFIX_REGEX), + ], + cwd=root, + ) + # --long was added in git-1.5.5 + if describe_out is None: + raise NotThisMethod("'git describe' failed") + describe_out = describe_out.strip() + full_out, rc = runner(GITS, ["rev-parse", "HEAD"], cwd=root) + if full_out is None: + raise NotThisMethod("'git rev-parse' failed") + full_out = full_out.strip() + + pieces = {} + pieces["long"] = full_out + pieces["short"] = full_out[:7] # maybe improved later + pieces["error"] = None + + branch_name, rc = runner(GITS, ["rev-parse", "--abbrev-ref", "HEAD"], cwd=root) + # --abbrev-ref was added in git-1.6.3 + if rc != 0 or branch_name is None: + raise NotThisMethod("'git rev-parse --abbrev-ref' returned error") + branch_name = branch_name.strip() + + if branch_name == "HEAD": + # If we aren't exactly on a branch, pick a branch which represents + # the current commit. If all else fails, we are on a branchless + # commit. + branches, rc = runner(GITS, ["branch", "--contains"], cwd=root) + # --contains was added in git-1.5.4 + if rc != 0 or branches is None: + raise NotThisMethod("'git branch --contains' returned error") + branches = branches.split("\n") + + # Remove the first line if we're running detached + if "(" in branches[0]: + branches.pop(0) + + # Strip off the leading "* " from the list of branches. + branches = [branch[2:] for branch in branches] + if "master" in branches: + branch_name = "master" + elif not branches: + branch_name = None + else: + # Pick the first branch that is returned. Good or bad. + branch_name = branches[0] + + pieces["branch"] = branch_name + + # parse describe_out. It will be like TAG-NUM-gHEX[-dirty] or HEX[-dirty] + # TAG might have hyphens. + git_describe = describe_out + + # look for -dirty suffix + dirty = git_describe.endswith("-dirty") + pieces["dirty"] = dirty + if dirty: + git_describe = git_describe[: git_describe.rindex("-dirty")] + + # now we have TAG-NUM-gHEX or HEX + + if "-" in git_describe: + # TAG-NUM-gHEX + mo = re.search(r"^(.+)-(\d+)-g([0-9a-f]+)$", git_describe) + if not mo: + # unparsable. Maybe git-describe is misbehaving? + pieces["error"] = "unable to parse git-describe output: '%s'" % describe_out + return pieces + + # tag + full_tag = mo.group(1) + if not full_tag.startswith(tag_prefix): + if verbose: + fmt = "tag '%s' doesn't start with prefix '%s'" + print(fmt % (full_tag, tag_prefix)) + pieces["error"] = "tag '%s' doesn't start with prefix '%s'" % ( + full_tag, + tag_prefix, + ) + return pieces + pieces["closest-tag"] = full_tag[len(tag_prefix) :] + + # distance: number of commits since tag + pieces["distance"] = int(mo.group(2)) + + # commit: short hex revision ID + pieces["short"] = mo.group(3) + + else: + # HEX: no tags + pieces["closest-tag"] = None + count_out, rc = runner(GITS, ["rev-list", "HEAD", "--count"], cwd=root) + pieces["distance"] = int(count_out) # total number of commits + + # commit date: see ISO-8601 comment in git_versions_from_keywords() + date = runner(GITS, ["show", "-s", "--format=%ci", "HEAD"], cwd=root)[0].strip() + # Use only the last line. Previous lines may contain GPG signature + # information. + date = date.splitlines()[-1] + pieces["date"] = date.strip().replace(" ", "T", 1).replace(" ", "", 1) + + return pieces + + +def plus_or_dot(pieces): + """Return a + if we don't already have one, else return a .""" + if "+" in pieces.get("closest-tag", ""): + return "." + return "+" + + +def render_pep440(pieces): + """Build up version string, with post-release "local version identifier". + + Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you + get a tagged build and then dirty it, you'll get TAG+0.gHEX.dirty + + Exceptions: + 1: no tags. git_describe was just HEX. 0+untagged.DISTANCE.gHEX[.dirty] + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"] or pieces["dirty"]: + rendered += plus_or_dot(pieces) + rendered += "%d.g%s" % (pieces["distance"], pieces["short"]) + if pieces["dirty"]: + rendered += ".dirty" + else: + # exception #1 + rendered = "0+untagged.%d.g%s" % (pieces["distance"], pieces["short"]) + if pieces["dirty"]: + rendered += ".dirty" + return rendered + + +def render_pep440_branch(pieces): + """TAG[[.dev0]+DISTANCE.gHEX[.dirty]] . + + The ".dev0" means not master branch. Note that .dev0 sorts backwards + (a feature branch will appear "older" than the master branch). + + Exceptions: + 1: no tags. 0[.dev0]+untagged.DISTANCE.gHEX[.dirty] + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"] or pieces["dirty"]: + if pieces["branch"] != "master": + rendered += ".dev0" + rendered += plus_or_dot(pieces) + rendered += "%d.g%s" % (pieces["distance"], pieces["short"]) + if pieces["dirty"]: + rendered += ".dirty" + else: + # exception #1 + rendered = "0" + if pieces["branch"] != "master": + rendered += ".dev0" + rendered += "+untagged.%d.g%s" % (pieces["distance"], pieces["short"]) + if pieces["dirty"]: + rendered += ".dirty" + return rendered + + +def pep440_split_post(ver): + """Split pep440 version string at the post-release segment. + + Returns the release segments before the post-release and the + post-release version number (or -1 if no post-release segment is present). + """ + vc = str.split(ver, ".post") + return vc[0], int(vc[1] or 0) if len(vc) == 2 else None + + +def render_pep440_pre(pieces): + """TAG[.postN.devDISTANCE] -- No -dirty. + + Exceptions: + 1: no tags. 0.post0.devDISTANCE + """ + if pieces["closest-tag"]: + if pieces["distance"]: + # update the post release segment + tag_version, post_version = pep440_split_post(pieces["closest-tag"]) + rendered = tag_version + if post_version is not None: + rendered += ".post%d.dev%d" % (post_version + 1, pieces["distance"]) + else: + rendered += ".post0.dev%d" % (pieces["distance"]) + else: + # no commits, use the tag as the version + rendered = pieces["closest-tag"] + else: + # exception #1 + rendered = "0.post0.dev%d" % pieces["distance"] + return rendered + + +def render_pep440_post(pieces): + """TAG[.postDISTANCE[.dev0]+gHEX] . + + The ".dev0" means dirty. Note that .dev0 sorts backwards + (a dirty tree will appear "older" than the corresponding clean one), + but you shouldn't be releasing software with -dirty anyways. + + Exceptions: + 1: no tags. 0.postDISTANCE[.dev0] + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"] or pieces["dirty"]: + rendered += ".post%d" % pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" + rendered += plus_or_dot(pieces) + rendered += "g%s" % pieces["short"] + else: + # exception #1 + rendered = "0.post%d" % pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" + rendered += "+g%s" % pieces["short"] + return rendered + + +def render_pep440_post_branch(pieces): + """TAG[.postDISTANCE[.dev0]+gHEX[.dirty]] . + + The ".dev0" means not master branch. + + Exceptions: + 1: no tags. 0.postDISTANCE[.dev0]+gHEX[.dirty] + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"] or pieces["dirty"]: + rendered += ".post%d" % pieces["distance"] + if pieces["branch"] != "master": + rendered += ".dev0" + rendered += plus_or_dot(pieces) + rendered += "g%s" % pieces["short"] + if pieces["dirty"]: + rendered += ".dirty" + else: + # exception #1 + rendered = "0.post%d" % pieces["distance"] + if pieces["branch"] != "master": + rendered += ".dev0" + rendered += "+g%s" % pieces["short"] + if pieces["dirty"]: + rendered += ".dirty" + return rendered + + +def render_pep440_old(pieces): + """TAG[.postDISTANCE[.dev0]] . + + The ".dev0" means dirty. + + Exceptions: + 1: no tags. 0.postDISTANCE[.dev0] + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"] or pieces["dirty"]: + rendered += ".post%d" % pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" + else: + # exception #1 + rendered = "0.post%d" % pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" + return rendered + + +def render_git_describe(pieces): + """TAG[-DISTANCE-gHEX][-dirty]. + + Like 'git describe --tags --dirty --always'. + + Exceptions: + 1: no tags. HEX[-dirty] (note: no 'g' prefix) + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"]: + rendered += "-%d-g%s" % (pieces["distance"], pieces["short"]) + else: + # exception #1 + rendered = pieces["short"] + if pieces["dirty"]: + rendered += "-dirty" + return rendered + + +def render_git_describe_long(pieces): + """TAG-DISTANCE-gHEX[-dirty]. + + Like 'git describe --tags --dirty --always -long'. + The distance/hash is unconditional. + + Exceptions: + 1: no tags. HEX[-dirty] (note: no 'g' prefix) + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + rendered += "-%d-g%s" % (pieces["distance"], pieces["short"]) + else: + # exception #1 + rendered = pieces["short"] + if pieces["dirty"]: + rendered += "-dirty" + return rendered + + +def render(pieces, style): + """Render the given version pieces into the requested style.""" + if pieces["error"]: + return { + "version": "unknown", + "full-revisionid": pieces.get("long"), + "dirty": None, + "error": pieces["error"], + "date": None, + } + + if not style or style == "default": + style = "pep440" # the default + + if style == "pep440": + rendered = render_pep440(pieces) + elif style == "pep440-branch": + rendered = render_pep440_branch(pieces) + elif style == "pep440-pre": + rendered = render_pep440_pre(pieces) + elif style == "pep440-post": + rendered = render_pep440_post(pieces) + elif style == "pep440-post-branch": + rendered = render_pep440_post_branch(pieces) + elif style == "pep440-old": + rendered = render_pep440_old(pieces) + elif style == "git-describe": + rendered = render_git_describe(pieces) + elif style == "git-describe-long": + rendered = render_git_describe_long(pieces) + else: + raise ValueError("unknown style '%s'" % style) + + return { + "version": rendered, + "full-revisionid": pieces["long"], + "dirty": pieces["dirty"], + "error": None, + "date": pieces.get("date"), + } + + +def get_versions(): + """Get version information or return default if unable to do so.""" + # I am in _version.py, which lives at ROOT/VERSIONFILE_SOURCE. If we have + # __file__, we can work backwards from there to the root. Some + # py2exe/bbfreeze/non-CPython implementations don't do __file__, in which + # case we can only use expanded keywords. + + cfg = get_config() + verbose = cfg.verbose + + try: + return git_versions_from_keywords(get_keywords(), cfg.tag_prefix, verbose) + except NotThisMethod: + pass + + try: + root = os.path.realpath(__file__) + # versionfile_source is the relative path from the top of the source + # tree (where the .git directory might live) to this file. Invert + # this to find the root from __file__. + for _ in cfg.versionfile_source.split("/"): + root = os.path.dirname(root) + except NameError: + return { + "version": "0+unknown", + "full-revisionid": None, + "dirty": None, + "error": "unable to find root of source tree", + "date": None, + } + + try: + pieces = git_pieces_from_vcs(cfg.tag_prefix, root, verbose) + return render(pieces, cfg.style) + except NotThisMethod: + pass + + try: + if cfg.parentdir_prefix: + return versions_from_parentdir(cfg.parentdir_prefix, root, verbose) + except NotThisMethod: + pass + + return { + "version": "0+unknown", + "full-revisionid": None, + "dirty": None, + "error": "unable to compute version", + "date": None, + } diff --git a/prefect_datahub/blocks.py b/prefect_datahub/blocks.py new file mode 100644 index 0000000..4064c1d --- /dev/null +++ b/prefect_datahub/blocks.py @@ -0,0 +1,35 @@ +"""This is an example blocks module""" + +from prefect.blocks.core import Block +from pydantic import Field + + +class DatahubBlock(Block): + """ + A sample block that holds a value. + + Attributes: + value (str): The value to store. + + Example: + Load a stored value: + ```python + from prefect_datahub import DatahubBlock + block = DatahubBlock.load("BLOCK_NAME") + ``` + """ + + _block_type_name = "datahub" + # replace this with a relevant logo; defaults to Prefect logo + _logo_url = "https://images.ctfassets.net/gm98wzqotmnx/08yCE6xpJMX9Kjl5VArDS/c2ede674c20f90b9b6edeab71feffac9/prefect-200x200.png?h=250" # noqa + _documentation_url = "https://shubhamjagtap639.github.io/prefect-datahub/blocks/#prefect-datahub.blocks.DatahubBlock" # noqa + + value: str = Field("The default value", description="The value to store.") + + @classmethod + def seed_value_for_example(cls): + """ + Seeds the field, value, so the block can be loaded. + """ + block = cls(value="A sample value") + block.save("sample-block", overwrite=True) diff --git a/prefect_datahub/datahub_emitter.py b/prefect_datahub/datahub_emitter.py new file mode 100644 index 0000000..a2f8281 --- /dev/null +++ b/prefect_datahub/datahub_emitter.py @@ -0,0 +1,172 @@ +"""Module for emit metadata to Datahub REST. """ + +from prefect.blocks.core import Block +from pydantic import Field +from typing import Dict, List, Tuple, Optional +import asyncio +from prefect.blocks.core import Block +from prefect.context import FlowRunContext, TaskRunContext +from datahub.api.entities.datajob import DataFlow, DataJob +from datahub.api.entities.dataprocess.dataprocess_instance import ( + DataProcessInstance, + InstanceRunResult, +) +from datahub_provider.entities import _Entity +from datahub.utilities.urns.dataset_urn import DatasetUrn +from datahub_provider.entities import Dataset +from datahub.emitter.rest_emitter import DataHubRestEmitter +from datahub.utilities.urns.data_flow_urn import DataFlowUrn +from datahub.utilities.urns.data_job_urn import DataJobUrn +from prefect.client.orchestration import get_client + + +class DatahubEmitter(Block): + """ + Block used to emit prefect task and flow related metadata to Datahub REST + + Attributes: + datahub_rest_url (str): The value to store. + cluster (str): The value to store. + capture_tags_info (boolean): The value to store. + + Example: + Load a stored value: + ```python + from prefect_datahub import DatahubEmitter + block = DatahubEmitter.load("BLOCK_NAME") + ``` + """ + + _block_type_name = "datahub emitter" + # replace this with a relevant logo; defaults to Prefect logo + _logo_url = "https://images.ctfassets.net/gm98wzqotmnx/08yCE6xpJMX9Kjl5VArDS/c2ede674c20f90b9b6edeab71feffac9/prefect-200x200.png?h=250" # noqa + _documentation_url = "https://GS lab.github.io/prefect-datahub/blocks/#prefect-datahub.blocks.DatahubBlock" # noqa + + datahub_rest_url: Optional[str] = Field( + default="http://localhost:8080", + title="Datahub rest url", + description="Datahub gms rest url.", + ) + + cluster: Optional[str] = Field( + default="prod", + title="Cluster", + description="Name of the prefect cluster.", + ) + + capture_tags_info: Optional[bool] = Field( + default=True, + title="Capture tags infor", + description="If true, the tags field of the task and flow will be captured as DataHub tags.", + ) + + @classmethod + def seed_value_for_example(cls): + """ + Seeds the field, value, so the block can be loaded. + """ + block = cls(value="A sample value") + block.save("sample-block", overwrite=True) + + def _get_config(self) -> Tuple[str, Optional[str], Optional[int]]: + host = "http://localhost:8080" + password = None + timeout_sec = None + return (host, password, timeout_sec) + + def make_emitter(self) -> "DatahubRestEmitter": + import datahub.emitter.rest_emitter + + return datahub.emitter.rest_emitter.DatahubRestEmitter(*self._get_config()) + + async def _get_flow_run_graph(self, flow_run_id): + response = await get_client()._client.get(f"/flow_runs/{flow_run_id}/graph") + return response.json() + + async def _get_task_run(self, task_run_id): + return await get_client().read_task_run(task_run_id) + + def ingest_task(self, inputs: List = None, outputs: List = None): + + flow_run_ctx = FlowRunContext.get() + task_run_ctx = TaskRunContext.get() + + emitter = self.make_emitter() + + dataflow_urn = DataFlowUrn.create_from_ids( + orchestrator="prefect", env='prod', flow_id=flow_run_ctx.flow.name + ) + + datajob = DataJob(id=task_run_ctx.task.task_key, flow_urn=dataflow_urn, name=task_run_ctx.task.name) + datajob.description = task_run_ctx.task.description + datajob.tags = task_run_ctx.task.tags + if inputs is not None: + datajob.inlets.extend(_entities_to_urn_list(inputs)) + if outputs is not None: + datajob.outlets.extend(_entities_to_urn_list(outputs)) + + if task_run_ctx.task_run.task_inputs: + task_run_key_map = {str(prefect_future.task_run.id):prefect_future.task_run.task_key for prefect_future in flow_run_ctx.task_run_futures} + for inputs in task_run_ctx.task_run.task_inputs['actual_data']: + upstream_task_urn = DataJobUrn.create_from_ids( + data_flow_urn=str(dataflow_urn), job_id=task_run_key_map[str(inputs.id)] + ) + datajob.upstream_urns.extend([upstream_task_urn]) + datajob.emit(emitter) + + def ingest_flow(self): + flow_run_ctx = FlowRunContext.get() + + emitter = self.make_emitter() + + dataflow = DataFlow( + cluster='prod', id=flow_run_ctx.flow.name, orchestrator="prefect" + ) + dataflow.description = flow_run_ctx.flow.description + dataflow.emit(emitter) + + dpi = DataProcessInstance.from_dataflow(dataflow=dataflow, id=flow_run_ctx.flow_run.name) + + dpi.emit_process_start( + emitter=emitter, start_timestamp_millis=int(flow_run_ctx.flow_run.start_time.timestamp() * 1000) + ) + + dpi.emit_process_end( + emitter=emitter, + end_timestamp_millis=int(flow_run_ctx.flow_run.start_time.timestamp() * 1000)+5000, + result=InstanceRunResult.SUCCESS, + result_type="prefect", + ) + + for prefect_future in flow_run_ctx.task_run_futures: + task_run = asyncio.run(self._get_task_run(prefect_future.task_run.id)) + datajob = DataJob(id=task_run.task_key, flow_urn=dataflow.urn) + + if task_run.state_name == "Completed": + result = InstanceRunResult.SUCCESS + elif task_run.state_name == "Failed": + result = InstanceRunResult.FAILURE + elif task_run.state_name == "Cancelled": + result = InstanceRunResult.SKIPPED + else: + raise Exception( + f"Result should be either success or failure and it was {ti.state}" + ) + + dpi = DataProcessInstance.from_datajob( + datajob=datajob, + id=f"{flow_run_ctx.flow_run.name}.{task_run.name}", + clone_inlets=True, + clone_outlets=True, + ) + dpi.emit_process_start( + emitter=emitter, + start_timestamp_millis=int(task_run.start_time.timestamp() * 1000), + emit_template=False, + ) + dpi.emit_process_end( + emitter=emitter, + end_timestamp_millis=int(task_run.end_time.timestamp() * 1000), + result=result, + result_type="prefect", + ) diff --git a/prefect_datahub/flows.py b/prefect_datahub/flows.py new file mode 100644 index 0000000..207bdae --- /dev/null +++ b/prefect_datahub/flows.py @@ -0,0 +1,26 @@ +"""This is an example flows module""" +from prefect import flow + +from prefect_datahub.blocks import DatahubBlock +from prefect_datahub.tasks import ( + goodbye_prefect_datahub, + hello_prefect_datahub, +) + + +@flow +def hello_and_goodbye(): + """ + Sample flow that says hello and goodbye! + """ + DatahubBlock.seed_value_for_example() + block = DatahubBlock.load("sample-block") + + print(hello_prefect_datahub()) + print(f"The block's value: {block.value}") + print(goodbye_prefect_datahub()) + return "Done" + + +if __name__ == "__main__": + hello_and_goodbye() diff --git a/prefect_datahub/tasks.py b/prefect_datahub/tasks.py new file mode 100644 index 0000000..ba5ca98 --- /dev/null +++ b/prefect_datahub/tasks.py @@ -0,0 +1,24 @@ +"""This is an example tasks module""" +from prefect import task + + +@task +def hello_prefect_datahub() -> str: + """ + Sample task that says hello! + + Returns: + A greeting for your collection + """ + return "Hello, prefect-datahub!" + + +@task +def goodbye_prefect_datahub() -> str: + """ + Sample task that says goodbye! + + Returns: + A farewell for your collection + """ + return "Goodbye, prefect-datahub!" diff --git a/requirements-dev.txt b/requirements-dev.txt new file mode 100644 index 0000000..1c84ddc --- /dev/null +++ b/requirements-dev.txt @@ -0,0 +1,16 @@ +pytest +black +flake8 +mypy +mkdocs +mkdocs-material +mkdocstrings[python] +isort +pre-commit +pytest-asyncio +mock; python_version < '3.8' +mkdocs-gen-files +interrogate +coverage +pillow +acryl-datahub[datahub-rest] \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..4ec3de6 --- /dev/null +++ b/requirements.txt @@ -0,0 +1 @@ +prefect>=2.0.0 diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..17d7e84 --- /dev/null +++ b/setup.cfg @@ -0,0 +1,39 @@ +[flake8] +exclude = .git,__pycache__,build,dist +per-file-ignores = + setup.py:E501 +# Match black line-length +max-line-length = 88 +extend-ignore = + E203, + +[isort] +skip = __init__.py +profile = black +skip_gitignore = True +multi_line_output = 3 + +[versioneer] +VCS = git +style = pep440 +versionfile_source = prefect_datahub/_version.py +versionfile_build = prefect_datahub/_version.py +tag_prefix = v +parentdir_prefix = + +[tool:interrogate] +ignore-init-module = True +ignore_init_method = True +exclude = prefect_datahub/_version.py, tests, setup.py, versioneer.py, docs, site +fail-under = 95 +omit-covered-files = True + +[coverage:run] +omit = tests/*, prefect_datahub/_version.py + +[coverage:report] +fail_under = 80 +show_missing = True + +[tool:pytest] +asyncio_mode = auto diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..8852888 --- /dev/null +++ b/setup.py @@ -0,0 +1,47 @@ +from setuptools import find_packages, setup + +import versioneer + +with open("requirements.txt") as install_requires_file: + install_requires = install_requires_file.read().strip().split("\n") + +with open("requirements-dev.txt") as dev_requires_file: + dev_requires = dev_requires_file.read().strip().split("\n") + +with open("README.md") as readme_file: + readme = readme_file.read() + +setup( + name="prefect-datahub", + description="Block used to emit prefect task and flow related metadata to Datahub REST", + license="Apache License 2.0", + author="Shubham Jagtap", + author_email="shubham.jagtap@gslab.com", + keywords="prefect", + url="https://github.com/shubhamjagtap639/prefect-datahub", + long_description=readme, + long_description_content_type="text/markdown", + version=versioneer.get_version(), + cmdclass=versioneer.get_cmdclass(), + packages=find_packages(exclude=("tests", "docs")), + python_requires=">=3.7", + install_requires=install_requires, + extras_require={"dev": dev_requires}, + entry_points={ + "prefect.collections": [ + "prefect_datahub = prefect_datahub", + ] + }, + classifiers=[ + "Natural Language :: English", + "Intended Audience :: Developers", + "Intended Audience :: System Administrators", + "License :: OSI Approved :: Apache Software License", + "Programming Language :: Python :: 3 :: Only", + "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Topic :: Software Development :: Libraries", + ], +) diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..ca7cae7 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,22 @@ +import pytest +from prefect.testing.utilities import prefect_test_harness + + +@pytest.fixture(scope="session", autouse=True) +def prefect_db(): + """ + Sets up test harness for temporary DB during test runs. + """ + with prefect_test_harness(): + yield + + +@pytest.fixture(autouse=True) +def reset_object_registry(): + """ + Ensures each test has a clean object registry. + """ + from prefect.context import PrefectObjectRegistry + + with PrefectObjectRegistry(): + yield diff --git a/tests/test_block_standards.py b/tests/test_block_standards.py new file mode 100644 index 0000000..496c128 --- /dev/null +++ b/tests/test_block_standards.py @@ -0,0 +1,22 @@ +import pytest +from prefect.blocks.core import Block +from prefect.testing.standard_test_suites import BlockStandardTestSuite +from prefect.utilities.dispatch import get_registry_for_type +from prefect.utilities.importtools import to_qualified_name + + +def find_module_blocks(): + blocks = get_registry_for_type(Block) + module_blocks = [ + block + for block in blocks.values() + if to_qualified_name(block).startswith("prefect_datahub") + ] + return module_blocks + + +@pytest.mark.parametrize("block", find_module_blocks()) +class TestAllBlocksAdhereToStandards(BlockStandardTestSuite): + @pytest.fixture + def block(self, block): + return block diff --git a/tests/test_flows.py b/tests/test_flows.py new file mode 100644 index 0000000..e5d8021 --- /dev/null +++ b/tests/test_flows.py @@ -0,0 +1,6 @@ +from prefect_datahub.flows import hello_and_goodbye + + +def test_hello_and_goodbye_flow(): + result = hello_and_goodbye() + assert result == "Done" diff --git a/tests/test_tasks.py b/tests/test_tasks.py new file mode 100644 index 0000000..71e5b46 --- /dev/null +++ b/tests/test_tasks.py @@ -0,0 +1,24 @@ +from prefect import flow + +from prefect_datahub.tasks import ( + goodbye_prefect_datahub, + hello_prefect_datahub, +) + + +def test_hello_prefect_datahub(): + @flow + def test_flow(): + return hello_prefect_datahub() + + result = test_flow() + assert result == "Hello, prefect-datahub!" + + +def goodbye_hello_prefect_datahub(): + @flow + def test_flow(): + return goodbye_prefect_datahub() + + result = test_flow() + assert result == "Goodbye, prefect-datahub!" diff --git a/versioneer.py b/versioneer.py new file mode 100644 index 0000000..d70f31b --- /dev/null +++ b/versioneer.py @@ -0,0 +1,2163 @@ +# Version: 0.21 + +"""The Versioneer - like a rocketeer, but for versions. + +The Versioneer +============== + +* like a rocketeer, but for versions! +* https://github.com/python-versioneer/python-versioneer +* Brian Warner +* License: Public Domain +* Compatible with: Python 3.6, 3.7, 3.8, 3.9 and pypy3 +* [![Latest Version][pypi-image]][pypi-url] +* [![Build Status][travis-image]][travis-url] + +This is a tool for managing a recorded version number in distutils-based +python projects. The goal is to remove the tedious and error-prone "update +the embedded version string" step from your release process. Making a new +release should be as easy as recording a new tag in your version-control +system, and maybe making new tarballs. + + +## Quick Install + +* `pip install versioneer` to somewhere in your $PATH +* add a `[versioneer]` section to your setup.cfg (see [Install](INSTALL.md)) +* run `versioneer install` in your source tree, commit the results +* Verify version information with `python setup.py version` + +## Version Identifiers + +Source trees come from a variety of places: + +* a version-control system checkout (mostly used by developers) +* a nightly tarball, produced by build automation +* a snapshot tarball, produced by a web-based VCS browser, like github's + "tarball from tag" feature +* a release tarball, produced by "setup.py sdist", distributed through PyPI + +Within each source tree, the version identifier (either a string or a number, +this tool is format-agnostic) can come from a variety of places: + +* ask the VCS tool itself, e.g. "git describe" (for checkouts), which knows + about recent "tags" and an absolute revision-id +* the name of the directory into which the tarball was unpacked +* an expanded VCS keyword ($Id$, etc) +* a `_version.py` created by some earlier build step + +For released software, the version identifier is closely related to a VCS +tag. Some projects use tag names that include more than just the version +string (e.g. "myproject-1.2" instead of just "1.2"), in which case the tool +needs to strip the tag prefix to extract the version identifier. For +unreleased software (between tags), the version identifier should provide +enough information to help developers recreate the same tree, while also +giving them an idea of roughly how old the tree is (after version 1.2, before +version 1.3). Many VCS systems can report a description that captures this, +for example `git describe --tags --dirty --always` reports things like +"0.7-1-g574ab98-dirty" to indicate that the checkout is one revision past the +0.7 tag, has a unique revision id of "574ab98", and is "dirty" (it has +uncommitted changes). + +The version identifier is used for multiple purposes: + +* to allow the module to self-identify its version: `myproject.__version__` +* to choose a name and prefix for a 'setup.py sdist' tarball + +## Theory of Operation + +Versioneer works by adding a special `_version.py` file into your source +tree, where your `__init__.py` can import it. This `_version.py` knows how to +dynamically ask the VCS tool for version information at import time. + +`_version.py` also contains `$Revision$` markers, and the installation +process marks `_version.py` to have this marker rewritten with a tag name +during the `git archive` command. As a result, generated tarballs will +contain enough information to get the proper version. + +To allow `setup.py` to compute a version too, a `versioneer.py` is added to +the top level of your source tree, next to `setup.py` and the `setup.cfg` +that configures it. This overrides several distutils/setuptools commands to +compute the version when invoked, and changes `setup.py build` and `setup.py +sdist` to replace `_version.py` with a small static file that contains just +the generated version data. + +## Installation + +See [INSTALL.md](./INSTALL.md) for detailed installation instructions. + +## Version-String Flavors + +Code which uses Versioneer can learn about its version string at runtime by +importing `_version` from your main `__init__.py` file and running the +`get_versions()` function. From the "outside" (e.g. in `setup.py`), you can +import the top-level `versioneer.py` and run `get_versions()`. + +Both functions return a dictionary with different flavors of version +information: + +* `['version']`: A condensed version string, rendered using the selected + style. This is the most commonly used value for the project's version + string. The default "pep440" style yields strings like `0.11`, + `0.11+2.g1076c97`, or `0.11+2.g1076c97.dirty`. See the "Styles" section + below for alternative styles. + +* `['full-revisionid']`: detailed revision identifier. For Git, this is the + full SHA1 commit id, e.g. "1076c978a8d3cfc70f408fe5974aa6c092c949ac". + +* `['date']`: Date and time of the latest `HEAD` commit. For Git, it is the + commit date in ISO 8601 format. This will be None if the date is not + available. + +* `['dirty']`: a boolean, True if the tree has uncommitted changes. Note that + this is only accurate if run in a VCS checkout, otherwise it is likely to + be False or None + +* `['error']`: if the version string could not be computed, this will be set + to a string describing the problem, otherwise it will be None. It may be + useful to throw an exception in setup.py if this is set, to avoid e.g. + creating tarballs with a version string of "unknown". + +Some variants are more useful than others. Including `full-revisionid` in a +bug report should allow developers to reconstruct the exact code being tested +(or indicate the presence of local changes that should be shared with the +developers). `version` is suitable for display in an "about" box or a CLI +`--version` output: it can be easily compared against release notes and lists +of bugs fixed in various releases. + +The installer adds the following text to your `__init__.py` to place a basic +version in `YOURPROJECT.__version__`: + + from ._version import get_versions + __version__ = get_versions()['version'] + del get_versions + +## Styles + +The setup.cfg `style=` configuration controls how the VCS information is +rendered into a version string. + +The default style, "pep440", produces a PEP440-compliant string, equal to the +un-prefixed tag name for actual releases, and containing an additional "local +version" section with more detail for in-between builds. For Git, this is +TAG[+DISTANCE.gHEX[.dirty]] , using information from `git describe --tags +--dirty --always`. For example "0.11+2.g1076c97.dirty" indicates that the +tree is like the "1076c97" commit but has uncommitted changes (".dirty"), and +that this commit is two revisions ("+2") beyond the "0.11" tag. For released +software (exactly equal to a known tag), the identifier will only contain the +stripped tag, e.g. "0.11". + +Other styles are available. See [details.md](details.md) in the Versioneer +source tree for descriptions. + +## Debugging + +Versioneer tries to avoid fatal errors: if something goes wrong, it will tend +to return a version of "0+unknown". To investigate the problem, run `setup.py +version`, which will run the version-lookup code in a verbose mode, and will +display the full contents of `get_versions()` (including the `error` string, +which may help identify what went wrong). + +## Known Limitations + +Some situations are known to cause problems for Versioneer. This details the +most significant ones. More can be found on Github +[issues page](https://github.com/python-versioneer/python-versioneer/issues). + +### Subprojects + +Versioneer has limited support for source trees in which `setup.py` is not in +the root directory (e.g. `setup.py` and `.git/` are *not* siblings). The are +two common reasons why `setup.py` might not be in the root: + +* Source trees which contain multiple subprojects, such as + [Buildbot](https://github.com/buildbot/buildbot), which contains both + "master" and "slave" subprojects, each with their own `setup.py`, + `setup.cfg`, and `tox.ini`. Projects like these produce multiple PyPI + distributions (and upload multiple independently-installable tarballs). +* Source trees whose main purpose is to contain a C library, but which also + provide bindings to Python (and perhaps other languages) in subdirectories. + +Versioneer will look for `.git` in parent directories, and most operations +should get the right version string. However `pip` and `setuptools` have bugs +and implementation details which frequently cause `pip install .` from a +subproject directory to fail to find a correct version string (so it usually +defaults to `0+unknown`). + +`pip install --editable .` should work correctly. `setup.py install` might +work too. + +Pip-8.1.1 is known to have this problem, but hopefully it will get fixed in +some later version. + +[Bug #38](https://github.com/python-versioneer/python-versioneer/issues/38) is tracking +this issue. The discussion in +[PR #61](https://github.com/python-versioneer/python-versioneer/pull/61) describes the +issue from the Versioneer side in more detail. +[pip PR#3176](https://github.com/pypa/pip/pull/3176) and +[pip PR#3615](https://github.com/pypa/pip/pull/3615) contain work to improve +pip to let Versioneer work correctly. + +Versioneer-0.16 and earlier only looked for a `.git` directory next to the +`setup.cfg`, so subprojects were completely unsupported with those releases. + +### Editable installs with setuptools <= 18.5 + +`setup.py develop` and `pip install --editable .` allow you to install a +project into a virtualenv once, then continue editing the source code (and +test) without re-installing after every change. + +"Entry-point scripts" (`setup(entry_points={"console_scripts": ..})`) are a +convenient way to specify executable scripts that should be installed along +with the python package. + +These both work as expected when using modern setuptools. When using +setuptools-18.5 or earlier, however, certain operations will cause +`pkg_resources.DistributionNotFound` errors when running the entrypoint +script, which must be resolved by re-installing the package. This happens +when the install happens with one version, then the egg_info data is +regenerated while a different version is checked out. Many setup.py commands +cause egg_info to be rebuilt (including `sdist`, `wheel`, and installing into +a different virtualenv), so this can be surprising. + +[Bug #83](https://github.com/python-versioneer/python-versioneer/issues/83) describes +this one, but upgrading to a newer version of setuptools should probably +resolve it. + + +## Updating Versioneer + +To upgrade your project to a new release of Versioneer, do the following: + +* install the new Versioneer (`pip install -U versioneer` or equivalent) +* edit `setup.cfg`, if necessary, to include any new configuration settings + indicated by the release notes. See [UPGRADING](./UPGRADING.md) for details. +* re-run `versioneer install` in your source tree, to replace + `SRC/_version.py` +* commit any changed files + +## Future Directions + +This tool is designed to make it easily extended to other version-control +systems: all VCS-specific components are in separate directories like +src/git/ . The top-level `versioneer.py` script is assembled from these +components by running make-versioneer.py . In the future, make-versioneer.py +will take a VCS name as an argument, and will construct a version of +`versioneer.py` that is specific to the given VCS. It might also take the +configuration arguments that are currently provided manually during +installation by editing setup.py . Alternatively, it might go the other +direction and include code from all supported VCS systems, reducing the +number of intermediate scripts. + +## Similar projects + +* [setuptools_scm](https://github.com/pypa/setuptools_scm/) - a non-vendored build-time + dependency +* [minver](https://github.com/jbweston/miniver) - a lightweight reimplementation of + versioneer +* [versioningit](https://github.com/jwodder/versioningit) - a PEP 518-based setuptools + plugin + +## License + +To make Versioneer easier to embed, all its code is dedicated to the public +domain. The `_version.py` that it creates is also in the public domain. +Specifically, both are released under the Creative Commons "Public Domain +Dedication" license (CC0-1.0), as described in +https://creativecommons.org/publicdomain/zero/1.0/ . + +[pypi-image]: https://img.shields.io/pypi/v/versioneer.svg +[pypi-url]: https://pypi.python.org/pypi/versioneer/ +[travis-image]: +https://img.shields.io/travis/com/python-versioneer/python-versioneer.svg +[travis-url]: https://travis-ci.com/github/python-versioneer/python-versioneer + +""" +# pylint:disable=invalid-name,import-outside-toplevel,missing-function-docstring +# pylint:disable=missing-class-docstring,too-many-branches,too-many-statements +# pylint:disable=raise-missing-from,too-many-lines,too-many-locals,import-error +# pylint:disable=too-few-public-methods,redefined-outer-name,consider-using-with +# pylint:disable=attribute-defined-outside-init,too-many-arguments + +import configparser +import errno +import json +import os +import re +import subprocess +import sys +from typing import Callable, Dict + + +class VersioneerConfig: + """Container for Versioneer configuration parameters.""" + + +def get_root(): + """Get the project root directory. + + We require that all commands are run from the project root, i.e. the + directory that contains setup.py, setup.cfg, and versioneer.py . + """ + root = os.path.realpath(os.path.abspath(os.getcwd())) + setup_py = os.path.join(root, "setup.py") + versioneer_py = os.path.join(root, "versioneer.py") + if not (os.path.exists(setup_py) or os.path.exists(versioneer_py)): + # allow 'python path/to/setup.py COMMAND' + root = os.path.dirname(os.path.realpath(os.path.abspath(sys.argv[0]))) + setup_py = os.path.join(root, "setup.py") + versioneer_py = os.path.join(root, "versioneer.py") + if not (os.path.exists(setup_py) or os.path.exists(versioneer_py)): + err = ( + "Versioneer was unable to run the project root directory. " + "Versioneer requires setup.py to be executed from " + "its immediate directory (like 'python setup.py COMMAND'), " + "or in a way that lets it use sys.argv[0] to find the root " + "(like 'python path/to/setup.py COMMAND')." + ) + raise VersioneerBadRootError(err) + try: + # Certain runtime workflows (setup.py install/develop in a setuptools + # tree) execute all dependencies in a single python process, so + # "versioneer" may be imported multiple times, and python's shared + # module-import table will cache the first one. So we can't use + # os.path.dirname(__file__), as that will find whichever + # versioneer.py was first imported, even in later projects. + my_path = os.path.realpath(os.path.abspath(__file__)) + me_dir = os.path.normcase(os.path.splitext(my_path)[0]) + vsr_dir = os.path.normcase(os.path.splitext(versioneer_py)[0]) + if me_dir != vsr_dir: + print( + "Warning: build in %s is using versioneer.py from %s" + % (os.path.dirname(my_path), versioneer_py) + ) + except NameError: + pass + return root + + +def get_config_from_root(root): + """Read the project setup.cfg file to determine Versioneer config.""" + # This might raise OSError (if setup.cfg is missing), or + # configparser.NoSectionError (if it lacks a [versioneer] section), or + # configparser.NoOptionError (if it lacks "VCS="). See the docstring at + # the top of versioneer.py for instructions on writing your setup.cfg . + setup_cfg = os.path.join(root, "setup.cfg") + parser = configparser.ConfigParser() + with open(setup_cfg, "r") as cfg_file: + parser.read_file(cfg_file) + VCS = parser.get("versioneer", "VCS") # mandatory + + # Dict-like interface for non-mandatory entries + section = parser["versioneer"] + + cfg = VersioneerConfig() + cfg.VCS = VCS + cfg.style = section.get("style", "") + cfg.versionfile_source = section.get("versionfile_source") + cfg.versionfile_build = section.get("versionfile_build") + cfg.tag_prefix = section.get("tag_prefix") + if cfg.tag_prefix in ("''", '""'): + cfg.tag_prefix = "" + cfg.parentdir_prefix = section.get("parentdir_prefix") + cfg.verbose = section.get("verbose") + return cfg + + +class NotThisMethod(Exception): + """Exception raised if a method is not valid for the current scenario.""" + + +# these dictionaries contain VCS-specific tools +LONG_VERSION_PY: Dict[str, str] = {} +HANDLERS: Dict[str, Dict[str, Callable]] = {} + + +def register_vcs_handler(vcs, method): # decorator + """Create decorator to mark a method as the handler of a VCS.""" + + def decorate(f): + """Store f in HANDLERS[vcs][method].""" + HANDLERS.setdefault(vcs, {})[method] = f + return f + + return decorate + + +def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False, env=None): + """Call the given command(s).""" + assert isinstance(commands, list) + process = None + for command in commands: + try: + dispcmd = str([command] + args) + # remember shell=False, so use git.cmd on windows, not just git + process = subprocess.Popen( + [command] + args, + cwd=cwd, + env=env, + stdout=subprocess.PIPE, + stderr=(subprocess.PIPE if hide_stderr else None), + ) + break + except OSError: + e = sys.exc_info()[1] + if e.errno == errno.ENOENT: + continue + if verbose: + print("unable to run %s" % dispcmd) + print(e) + return None, None + else: + if verbose: + print("unable to find command, tried %s" % (commands,)) + return None, None + stdout = process.communicate()[0].strip().decode() + if process.returncode != 0: + if verbose: + print("unable to run %s (error)" % dispcmd) + print("stdout was %s" % stdout) + return None, process.returncode + return stdout, process.returncode + + +LONG_VERSION_PY[ + "git" +] = r''' +# This file helps to compute a version number in source trees obtained from +# git-archive tarball (such as those provided by githubs download-from-tag +# feature). Distribution tarballs (built by setup.py sdist) and build +# directories (produced by setup.py build) will contain a much shorter file +# that just contains the computed version number. + +# This file is released into the public domain. Generated by +# versioneer-0.21 (https://github.com/python-versioneer/python-versioneer) + +"""Git implementation of _version.py.""" + +import errno +import os +import re +import subprocess +import sys +from typing import Callable, Dict + + +def get_keywords(): + """Get the keywords needed to look up the version information.""" + # these strings will be replaced by git during git-archive. + # setup.py/versioneer.py will grep for the variable names, so they must + # each be defined on a line of their own. _version.py will just call + # get_keywords(). + git_refnames = "%(DOLLAR)sFormat:%%d%(DOLLAR)s" + git_full = "%(DOLLAR)sFormat:%%H%(DOLLAR)s" + git_date = "%(DOLLAR)sFormat:%%ci%(DOLLAR)s" + keywords = {"refnames": git_refnames, "full": git_full, "date": git_date} + return keywords + + +class VersioneerConfig: + """Container for Versioneer configuration parameters.""" + + +def get_config(): + """Create, populate and return the VersioneerConfig() object.""" + # these strings are filled in when 'setup.py versioneer' creates + # _version.py + cfg = VersioneerConfig() + cfg.VCS = "git" + cfg.style = "%(STYLE)s" + cfg.tag_prefix = "%(TAG_PREFIX)s" + cfg.parentdir_prefix = "%(PARENTDIR_PREFIX)s" + cfg.versionfile_source = "%(VERSIONFILE_SOURCE)s" + cfg.verbose = False + return cfg + + +class NotThisMethod(Exception): + """Exception raised if a method is not valid for the current scenario.""" + + +LONG_VERSION_PY: Dict[str, str] = {} +HANDLERS: Dict[str, Dict[str, Callable]] = {} + + +def register_vcs_handler(vcs, method): # decorator + """Create decorator to mark a method as the handler of a VCS.""" + def decorate(f): + """Store f in HANDLERS[vcs][method].""" + if vcs not in HANDLERS: + HANDLERS[vcs] = {} + HANDLERS[vcs][method] = f + return f + return decorate + + +def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False, + env=None): + """Call the given command(s).""" + assert isinstance(commands, list) + process = None + for command in commands: + try: + dispcmd = str([command] + args) + # remember shell=False, so use git.cmd on windows, not just git + process = subprocess.Popen([command] + args, cwd=cwd, env=env, + stdout=subprocess.PIPE, + stderr=(subprocess.PIPE if hide_stderr + else None)) + break + except OSError: + e = sys.exc_info()[1] + if e.errno == errno.ENOENT: + continue + if verbose: + print("unable to run %%s" %% dispcmd) + print(e) + return None, None + else: + if verbose: + print("unable to find command, tried %%s" %% (commands,)) + return None, None + stdout = process.communicate()[0].strip().decode() + if process.returncode != 0: + if verbose: + print("unable to run %%s (error)" %% dispcmd) + print("stdout was %%s" %% stdout) + return None, process.returncode + return stdout, process.returncode + + +def versions_from_parentdir(parentdir_prefix, root, verbose): + """Try to determine the version from the parent directory name. + + Source tarballs conventionally unpack into a directory that includes both + the project name and a version string. We will also support searching up + two directory levels for an appropriately named parent directory + """ + rootdirs = [] + + for _ in range(3): + dirname = os.path.basename(root) + if dirname.startswith(parentdir_prefix): + return {"version": dirname[len(parentdir_prefix):], + "full-revisionid": None, + "dirty": False, "error": None, "date": None} + rootdirs.append(root) + root = os.path.dirname(root) # up a level + + if verbose: + print("Tried directories %%s but none started with prefix %%s" %% + (str(rootdirs), parentdir_prefix)) + raise NotThisMethod("rootdir doesn't start with parentdir_prefix") + + +@register_vcs_handler("git", "get_keywords") +def git_get_keywords(versionfile_abs): + """Extract version information from the given file.""" + # the code embedded in _version.py can just fetch the value of these + # keywords. When used from setup.py, we don't want to import _version.py, + # so we do it with a regexp instead. This function is not used from + # _version.py. + keywords = {} + try: + with open(versionfile_abs, "r") as fobj: + for line in fobj: + if line.strip().startswith("git_refnames ="): + mo = re.search(r'=\s*"(.*)"', line) + if mo: + keywords["refnames"] = mo.group(1) + if line.strip().startswith("git_full ="): + mo = re.search(r'=\s*"(.*)"', line) + if mo: + keywords["full"] = mo.group(1) + if line.strip().startswith("git_date ="): + mo = re.search(r'=\s*"(.*)"', line) + if mo: + keywords["date"] = mo.group(1) + except OSError: + pass + return keywords + + +@register_vcs_handler("git", "keywords") +def git_versions_from_keywords(keywords, tag_prefix, verbose): + """Get version information from git keywords.""" + if "refnames" not in keywords: + raise NotThisMethod("Short version file found") + date = keywords.get("date") + if date is not None: + # Use only the last line. Previous lines may contain GPG signature + # information. + date = date.splitlines()[-1] + + # git-2.2.0 added "%%cI", which expands to an ISO-8601 -compliant + # datestamp. However we prefer "%%ci" (which expands to an "ISO-8601 + # -like" string, which we must then edit to make compliant), because + # it's been around since git-1.5.3, and it's too difficult to + # discover which version we're using, or to work around using an + # older one. + date = date.strip().replace(" ", "T", 1).replace(" ", "", 1) + refnames = keywords["refnames"].strip() + if refnames.startswith("$Format"): + if verbose: + print("keywords are unexpanded, not using") + raise NotThisMethod("unexpanded keywords, not a git-archive tarball") + refs = {r.strip() for r in refnames.strip("()").split(",")} + # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of + # just "foo-1.0". If we see a "tag: " prefix, prefer those. + TAG = "tag: " + tags = {r[len(TAG):] for r in refs if r.startswith(TAG)} + if not tags: + # Either we're using git < 1.8.3, or there really are no tags. We use + # a heuristic: assume all version tags have a digit. The old git %%d + # expansion behaves like git log --decorate=short and strips out the + # refs/heads/ and refs/tags/ prefixes that would let us distinguish + # between branches and tags. By ignoring refnames without digits, we + # filter out many common branch names like "release" and + # "stabilization", as well as "HEAD" and "master". + tags = {r for r in refs if re.search(r'\d', r)} + if verbose: + print("discarding '%%s', no digits" %% ",".join(refs - tags)) + if verbose: + print("likely tags: %%s" %% ",".join(sorted(tags))) + for ref in sorted(tags): + # sorting will prefer e.g. "2.0" over "2.0rc1" + if ref.startswith(tag_prefix): + r = ref[len(tag_prefix):] + # Filter out refs that exactly match prefix or that don't start + # with a number once the prefix is stripped (mostly a concern + # when prefix is '') + if not re.match(r'\d', r): + continue + if verbose: + print("picking %%s" %% r) + return {"version": r, + "full-revisionid": keywords["full"].strip(), + "dirty": False, "error": None, + "date": date} + # no suitable tags, so version is "0+unknown", but full hex is still there + if verbose: + print("no suitable tags, using unknown + full revision id") + return {"version": "0+unknown", + "full-revisionid": keywords["full"].strip(), + "dirty": False, "error": "no suitable tags", "date": None} + + +@register_vcs_handler("git", "pieces_from_vcs") +def git_pieces_from_vcs(tag_prefix, root, verbose, runner=run_command): + """Get version from 'git describe' in the root of the source tree. + + This only gets called if the git-archive 'subst' keywords were *not* + expanded, and _version.py hasn't already been rewritten with a short + version string, meaning we're inside a checked out source tree. + """ + GITS = ["git"] + TAG_PREFIX_REGEX = "*" + if sys.platform == "win32": + GITS = ["git.cmd", "git.exe"] + TAG_PREFIX_REGEX = r"\*" + + _, rc = runner(GITS, ["rev-parse", "--git-dir"], cwd=root, + hide_stderr=True) + if rc != 0: + if verbose: + print("Directory %%s not under git control" %% root) + raise NotThisMethod("'git rev-parse --git-dir' returned error") + + # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty] + # if there isn't one, this yields HEX[-dirty] (no NUM) + describe_out, rc = runner(GITS, ["describe", "--tags", "--dirty", + "--always", "--long", + "--match", + "%%s%%s" %% (tag_prefix, TAG_PREFIX_REGEX)], + cwd=root) + # --long was added in git-1.5.5 + if describe_out is None: + raise NotThisMethod("'git describe' failed") + describe_out = describe_out.strip() + full_out, rc = runner(GITS, ["rev-parse", "HEAD"], cwd=root) + if full_out is None: + raise NotThisMethod("'git rev-parse' failed") + full_out = full_out.strip() + + pieces = {} + pieces["long"] = full_out + pieces["short"] = full_out[:7] # maybe improved later + pieces["error"] = None + + branch_name, rc = runner(GITS, ["rev-parse", "--abbrev-ref", "HEAD"], + cwd=root) + # --abbrev-ref was added in git-1.6.3 + if rc != 0 or branch_name is None: + raise NotThisMethod("'git rev-parse --abbrev-ref' returned error") + branch_name = branch_name.strip() + + if branch_name == "HEAD": + # If we aren't exactly on a branch, pick a branch which represents + # the current commit. If all else fails, we are on a branchless + # commit. + branches, rc = runner(GITS, ["branch", "--contains"], cwd=root) + # --contains was added in git-1.5.4 + if rc != 0 or branches is None: + raise NotThisMethod("'git branch --contains' returned error") + branches = branches.split("\n") + + # Remove the first line if we're running detached + if "(" in branches[0]: + branches.pop(0) + + # Strip off the leading "* " from the list of branches. + branches = [branch[2:] for branch in branches] + if "master" in branches: + branch_name = "master" + elif not branches: + branch_name = None + else: + # Pick the first branch that is returned. Good or bad. + branch_name = branches[0] + + pieces["branch"] = branch_name + + # parse describe_out. It will be like TAG-NUM-gHEX[-dirty] or HEX[-dirty] + # TAG might have hyphens. + git_describe = describe_out + + # look for -dirty suffix + dirty = git_describe.endswith("-dirty") + pieces["dirty"] = dirty + if dirty: + git_describe = git_describe[:git_describe.rindex("-dirty")] + + # now we have TAG-NUM-gHEX or HEX + + if "-" in git_describe: + # TAG-NUM-gHEX + mo = re.search(r'^(.+)-(\d+)-g([0-9a-f]+)$', git_describe) + if not mo: + # unparsable. Maybe git-describe is misbehaving? + pieces["error"] = ("unable to parse git-describe output: '%%s'" + %% describe_out) + return pieces + + # tag + full_tag = mo.group(1) + if not full_tag.startswith(tag_prefix): + if verbose: + fmt = "tag '%%s' doesn't start with prefix '%%s'" + print(fmt %% (full_tag, tag_prefix)) + pieces["error"] = ("tag '%%s' doesn't start with prefix '%%s'" + %% (full_tag, tag_prefix)) + return pieces + pieces["closest-tag"] = full_tag[len(tag_prefix):] + + # distance: number of commits since tag + pieces["distance"] = int(mo.group(2)) + + # commit: short hex revision ID + pieces["short"] = mo.group(3) + + else: + # HEX: no tags + pieces["closest-tag"] = None + count_out, rc = runner(GITS, ["rev-list", "HEAD", "--count"], cwd=root) + pieces["distance"] = int(count_out) # total number of commits + + # commit date: see ISO-8601 comment in git_versions_from_keywords() + date = runner(GITS, ["show", "-s", "--format=%%ci", "HEAD"], cwd=root)[0].strip() + # Use only the last line. Previous lines may contain GPG signature + # information. + date = date.splitlines()[-1] + pieces["date"] = date.strip().replace(" ", "T", 1).replace(" ", "", 1) + + return pieces + + +def plus_or_dot(pieces): + """Return a + if we don't already have one, else return a .""" + if "+" in pieces.get("closest-tag", ""): + return "." + return "+" + + +def render_pep440(pieces): + """Build up version string, with post-release "local version identifier". + + Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you + get a tagged build and then dirty it, you'll get TAG+0.gHEX.dirty + + Exceptions: + 1: no tags. git_describe was just HEX. 0+untagged.DISTANCE.gHEX[.dirty] + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"] or pieces["dirty"]: + rendered += plus_or_dot(pieces) + rendered += "%%d.g%%s" %% (pieces["distance"], pieces["short"]) + if pieces["dirty"]: + rendered += ".dirty" + else: + # exception #1 + rendered = "0+untagged.%%d.g%%s" %% (pieces["distance"], + pieces["short"]) + if pieces["dirty"]: + rendered += ".dirty" + return rendered + + +def render_pep440_branch(pieces): + """TAG[[.dev0]+DISTANCE.gHEX[.dirty]] . + + The ".dev0" means not master branch. Note that .dev0 sorts backwards + (a feature branch will appear "older" than the master branch). + + Exceptions: + 1: no tags. 0[.dev0]+untagged.DISTANCE.gHEX[.dirty] + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"] or pieces["dirty"]: + if pieces["branch"] != "master": + rendered += ".dev0" + rendered += plus_or_dot(pieces) + rendered += "%%d.g%%s" %% (pieces["distance"], pieces["short"]) + if pieces["dirty"]: + rendered += ".dirty" + else: + # exception #1 + rendered = "0" + if pieces["branch"] != "master": + rendered += ".dev0" + rendered += "+untagged.%%d.g%%s" %% (pieces["distance"], + pieces["short"]) + if pieces["dirty"]: + rendered += ".dirty" + return rendered + + +def pep440_split_post(ver): + """Split pep440 version string at the post-release segment. + + Returns the release segments before the post-release and the + post-release version number (or -1 if no post-release segment is present). + """ + vc = str.split(ver, ".post") + return vc[0], int(vc[1] or 0) if len(vc) == 2 else None + + +def render_pep440_pre(pieces): + """TAG[.postN.devDISTANCE] -- No -dirty. + + Exceptions: + 1: no tags. 0.post0.devDISTANCE + """ + if pieces["closest-tag"]: + if pieces["distance"]: + # update the post release segment + tag_version, post_version = pep440_split_post(pieces["closest-tag"]) + rendered = tag_version + if post_version is not None: + rendered += ".post%%d.dev%%d" %% (post_version+1, pieces["distance"]) + else: + rendered += ".post0.dev%%d" %% (pieces["distance"]) + else: + # no commits, use the tag as the version + rendered = pieces["closest-tag"] + else: + # exception #1 + rendered = "0.post0.dev%%d" %% pieces["distance"] + return rendered + + +def render_pep440_post(pieces): + """TAG[.postDISTANCE[.dev0]+gHEX] . + + The ".dev0" means dirty. Note that .dev0 sorts backwards + (a dirty tree will appear "older" than the corresponding clean one), + but you shouldn't be releasing software with -dirty anyways. + + Exceptions: + 1: no tags. 0.postDISTANCE[.dev0] + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"] or pieces["dirty"]: + rendered += ".post%%d" %% pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" + rendered += plus_or_dot(pieces) + rendered += "g%%s" %% pieces["short"] + else: + # exception #1 + rendered = "0.post%%d" %% pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" + rendered += "+g%%s" %% pieces["short"] + return rendered + + +def render_pep440_post_branch(pieces): + """TAG[.postDISTANCE[.dev0]+gHEX[.dirty]] . + + The ".dev0" means not master branch. + + Exceptions: + 1: no tags. 0.postDISTANCE[.dev0]+gHEX[.dirty] + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"] or pieces["dirty"]: + rendered += ".post%%d" %% pieces["distance"] + if pieces["branch"] != "master": + rendered += ".dev0" + rendered += plus_or_dot(pieces) + rendered += "g%%s" %% pieces["short"] + if pieces["dirty"]: + rendered += ".dirty" + else: + # exception #1 + rendered = "0.post%%d" %% pieces["distance"] + if pieces["branch"] != "master": + rendered += ".dev0" + rendered += "+g%%s" %% pieces["short"] + if pieces["dirty"]: + rendered += ".dirty" + return rendered + + +def render_pep440_old(pieces): + """TAG[.postDISTANCE[.dev0]] . + + The ".dev0" means dirty. + + Exceptions: + 1: no tags. 0.postDISTANCE[.dev0] + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"] or pieces["dirty"]: + rendered += ".post%%d" %% pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" + else: + # exception #1 + rendered = "0.post%%d" %% pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" + return rendered + + +def render_git_describe(pieces): + """TAG[-DISTANCE-gHEX][-dirty]. + + Like 'git describe --tags --dirty --always'. + + Exceptions: + 1: no tags. HEX[-dirty] (note: no 'g' prefix) + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"]: + rendered += "-%%d-g%%s" %% (pieces["distance"], pieces["short"]) + else: + # exception #1 + rendered = pieces["short"] + if pieces["dirty"]: + rendered += "-dirty" + return rendered + + +def render_git_describe_long(pieces): + """TAG-DISTANCE-gHEX[-dirty]. + + Like 'git describe --tags --dirty --always -long'. + The distance/hash is unconditional. + + Exceptions: + 1: no tags. HEX[-dirty] (note: no 'g' prefix) + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + rendered += "-%%d-g%%s" %% (pieces["distance"], pieces["short"]) + else: + # exception #1 + rendered = pieces["short"] + if pieces["dirty"]: + rendered += "-dirty" + return rendered + + +def render(pieces, style): + """Render the given version pieces into the requested style.""" + if pieces["error"]: + return {"version": "unknown", + "full-revisionid": pieces.get("long"), + "dirty": None, + "error": pieces["error"], + "date": None} + + if not style or style == "default": + style = "pep440" # the default + + if style == "pep440": + rendered = render_pep440(pieces) + elif style == "pep440-branch": + rendered = render_pep440_branch(pieces) + elif style == "pep440-pre": + rendered = render_pep440_pre(pieces) + elif style == "pep440-post": + rendered = render_pep440_post(pieces) + elif style == "pep440-post-branch": + rendered = render_pep440_post_branch(pieces) + elif style == "pep440-old": + rendered = render_pep440_old(pieces) + elif style == "git-describe": + rendered = render_git_describe(pieces) + elif style == "git-describe-long": + rendered = render_git_describe_long(pieces) + else: + raise ValueError("unknown style '%%s'" %% style) + + return {"version": rendered, "full-revisionid": pieces["long"], + "dirty": pieces["dirty"], "error": None, + "date": pieces.get("date")} + + +def get_versions(): + """Get version information or return default if unable to do so.""" + # I am in _version.py, which lives at ROOT/VERSIONFILE_SOURCE. If we have + # __file__, we can work backwards from there to the root. Some + # py2exe/bbfreeze/non-CPython implementations don't do __file__, in which + # case we can only use expanded keywords. + + cfg = get_config() + verbose = cfg.verbose + + try: + return git_versions_from_keywords(get_keywords(), cfg.tag_prefix, + verbose) + except NotThisMethod: + pass + + try: + root = os.path.realpath(__file__) + # versionfile_source is the relative path from the top of the source + # tree (where the .git directory might live) to this file. Invert + # this to find the root from __file__. + for _ in cfg.versionfile_source.split('/'): + root = os.path.dirname(root) + except NameError: + return {"version": "0+unknown", "full-revisionid": None, + "dirty": None, + "error": "unable to find root of source tree", + "date": None} + + try: + pieces = git_pieces_from_vcs(cfg.tag_prefix, root, verbose) + return render(pieces, cfg.style) + except NotThisMethod: + pass + + try: + if cfg.parentdir_prefix: + return versions_from_parentdir(cfg.parentdir_prefix, root, verbose) + except NotThisMethod: + pass + + return {"version": "0+unknown", "full-revisionid": None, + "dirty": None, + "error": "unable to compute version", "date": None} +''' + + +@register_vcs_handler("git", "get_keywords") +def git_get_keywords(versionfile_abs): + """Extract version information from the given file.""" + # the code embedded in _version.py can just fetch the value of these + # keywords. When used from setup.py, we don't want to import _version.py, + # so we do it with a regexp instead. This function is not used from + # _version.py. + keywords = {} + try: + with open(versionfile_abs, "r") as fobj: + for line in fobj: + if line.strip().startswith("git_refnames ="): + mo = re.search(r'=\s*"(.*)"', line) + if mo: + keywords["refnames"] = mo.group(1) + if line.strip().startswith("git_full ="): + mo = re.search(r'=\s*"(.*)"', line) + if mo: + keywords["full"] = mo.group(1) + if line.strip().startswith("git_date ="): + mo = re.search(r'=\s*"(.*)"', line) + if mo: + keywords["date"] = mo.group(1) + except OSError: + pass + return keywords + + +@register_vcs_handler("git", "keywords") +def git_versions_from_keywords(keywords, tag_prefix, verbose): + """Get version information from git keywords.""" + if "refnames" not in keywords: + raise NotThisMethod("Short version file found") + date = keywords.get("date") + if date is not None: + # Use only the last line. Previous lines may contain GPG signature + # information. + date = date.splitlines()[-1] + + # git-2.2.0 added "%cI", which expands to an ISO-8601 -compliant + # datestamp. However we prefer "%ci" (which expands to an "ISO-8601 + # -like" string, which we must then edit to make compliant), because + # it's been around since git-1.5.3, and it's too difficult to + # discover which version we're using, or to work around using an + # older one. + date = date.strip().replace(" ", "T", 1).replace(" ", "", 1) + refnames = keywords["refnames"].strip() + if refnames.startswith("$Format"): + if verbose: + print("keywords are unexpanded, not using") + raise NotThisMethod("unexpanded keywords, not a git-archive tarball") + refs = {r.strip() for r in refnames.strip("()").split(",")} + # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of + # just "foo-1.0". If we see a "tag: " prefix, prefer those. + TAG = "tag: " + tags = {r[len(TAG) :] for r in refs if r.startswith(TAG)} + if not tags: + # Either we're using git < 1.8.3, or there really are no tags. We use + # a heuristic: assume all version tags have a digit. The old git %d + # expansion behaves like git log --decorate=short and strips out the + # refs/heads/ and refs/tags/ prefixes that would let us distinguish + # between branches and tags. By ignoring refnames without digits, we + # filter out many common branch names like "release" and + # "stabilization", as well as "HEAD" and "master". + tags = {r for r in refs if re.search(r"\d", r)} + if verbose: + print("discarding '%s', no digits" % ",".join(refs - tags)) + if verbose: + print("likely tags: %s" % ",".join(sorted(tags))) + for ref in sorted(tags): + # sorting will prefer e.g. "2.0" over "2.0rc1" + if ref.startswith(tag_prefix): + r = ref[len(tag_prefix) :] + # Filter out refs that exactly match prefix or that don't start + # with a number once the prefix is stripped (mostly a concern + # when prefix is '') + if not re.match(r"\d", r): + continue + if verbose: + print("picking %s" % r) + return { + "version": r, + "full-revisionid": keywords["full"].strip(), + "dirty": False, + "error": None, + "date": date, + } + # no suitable tags, so version is "0+unknown", but full hex is still there + if verbose: + print("no suitable tags, using unknown + full revision id") + return { + "version": "0+unknown", + "full-revisionid": keywords["full"].strip(), + "dirty": False, + "error": "no suitable tags", + "date": None, + } + + +@register_vcs_handler("git", "pieces_from_vcs") +def git_pieces_from_vcs(tag_prefix, root, verbose, runner=run_command): + """Get version from 'git describe' in the root of the source tree. + + This only gets called if the git-archive 'subst' keywords were *not* + expanded, and _version.py hasn't already been rewritten with a short + version string, meaning we're inside a checked out source tree. + """ + GITS = ["git"] + TAG_PREFIX_REGEX = "*" + if sys.platform == "win32": + GITS = ["git.cmd", "git.exe"] + TAG_PREFIX_REGEX = r"\*" + + _, rc = runner(GITS, ["rev-parse", "--git-dir"], cwd=root, hide_stderr=True) + if rc != 0: + if verbose: + print("Directory %s not under git control" % root) + raise NotThisMethod("'git rev-parse --git-dir' returned error") + + # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty] + # if there isn't one, this yields HEX[-dirty] (no NUM) + describe_out, rc = runner( + GITS, + [ + "describe", + "--tags", + "--dirty", + "--always", + "--long", + "--match", + "%s%s" % (tag_prefix, TAG_PREFIX_REGEX), + ], + cwd=root, + ) + # --long was added in git-1.5.5 + if describe_out is None: + raise NotThisMethod("'git describe' failed") + describe_out = describe_out.strip() + full_out, rc = runner(GITS, ["rev-parse", "HEAD"], cwd=root) + if full_out is None: + raise NotThisMethod("'git rev-parse' failed") + full_out = full_out.strip() + + pieces = {} + pieces["long"] = full_out + pieces["short"] = full_out[:7] # maybe improved later + pieces["error"] = None + + branch_name, rc = runner(GITS, ["rev-parse", "--abbrev-ref", "HEAD"], cwd=root) + # --abbrev-ref was added in git-1.6.3 + if rc != 0 or branch_name is None: + raise NotThisMethod("'git rev-parse --abbrev-ref' returned error") + branch_name = branch_name.strip() + + if branch_name == "HEAD": + # If we aren't exactly on a branch, pick a branch which represents + # the current commit. If all else fails, we are on a branchless + # commit. + branches, rc = runner(GITS, ["branch", "--contains"], cwd=root) + # --contains was added in git-1.5.4 + if rc != 0 or branches is None: + raise NotThisMethod("'git branch --contains' returned error") + branches = branches.split("\n") + + # Remove the first line if we're running detached + if "(" in branches[0]: + branches.pop(0) + + # Strip off the leading "* " from the list of branches. + branches = [branch[2:] for branch in branches] + if "master" in branches: + branch_name = "master" + elif not branches: + branch_name = None + else: + # Pick the first branch that is returned. Good or bad. + branch_name = branches[0] + + pieces["branch"] = branch_name + + # parse describe_out. It will be like TAG-NUM-gHEX[-dirty] or HEX[-dirty] + # TAG might have hyphens. + git_describe = describe_out + + # look for -dirty suffix + dirty = git_describe.endswith("-dirty") + pieces["dirty"] = dirty + if dirty: + git_describe = git_describe[: git_describe.rindex("-dirty")] + + # now we have TAG-NUM-gHEX or HEX + + if "-" in git_describe: + # TAG-NUM-gHEX + mo = re.search(r"^(.+)-(\d+)-g([0-9a-f]+)$", git_describe) + if not mo: + # unparsable. Maybe git-describe is misbehaving? + pieces["error"] = "unable to parse git-describe output: '%s'" % describe_out + return pieces + + # tag + full_tag = mo.group(1) + if not full_tag.startswith(tag_prefix): + if verbose: + fmt = "tag '%s' doesn't start with prefix '%s'" + print(fmt % (full_tag, tag_prefix)) + pieces["error"] = "tag '%s' doesn't start with prefix '%s'" % ( + full_tag, + tag_prefix, + ) + return pieces + pieces["closest-tag"] = full_tag[len(tag_prefix) :] + + # distance: number of commits since tag + pieces["distance"] = int(mo.group(2)) + + # commit: short hex revision ID + pieces["short"] = mo.group(3) + + else: + # HEX: no tags + pieces["closest-tag"] = None + count_out, rc = runner(GITS, ["rev-list", "HEAD", "--count"], cwd=root) + pieces["distance"] = int(count_out) # total number of commits + + # commit date: see ISO-8601 comment in git_versions_from_keywords() + date = runner(GITS, ["show", "-s", "--format=%ci", "HEAD"], cwd=root)[0].strip() + # Use only the last line. Previous lines may contain GPG signature + # information. + date = date.splitlines()[-1] + pieces["date"] = date.strip().replace(" ", "T", 1).replace(" ", "", 1) + + return pieces + + +def do_vcs_install(manifest_in, versionfile_source, ipy): + """Git-specific installation logic for Versioneer. + + For Git, this means creating/changing .gitattributes to mark _version.py + for export-subst keyword substitution. + """ + GITS = ["git"] + if sys.platform == "win32": + GITS = ["git.cmd", "git.exe"] + files = [manifest_in, versionfile_source] + if ipy: + files.append(ipy) + try: + my_path = __file__ + if my_path.endswith(".pyc") or my_path.endswith(".pyo"): + my_path = os.path.splitext(my_path)[0] + ".py" + versioneer_file = os.path.relpath(my_path) + except NameError: + versioneer_file = "versioneer.py" + files.append(versioneer_file) + present = False + try: + with open(".gitattributes", "r") as fobj: + for line in fobj: + if line.strip().startswith(versionfile_source): + if "export-subst" in line.strip().split()[1:]: + present = True + break + except OSError: + pass + if not present: + with open(".gitattributes", "a+") as fobj: + fobj.write(f"{versionfile_source} export-subst\n") + files.append(".gitattributes") + run_command(GITS, ["add", "--"] + files) + + +def versions_from_parentdir(parentdir_prefix, root, verbose): + """Try to determine the version from the parent directory name. + + Source tarballs conventionally unpack into a directory that includes both + the project name and a version string. We will also support searching up + two directory levels for an appropriately named parent directory + """ + rootdirs = [] + + for _ in range(3): + dirname = os.path.basename(root) + if dirname.startswith(parentdir_prefix): + return { + "version": dirname[len(parentdir_prefix) :], + "full-revisionid": None, + "dirty": False, + "error": None, + "date": None, + } + rootdirs.append(root) + root = os.path.dirname(root) # up a level + + if verbose: + print( + "Tried directories %s but none started with prefix %s" + % (str(rootdirs), parentdir_prefix) + ) + raise NotThisMethod("rootdir doesn't start with parentdir_prefix") + + +SHORT_VERSION_PY = """ +# This file was generated by 'versioneer.py' (0.21) from +# revision-control system data, or from the parent directory name of an +# unpacked source archive. Distribution tarballs contain a pre-generated copy +# of this file. + +import json + +version_json = ''' +%s +''' # END VERSION_JSON + + +def get_versions(): + return json.loads(version_json) +""" + + +def versions_from_file(filename): + """Try to determine the version from _version.py if present.""" + try: + with open(filename) as f: + contents = f.read() + except OSError: + raise NotThisMethod("unable to read _version.py") + mo = re.search( + r"version_json = '''\n(.*)''' # END VERSION_JSON", contents, re.M | re.S + ) + if not mo: + mo = re.search( + r"version_json = '''\r\n(.*)''' # END VERSION_JSON", contents, re.M | re.S + ) + if not mo: + raise NotThisMethod("no version_json in _version.py") + return json.loads(mo.group(1)) + + +def write_to_version_file(filename, versions): + """Write the given version number to the given _version.py file.""" + os.unlink(filename) + contents = json.dumps(versions, sort_keys=True, indent=1, separators=(",", ": ")) + with open(filename, "w") as f: + f.write(SHORT_VERSION_PY % contents) + + print("set %s to '%s'" % (filename, versions["version"])) + + +def plus_or_dot(pieces): + """Return a + if we don't already have one, else return a .""" + if "+" in pieces.get("closest-tag", ""): + return "." + return "+" + + +def render_pep440(pieces): + """Build up version string, with post-release "local version identifier". + + Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you + get a tagged build and then dirty it, you'll get TAG+0.gHEX.dirty + + Exceptions: + 1: no tags. git_describe was just HEX. 0+untagged.DISTANCE.gHEX[.dirty] + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"] or pieces["dirty"]: + rendered += plus_or_dot(pieces) + rendered += "%d.g%s" % (pieces["distance"], pieces["short"]) + if pieces["dirty"]: + rendered += ".dirty" + else: + # exception #1 + rendered = "0+untagged.%d.g%s" % (pieces["distance"], pieces["short"]) + if pieces["dirty"]: + rendered += ".dirty" + return rendered + + +def render_pep440_branch(pieces): + """TAG[[.dev0]+DISTANCE.gHEX[.dirty]] . + + The ".dev0" means not master branch. Note that .dev0 sorts backwards + (a feature branch will appear "older" than the master branch). + + Exceptions: + 1: no tags. 0[.dev0]+untagged.DISTANCE.gHEX[.dirty] + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"] or pieces["dirty"]: + if pieces["branch"] != "master": + rendered += ".dev0" + rendered += plus_or_dot(pieces) + rendered += "%d.g%s" % (pieces["distance"], pieces["short"]) + if pieces["dirty"]: + rendered += ".dirty" + else: + # exception #1 + rendered = "0" + if pieces["branch"] != "master": + rendered += ".dev0" + rendered += "+untagged.%d.g%s" % (pieces["distance"], pieces["short"]) + if pieces["dirty"]: + rendered += ".dirty" + return rendered + + +def pep440_split_post(ver): + """Split pep440 version string at the post-release segment. + + Returns the release segments before the post-release and the + post-release version number (or -1 if no post-release segment is present). + """ + vc = str.split(ver, ".post") + return vc[0], int(vc[1] or 0) if len(vc) == 2 else None + + +def render_pep440_pre(pieces): + """TAG[.postN.devDISTANCE] -- No -dirty. + + Exceptions: + 1: no tags. 0.post0.devDISTANCE + """ + if pieces["closest-tag"]: + if pieces["distance"]: + # update the post release segment + tag_version, post_version = pep440_split_post(pieces["closest-tag"]) + rendered = tag_version + if post_version is not None: + rendered += ".post%d.dev%d" % (post_version + 1, pieces["distance"]) + else: + rendered += ".post0.dev%d" % (pieces["distance"]) + else: + # no commits, use the tag as the version + rendered = pieces["closest-tag"] + else: + # exception #1 + rendered = "0.post0.dev%d" % pieces["distance"] + return rendered + + +def render_pep440_post(pieces): + """TAG[.postDISTANCE[.dev0]+gHEX] . + + The ".dev0" means dirty. Note that .dev0 sorts backwards + (a dirty tree will appear "older" than the corresponding clean one), + but you shouldn't be releasing software with -dirty anyways. + + Exceptions: + 1: no tags. 0.postDISTANCE[.dev0] + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"] or pieces["dirty"]: + rendered += ".post%d" % pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" + rendered += plus_or_dot(pieces) + rendered += "g%s" % pieces["short"] + else: + # exception #1 + rendered = "0.post%d" % pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" + rendered += "+g%s" % pieces["short"] + return rendered + + +def render_pep440_post_branch(pieces): + """TAG[.postDISTANCE[.dev0]+gHEX[.dirty]] . + + The ".dev0" means not master branch. + + Exceptions: + 1: no tags. 0.postDISTANCE[.dev0]+gHEX[.dirty] + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"] or pieces["dirty"]: + rendered += ".post%d" % pieces["distance"] + if pieces["branch"] != "master": + rendered += ".dev0" + rendered += plus_or_dot(pieces) + rendered += "g%s" % pieces["short"] + if pieces["dirty"]: + rendered += ".dirty" + else: + # exception #1 + rendered = "0.post%d" % pieces["distance"] + if pieces["branch"] != "master": + rendered += ".dev0" + rendered += "+g%s" % pieces["short"] + if pieces["dirty"]: + rendered += ".dirty" + return rendered + + +def render_pep440_old(pieces): + """TAG[.postDISTANCE[.dev0]] . + + The ".dev0" means dirty. + + Exceptions: + 1: no tags. 0.postDISTANCE[.dev0] + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"] or pieces["dirty"]: + rendered += ".post%d" % pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" + else: + # exception #1 + rendered = "0.post%d" % pieces["distance"] + if pieces["dirty"]: + rendered += ".dev0" + return rendered + + +def render_git_describe(pieces): + """TAG[-DISTANCE-gHEX][-dirty]. + + Like 'git describe --tags --dirty --always'. + + Exceptions: + 1: no tags. HEX[-dirty] (note: no 'g' prefix) + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + if pieces["distance"]: + rendered += "-%d-g%s" % (pieces["distance"], pieces["short"]) + else: + # exception #1 + rendered = pieces["short"] + if pieces["dirty"]: + rendered += "-dirty" + return rendered + + +def render_git_describe_long(pieces): + """TAG-DISTANCE-gHEX[-dirty]. + + Like 'git describe --tags --dirty --always -long'. + The distance/hash is unconditional. + + Exceptions: + 1: no tags. HEX[-dirty] (note: no 'g' prefix) + """ + if pieces["closest-tag"]: + rendered = pieces["closest-tag"] + rendered += "-%d-g%s" % (pieces["distance"], pieces["short"]) + else: + # exception #1 + rendered = pieces["short"] + if pieces["dirty"]: + rendered += "-dirty" + return rendered + + +def render(pieces, style): + """Render the given version pieces into the requested style.""" + if pieces["error"]: + return { + "version": "unknown", + "full-revisionid": pieces.get("long"), + "dirty": None, + "error": pieces["error"], + "date": None, + } + + if not style or style == "default": + style = "pep440" # the default + + if style == "pep440": + rendered = render_pep440(pieces) + elif style == "pep440-branch": + rendered = render_pep440_branch(pieces) + elif style == "pep440-pre": + rendered = render_pep440_pre(pieces) + elif style == "pep440-post": + rendered = render_pep440_post(pieces) + elif style == "pep440-post-branch": + rendered = render_pep440_post_branch(pieces) + elif style == "pep440-old": + rendered = render_pep440_old(pieces) + elif style == "git-describe": + rendered = render_git_describe(pieces) + elif style == "git-describe-long": + rendered = render_git_describe_long(pieces) + else: + raise ValueError("unknown style '%s'" % style) + + return { + "version": rendered, + "full-revisionid": pieces["long"], + "dirty": pieces["dirty"], + "error": None, + "date": pieces.get("date"), + } + + +class VersioneerBadRootError(Exception): + """The project root directory is unknown or missing key files.""" + + +def get_versions(verbose=False): + """Get the project version from whatever source is available. + + Returns dict with two keys: 'version' and 'full'. + """ + if "versioneer" in sys.modules: + # see the discussion in cmdclass.py:get_cmdclass() + del sys.modules["versioneer"] + + root = get_root() + cfg = get_config_from_root(root) + + assert cfg.VCS is not None, "please set [versioneer]VCS= in setup.cfg" + handlers = HANDLERS.get(cfg.VCS) + assert handlers, "unrecognized VCS '%s'" % cfg.VCS + verbose = verbose or cfg.verbose + assert ( + cfg.versionfile_source is not None + ), "please set versioneer.versionfile_source" + assert cfg.tag_prefix is not None, "please set versioneer.tag_prefix" + + versionfile_abs = os.path.join(root, cfg.versionfile_source) + + # extract version from first of: _version.py, VCS command (e.g. 'git + # describe'), parentdir. This is meant to work for developers using a + # source checkout, for users of a tarball created by 'setup.py sdist', + # and for users of a tarball/zipball created by 'git archive' or github's + # download-from-tag feature or the equivalent in other VCSes. + + get_keywords_f = handlers.get("get_keywords") + from_keywords_f = handlers.get("keywords") + if get_keywords_f and from_keywords_f: + try: + keywords = get_keywords_f(versionfile_abs) + ver = from_keywords_f(keywords, cfg.tag_prefix, verbose) + if verbose: + print("got version from expanded keyword %s" % ver) + return ver + except NotThisMethod: + pass + + try: + ver = versions_from_file(versionfile_abs) + if verbose: + print("got version from file %s %s" % (versionfile_abs, ver)) + return ver + except NotThisMethod: + pass + + from_vcs_f = handlers.get("pieces_from_vcs") + if from_vcs_f: + try: + pieces = from_vcs_f(cfg.tag_prefix, root, verbose) + ver = render(pieces, cfg.style) + if verbose: + print("got version from VCS %s" % ver) + return ver + except NotThisMethod: + pass + + try: + if cfg.parentdir_prefix: + ver = versions_from_parentdir(cfg.parentdir_prefix, root, verbose) + if verbose: + print("got version from parentdir %s" % ver) + return ver + except NotThisMethod: + pass + + if verbose: + print("unable to compute version") + + return { + "version": "0+unknown", + "full-revisionid": None, + "dirty": None, + "error": "unable to compute version", + "date": None, + } + + +def get_version(): + """Get the short version string for this project.""" + return get_versions()["version"] + + +def get_cmdclass(cmdclass=None): + """Get the custom setuptools/distutils subclasses used by Versioneer. + + If the package uses a different cmdclass (e.g. one from numpy), it + should be provide as an argument. + """ + if "versioneer" in sys.modules: + del sys.modules["versioneer"] + # this fixes the "python setup.py develop" case (also 'install' and + # 'easy_install .'), in which subdependencies of the main project are + # built (using setup.py bdist_egg) in the same python process. Assume + # a main project A and a dependency B, which use different versions + # of Versioneer. A's setup.py imports A's Versioneer, leaving it in + # sys.modules by the time B's setup.py is executed, causing B to run + # with the wrong versioneer. Setuptools wraps the sub-dep builds in a + # sandbox that restores sys.modules to it's pre-build state, so the + # parent is protected against the child's "import versioneer". By + # removing ourselves from sys.modules here, before the child build + # happens, we protect the child from the parent's versioneer too. + # Also see https://github.com/python-versioneer/python-versioneer/issues/52 + + cmds = {} if cmdclass is None else cmdclass.copy() + + # we add "version" to both distutils and setuptools + from distutils.core import Command + + class cmd_version(Command): + description = "report generated version string" + user_options = [] + boolean_options = [] + + def initialize_options(self): + pass + + def finalize_options(self): + pass + + def run(self): + vers = get_versions(verbose=True) + print("Version: %s" % vers["version"]) + print(" full-revisionid: %s" % vers.get("full-revisionid")) + print(" dirty: %s" % vers.get("dirty")) + print(" date: %s" % vers.get("date")) + if vers["error"]: + print(" error: %s" % vers["error"]) + + cmds["version"] = cmd_version + + # we override "build_py" in both distutils and setuptools + # + # most invocation pathways end up running build_py: + # distutils/build -> build_py + # distutils/install -> distutils/build ->.. + # setuptools/bdist_wheel -> distutils/install ->.. + # setuptools/bdist_egg -> distutils/install_lib -> build_py + # setuptools/install -> bdist_egg ->.. + # setuptools/develop -> ? + # pip install: + # copies source tree to a tempdir before running egg_info/etc + # if .git isn't copied too, 'git describe' will fail + # then does setup.py bdist_wheel, or sometimes setup.py install + # setup.py egg_info -> ? + + # we override different "build_py" commands for both environments + if "build_py" in cmds: + _build_py = cmds["build_py"] + elif "setuptools" in sys.modules: + from setuptools.command.build_py import build_py as _build_py + else: + from distutils.command.build_py import build_py as _build_py + + class cmd_build_py(_build_py): + def run(self): + root = get_root() + cfg = get_config_from_root(root) + versions = get_versions() + _build_py.run(self) + # now locate _version.py in the new build/ directory and replace + # it with an updated value + if cfg.versionfile_build: + target_versionfile = os.path.join(self.build_lib, cfg.versionfile_build) + print("UPDATING %s" % target_versionfile) + write_to_version_file(target_versionfile, versions) + + cmds["build_py"] = cmd_build_py + + if "build_ext" in cmds: + _build_ext = cmds["build_ext"] + elif "setuptools" in sys.modules: + from setuptools.command.build_ext import build_ext as _build_ext + else: + from distutils.command.build_ext import build_ext as _build_ext + + class cmd_build_ext(_build_ext): + def run(self): + root = get_root() + cfg = get_config_from_root(root) + versions = get_versions() + _build_ext.run(self) + if self.inplace: + # build_ext --inplace will only build extensions in + # build/lib<..> dir with no _version.py to write to. + # As in place builds will already have a _version.py + # in the module dir, we do not need to write one. + return + # now locate _version.py in the new build/ directory and replace + # it with an updated value + target_versionfile = os.path.join(self.build_lib, cfg.versionfile_build) + print("UPDATING %s" % target_versionfile) + write_to_version_file(target_versionfile, versions) + + cmds["build_ext"] = cmd_build_ext + + if "cx_Freeze" in sys.modules: # cx_freeze enabled? + from cx_Freeze.dist import build_exe as _build_exe + + # nczeczulin reports that py2exe won't like the pep440-style string + # as FILEVERSION, but it can be used for PRODUCTVERSION, e.g. + # setup(console=[{ + # "version": versioneer.get_version().split("+", 1)[0], # FILEVERSION + # "product_version": versioneer.get_version(), + # ... + + class cmd_build_exe(_build_exe): + def run(self): + root = get_root() + cfg = get_config_from_root(root) + versions = get_versions() + target_versionfile = cfg.versionfile_source + print("UPDATING %s" % target_versionfile) + write_to_version_file(target_versionfile, versions) + + _build_exe.run(self) + os.unlink(target_versionfile) + with open(cfg.versionfile_source, "w") as f: + LONG = LONG_VERSION_PY[cfg.VCS] + f.write( + LONG + % { + "DOLLAR": "$", + "STYLE": cfg.style, + "TAG_PREFIX": cfg.tag_prefix, + "PARENTDIR_PREFIX": cfg.parentdir_prefix, + "VERSIONFILE_SOURCE": cfg.versionfile_source, + } + ) + + cmds["build_exe"] = cmd_build_exe + del cmds["build_py"] + + if "py2exe" in sys.modules: # py2exe enabled? + from py2exe.distutils_buildexe import py2exe as _py2exe + + class cmd_py2exe(_py2exe): + def run(self): + root = get_root() + cfg = get_config_from_root(root) + versions = get_versions() + target_versionfile = cfg.versionfile_source + print("UPDATING %s" % target_versionfile) + write_to_version_file(target_versionfile, versions) + + _py2exe.run(self) + os.unlink(target_versionfile) + with open(cfg.versionfile_source, "w") as f: + LONG = LONG_VERSION_PY[cfg.VCS] + f.write( + LONG + % { + "DOLLAR": "$", + "STYLE": cfg.style, + "TAG_PREFIX": cfg.tag_prefix, + "PARENTDIR_PREFIX": cfg.parentdir_prefix, + "VERSIONFILE_SOURCE": cfg.versionfile_source, + } + ) + + cmds["py2exe"] = cmd_py2exe + + # we override different "sdist" commands for both environments + if "sdist" in cmds: + _sdist = cmds["sdist"] + elif "setuptools" in sys.modules: + from setuptools.command.sdist import sdist as _sdist + else: + from distutils.command.sdist import sdist as _sdist + + class cmd_sdist(_sdist): + def run(self): + versions = get_versions() + self._versioneer_generated_versions = versions + # unless we update this, the command will keep using the old + # version + self.distribution.metadata.version = versions["version"] + return _sdist.run(self) + + def make_release_tree(self, base_dir, files): + root = get_root() + cfg = get_config_from_root(root) + _sdist.make_release_tree(self, base_dir, files) + # now locate _version.py in the new base_dir directory + # (remembering that it may be a hardlink) and replace it with an + # updated value + target_versionfile = os.path.join(base_dir, cfg.versionfile_source) + print("UPDATING %s" % target_versionfile) + write_to_version_file( + target_versionfile, self._versioneer_generated_versions + ) + + cmds["sdist"] = cmd_sdist + + return cmds + + +CONFIG_ERROR = """ +setup.cfg is missing the necessary Versioneer configuration. You need +a section like: + + [versioneer] + VCS = git + style = pep440 + versionfile_source = src/myproject/_version.py + versionfile_build = myproject/_version.py + tag_prefix = + parentdir_prefix = myproject- + +You will also need to edit your setup.py to use the results: + + import versioneer + setup(version=versioneer.get_version(), + cmdclass=versioneer.get_cmdclass(), ...) + +Please read the docstring in ./versioneer.py for configuration instructions, +edit setup.cfg, and re-run the installer or 'python versioneer.py setup'. +""" + +SAMPLE_CONFIG = """ +# See the docstring in versioneer.py for instructions. Note that you must +# re-run 'versioneer.py setup' after changing this section, and commit the +# resulting files. + +[versioneer] +#VCS = git +#style = pep440 +#versionfile_source = +#versionfile_build = +#tag_prefix = +#parentdir_prefix = + +""" + +OLD_SNIPPET = """ +from ._version import get_versions +__version__ = get_versions()['version'] +del get_versions +""" + +INIT_PY_SNIPPET = """ +from . import {0} +__version__ = {0}.get_versions()['version'] +""" + + +def do_setup(): + """Do main VCS-independent setup function for installing Versioneer.""" + root = get_root() + try: + cfg = get_config_from_root(root) + except (OSError, configparser.NoSectionError, configparser.NoOptionError) as e: + if isinstance(e, (OSError, configparser.NoSectionError)): + print("Adding sample versioneer config to setup.cfg", file=sys.stderr) + with open(os.path.join(root, "setup.cfg"), "a") as f: + f.write(SAMPLE_CONFIG) + print(CONFIG_ERROR, file=sys.stderr) + return 1 + + print(" creating %s" % cfg.versionfile_source) + with open(cfg.versionfile_source, "w") as f: + LONG = LONG_VERSION_PY[cfg.VCS] + f.write( + LONG + % { + "DOLLAR": "$", + "STYLE": cfg.style, + "TAG_PREFIX": cfg.tag_prefix, + "PARENTDIR_PREFIX": cfg.parentdir_prefix, + "VERSIONFILE_SOURCE": cfg.versionfile_source, + } + ) + + ipy = os.path.join(os.path.dirname(cfg.versionfile_source), "__init__.py") + if os.path.exists(ipy): + try: + with open(ipy, "r") as f: + old = f.read() + except OSError: + old = "" + module = os.path.splitext(os.path.basename(cfg.versionfile_source))[0] + snippet = INIT_PY_SNIPPET.format(module) + if OLD_SNIPPET in old: + print(" replacing boilerplate in %s" % ipy) + with open(ipy, "w") as f: + f.write(old.replace(OLD_SNIPPET, snippet)) + elif snippet not in old: + print(" appending to %s" % ipy) + with open(ipy, "a") as f: + f.write(snippet) + else: + print(" %s unmodified" % ipy) + else: + print(" %s doesn't exist, ok" % ipy) + ipy = None + + # Make sure both the top-level "versioneer.py" and versionfile_source + # (PKG/_version.py, used by runtime code) are in MANIFEST.in, so + # they'll be copied into source distributions. Pip won't be able to + # install the package without this. + manifest_in = os.path.join(root, "MANIFEST.in") + simple_includes = set() + try: + with open(manifest_in, "r") as f: + for line in f: + if line.startswith("include "): + for include in line.split()[1:]: + simple_includes.add(include) + except OSError: + pass + # That doesn't cover everything MANIFEST.in can do + # (http://docs.python.org/2/distutils/sourcedist.html#commands), so + # it might give some false negatives. Appending redundant 'include' + # lines is safe, though. + if "versioneer.py" not in simple_includes: + print(" appending 'versioneer.py' to MANIFEST.in") + with open(manifest_in, "a") as f: + f.write("include versioneer.py\n") + else: + print(" 'versioneer.py' already in MANIFEST.in") + if cfg.versionfile_source not in simple_includes: + print( + " appending versionfile_source ('%s') to MANIFEST.in" + % cfg.versionfile_source + ) + with open(manifest_in, "a") as f: + f.write("include %s\n" % cfg.versionfile_source) + else: + print(" versionfile_source already in MANIFEST.in") + + # Make VCS-specific changes. For git, this means creating/changing + # .gitattributes to mark _version.py for export-subst keyword + # substitution. + do_vcs_install(manifest_in, cfg.versionfile_source, ipy) + return 0 + + +def scan_setup_py(): + """Validate the contents of setup.py against Versioneer's expectations.""" + found = set() + setters = False + errors = 0 + with open("setup.py", "r") as f: + for line in f.readlines(): + if "import versioneer" in line: + found.add("import") + if "versioneer.get_cmdclass()" in line: + found.add("cmdclass") + if "versioneer.get_version()" in line: + found.add("get_version") + if "versioneer.VCS" in line: + setters = True + if "versioneer.versionfile_source" in line: + setters = True + if len(found) != 3: + print("") + print("Your setup.py appears to be missing some important items") + print("(but I might be wrong). Please make sure it has something") + print("roughly like the following:") + print("") + print(" import versioneer") + print(" setup( version=versioneer.get_version(),") + print(" cmdclass=versioneer.get_cmdclass(), ...)") + print("") + errors += 1 + if setters: + print("You should remove lines like 'versioneer.VCS = ' and") + print("'versioneer.versionfile_source = ' . This configuration") + print("now lives in setup.cfg, and should be removed from setup.py") + print("") + errors += 1 + return errors + + +if __name__ == "__main__": + cmd = sys.argv[1] + if cmd == "setup": + errors = do_setup() + errors += scan_setup_py() + if errors: + sys.exit(1) From aa508d9ad906db085e398c90a24fedf695c5db9f Mon Sep 17 00:00:00 2001 From: shubhamjagtap639 Date: Wed, 31 May 2023 12:30:28 +0530 Subject: [PATCH 02/39] datahub emitter code modified --- docs/datahub_emitter.md | 1 + docs/flows.md | 6 -- docs/tasks.md | 6 -- example/etl_flow.py | 44 +++++++++ example/load.py | 7 ++ prefect_datahub/__init__.py | 2 +- prefect_datahub/blocks.py | 35 ------- prefect_datahub/datahub_emitter.py | 148 ++++++++++++++++------------- prefect_datahub/flows.py | 26 ----- prefect_datahub/tasks.py | 24 ----- 10 files changed, 133 insertions(+), 166 deletions(-) create mode 100644 docs/datahub_emitter.md delete mode 100644 docs/flows.md delete mode 100644 docs/tasks.md create mode 100644 example/etl_flow.py create mode 100644 example/load.py delete mode 100644 prefect_datahub/blocks.py delete mode 100644 prefect_datahub/flows.py delete mode 100644 prefect_datahub/tasks.py diff --git a/docs/datahub_emitter.md b/docs/datahub_emitter.md new file mode 100644 index 0000000..2fcf05c --- /dev/null +++ b/docs/datahub_emitter.md @@ -0,0 +1 @@ +::: prefect_datahub.datahub_emitter diff --git a/docs/flows.md b/docs/flows.md deleted file mode 100644 index d8621f0..0000000 --- a/docs/flows.md +++ /dev/null @@ -1,6 +0,0 @@ ---- -description: -notes: This documentation page is generated from source file docstrings. ---- - -::: prefect_datahub.flows \ No newline at end of file diff --git a/docs/tasks.md b/docs/tasks.md deleted file mode 100644 index 90b35df..0000000 --- a/docs/tasks.md +++ /dev/null @@ -1,6 +0,0 @@ ---- -description: -notes: This documentation page is generated from source file docstrings. ---- - -::: prefect_datahub.tasks \ No newline at end of file diff --git a/example/etl_flow.py b/example/etl_flow.py new file mode 100644 index 0000000..baf10ac --- /dev/null +++ b/example/etl_flow.py @@ -0,0 +1,44 @@ +from datahub_provider.entities import Dataset +from prefect import flow, task + +from prefect_datahub import DatahubEmitter + +datahub_emitter = DatahubEmitter.load("datahub-emitter-block") + + +@task(name="Extract", description="Extract the actual data") +def extract(): + data = "This is data" + datahub_emitter.emit_task() + return data + + +@task(description="Transform the actual data") +def transform(actual_data): + actual_data = actual_data.split(" ") + datahub_emitter.emit_task( + inputs=[Dataset("snowflake", "mydb.schema.tableA")], + outputs=[Dataset("snowflake", "mydb.schema.tableC")], + ) + return actual_data + + +@task(name="Load_task", description="Load the actual data") +def load(data): + datahub_emitter.emit_task() + print(data) + + +@flow(log_prints=True) +def etl(): + print("Flow started") + data = extract() + data = transform(data) + load(data) + datahub_emitter.emit_flow() + print("") + + +if __name__ == "__main__": + etl() + print("s") diff --git a/example/load.py b/example/load.py new file mode 100644 index 0000000..e9d235e --- /dev/null +++ b/example/load.py @@ -0,0 +1,7 @@ +from prefect_datahub import DatahubEmitter + +emitter = DatahubEmitter( + datahub_rest_url="http://localhost:8080", capture_tags_info=False +) + +emitter.save("datahub-emitter-block", overwrite=True) diff --git a/prefect_datahub/__init__.py b/prefect_datahub/__init__.py index 185a4c9..6e20479 100644 --- a/prefect_datahub/__init__.py +++ b/prefect_datahub/__init__.py @@ -1,4 +1,4 @@ from . import _version -from .blocks import DatahubBlock # noqa +from .datahub_emitter import DatahubEmitter # noqa __version__ = _version.get_versions()["version"] diff --git a/prefect_datahub/blocks.py b/prefect_datahub/blocks.py deleted file mode 100644 index 4064c1d..0000000 --- a/prefect_datahub/blocks.py +++ /dev/null @@ -1,35 +0,0 @@ -"""This is an example blocks module""" - -from prefect.blocks.core import Block -from pydantic import Field - - -class DatahubBlock(Block): - """ - A sample block that holds a value. - - Attributes: - value (str): The value to store. - - Example: - Load a stored value: - ```python - from prefect_datahub import DatahubBlock - block = DatahubBlock.load("BLOCK_NAME") - ``` - """ - - _block_type_name = "datahub" - # replace this with a relevant logo; defaults to Prefect logo - _logo_url = "https://images.ctfassets.net/gm98wzqotmnx/08yCE6xpJMX9Kjl5VArDS/c2ede674c20f90b9b6edeab71feffac9/prefect-200x200.png?h=250" # noqa - _documentation_url = "https://shubhamjagtap639.github.io/prefect-datahub/blocks/#prefect-datahub.blocks.DatahubBlock" # noqa - - value: str = Field("The default value", description="The value to store.") - - @classmethod - def seed_value_for_example(cls): - """ - Seeds the field, value, so the block can be loaded. - """ - block = cls(value="A sample value") - block.save("sample-block", overwrite=True) diff --git a/prefect_datahub/datahub_emitter.py b/prefect_datahub/datahub_emitter.py index a2f8281..c0170b1 100644 --- a/prefect_datahub/datahub_emitter.py +++ b/prefect_datahub/datahub_emitter.py @@ -1,23 +1,23 @@ """Module for emit metadata to Datahub REST. """ -from prefect.blocks.core import Block -from pydantic import Field -from typing import Dict, List, Tuple, Optional import asyncio -from prefect.blocks.core import Block -from prefect.context import FlowRunContext, TaskRunContext +from typing import List, Optional + from datahub.api.entities.datajob import DataFlow, DataJob from datahub.api.entities.dataprocess.dataprocess_instance import ( DataProcessInstance, InstanceRunResult, ) -from datahub_provider.entities import _Entity -from datahub.utilities.urns.dataset_urn import DatasetUrn -from datahub_provider.entities import Dataset -from datahub.emitter.rest_emitter import DataHubRestEmitter +from datahub.emitter.rest_emitter import DatahubRestEmitter from datahub.utilities.urns.data_flow_urn import DataFlowUrn from datahub.utilities.urns.data_job_urn import DataJobUrn +from datahub.utilities.urns.dataset_urn import DatasetUrn +from datahub_provider.entities import _Entity +from prefect.blocks.core import Block +from prefect.client.cloud import get_cloud_client from prefect.client.orchestration import get_client +from prefect.context import FlowRunContext, TaskRunContext +from pydantic import Field class DatahubEmitter(Block): @@ -40,7 +40,7 @@ class DatahubEmitter(Block): _block_type_name = "datahub emitter" # replace this with a relevant logo; defaults to Prefect logo _logo_url = "https://images.ctfassets.net/gm98wzqotmnx/08yCE6xpJMX9Kjl5VArDS/c2ede674c20f90b9b6edeab71feffac9/prefect-200x200.png?h=250" # noqa - _documentation_url = "https://GS lab.github.io/prefect-datahub/blocks/#prefect-datahub.blocks.DatahubBlock" # noqa + _documentation_url = "https://shubhamjagtap639.github.io/prefect-datahub/datahub_emitter/#prefect-datahub.datahub_emitter.DatahubEmitter" # noqa datahub_rest_url: Optional[str] = Field( default="http://localhost:8080", @@ -56,92 +56,104 @@ class DatahubEmitter(Block): capture_tags_info: Optional[bool] = Field( default=True, - title="Capture tags infor", + title="Capture tags info", description="If true, the tags field of the task and flow will be captured as DataHub tags.", ) - @classmethod - def seed_value_for_example(cls): - """ - Seeds the field, value, so the block can be loaded. - """ - block = cls(value="A sample value") - block.save("sample-block", overwrite=True) - - def _get_config(self) -> Tuple[str, Optional[str], Optional[int]]: - host = "http://localhost:8080" - password = None - timeout_sec = None - return (host, password, timeout_sec) - - def make_emitter(self) -> "DatahubRestEmitter": - import datahub.emitter.rest_emitter - - return datahub.emitter.rest_emitter.DatahubRestEmitter(*self._get_config()) - - async def _get_flow_run_graph(self, flow_run_id): - response = await get_client()._client.get(f"/flow_runs/{flow_run_id}/graph") - return response.json() - - async def _get_task_run(self, task_run_id): - return await get_client().read_task_run(task_run_id) - - def ingest_task(self, inputs: List = None, outputs: List = None): - + graceful_exceptions: Optional[bool] = Field( + default=True, + title="Graceful Exceptions", + description="If set to true, most runtime errors in the emit task or flow will be suppressed and will not cause the overall flow to fail. Note that configuration issues will still throw exceptions..", + ) + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.emitter = DatahubRestEmitter(gms_server=self.datahub_rest_url) + self.emitter.test_connection() + self.prefect_client = get_client() + self.prefect_cloud_client = get_cloud_client() + + def _entities_to_urn_list(self, iolets: List[_Entity]) -> List[DatasetUrn]: + return [DatasetUrn.create_from_string(let.urn) for let in iolets] + + def emit_task(self, inputs: List = None, outputs: List = None): flow_run_ctx = FlowRunContext.get() task_run_ctx = TaskRunContext.get() - - emitter = self.make_emitter() + + if flow_run_ctx is None or task_run_ctx is None: + return dataflow_urn = DataFlowUrn.create_from_ids( - orchestrator="prefect", env='prod', flow_id=flow_run_ctx.flow.name + orchestrator="prefect", env=self.cluster, flow_id=flow_run_ctx.flow.name + ) + + datajob = DataJob( + id=task_run_ctx.task.task_key, + flow_urn=dataflow_urn, + name=task_run_ctx.task.name, ) - - datajob = DataJob(id=task_run_ctx.task.task_key, flow_urn=dataflow_urn, name=task_run_ctx.task.name) datajob.description = task_run_ctx.task.description datajob.tags = task_run_ctx.task.tags if inputs is not None: - datajob.inlets.extend(_entities_to_urn_list(inputs)) + datajob.inlets.extend(self._entities_to_urn_list(inputs)) if outputs is not None: - datajob.outlets.extend(_entities_to_urn_list(outputs)) + datajob.outlets.extend(self._entities_to_urn_list(outputs)) + # Add upstrem urns if task_run_ctx.task_run.task_inputs: - task_run_key_map = {str(prefect_future.task_run.id):prefect_future.task_run.task_key for prefect_future in flow_run_ctx.task_run_futures} - for inputs in task_run_ctx.task_run.task_inputs['actual_data']: + task_run_key_map = { + str(prefect_future.task_run.id): prefect_future.task_run.task_key + for prefect_future in flow_run_ctx.task_run_futures + } + for key in task_run_ctx.task_run.task_inputs.keys(): upstream_task_urn = DataJobUrn.create_from_ids( - data_flow_urn=str(dataflow_urn), job_id=task_run_key_map[str(inputs.id)] - ) + data_flow_urn=str(dataflow_urn), + job_id=task_run_key_map[ + str(task_run_ctx.task_run.task_inputs[key][0].id) + ], + ) datajob.upstream_urns.extend([upstream_task_urn]) - datajob.emit(emitter) - - def ingest_flow(self): + datajob.emit(self.emitter) + + def emit_flow(self): flow_run_ctx = FlowRunContext.get() - emitter = self.make_emitter() + if flow_run_ctx is None: + return dataflow = DataFlow( - cluster='prod', id=flow_run_ctx.flow.name, orchestrator="prefect" - ) + cluster=self.cluster, id=flow_run_ctx.flow.name, orchestrator="prefect" + ) dataflow.description = flow_run_ctx.flow.description - dataflow.emit(emitter) + dataflow.emit(self.emitter) - dpi = DataProcessInstance.from_dataflow(dataflow=dataflow, id=flow_run_ctx.flow_run.name) + dpi = DataProcessInstance.from_dataflow( + dataflow=dataflow, id=flow_run_ctx.flow_run.name + ) dpi.emit_process_start( - emitter=emitter, start_timestamp_millis=int(flow_run_ctx.flow_run.start_time.timestamp() * 1000) + emitter=self.emitter, + start_timestamp_millis=int( + flow_run_ctx.flow_run.start_time.timestamp() * 1000 + ), ) - + dpi.emit_process_end( - emitter=emitter, - end_timestamp_millis=int(flow_run_ctx.flow_run.start_time.timestamp() * 1000)+5000, + emitter=self.emitter, + end_timestamp_millis=int( + flow_run_ctx.flow_run.start_time.timestamp() * 1000 + ) + + 5000, result=InstanceRunResult.SUCCESS, result_type="prefect", ) for prefect_future in flow_run_ctx.task_run_futures: - task_run = asyncio.run(self._get_task_run(prefect_future.task_run.id)) + task_run = asyncio.run( + self.prefect_client.read_task_run(prefect_future.task_run.id) + ) datajob = DataJob(id=task_run.task_key, flow_urn=dataflow.urn) - + if task_run.state_name == "Completed": result = InstanceRunResult.SUCCESS elif task_run.state_name == "Failed": @@ -150,9 +162,9 @@ def ingest_flow(self): result = InstanceRunResult.SKIPPED else: raise Exception( - f"Result should be either success or failure and it was {ti.state}" + f"Result should be either success or failure and it was {task_run.state_name}" ) - + dpi = DataProcessInstance.from_datajob( datajob=datajob, id=f"{flow_run_ctx.flow_run.name}.{task_run.name}", @@ -160,12 +172,12 @@ def ingest_flow(self): clone_outlets=True, ) dpi.emit_process_start( - emitter=emitter, + emitter=self.emitter, start_timestamp_millis=int(task_run.start_time.timestamp() * 1000), emit_template=False, ) dpi.emit_process_end( - emitter=emitter, + emitter=self.emitter, end_timestamp_millis=int(task_run.end_time.timestamp() * 1000), result=result, result_type="prefect", diff --git a/prefect_datahub/flows.py b/prefect_datahub/flows.py deleted file mode 100644 index 207bdae..0000000 --- a/prefect_datahub/flows.py +++ /dev/null @@ -1,26 +0,0 @@ -"""This is an example flows module""" -from prefect import flow - -from prefect_datahub.blocks import DatahubBlock -from prefect_datahub.tasks import ( - goodbye_prefect_datahub, - hello_prefect_datahub, -) - - -@flow -def hello_and_goodbye(): - """ - Sample flow that says hello and goodbye! - """ - DatahubBlock.seed_value_for_example() - block = DatahubBlock.load("sample-block") - - print(hello_prefect_datahub()) - print(f"The block's value: {block.value}") - print(goodbye_prefect_datahub()) - return "Done" - - -if __name__ == "__main__": - hello_and_goodbye() diff --git a/prefect_datahub/tasks.py b/prefect_datahub/tasks.py deleted file mode 100644 index ba5ca98..0000000 --- a/prefect_datahub/tasks.py +++ /dev/null @@ -1,24 +0,0 @@ -"""This is an example tasks module""" -from prefect import task - - -@task -def hello_prefect_datahub() -> str: - """ - Sample task that says hello! - - Returns: - A greeting for your collection - """ - return "Hello, prefect-datahub!" - - -@task -def goodbye_prefect_datahub() -> str: - """ - Sample task that says goodbye! - - Returns: - A farewell for your collection - """ - return "Goodbye, prefect-datahub!" From 3ae50463ef57ea155a5c8d2ab05cc5bbb643317c Mon Sep 17 00:00:00 2001 From: shubhamjagtap639 Date: Fri, 2 Jun 2023 11:42:48 +0530 Subject: [PATCH 03/39] Emit workspace added --- example/etl_flow.py | 4 +- example/etl_sub_flow.py | 53 ++++++ prefect_datahub/datahub_emitter.py | 261 +++++++++++++++++++---------- 3 files changed, 225 insertions(+), 93 deletions(-) create mode 100644 example/etl_sub_flow.py diff --git a/example/etl_flow.py b/example/etl_flow.py index baf10ac..0e424d5 100644 --- a/example/etl_flow.py +++ b/example/etl_flow.py @@ -32,8 +32,8 @@ def load(data): @flow(log_prints=True) def etl(): print("Flow started") - data = extract() - data = transform(data) + extract() + data = transform("This is data") load(data) datahub_emitter.emit_flow() print("") diff --git a/example/etl_sub_flow.py b/example/etl_sub_flow.py new file mode 100644 index 0000000..8a57ca8 --- /dev/null +++ b/example/etl_sub_flow.py @@ -0,0 +1,53 @@ +from datahub_provider.entities import Dataset +from prefect import flow, task + +from prefect_datahub import DatahubEmitter + +# datahub_emitter = DatahubEmitter.load("datahub-emitter-block") +datahub_emitter = DatahubEmitter() + + +@task(name="Extract", description="Extract the actual data") +def extract(): + data = "This is data" + datahub_emitter.emit_task() + return data + + +@task(description="Transform the actual data") +def transform(actual_data): + actual_data = actual_data.split(" ") + datahub_emitter.emit_task( + inputs=[Dataset("snowflake", "mydb.schema.tableA")], + outputs=[Dataset("snowflake", "mydb.schema.tableC")], + ) + return actual_data + + +@task(name="Load_task", description="Load the actual data") +def load(data): + # datahub_emitter.emit_task() + print(data) + + +@flow(log_prints=True) +def tl(data): + print("Flow started") + data = transform(data) + load(data) + datahub_emitter.emit_flow() + print("") + + +@flow(log_prints=True) +def etl(): + print("Flow started") + data = extract() + tl(data) + datahub_emitter.emit_flow() + print("") + + +if __name__ == "__main__": + etl() + print("s") diff --git a/prefect_datahub/datahub_emitter.py b/prefect_datahub/datahub_emitter.py index c0170b1..60f03e1 100644 --- a/prefect_datahub/datahub_emitter.py +++ b/prefect_datahub/datahub_emitter.py @@ -1,25 +1,33 @@ """Module for emit metadata to Datahub REST. """ import asyncio -from typing import List, Optional +from typing import Dict, List, Optional from datahub.api.entities.datajob import DataFlow, DataJob from datahub.api.entities.dataprocess.dataprocess_instance import ( DataProcessInstance, InstanceRunResult, ) +from datahub.emitter.mce_builder import make_user_urn +from datahub.emitter.mcp_builder import PlatformKey, gen_containers from datahub.emitter.rest_emitter import DatahubRestEmitter from datahub.utilities.urns.data_flow_urn import DataFlowUrn from datahub.utilities.urns.data_job_urn import DataJobUrn from datahub.utilities.urns.dataset_urn import DatasetUrn from datahub_provider.entities import _Entity +from prefect import get_run_logger from prefect.blocks.core import Block from prefect.client.cloud import get_cloud_client from prefect.client.orchestration import get_client +from prefect.client.schemas import TaskRun from prefect.context import FlowRunContext, TaskRunContext from pydantic import Field +class WorkspaceKey(PlatformKey): + workspace_name: str + + class DatahubEmitter(Block): """ Block used to emit prefect task and flow related metadata to Datahub REST @@ -48,89 +56,89 @@ class DatahubEmitter(Block): description="Datahub gms rest url.", ) - cluster: Optional[str] = Field( + env: Optional[str] = Field( default="prod", - title="Cluster", - description="Name of the prefect cluster.", - ) - - capture_tags_info: Optional[bool] = Field( - default=True, - title="Capture tags info", - description="If true, the tags field of the task and flow will be captured as DataHub tags.", + title="Environment", + description="Name of the prefect environment.", ) - graceful_exceptions: Optional[bool] = Field( - default=True, - title="Graceful Exceptions", - description="If set to true, most runtime errors in the emit task or flow will be suppressed and will not cause the overall flow to fail. Note that configuration issues will still throw exceptions..", + platform_instance: Optional[str] = Field( + default=None, + title="Platform instance", + description="Name of the prefect platform instance.", ) def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) + self.datajob_to_emit = {} self.emitter = DatahubRestEmitter(gms_server=self.datahub_rest_url) self.emitter.test_connection() - self.prefect_client = get_client() - self.prefect_cloud_client = get_cloud_client() + asyncio.run(get_client().api_healthcheck()) def _entities_to_urn_list(self, iolets: List[_Entity]) -> List[DatasetUrn]: return [DatasetUrn.create_from_string(let.urn) for let in iolets] - def emit_task(self, inputs: List = None, outputs: List = None): - flow_run_ctx = FlowRunContext.get() - task_run_ctx = TaskRunContext.get() - - if flow_run_ctx is None or task_run_ctx is None: - return + async def _get_flow_run_graph(self, flow_run_id): + response = await get_client()._client.get(f"/flow_runs/{flow_run_id}/graph") + return response.json() + def generate_datajob( + self, + flow_run_ctx: FlowRunContext, + task_run_ctx: TaskRunContext = None, + task_key: str = None, + ) -> Optional[DataJob]: dataflow_urn = DataFlowUrn.create_from_ids( - orchestrator="prefect", env=self.cluster, flow_id=flow_run_ctx.flow.name - ) - - datajob = DataJob( - id=task_run_ctx.task.task_key, - flow_urn=dataflow_urn, - name=task_run_ctx.task.name, + orchestrator="prefect", + flow_id=flow_run_ctx.flow.name, + env=self.env, + platform_instance=self.platform_instance, ) - datajob.description = task_run_ctx.task.description - datajob.tags = task_run_ctx.task.tags - if inputs is not None: - datajob.inlets.extend(self._entities_to_urn_list(inputs)) - if outputs is not None: - datajob.outlets.extend(self._entities_to_urn_list(outputs)) - - # Add upstrem urns - if task_run_ctx.task_run.task_inputs: - task_run_key_map = { - str(prefect_future.task_run.id): prefect_future.task_run.task_key - for prefect_future in flow_run_ctx.task_run_futures - } - for key in task_run_ctx.task_run.task_inputs.keys(): - upstream_task_urn = DataJobUrn.create_from_ids( - data_flow_urn=str(dataflow_urn), - job_id=task_run_key_map[ - str(task_run_ctx.task_run.task_inputs[key][0].id) - ], - ) - datajob.upstream_urns.extend([upstream_task_urn]) - datajob.emit(self.emitter) + if task_run_ctx is not None: + datajob = DataJob( + id=task_run_ctx.task.task_key, + flow_urn=dataflow_urn, + name=task_run_ctx.task.name, + ) - def emit_flow(self): - flow_run_ctx = FlowRunContext.get() + datajob.description = task_run_ctx.task.description + datajob.tags = task_run_ctx.task.tags + job_property_bag: Dict[str, str] = {} - if flow_run_ctx is None: - return + allowed_task_keys = [ + "cache_result_in_memory", + "isasync", + "retries", + "_is_protocol", + "task_key", + ] + for key in allowed_task_keys: + if hasattr(task_run_ctx.task, key): + job_property_bag[key] = repr(getattr(task_run_ctx.task, key)) + datajob.properties = job_property_bag + return datajob + elif task_key is not None: + datajob = DataJob( + id=task_key, + flow_urn=dataflow_urn, + ) + return datajob + return None + def generate_dataflow(self, flow_run_ctx: FlowRunContext) -> DataFlow: dataflow = DataFlow( - cluster=self.cluster, id=flow_run_ctx.flow.name, orchestrator="prefect" + orchestrator="prefect", + id=flow_run_ctx.flow.name, + env=self.env, + platform_instance=self.platform_instance, ) dataflow.description = flow_run_ctx.flow.description - dataflow.emit(self.emitter) + return dataflow + def run_dataflow(self, dataflow: DataFlow, flow_run_ctx: FlowRunContext) -> None: dpi = DataProcessInstance.from_dataflow( dataflow=dataflow, id=flow_run_ctx.flow_run.name ) - dpi.emit_process_start( emitter=self.emitter, start_timestamp_millis=int( @@ -138,47 +146,118 @@ def emit_flow(self): ), ) + def run_datajob( + self, datajob: DataJob, flow_run_name: str, task_run: TaskRun + ) -> None: + if task_run.state_name == "Completed": + result = InstanceRunResult.SUCCESS + elif task_run.state_name == "Failed": + result = InstanceRunResult.FAILURE + elif task_run.state_name == "Cancelled": + result = InstanceRunResult.SKIPPED + else: + raise Exception( + f"Result should be either success or failure and it was {task_run.state_name}" + ) + dpi = DataProcessInstance.from_datajob( + datajob=datajob, + id=f"{flow_run_name}.{task_run.name}", + clone_inlets=True, + clone_outlets=True, + ) + dpi.emit_process_start( + emitter=self.emitter, + start_timestamp_millis=int(task_run.start_time.timestamp() * 1000), + emit_template=False, + ) dpi.emit_process_end( emitter=self.emitter, - end_timestamp_millis=int( - flow_run_ctx.flow_run.start_time.timestamp() * 1000 - ) - + 5000, - result=InstanceRunResult.SUCCESS, + end_timestamp_millis=int(task_run.end_time.timestamp() * 1000), + result=result, result_type="prefect", ) - for prefect_future in flow_run_ctx.task_run_futures: - task_run = asyncio.run( - self.prefect_client.read_task_run(prefect_future.task_run.id) + def emit_workspaces(self) -> None: + try: + asyncio.run(get_cloud_client().api_healthcheck()) + except Exception as e: + get_run_logger().debug(str(e)) + return + + workspaces = asyncio.run(get_cloud_client().read_workspaces()) + for workspace in workspaces: + container_key = WorkspaceKey( + workspace_name=workspace.workspace_name, + platform="prefect", + instance=self.platform_instance, + env=self.env, + ) + container_work_units = gen_containers( + container_key=container_key, + name=workspace.workspace_name, + sub_types=["Workspace"], + description=workspace.workspace_description, + owner_urn=make_user_urn(workspace.account_name), ) - datajob = DataJob(id=task_run.task_key, flow_urn=dataflow.urn) - - if task_run.state_name == "Completed": - result = InstanceRunResult.SUCCESS - elif task_run.state_name == "Failed": - result = InstanceRunResult.FAILURE - elif task_run.state_name == "Cancelled": - result = InstanceRunResult.SKIPPED + for workunit in container_work_units: + self.emitter.emit(workunit.metadata) + + def emit_task(self, inputs: List = None, outputs: List = None): + flow_run_ctx = FlowRunContext.get() + task_run_ctx = TaskRunContext.get() + assert flow_run_ctx + assert task_run_ctx + + datajob = self.generate_datajob( + flow_run_ctx=flow_run_ctx, task_run_ctx=task_run_ctx + ) + if inputs is not None: + datajob.inlets.extend(self._entities_to_urn_list(inputs)) + if outputs is not None: + datajob.outlets.extend(self._entities_to_urn_list(outputs)) + self.datajob_to_emit[str(datajob.urn)] = datajob + + def emit_flow(self): + flow_run_ctx = FlowRunContext.get() + assert flow_run_ctx + # Emit flow + dataflow = self.generate_dataflow(flow_run_ctx=flow_run_ctx) + dataflow.emit(self.emitter) + + # Emit task, task run and add upstream task if present for each task + graph_json = asyncio.run( + self._get_flow_run_graph(str(flow_run_ctx.flow_run.id)) + ) + task_run_key_map = { + str(prefect_future.task_run.id): prefect_future.task_run.task_key + for prefect_future in flow_run_ctx.task_run_futures + } + for node in graph_json: + task_run = asyncio.run(get_client().read_task_run(node["id"])) + # Emit task + datajob_urn = DataJobUrn.create_from_ids( + data_flow_urn=str(dataflow.urn), + job_id=task_run.task_key, + ) + if str(datajob_urn) in self.datajob_to_emit: + datajob = self.datajob_to_emit[str(datajob_urn)] else: - raise Exception( - f"Result should be either success or failure and it was {task_run.state_name}" + datajob = self.generate_datajob( + flow_run_ctx=flow_run_ctx, task_key=task_run.task_key + ) + # Add upstrem urns + for each in node["upstream_dependencies"]: + upstream_task_urn = DataJobUrn.create_from_ids( + data_flow_urn=str(dataflow.urn), + job_id=task_run_key_map[each["id"]], ) + datajob.upstream_urns.extend([upstream_task_urn]) + datajob.emit(self.emitter) - dpi = DataProcessInstance.from_datajob( + self.run_datajob( datajob=datajob, - id=f"{flow_run_ctx.flow_run.name}.{task_run.name}", - clone_inlets=True, - clone_outlets=True, - ) - dpi.emit_process_start( - emitter=self.emitter, - start_timestamp_millis=int(task_run.start_time.timestamp() * 1000), - emit_template=False, - ) - dpi.emit_process_end( - emitter=self.emitter, - end_timestamp_millis=int(task_run.end_time.timestamp() * 1000), - result=result, - result_type="prefect", + flow_run_name=flow_run_ctx.flow_run.name, + task_run=task_run, ) + + self.emit_workspaces() From 8d059600cfa54e5ca78b4e14dde3ac073a2ca1b4 Mon Sep 17 00:00:00 2001 From: shubhamjagtap639 Date: Fri, 2 Jun 2023 18:07:07 +0530 Subject: [PATCH 04/39] dataflow, datajob and data process instance properties added --- prefect_datahub/datahub_emitter.py | 91 +++++++++++++++++++++++++----- 1 file changed, 78 insertions(+), 13 deletions(-) diff --git a/prefect_datahub/datahub_emitter.py b/prefect_datahub/datahub_emitter.py index 60f03e1..010b5b4 100644 --- a/prefect_datahub/datahub_emitter.py +++ b/prefect_datahub/datahub_emitter.py @@ -73,7 +73,6 @@ def __init__(self, *args, **kwargs): self.datajob_to_emit = {} self.emitter = DatahubRestEmitter(gms_server=self.datahub_rest_url) self.emitter.test_connection() - asyncio.run(get_client().api_healthcheck()) def _entities_to_urn_list(self, iolets: List[_Entity]) -> List[DatasetUrn]: return [DatasetUrn.create_from_string(let.urn) for let in iolets] @@ -103,17 +102,22 @@ def generate_datajob( datajob.description = task_run_ctx.task.description datajob.tags = task_run_ctx.task.tags - job_property_bag: Dict[str, str] = {} + job_property_bag: Dict[str, str] = {} allowed_task_keys = [ - "cache_result_in_memory", - "isasync", + "version", + "cache_expiration", + "task_run_name", "retries", - "_is_protocol", + "timeout_seconds", + "log_prints", + "refresh_cache", "task_key", + "on_completion", + "on_failure", ] for key in allowed_task_keys: - if hasattr(task_run_ctx.task, key): + if hasattr(task_run_ctx.task, key) and getattr(task_run_ctx.task, key) is not None: job_property_bag[key] = repr(getattr(task_run_ctx.task, key)) datajob.properties = job_property_bag return datajob @@ -126,19 +130,61 @@ def generate_datajob( return None def generate_dataflow(self, flow_run_ctx: FlowRunContext) -> DataFlow: + flow = asyncio.run(get_client().read_flow(flow_id=flow_run_ctx.flow_run.flow_id)) dataflow = DataFlow( orchestrator="prefect", id=flow_run_ctx.flow.name, env=self.env, + name=flow_run_ctx.flow.name, platform_instance=self.platform_instance, ) dataflow.description = flow_run_ctx.flow.description + dataflow.tags = flow.tags + flow_property_bag: Dict[str, str] = {} + flow_property_bag['id'] = str(flow.id) + flow_property_bag['created'] = str(flow.created) + flow_property_bag['updated'] = str(flow.updated) + + allowed_flow_keys = [ + "version", + "flow_run_name", + "retries", + "task_runner", + "timeout_seconds", + "persist_result", + "log_prints", + "on_completion", + "on_failure", + "on_cancellation", + "on_crashed", + ] + for key in allowed_flow_keys: + if hasattr(flow_run_ctx.flow, key) and getattr(flow_run_ctx.flow, key) is not None: + flow_property_bag[key] = repr(getattr(flow_run_ctx.flow, key)) + dataflow.properties = flow_property_bag + return dataflow def run_dataflow(self, dataflow: DataFlow, flow_run_ctx: FlowRunContext) -> None: + flow_run = asyncio.run(get_client().read_flow_run(flow_run_id=flow_run_ctx.flow_run.id)) dpi = DataProcessInstance.from_dataflow( dataflow=dataflow, id=flow_run_ctx.flow_run.name ) + + dpi_property_bag: Dict[str, str] = {} + dpi_property_bag["id"] = str(flow_run.id) + dpi_property_bag["created"] = str(flow_run.created) + dpi_property_bag["created_by"] = str(flow_run.created_by) + dpi_property_bag["auto_scheduled"] = str(flow_run.auto_scheduled) + dpi_property_bag["estimated_run_time"] = str(flow_run.estimated_run_time) + dpi_property_bag["start_time"] = str(flow_run.start_time) + dpi_property_bag["total_run_time"] = str(flow_run.total_run_time) + dpi_property_bag["next_scheduled_start_time"] = str(flow_run.next_scheduled_start_time) + dpi_property_bag["tags"] = str(flow_run.tags) + dpi_property_bag["updated"] = str(flow_run.updated) + dpi_property_bag["run_count"] = str(flow_run.run_count) + dpi.properties.update(dpi_property_bag) + dpi.emit_process_start( emitter=self.emitter, start_timestamp_millis=int( @@ -149,6 +195,28 @@ def run_dataflow(self, dataflow: DataFlow, flow_run_ctx: FlowRunContext) -> None def run_datajob( self, datajob: DataJob, flow_run_name: str, task_run: TaskRun ) -> None: + dpi = DataProcessInstance.from_datajob( + datajob=datajob, + id=f"{flow_run_name}.{task_run.name}", + clone_inlets=True, + clone_outlets=True, + ) + + dpi_property_bag: Dict[str, str] = {} + dpi_property_bag["id"] = str(task_run.id) + dpi_property_bag["flow_run_id"] = str(task_run.flow_run_id) + dpi_property_bag["created"] = str(task_run.created) + dpi_property_bag["estimated_run_time"] = str(task_run.estimated_run_time) + dpi_property_bag["expected_start_time"] = str(task_run.expected_start_time) + dpi_property_bag["start_time"] = str(task_run.start_time) + dpi_property_bag["end_time"] = str(task_run.end_time) + dpi_property_bag["total_run_time"] = str(task_run.total_run_time) + dpi_property_bag["next_scheduled_start_time"] = str(task_run.next_scheduled_start_time) + dpi_property_bag["tags"] = str(task_run.tags) + dpi_property_bag["updated"] = str(task_run.updated) + dpi_property_bag["run_count"] = str(task_run.run_count) + dpi.properties.update(dpi_property_bag) + if task_run.state_name == "Completed": result = InstanceRunResult.SUCCESS elif task_run.state_name == "Failed": @@ -159,17 +227,13 @@ def run_datajob( raise Exception( f"Result should be either success or failure and it was {task_run.state_name}" ) - dpi = DataProcessInstance.from_datajob( - datajob=datajob, - id=f"{flow_run_name}.{task_run.name}", - clone_inlets=True, - clone_outlets=True, - ) + dpi.emit_process_start( emitter=self.emitter, start_timestamp_millis=int(task_run.start_time.timestamp() * 1000), emit_template=False, ) + dpi.emit_process_end( emitter=self.emitter, end_timestamp_millis=int(task_run.end_time.timestamp() * 1000), @@ -181,7 +245,7 @@ def emit_workspaces(self) -> None: try: asyncio.run(get_cloud_client().api_healthcheck()) except Exception as e: - get_run_logger().debug(str(e)) + get_run_logger().info("Cannot emit workspaces. Please set correct 'PREFECT_API_KEY'.") return workspaces = asyncio.run(get_cloud_client().read_workspaces()) @@ -254,6 +318,7 @@ def emit_flow(self): datajob.upstream_urns.extend([upstream_task_urn]) datajob.emit(self.emitter) + self.run_dataflow(dataflow, flow_run_ctx) self.run_datajob( datajob=datajob, flow_run_name=flow_run_ctx.flow_run.name, From e5da5a75e38bd286b010c9531d73e2ed82a7a8ba Mon Sep 17 00:00:00 2001 From: shubhamjagtap639 Date: Fri, 2 Jun 2023 18:10:40 +0530 Subject: [PATCH 05/39] datahub emitter file reformatted --- prefect_datahub/datahub_emitter.py | 50 ++++++++++++++++++++---------- 1 file changed, 33 insertions(+), 17 deletions(-) diff --git a/prefect_datahub/datahub_emitter.py b/prefect_datahub/datahub_emitter.py index 010b5b4..d842cb6 100644 --- a/prefect_datahub/datahub_emitter.py +++ b/prefect_datahub/datahub_emitter.py @@ -102,7 +102,7 @@ def generate_datajob( datajob.description = task_run_ctx.task.description datajob.tags = task_run_ctx.task.tags - job_property_bag: Dict[str, str] = {} + job_property_bag: Dict[str, str] = {} allowed_task_keys = [ "version", @@ -117,7 +117,10 @@ def generate_datajob( "on_failure", ] for key in allowed_task_keys: - if hasattr(task_run_ctx.task, key) and getattr(task_run_ctx.task, key) is not None: + if ( + hasattr(task_run_ctx.task, key) + and getattr(task_run_ctx.task, key) is not None + ): job_property_bag[key] = repr(getattr(task_run_ctx.task, key)) datajob.properties = job_property_bag return datajob @@ -130,7 +133,9 @@ def generate_datajob( return None def generate_dataflow(self, flow_run_ctx: FlowRunContext) -> DataFlow: - flow = asyncio.run(get_client().read_flow(flow_id=flow_run_ctx.flow_run.flow_id)) + flow = asyncio.run( + get_client().read_flow(flow_id=flow_run_ctx.flow_run.flow_id) + ) dataflow = DataFlow( orchestrator="prefect", id=flow_run_ctx.flow.name, @@ -141,9 +146,9 @@ def generate_dataflow(self, flow_run_ctx: FlowRunContext) -> DataFlow: dataflow.description = flow_run_ctx.flow.description dataflow.tags = flow.tags flow_property_bag: Dict[str, str] = {} - flow_property_bag['id'] = str(flow.id) - flow_property_bag['created'] = str(flow.created) - flow_property_bag['updated'] = str(flow.updated) + flow_property_bag["id"] = str(flow.id) + flow_property_bag["created"] = str(flow.created) + flow_property_bag["updated"] = str(flow.updated) allowed_flow_keys = [ "version", @@ -159,14 +164,19 @@ def generate_dataflow(self, flow_run_ctx: FlowRunContext) -> DataFlow: "on_crashed", ] for key in allowed_flow_keys: - if hasattr(flow_run_ctx.flow, key) and getattr(flow_run_ctx.flow, key) is not None: + if ( + hasattr(flow_run_ctx.flow, key) + and getattr(flow_run_ctx.flow, key) is not None + ): flow_property_bag[key] = repr(getattr(flow_run_ctx.flow, key)) dataflow.properties = flow_property_bag - + return dataflow def run_dataflow(self, dataflow: DataFlow, flow_run_ctx: FlowRunContext) -> None: - flow_run = asyncio.run(get_client().read_flow_run(flow_run_id=flow_run_ctx.flow_run.id)) + flow_run = asyncio.run( + get_client().read_flow_run(flow_run_id=flow_run_ctx.flow_run.id) + ) dpi = DataProcessInstance.from_dataflow( dataflow=dataflow, id=flow_run_ctx.flow_run.name ) @@ -179,12 +189,14 @@ def run_dataflow(self, dataflow: DataFlow, flow_run_ctx: FlowRunContext) -> None dpi_property_bag["estimated_run_time"] = str(flow_run.estimated_run_time) dpi_property_bag["start_time"] = str(flow_run.start_time) dpi_property_bag["total_run_time"] = str(flow_run.total_run_time) - dpi_property_bag["next_scheduled_start_time"] = str(flow_run.next_scheduled_start_time) + dpi_property_bag["next_scheduled_start_time"] = str( + flow_run.next_scheduled_start_time + ) dpi_property_bag["tags"] = str(flow_run.tags) dpi_property_bag["updated"] = str(flow_run.updated) dpi_property_bag["run_count"] = str(flow_run.run_count) dpi.properties.update(dpi_property_bag) - + dpi.emit_process_start( emitter=self.emitter, start_timestamp_millis=int( @@ -201,7 +213,7 @@ def run_datajob( clone_inlets=True, clone_outlets=True, ) - + dpi_property_bag: Dict[str, str] = {} dpi_property_bag["id"] = str(task_run.id) dpi_property_bag["flow_run_id"] = str(task_run.flow_run_id) @@ -211,12 +223,14 @@ def run_datajob( dpi_property_bag["start_time"] = str(task_run.start_time) dpi_property_bag["end_time"] = str(task_run.end_time) dpi_property_bag["total_run_time"] = str(task_run.total_run_time) - dpi_property_bag["next_scheduled_start_time"] = str(task_run.next_scheduled_start_time) + dpi_property_bag["next_scheduled_start_time"] = str( + task_run.next_scheduled_start_time + ) dpi_property_bag["tags"] = str(task_run.tags) dpi_property_bag["updated"] = str(task_run.updated) dpi_property_bag["run_count"] = str(task_run.run_count) dpi.properties.update(dpi_property_bag) - + if task_run.state_name == "Completed": result = InstanceRunResult.SUCCESS elif task_run.state_name == "Failed": @@ -227,13 +241,13 @@ def run_datajob( raise Exception( f"Result should be either success or failure and it was {task_run.state_name}" ) - + dpi.emit_process_start( emitter=self.emitter, start_timestamp_millis=int(task_run.start_time.timestamp() * 1000), emit_template=False, ) - + dpi.emit_process_end( emitter=self.emitter, end_timestamp_millis=int(task_run.end_time.timestamp() * 1000), @@ -245,7 +259,9 @@ def emit_workspaces(self) -> None: try: asyncio.run(get_cloud_client().api_healthcheck()) except Exception as e: - get_run_logger().info("Cannot emit workspaces. Please set correct 'PREFECT_API_KEY'.") + get_run_logger().info( + "Cannot emit workspaces. Please set correct 'PREFECT_API_KEY'." + ) return workspaces = asyncio.run(get_cloud_client().read_workspaces()) From 0cae683cd838fee3355e19a7d8fb5d0447379ec4 Mon Sep 17 00:00:00 2001 From: shubhamjagtap639 Date: Wed, 7 Jun 2023 20:13:22 +0530 Subject: [PATCH 06/39] test cases for emit task, flow, and workspaces added --- example/etl_flow.py | 1 + example/etl_sub_flow.py | 5 +- prefect_datahub/datahub_emitter.py | 69 ++-- tests/conftest.py | 484 ++++++++++++++++++++++++++++- tests/test_datahub_emitter.py | 258 +++++++++++++++ tests/test_flows.py | 6 - tests/test_tasks.py | 24 -- 7 files changed, 763 insertions(+), 84 deletions(-) create mode 100644 tests/test_datahub_emitter.py delete mode 100644 tests/test_flows.py delete mode 100644 tests/test_tasks.py diff --git a/example/etl_flow.py b/example/etl_flow.py index 0e424d5..19f1dda 100644 --- a/example/etl_flow.py +++ b/example/etl_flow.py @@ -36,6 +36,7 @@ def etl(): data = transform("This is data") load(data) datahub_emitter.emit_flow() + datahub_emitter.emit_workspaces() print("") diff --git a/example/etl_sub_flow.py b/example/etl_sub_flow.py index 8a57ca8..e779f76 100644 --- a/example/etl_sub_flow.py +++ b/example/etl_sub_flow.py @@ -3,8 +3,7 @@ from prefect_datahub import DatahubEmitter -# datahub_emitter = DatahubEmitter.load("datahub-emitter-block") -datahub_emitter = DatahubEmitter() +datahub_emitter = DatahubEmitter.load("datahub-emitter-block") @task(name="Extract", description="Extract the actual data") @@ -26,7 +25,7 @@ def transform(actual_data): @task(name="Load_task", description="Load the actual data") def load(data): - # datahub_emitter.emit_task() + datahub_emitter.emit_task() print(data) diff --git a/prefect_datahub/datahub_emitter.py b/prefect_datahub/datahub_emitter.py index d842cb6..4428326 100644 --- a/prefect_datahub/datahub_emitter.py +++ b/prefect_datahub/datahub_emitter.py @@ -17,8 +17,7 @@ from datahub_provider.entities import _Entity from prefect import get_run_logger from prefect.blocks.core import Block -from prefect.client.cloud import get_cloud_client -from prefect.client.orchestration import get_client +from prefect.client import cloud, orchestration from prefect.client.schemas import TaskRun from prefect.context import FlowRunContext, TaskRunContext from pydantic import Field @@ -78,7 +77,9 @@ def _entities_to_urn_list(self, iolets: List[_Entity]) -> List[DatasetUrn]: return [DatasetUrn.create_from_string(let.urn) for let in iolets] async def _get_flow_run_graph(self, flow_run_id): - response = await get_client()._client.get(f"/flow_runs/{flow_run_id}/graph") + response = await orchestration.get_client()._client.get( + f"/flow_runs/{flow_run_id}/graph" + ) return response.json() def generate_datajob( @@ -134,7 +135,7 @@ def generate_datajob( def generate_dataflow(self, flow_run_ctx: FlowRunContext) -> DataFlow: flow = asyncio.run( - get_client().read_flow(flow_id=flow_run_ctx.flow_run.flow_id) + orchestration.get_client().read_flow(flow_id=flow_run_ctx.flow_run.flow_id) ) dataflow = DataFlow( orchestrator="prefect", @@ -175,7 +176,9 @@ def generate_dataflow(self, flow_run_ctx: FlowRunContext) -> DataFlow: def run_dataflow(self, dataflow: DataFlow, flow_run_ctx: FlowRunContext) -> None: flow_run = asyncio.run( - get_client().read_flow_run(flow_run_id=flow_run_ctx.flow_run.id) + orchestration.get_client().read_flow_run( + flow_run_id=flow_run_ctx.flow_run.id + ) ) dpi = DataProcessInstance.from_dataflow( dataflow=dataflow, id=flow_run_ctx.flow_run.name @@ -255,33 +258,6 @@ def run_datajob( result_type="prefect", ) - def emit_workspaces(self) -> None: - try: - asyncio.run(get_cloud_client().api_healthcheck()) - except Exception as e: - get_run_logger().info( - "Cannot emit workspaces. Please set correct 'PREFECT_API_KEY'." - ) - return - - workspaces = asyncio.run(get_cloud_client().read_workspaces()) - for workspace in workspaces: - container_key = WorkspaceKey( - workspace_name=workspace.workspace_name, - platform="prefect", - instance=self.platform_instance, - env=self.env, - ) - container_work_units = gen_containers( - container_key=container_key, - name=workspace.workspace_name, - sub_types=["Workspace"], - description=workspace.workspace_description, - owner_urn=make_user_urn(workspace.account_name), - ) - for workunit in container_work_units: - self.emitter.emit(workunit.metadata) - def emit_task(self, inputs: List = None, outputs: List = None): flow_run_ctx = FlowRunContext.get() task_run_ctx = TaskRunContext.get() @@ -313,7 +289,7 @@ def emit_flow(self): for prefect_future in flow_run_ctx.task_run_futures } for node in graph_json: - task_run = asyncio.run(get_client().read_task_run(node["id"])) + task_run = asyncio.run(orchestration.get_client().read_task_run(node["id"])) # Emit task datajob_urn = DataJobUrn.create_from_ids( data_flow_urn=str(dataflow.urn), @@ -334,11 +310,34 @@ def emit_flow(self): datajob.upstream_urns.extend([upstream_task_urn]) datajob.emit(self.emitter) - self.run_dataflow(dataflow, flow_run_ctx) self.run_datajob( datajob=datajob, flow_run_name=flow_run_ctx.flow_run.name, task_run=task_run, ) - self.emit_workspaces() + def emit_workspaces(self) -> None: + try: + asyncio.run(cloud.get_cloud_client().api_healthcheck()) + except Exception as e: + get_run_logger().info( + "Cannot emit workspaces. Please set correct 'PREFECT_API_KEY'." + ) + return + workspaces = asyncio.run(cloud.get_cloud_client().read_workspaces()) + for workspace in workspaces: + container_key = WorkspaceKey( + workspace_name=workspace.workspace_name, + platform="prefect", + instance=self.platform_instance, + env=self.env, + ) + container_work_units = gen_containers( + container_key=container_key, + name=workspace.workspace_name, + sub_types=["Workspace"], + description=workspace.workspace_description, + owner_urn=make_user_urn(workspace.account_name), + ) + for workunit in container_work_units: + self.emitter.emit(workunit.metadata) diff --git a/tests/conftest.py b/tests/conftest.py index ca7cae7..1119afd 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,22 +1,474 @@ +import asyncio +import json +from unittest.mock import MagicMock, patch +from uuid import UUID + import pytest -from prefect.testing.utilities import prefect_test_harness +from prefect.client.schemas import FlowRun, TaskRun, Workspace +from prefect.futures import PrefectFuture +from prefect.server.schemas.core import Flow +from requests.models import Response + +mock_transform_task_json = { + "name": "transform", + "description": "Transform the actual data", + "task_key": "__main__.transform", + "tags": [], +} +mock_extract_task_run_json = { + "id": "fa14a52b-d271-4c41-99cb-6b42ca7c070b", + "created": "2023-06-06T05:51:54.822707+00:00", + "updated": "2023-06-06T05:51:55.126000+00:00", + "name": "Extract-0", + "flow_run_id": "c3b947e5-3fa1-4b46-a2e2-58d50c938f2e", + "task_key": "__main__.extract", + "dynamic_key": "0", + "cache_key": None, + "cache_expiration": None, + "task_version": None, + "empirical_policy": { + "max_retries": 0, + "retry_delay_seconds": 0.0, + "retries": 0, + "retry_delay": 0, + "retry_jitter_factor": None, + }, + "tags": [], + "state_id": "e280decd-2cc8-4428-a70f-149bcaf95b3c", + "task_inputs": {}, + "state_type": "COMPLETED", + "state_name": "Completed", + "run_count": 1, + "flow_run_run_count": 1, + "expected_start_time": "2023-06-06T05:51:54.822183+00:00", + "next_scheduled_start_time": None, + "start_time": "2023-06-06T05:51:55.016264+00:00", + "end_time": "2023-06-06T05:51:55.096534+00:00", + "total_run_time": 0.08027, + "estimated_run_time": 0.08027, + "estimated_start_time_delta": 0.194081, + "state": { + "id": "e280decd-2cc8-4428-a70f-149bcaf95b3c", + "type": "COMPLETED", + "name": "Completed", + "timestamp": "2023-06-06T05:51:55.096534+00:00", + "message": None, + "data": {"type": "unpersisted"}, + "state_details": { + "flow_run_id": "c3b947e5-3fa1-4b46-a2e2-58d50c938f2e", + "task_run_id": "fa14a52b-d271-4c41-99cb-6b42ca7c070b", + "child_flow_run_id": None, + "scheduled_time": None, + "cache_key": None, + "cache_expiration": None, + "untrackable_result": False, + "pause_timeout": None, + "pause_reschedule": False, + "pause_key": None, + "refresh_cache": None, + }, + }, +} +mock_transform_task_run_json = { + "id": "dd15ee83-5d28-4bf1-804f-f84eab9f9fb7", + "created": "2023-06-06T05:51:55.160372+00:00", + "updated": "2023-06-06T05:51:55.358000+00:00", + "name": "transform-0", + "flow_run_id": "c3b947e5-3fa1-4b46-a2e2-58d50c938f2e", + "task_key": "__main__.transform", + "dynamic_key": "0", + "cache_key": None, + "cache_expiration": None, + "task_version": None, + "empirical_policy": { + "max_retries": 0, + "retry_delay_seconds": 0.0, + "retries": 0, + "retry_delay": 0, + "retry_jitter_factor": None, + }, + "tags": [], + "state_id": "971ad82e-6e5f-4691-abab-c900358e96c2", + "task_inputs": { + "actual_data": [ + {"input_type": "task_run", "id": "fa14a52b-d271-4c41-99cb-6b42ca7c070b"} + ] + }, + "state_type": "COMPLETED", + "state_name": "Completed", + "run_count": 1, + "flow_run_run_count": 1, + "expected_start_time": "2023-06-06T05:51:55.159416+00:00", + "next_scheduled_start_time": None, + "start_time": "2023-06-06T05:51:55.243159+00:00", + "end_time": "2023-06-06T05:51:55.332950+00:00", + "total_run_time": 0.089791, + "estimated_run_time": 0.089791, + "estimated_start_time_delta": 0.083743, + "state": { + "id": "971ad82e-6e5f-4691-abab-c900358e96c2", + "type": "COMPLETED", + "name": "Completed", + "timestamp": "2023-06-06T05:51:55.332950+00:00", + "message": None, + "data": {"type": "unpersisted"}, + "state_details": { + "flow_run_id": "c3b947e5-3fa1-4b46-a2e2-58d50c938f2e", + "task_run_id": "dd15ee83-5d28-4bf1-804f-f84eab9f9fb7", + "child_flow_run_id": None, + "scheduled_time": None, + "cache_key": None, + "cache_expiration": None, + "untrackable_result": False, + "pause_timeout": None, + "pause_reschedule": False, + "pause_key": None, + "refresh_cache": None, + }, + }, +} +mock_load_task_run_json = { + "id": "f19f83ea-316f-4781-8cbe-1d5d8719afc3", + "created": "2023-06-06T05:51:55.389823+00:00", + "updated": "2023-06-06T05:51:55.566000+00:00", + "name": "Load_task-0", + "flow_run_id": "c3b947e5-3fa1-4b46-a2e2-58d50c938f2e", + "task_key": "__main__.load", + "dynamic_key": "0", + "cache_key": None, + "cache_expiration": None, + "task_version": None, + "empirical_policy": { + "max_retries": 0, + "retry_delay_seconds": 0.0, + "retries": 0, + "retry_delay": 0, + "retry_jitter_factor": None, + }, + "tags": [], + "state_id": "0cad13c8-84e4-4bcf-8616-c5904e10dcb4", + "task_inputs": { + "data": [ + {"input_type": "task_run", "id": "dd15ee83-5d28-4bf1-804f-f84eab9f9fb7"} + ] + }, + "state_type": "COMPLETED", + "state_name": "Completed", + "run_count": 1, + "flow_run_run_count": 1, + "expected_start_time": "2023-06-06T05:51:55.389075+00:00", + "next_scheduled_start_time": None, + "start_time": "2023-06-06T05:51:55.461812+00:00", + "end_time": "2023-06-06T05:51:55.535954+00:00", + "total_run_time": 0.074142, + "estimated_run_time": 0.074142, + "estimated_start_time_delta": 0.072737, + "state": { + "id": "0cad13c8-84e4-4bcf-8616-c5904e10dcb4", + "type": "COMPLETED", + "name": "Completed", + "timestamp": "2023-06-06T05:51:55.535954+00:00", + "message": None, + "data": {"type": "unpersisted"}, + "state_details": { + "flow_run_id": "c3b947e5-3fa1-4b46-a2e2-58d50c938f2e", + "task_run_id": "f19f83ea-316f-4781-8cbe-1d5d8719afc3", + "child_flow_run_id": None, + "scheduled_time": None, + "cache_key": None, + "cache_expiration": None, + "untrackable_result": True, + "pause_timeout": None, + "pause_reschedule": False, + "pause_key": None, + "refresh_cache": None, + }, + }, +} +mock_flow_json = { + "id": "cc65498f-d950-4114-8cc1-7af9e8fdf91b", + "created": "2023-06-02T12:31:10.988697+00:00", + "updated": "2023-06-02T12:31:10.988710+00:00", + "name": "etl", + "description": "Extract transform load flow", + "tags": [], +} +mock_flow_run_json = { + "id": "c3b947e5-3fa1-4b46-a2e2-58d50c938f2e", + "created": "2023-06-06T05:51:54.544266+00:00", + "updated": "2023-06-06T05:51:55.622000+00:00", + "name": "olivine-beagle", + "flow_id": "cc65498f-d950-4114-8cc1-7af9e8fdf91b", + "state_id": "ca2db325-d98f-40e7-862e-449cd0cc9a6e", + "deployment_id": None, + "work_queue_name": None, + "flow_version": "3ba54dfa31a7c9af4161aa4cd020a527", + "parameters": {}, + "idempotency_key": None, + "context": {}, + "empirical_policy": { + "max_retries": 0, + "retry_delay_seconds": 0.0, + "retries": 0, + "retry_delay": 0, + "pause_keys": [], + "resuming": False, + }, + "tags": [], + "parent_task_run_id": None, + "state_type": "COMPLETED", + "state_name": "Completed", + "run_count": 1, + "expected_start_time": "2023-06-06T05:51:54.543357+00:00", + "next_scheduled_start_time": None, + "start_time": "2023-06-06T05:51:54.750523+00:00", + "end_time": "2023-06-06T05:51:55.596446+00:00", + "total_run_time": 0.845923, + "estimated_run_time": 0.845923, + "estimated_start_time_delta": 0.207166, + "auto_scheduled": False, + "infrastructure_document_id": None, + "infrastructure_pid": None, + "created_by": None, + "work_pool_name": None, + "state": { + "id": "ca2db325-d98f-40e7-862e-449cd0cc9a6e", + "type": "COMPLETED", + "name": "Completed", + "timestamp": "2023-06-06T05:51:55.596446+00:00", + "message": "All states completed.", + "data": {"type": "unpersisted"}, + "state_details": { + "flow_run_id": "c3b947e5-3fa1-4b46-a2e2-58d50c938f2e", + "task_run_id": None, + "child_flow_run_id": None, + "scheduled_time": None, + "cache_key": None, + "cache_expiration": None, + "untrackable_result": False, + "pause_timeout": None, + "pause_reschedule": False, + "pause_key": None, + "refresh_cache": None, + }, + }, +} +mock_graph_json = [ + { + "id": "fa14a52b-d271-4c41-99cb-6b42ca7c070b", + "name": "Extract-0", + "upstream_dependencies": [], + "state": { + "id": "e280decd-2cc8-4428-a70f-149bcaf95b3c", + "type": "COMPLETED", + "name": "Completed", + "timestamp": "2023-06-06T05:51:55.096534+00:00", + "message": None, + "data": {"type": "unpersisted"}, + "state_details": { + "flow_run_id": "c3b947e5-3fa1-4b46-a2e2-58d50c938f2e", + "task_run_id": "fa14a52b-d271-4c41-99cb-6b42ca7c070b", + "child_flow_run_id": None, + "scheduled_time": None, + "cache_key": None, + "cache_expiration": None, + "untrackable_result": False, + "pause_timeout": None, + "pause_reschedule": False, + "pause_key": None, + "refresh_cache": None, + }, + }, + "expected_start_time": "2023-06-06T05:51:54.822183+00:00", + "start_time": "2023-06-06T05:51:55.016264+00:00", + "end_time": "2023-06-06T05:51:55.096534+00:00", + "total_run_time": 0.08027, + "estimated_run_time": 0.08027, + "untrackable_result": False, + }, + { + "id": "f19f83ea-316f-4781-8cbe-1d5d8719afc3", + "name": "Load_task-0", + "upstream_dependencies": [ + {"input_type": "task_run", "id": "dd15ee83-5d28-4bf1-804f-f84eab9f9fb7"} + ], + "state": { + "id": "0cad13c8-84e4-4bcf-8616-c5904e10dcb4", + "type": "COMPLETED", + "name": "Completed", + "timestamp": "2023-06-06T05:51:55.535954+00:00", + "message": None, + "data": {"type": "unpersisted"}, + "state_details": { + "flow_run_id": "c3b947e5-3fa1-4b46-a2e2-58d50c938f2e", + "task_run_id": "f19f83ea-316f-4781-8cbe-1d5d8719afc3", + "child_flow_run_id": None, + "scheduled_time": None, + "cache_key": None, + "cache_expiration": None, + "untrackable_result": True, + "pause_timeout": None, + "pause_reschedule": False, + "pause_key": None, + "refresh_cache": None, + }, + }, + "expected_start_time": "2023-06-06T05:51:55.389075+00:00", + "start_time": "2023-06-06T05:51:55.461812+00:00", + "end_time": "2023-06-06T05:51:55.535954+00:00", + "total_run_time": 0.074142, + "estimated_run_time": 0.074142, + "untrackable_result": True, + }, + { + "id": "dd15ee83-5d28-4bf1-804f-f84eab9f9fb7", + "name": "transform-0", + "upstream_dependencies": [ + {"input_type": "task_run", "id": "fa14a52b-d271-4c41-99cb-6b42ca7c070b"} + ], + "state": { + "id": "971ad82e-6e5f-4691-abab-c900358e96c2", + "type": "COMPLETED", + "name": "Completed", + "timestamp": "2023-06-06T05:51:55.332950+00:00", + "message": None, + "data": {"type": "unpersisted"}, + "state_details": { + "flow_run_id": "c3b947e5-3fa1-4b46-a2e2-58d50c938f2e", + "task_run_id": "dd15ee83-5d28-4bf1-804f-f84eab9f9fb7", + "child_flow_run_id": None, + "scheduled_time": None, + "cache_key": None, + "cache_expiration": None, + "untrackable_result": False, + "pause_timeout": None, + "pause_reschedule": False, + "pause_key": None, + "refresh_cache": None, + }, + }, + "expected_start_time": "2023-06-06T05:51:55.159416+00:00", + "start_time": "2023-06-06T05:51:55.243159+00:00", + "end_time": "2023-06-06T05:51:55.332950+00:00", + "total_run_time": 0.089791, + "estimated_run_time": 0.089791, + "untrackable_result": False, + }, +] +mock_workspace_json = { + "account_id": "33e98cfe-ad06-4ceb-a500-c11148499f75", + "account_name": "shubhamjagtapgslabcom", + "account_handle": "shubhamjagtapgslabcom", + "workspace_id": "157eb822-1b3b-4338-ae80-98edd5d00cb9", + "workspace_name": "datahub", + "workspace_description": "", + "workspace_handle": "datahub", +} + + +async def mock_task_run_future(): + extract_prefect_future = PrefectFuture( + name=mock_extract_task_run_json["name"], + key=UUID("4552629a-ac04-4590-b286-27642292739f"), + task_runner=None, + ) + extract_prefect_future.task_run = TaskRun.parse_obj(mock_extract_task_run_json) + transform_prefect_future = PrefectFuture( + name=mock_transform_task_run_json["name"], + key=UUID("40fff3e5-5ef4-4b8b-9cc8-786f91bcc656"), + task_runner=None, + ) + transform_prefect_future.task_run = TaskRun.parse_obj(mock_transform_task_run_json) + load_prefect_future = PrefectFuture( + name=mock_load_task_run_json["name"], + key=UUID("7565f596-9eb0-4330-ba34-963e7839883e"), + task_runner=None, + ) + load_prefect_future.task_run = TaskRun.parse_obj(mock_load_task_run_json) + return [extract_prefect_future, transform_prefect_future, load_prefect_future] + + +@pytest.fixture(scope="module") +def mock_run_context(): + + task_run_ctx = MagicMock() + task_run_ctx.task.task_key = mock_transform_task_json["task_key"] + task_run_ctx.task.name = mock_transform_task_json["name"] + task_run_ctx.task.description = mock_transform_task_json["description"] + task_run_ctx.task.tags = mock_transform_task_json["tags"] + + flow_run_ctx = MagicMock() + flow_run_ctx.flow.name = mock_flow_json["name"] + flow_run_ctx.flow.description = mock_flow_json["description"] + flow_run_obj = FlowRun.parse_obj(mock_flow_run_json) + flow_run_ctx.flow_run.id = flow_run_obj.id + flow_run_ctx.flow_run.name = flow_run_obj.name + flow_run_ctx.flow_run.flow_id = flow_run_obj.flow_id + flow_run_ctx.flow_run.start_time = flow_run_obj.start_time + flow_run_ctx.task_run_futures = asyncio.run(mock_task_run_future()) + + with patch( + "prefect_datahub.datahub_emitter.TaskRunContext" + ) as mock_task_run_ctx, patch( + "prefect_datahub.datahub_emitter.FlowRunContext" + ) as mock_flow_run_ctx: + mock_task_run_ctx.get.return_value = task_run_ctx + mock_flow_run_ctx.get.return_value = flow_run_ctx + yield (task_run_ctx, flow_run_ctx) + + +async def mock_task_run(*args, **kwargs): + if args[0] == "fa14a52b-d271-4c41-99cb-6b42ca7c070b": + return TaskRun.parse_obj(mock_extract_task_run_json) + elif args[0] == "dd15ee83-5d28-4bf1-804f-f84eab9f9fb7": + return TaskRun.parse_obj(mock_transform_task_run_json) + elif args[0] == "f19f83ea-316f-4781-8cbe-1d5d8719afc3": + return TaskRun.parse_obj(mock_load_task_run_json) + return None + + +async def mock_flow(*args, **kwargs): + return Flow.parse_obj(mock_flow_json) + + +async def mock_flow_run(*args, **kwargs): + return FlowRun.parse_obj(mock_flow_run_json) + + +async def mock_flow_run_graph(*args, **kwargs): + response = Response() + response.status_code = 200 + response._content = json.dumps(mock_graph_json, separators=(",", ":")).encode( + "utf-8" + ) + return response + + +async def mock_api_healthcheck(*args, **kwargs): + return None + +async def mock_read_workspaces(*args, **kwargs): + return [Workspace.parse_obj(mock_workspace_json)] -@pytest.fixture(scope="session", autouse=True) -def prefect_db(): - """ - Sets up test harness for temporary DB during test runs. - """ - with prefect_test_harness(): - yield +@pytest.fixture(scope="module") +def mock_prefect_client(): + prefect_client_mock = MagicMock() + prefect_client_mock.read_flow.side_effect = mock_flow + prefect_client_mock.read_flow_run.side_effect = mock_flow_run + prefect_client_mock.read_task_run.side_effect = mock_task_run + prefect_client_mock._client.get.side_effect = mock_flow_run_graph + with patch("prefect_datahub.datahub_emitter.orchestration") as mock_client: + mock_client.get_client.return_value = prefect_client_mock + yield prefect_client_mock -@pytest.fixture(autouse=True) -def reset_object_registry(): - """ - Ensures each test has a clean object registry. - """ - from prefect.context import PrefectObjectRegistry - with PrefectObjectRegistry(): - yield +@pytest.fixture(scope="module") +def mock_prefect_cloud_client(): + prefect_cloud_client_mock = MagicMock() + prefect_cloud_client_mock.api_healthcheck.side_effect = mock_api_healthcheck + prefect_cloud_client_mock.read_workspaces.side_effect = mock_read_workspaces + with patch("prefect_datahub.datahub_emitter.cloud") as mock_client: + mock_client.get_cloud_client.return_value = prefect_cloud_client_mock + yield prefect_cloud_client_mock diff --git a/tests/test_datahub_emitter.py b/tests/test_datahub_emitter.py new file mode 100644 index 0000000..d1acfd5 --- /dev/null +++ b/tests/test_datahub_emitter.py @@ -0,0 +1,258 @@ +from unittest.mock import Mock, patch + +from datahub.api.entities.datajob import DataJob +from datahub_provider.entities import Dataset +from prefect.context import FlowRunContext, TaskRunContext + +from prefect_datahub.datahub_emitter import DatahubEmitter + + +@patch("prefect_datahub.datahub_emitter.DatahubRestEmitter", autospec=True) +def test_emit_task(mock_emit, mock_run_context): + mock_emitter = Mock() + mock_emit.return_value = mock_emitter + + datahub_emitter = DatahubEmitter() + inputs = [Dataset("snowflake", "mydb.schema.tableA")] + outputs = [Dataset("snowflake", "mydb.schema.tableC")] + datahub_emitter.emit_task( + inputs=inputs, + outputs=outputs, + ) + + task_run_ctx: TaskRunContext = mock_run_context[0] + flow_run_ctx: FlowRunContext = mock_run_context[1] + + expected_datajob_urn = f"urn:li:dataJob:(urn:li:dataFlow:(prefect,{flow_run_ctx.flow.name},prod),{task_run_ctx.task.task_key})" + + assert expected_datajob_urn in datahub_emitter.datajob_to_emit.keys() + actual_datajob = datahub_emitter.datajob_to_emit[expected_datajob_urn] + assert isinstance(actual_datajob, DataJob) + assert str(actual_datajob.flow_urn) == "urn:li:dataFlow:(prefect,etl,prod)" + assert actual_datajob.name == task_run_ctx.task.name + assert actual_datajob.description == task_run_ctx.task.description + assert actual_datajob.tags == task_run_ctx.task.tags + assert ( + str(actual_datajob.inlets[0]) + == "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableA,PROD)" + ) + assert ( + str(actual_datajob.outlets[0]) + == "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)" + ) + assert mock_emit.emit.call_count == 0 + + +@patch("prefect_datahub.datahub_emitter.DatahubRestEmitter", autospec=True) +def test_emit_flow(mock_emit, mock_run_context, mock_prefect_client): + mock_emitter = Mock() + mock_emit.return_value = mock_emitter + + datahub_emitter = DatahubEmitter() + datahub_emitter.emit_flow() + + flow_run_ctx: FlowRunContext = mock_run_context[1] + + expected_dataflow_urn = f"urn:li:dataFlow:(prefect,{flow_run_ctx.flow.name},prod)" + + assert mock_emitter.method_calls[1].args[0].aspectName == "dataFlowInfo" + assert mock_emitter.method_calls[1].args[0].entityUrn == expected_dataflow_urn + assert mock_emitter.method_calls[2].args[0].aspectName == "ownership" + assert mock_emitter.method_calls[2].args[0].entityUrn == expected_dataflow_urn + assert mock_emitter.method_calls[3].args[0].aspectName == "globalTags" + assert mock_emitter.method_calls[3].args[0].entityUrn == expected_dataflow_urn + assert mock_emitter.method_calls[4].args[0].aspectName == "dataJobInfo" + assert ( + mock_emitter.method_calls[4].args[0].entityUrn + == f"urn:li:dataJob:(urn:li:dataFlow:(prefect,{flow_run_ctx.flow.name},prod),__main__.extract)" + ) + assert mock_emitter.method_calls[5].args[0].aspectName == "dataJobInputOutput" + assert ( + mock_emitter.method_calls[5].args[0].entityUrn + == f"urn:li:dataJob:(urn:li:dataFlow:(prefect,{flow_run_ctx.flow.name},prod),__main__.extract)" + ) + assert mock_emitter.method_calls[6].args[0].aspectName == "ownership" + assert ( + mock_emitter.method_calls[6].args[0].entityUrn + == f"urn:li:dataJob:(urn:li:dataFlow:(prefect,{flow_run_ctx.flow.name},prod),__main__.extract)" + ) + assert mock_emitter.method_calls[7].args[0].aspectName == "globalTags" + assert ( + mock_emitter.method_calls[7].args[0].entityUrn + == f"urn:li:dataJob:(urn:li:dataFlow:(prefect,{flow_run_ctx.flow.name},prod),__main__.extract)" + ) + assert ( + mock_emitter.method_calls[8].args[0].aspectName + == "dataProcessInstanceProperties" + ) + assert ( + mock_emitter.method_calls[8].args[0].entityUrn + == "urn:li:dataProcessInstance:77a8ea575ff6976d37cd1a60caf98a95" + ) + assert ( + mock_emitter.method_calls[9].args[0].aspectName + == "dataProcessInstanceRelationships" + ) + assert ( + mock_emitter.method_calls[9].args[0].entityUrn + == "urn:li:dataProcessInstance:77a8ea575ff6976d37cd1a60caf98a95" + ) + assert ( + mock_emitter.method_calls[10].args[0].aspectName + == "dataProcessInstanceRunEvent" + ) + assert ( + mock_emitter.method_calls[10].args[0].entityUrn + == "urn:li:dataProcessInstance:77a8ea575ff6976d37cd1a60caf98a95" + ) + assert ( + mock_emitter.method_calls[11].args[0].aspectName + == "dataProcessInstanceRunEvent" + ) + assert ( + mock_emitter.method_calls[11].args[0].entityUrn + == "urn:li:dataProcessInstance:77a8ea575ff6976d37cd1a60caf98a95" + ) + assert mock_emitter.method_calls[12].args[0].aspectName == "dataJobInfo" + assert ( + mock_emitter.method_calls[12].args[0].entityUrn + == f"urn:li:dataJob:(urn:li:dataFlow:(prefect,{flow_run_ctx.flow.name},prod),__main__.load)" + ) + assert mock_emitter.method_calls[13].args[0].aspectName == "dataJobInputOutput" + assert ( + mock_emitter.method_calls[13].args[0].entityUrn + == f"urn:li:dataJob:(urn:li:dataFlow:(prefect,{flow_run_ctx.flow.name},prod),__main__.load)" + ) + assert mock_emitter.method_calls[14].args[0].aspectName == "ownership" + assert ( + mock_emitter.method_calls[14].args[0].entityUrn + == f"urn:li:dataJob:(urn:li:dataFlow:(prefect,{flow_run_ctx.flow.name},prod),__main__.load)" + ) + assert mock_emitter.method_calls[15].args[0].aspectName == "globalTags" + assert ( + mock_emitter.method_calls[15].args[0].entityUrn + == f"urn:li:dataJob:(urn:li:dataFlow:(prefect,{flow_run_ctx.flow.name},prod),__main__.load)" + ) + assert ( + mock_emitter.method_calls[16].args[0].aspectName + == "dataProcessInstanceProperties" + ) + assert ( + mock_emitter.method_calls[16].args[0].entityUrn + == "urn:li:dataProcessInstance:6efec88dd6d26cb85e8592baf38e42b9" + ) + assert ( + mock_emitter.method_calls[17].args[0].aspectName + == "dataProcessInstanceRelationships" + ) + assert ( + mock_emitter.method_calls[17].args[0].entityUrn + == "urn:li:dataProcessInstance:6efec88dd6d26cb85e8592baf38e42b9" + ) + assert ( + mock_emitter.method_calls[18].args[0].aspectName + == "dataProcessInstanceRunEvent" + ) + assert ( + mock_emitter.method_calls[18].args[0].entityUrn + == "urn:li:dataProcessInstance:6efec88dd6d26cb85e8592baf38e42b9" + ) + assert ( + mock_emitter.method_calls[19].args[0].aspectName + == "dataProcessInstanceRunEvent" + ) + assert ( + mock_emitter.method_calls[19].args[0].entityUrn + == "urn:li:dataProcessInstance:6efec88dd6d26cb85e8592baf38e42b9" + ) + assert mock_emitter.method_calls[20].args[0].aspectName == "dataJobInfo" + assert ( + mock_emitter.method_calls[20].args[0].entityUrn + == f"urn:li:dataJob:(urn:li:dataFlow:(prefect,{flow_run_ctx.flow.name},prod),__main__.transform)" + ) + assert mock_emitter.method_calls[21].args[0].aspectName == "dataJobInputOutput" + assert ( + mock_emitter.method_calls[21].args[0].entityUrn + == f"urn:li:dataJob:(urn:li:dataFlow:(prefect,{flow_run_ctx.flow.name},prod),__main__.transform)" + ) + assert mock_emitter.method_calls[22].args[0].aspectName == "ownership" + assert ( + mock_emitter.method_calls[22].args[0].entityUrn + == f"urn:li:dataJob:(urn:li:dataFlow:(prefect,{flow_run_ctx.flow.name},prod),__main__.transform)" + ) + assert mock_emitter.method_calls[23].args[0].aspectName == "globalTags" + assert ( + mock_emitter.method_calls[23].args[0].entityUrn + == f"urn:li:dataJob:(urn:li:dataFlow:(prefect,{flow_run_ctx.flow.name},prod),__main__.transform)" + ) + assert ( + mock_emitter.method_calls[24].args[0].aspectName + == "dataProcessInstanceProperties" + ) + assert ( + mock_emitter.method_calls[24].args[0].entityUrn + == "urn:li:dataProcessInstance:c4458dec616b26ad64e2c520614ef6b7" + ) + assert ( + mock_emitter.method_calls[25].args[0].aspectName + == "dataProcessInstanceRelationships" + ) + assert ( + mock_emitter.method_calls[25].args[0].entityUrn + == "urn:li:dataProcessInstance:c4458dec616b26ad64e2c520614ef6b7" + ) + assert ( + mock_emitter.method_calls[26].args[0].aspectName + == "dataProcessInstanceRunEvent" + ) + assert ( + mock_emitter.method_calls[26].args[0].entityUrn + == "urn:li:dataProcessInstance:c4458dec616b26ad64e2c520614ef6b7" + ) + assert ( + mock_emitter.method_calls[27].args[0].aspectName + == "dataProcessInstanceRunEvent" + ) + assert ( + mock_emitter.method_calls[27].args[0].entityUrn + == "urn:li:dataProcessInstance:c4458dec616b26ad64e2c520614ef6b7" + ) + + +@patch("prefect_datahub.datahub_emitter.DatahubRestEmitter", autospec=True) +def test_emit_workspace(mock_emit, mock_prefect_cloud_client): + mock_emitter = Mock() + mock_emit.return_value = mock_emitter + + datahub_emitter = DatahubEmitter() + datahub_emitter.emit_workspaces() + + assert mock_emitter.method_calls[1].args[0].aspectName == "containerProperties" + assert ( + mock_emitter.method_calls[1].args[0].entityUrn + == "urn:li:container:bf46b065c6816616f35e83d8be976c62" + ) + assert mock_emitter.method_calls[2].args[0].aspectName == "status" + assert ( + mock_emitter.method_calls[2].args[0].entityUrn + == "urn:li:container:bf46b065c6816616f35e83d8be976c62" + ) + assert mock_emitter.method_calls[3].args[0].aspectName == "dataPlatformInstance" + assert ( + mock_emitter.method_calls[3].args[0].entityUrn + == "urn:li:container:bf46b065c6816616f35e83d8be976c62" + ) + assert mock_emitter.method_calls[4].args[0].aspectName == "subTypes" + assert ( + mock_emitter.method_calls[4].args[0].entityUrn + == "urn:li:container:bf46b065c6816616f35e83d8be976c62" + ) + assert mock_emitter.method_calls[5].args[0].aspectName == "ownership" + assert ( + mock_emitter.method_calls[5].args[0].entityUrn + == "urn:li:container:bf46b065c6816616f35e83d8be976c62" + ) + assert ( + mock_emitter.method_calls[5].args[0].aspect.owners[0].owner + == "urn:li:corpuser:shubhamjagtapgslabcom" + ) diff --git a/tests/test_flows.py b/tests/test_flows.py deleted file mode 100644 index e5d8021..0000000 --- a/tests/test_flows.py +++ /dev/null @@ -1,6 +0,0 @@ -from prefect_datahub.flows import hello_and_goodbye - - -def test_hello_and_goodbye_flow(): - result = hello_and_goodbye() - assert result == "Done" diff --git a/tests/test_tasks.py b/tests/test_tasks.py deleted file mode 100644 index 71e5b46..0000000 --- a/tests/test_tasks.py +++ /dev/null @@ -1,24 +0,0 @@ -from prefect import flow - -from prefect_datahub.tasks import ( - goodbye_prefect_datahub, - hello_prefect_datahub, -) - - -def test_hello_prefect_datahub(): - @flow - def test_flow(): - return hello_prefect_datahub() - - result = test_flow() - assert result == "Hello, prefect-datahub!" - - -def goodbye_hello_prefect_datahub(): - @flow - def test_flow(): - return goodbye_prefect_datahub() - - result = test_flow() - assert result == "Goodbye, prefect-datahub!" From 76333915be49558d4ac31d3e3aa8e5393f8e14b1 Mon Sep 17 00:00:00 2001 From: shubhamjagtap639 Date: Fri, 9 Jun 2023 15:12:10 +0530 Subject: [PATCH 07/39] Constants.py added and file reformatted --- example/etl_flow.py | 3 +- mkdocs.yml | 6 +- prefect_datahub/__init__.py | 1 - prefect_datahub/constants.py | 47 ++++ prefect_datahub/datahub_emitter.py | 350 ++++++++++++++++++++--------- requirements-dev.txt | 3 +- requirements.txt | 1 + tests/conftest.py | 1 - tests/test_datahub_emitter.py | 41 ++-- 9 files changed, 321 insertions(+), 132 deletions(-) create mode 100644 prefect_datahub/constants.py diff --git a/example/etl_flow.py b/example/etl_flow.py index 19f1dda..de421e9 100644 --- a/example/etl_flow.py +++ b/example/etl_flow.py @@ -29,14 +29,13 @@ def load(data): print(data) -@flow(log_prints=True) +@flow(name="ETL flow", description="Extract transform load flow") def etl(): print("Flow started") extract() data = transform("This is data") load(data) datahub_emitter.emit_flow() - datahub_emitter.emit_workspaces() print("") diff --git a/mkdocs.yml b/mkdocs.yml index 327cd06..1187ed1 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -73,10 +73,8 @@ watch: nav: - Home: index.md + - Datahub Emitter: datahub_emitter.md - Blocks Catalog: blocks_catalog.md - - Examples Catalog: examples_catalog.md - - API Reference: - - Tasks: tasks.md - - Flows: flows.md + - Examples Catalog: examples_catalog.md diff --git a/prefect_datahub/__init__.py b/prefect_datahub/__init__.py index 6e20479..4d52a61 100644 --- a/prefect_datahub/__init__.py +++ b/prefect_datahub/__init__.py @@ -1,4 +1,3 @@ from . import _version -from .datahub_emitter import DatahubEmitter # noqa __version__ = _version.get_versions()["version"] diff --git a/prefect_datahub/constants.py b/prefect_datahub/constants.py new file mode 100644 index 0000000..d73ecca --- /dev/null +++ b/prefect_datahub/constants.py @@ -0,0 +1,47 @@ +ORCHESTRATOR = "prefect" + +# Flow and task common constants +VERSION = "version" +RETRIES = "retries" +TIMEOUT_SECONDS = "timeout_seconds" +LOG_PRINTS = "log_prints" +ON_COMPLETION = "on_completion" +ON_FAILURE = "on_failure" + +# Flow constants +FLOW_RUN_NAME = "flow_run_name" +TASK_RUNNER = "task_runner" +PERSIST_RESULT = "persist_result" +ON_CANCELLATION = "on_cancellation" +ON_CRASHED = "on_crashed" + +# Task constants +CACHE_EXPIRATION = "cache_expiration" +TASK_RUN_NAME = "task_run_name" +REFRESH_CACHE = "refresh_cache" +TASK_KEY = "task_key" + +# Flow run and task run common constants +ID = "id" +CREATED = "created" +UPDATED = "updated" +TAGS = "tags" +ESTIMATED_RUN_TIME = "estimated_run_time" +START_TIME = "start_time" +END_TIME = "end_time" +TOTAL_RUN_TIME = "total_run_time" +NEXT_SCHEDULED_START_TIME = "next_scheduled_start_time" + +# Fask run constants +CREATED_BY = "created_by" +AUTO_SCHEDULED = "auto_scheduled" + +# Task run constants +FLOW_RUN_ID = "flow_run_id" +RUN_COUNT = "run_count" +UPSTREAM_DEPENDENCIES = "upstream_dependencies" + +# States constants +COMPLETE = "Completed" +FAILED = "Failed" +CANCELLED = "Cancelled" diff --git a/prefect_datahub/datahub_emitter.py b/prefect_datahub/datahub_emitter.py index 4428326..df62ea6 100644 --- a/prefect_datahub/datahub_emitter.py +++ b/prefect_datahub/datahub_emitter.py @@ -1,4 +1,4 @@ -"""Module for emit metadata to Datahub REST. """ +"""Datahub Emitter classes used to emit prefect metadata to Datahub REST. """ import asyncio from typing import Dict, List, Optional @@ -22,6 +22,8 @@ from prefect.context import FlowRunContext, TaskRunContext from pydantic import Field +from prefect_datahub import constants + class WorkspaceKey(PlatformKey): workspace_name: str @@ -32,9 +34,13 @@ class DatahubEmitter(Block): Block used to emit prefect task and flow related metadata to Datahub REST Attributes: - datahub_rest_url (str): The value to store. - cluster (str): The value to store. - capture_tags_info (boolean): The value to store. + datahub_rest_url (str): The Datahub GMS Rest URL. + env (str): The environment that all assets produced by this orchestrator \ + belong to. For more detail and possible values refer \ + https://datahubproject.io/docs/graphql/enums/#fabrictype. + platform_instance (str): The instance of the platform that all assets \ + produced by this recipe belong to. For more detail please refer to \ + https://datahubproject.io/docs/platform-instances/. Example: Load a stored value: @@ -58,18 +64,22 @@ class DatahubEmitter(Block): env: Optional[str] = Field( default="prod", title="Environment", - description="Name of the prefect environment.", + description="The environment that all assets produced by this orchestrator " + "belong to. For more detail and possible values refer " + "https://datahubproject.io/docs/graphql/enums/#fabrictype.", ) platform_instance: Optional[str] = Field( default=None, title="Platform instance", - description="Name of the prefect platform instance.", + description="The instance of the platform that all assets produced by this " + "recipe belong to. For more detail please refer to " + "https://datahubproject.io/docs/platform-instances/.", ) def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - self.datajob_to_emit = {} + self.datajobs_to_emit = {} self.emitter = DatahubRestEmitter(gms_server=self.datahub_rest_url) self.emitter.test_connection() @@ -82,14 +92,26 @@ async def _get_flow_run_graph(self, flow_run_id): ) return response.json() - def generate_datajob( + def _generate_datajob( self, flow_run_ctx: FlowRunContext, - task_run_ctx: TaskRunContext = None, - task_key: str = None, + task_run_ctx: Optional[TaskRunContext] = None, + task_key: Optional[str] = None, ) -> Optional[DataJob]: + """ + Create datajob entity using task run ctx and flow run ctx. + Assign description, tags, and properties to created datajob. + + Args: + flow_run_ctx: The prefect current running flow run context. + task_run_ctx: The prefect current running task run context. + task_key: The task key. + + Returns: + The datajob entity. + """ dataflow_urn = DataFlowUrn.create_from_ids( - orchestrator="prefect", + orchestrator=constants.ORCHESTRATOR, flow_id=flow_run_ctx.flow.name, env=self.env, platform_instance=self.platform_instance, @@ -106,16 +128,16 @@ def generate_datajob( job_property_bag: Dict[str, str] = {} allowed_task_keys = [ - "version", - "cache_expiration", - "task_run_name", - "retries", - "timeout_seconds", - "log_prints", - "refresh_cache", - "task_key", - "on_completion", - "on_failure", + constants.VERSION, + constants.CACHE_EXPIRATION, + constants.TASK_RUN_NAME, + constants.RETRIES, + constants.TIMEOUT_SECONDS, + constants.LOG_PRINTS, + constants.REFRESH_CACHE, + constants.TASK_KEY, + constants.ON_COMPLETION, + constants.ON_FAILURE, ] for key in allowed_task_keys: if ( @@ -133,13 +155,24 @@ def generate_datajob( return datajob return None - def generate_dataflow(self, flow_run_ctx: FlowRunContext) -> DataFlow: + def _generate_dataflow(self, flow_run_ctx: FlowRunContext) -> DataFlow: + """ + Create dataflow entity using flow run ctx. + Assign description, tags, and properties to created dataflow. + + Args: + flow_run_ctx: The prefect current running flow run context. + + Returns: + The dataflow entity. + """ flow = asyncio.run( orchestration.get_client().read_flow(flow_id=flow_run_ctx.flow_run.flow_id) ) dataflow = DataFlow( - orchestrator="prefect", + orchestrator=constants.ORCHESTRATOR, id=flow_run_ctx.flow.name, + cluster=self.env, env=self.env, name=flow_run_ctx.flow.name, platform_instance=self.platform_instance, @@ -147,22 +180,22 @@ def generate_dataflow(self, flow_run_ctx: FlowRunContext) -> DataFlow: dataflow.description = flow_run_ctx.flow.description dataflow.tags = flow.tags flow_property_bag: Dict[str, str] = {} - flow_property_bag["id"] = str(flow.id) - flow_property_bag["created"] = str(flow.created) - flow_property_bag["updated"] = str(flow.updated) + flow_property_bag[constants.ID] = str(flow.id) + flow_property_bag[constants.CREATED] = str(flow.created) + flow_property_bag[constants.UPDATED] = str(flow.updated) allowed_flow_keys = [ - "version", - "flow_run_name", - "retries", - "task_runner", - "timeout_seconds", - "persist_result", - "log_prints", - "on_completion", - "on_failure", - "on_cancellation", - "on_crashed", + constants.VERSION, + constants.FLOW_RUN_NAME, + constants.RETRIES, + constants.TASK_RUNNER, + constants.TIMEOUT_SECONDS, + constants.PERSIST_RESULT, + constants.LOG_PRINTS, + constants.ON_COMPLETION, + constants.ON_FAILURE, + constants.ON_CANCELLATION, + constants.ON_CRASHED, ] for key in allowed_flow_keys: if ( @@ -174,7 +207,16 @@ def generate_dataflow(self, flow_run_ctx: FlowRunContext) -> DataFlow: return dataflow - def run_dataflow(self, dataflow: DataFlow, flow_run_ctx: FlowRunContext) -> None: + def _run_dataflow(self, dataflow: DataFlow, flow_run_ctx: FlowRunContext) -> None: + """ + Emit prefect flow run to datahub rest. Prefect flow run get mapped with datahub + data process instance entity which get's generate from provided dataflow entity. + Assign flow run properties to data process instance properties. + + Args: + dataflow: The datahub dataflow entity used to create data process instance. + flow_run_ctx: The prefect current running flow run context. + """ flow_run = asyncio.run( orchestration.get_client().read_flow_run( flow_run_id=flow_run_ctx.flow_run.id @@ -185,19 +227,22 @@ def run_dataflow(self, dataflow: DataFlow, flow_run_ctx: FlowRunContext) -> None ) dpi_property_bag: Dict[str, str] = {} - dpi_property_bag["id"] = str(flow_run.id) - dpi_property_bag["created"] = str(flow_run.created) - dpi_property_bag["created_by"] = str(flow_run.created_by) - dpi_property_bag["auto_scheduled"] = str(flow_run.auto_scheduled) - dpi_property_bag["estimated_run_time"] = str(flow_run.estimated_run_time) - dpi_property_bag["start_time"] = str(flow_run.start_time) - dpi_property_bag["total_run_time"] = str(flow_run.total_run_time) - dpi_property_bag["next_scheduled_start_time"] = str( - flow_run.next_scheduled_start_time - ) - dpi_property_bag["tags"] = str(flow_run.tags) - dpi_property_bag["updated"] = str(flow_run.updated) - dpi_property_bag["run_count"] = str(flow_run.run_count) + allowed_flow_run_keys = [ + constants.ID, + constants.CREATED, + constants.UPDATED, + constants.CREATED_BY, + constants.AUTO_SCHEDULED, + constants.ESTIMATED_RUN_TIME, + constants.START_TIME, + constants.TOTAL_RUN_TIME, + constants.NEXT_SCHEDULED_START_TIME, + constants.TAGS, + constants.RUN_COUNT, + ] + for key in allowed_flow_run_keys: + if hasattr(flow_run, key) and getattr(flow_run, key) is not None: + dpi_property_bag[key] = str(getattr(flow_run, key)) dpi.properties.update(dpi_property_bag) dpi.emit_process_start( @@ -207,9 +252,19 @@ def run_dataflow(self, dataflow: DataFlow, flow_run_ctx: FlowRunContext) -> None ), ) - def run_datajob( + def _run_datajob( self, datajob: DataJob, flow_run_name: str, task_run: TaskRun ) -> None: + """ + Emit prefect task run to datahub rest. Prefect task run get mapped with datahub + data process instance entity which get's generate from provided datajob entity. + Assign task run properties to data process instance properties. + + Args: + datajob: The datahub datajob entity used to create data process instance. + flow_run_name: The prefect current running flow run name. + task_run: The prefect task run entity. + """ dpi = DataProcessInstance.from_datajob( datajob=datajob, id=f"{flow_run_name}.{task_run.name}", @@ -218,31 +273,34 @@ def run_datajob( ) dpi_property_bag: Dict[str, str] = {} - dpi_property_bag["id"] = str(task_run.id) - dpi_property_bag["flow_run_id"] = str(task_run.flow_run_id) - dpi_property_bag["created"] = str(task_run.created) - dpi_property_bag["estimated_run_time"] = str(task_run.estimated_run_time) - dpi_property_bag["expected_start_time"] = str(task_run.expected_start_time) - dpi_property_bag["start_time"] = str(task_run.start_time) - dpi_property_bag["end_time"] = str(task_run.end_time) - dpi_property_bag["total_run_time"] = str(task_run.total_run_time) - dpi_property_bag["next_scheduled_start_time"] = str( - task_run.next_scheduled_start_time - ) - dpi_property_bag["tags"] = str(task_run.tags) - dpi_property_bag["updated"] = str(task_run.updated) - dpi_property_bag["run_count"] = str(task_run.run_count) + allowed_task_run_keys = [ + constants.ID, + constants.FLOW_RUN_ID, + constants.CREATED, + constants.UPDATED, + constants.ESTIMATED_RUN_TIME, + constants.START_TIME, + constants.END_TIME, + constants.TOTAL_RUN_TIME, + constants.NEXT_SCHEDULED_START_TIME, + constants.TAGS, + constants.RUN_COUNT, + ] + for key in allowed_task_run_keys: + if hasattr(task_run, key) and getattr(task_run, key) is not None: + dpi_property_bag[key] = str(getattr(task_run, key)) dpi.properties.update(dpi_property_bag) - if task_run.state_name == "Completed": + if task_run.state_name == constants.COMPLETE: result = InstanceRunResult.SUCCESS - elif task_run.state_name == "Failed": + elif task_run.state_name == constants.FAILED: result = InstanceRunResult.FAILURE - elif task_run.state_name == "Cancelled": + elif task_run.state_name == constants.CANCELLED: result = InstanceRunResult.SKIPPED else: raise Exception( - f"Result should be either success or failure and it was {task_run.state_name}" + f"Result should be either success or failure and it was " + f"{task_run.state_name}" ) dpi.emit_process_start( @@ -255,29 +313,126 @@ def run_datajob( emitter=self.emitter, end_timestamp_millis=int(task_run.end_time.timestamp() * 1000), result=result, - result_type="prefect", + result_type=constants.ORCHESTRATOR, ) - def emit_task(self, inputs: List = None, outputs: List = None): + def _emit_workspaces(self) -> None: + """ + Emit prefect workspace metadata to datahub rest. + Prefect workspce get mapped with datahub container entity. + Workspace account name also get emit as owner of continer. + """ + try: + asyncio.run(cloud.get_cloud_client().api_healthcheck()) + except Exception: + get_run_logger().info( + "Cannot emit workspaces. Please set correct 'PREFECT_API_KEY'." + ) + return + SUB_TYPE = "Workspace" + workspaces = asyncio.run(cloud.get_cloud_client().read_workspaces()) + for workspace in workspaces: + container_key = WorkspaceKey( + workspace_name=workspace.workspace_name, + platform=constants.ORCHESTRATOR, + instance=self.platform_instance, + env=self.env, + ) + container_work_units = gen_containers( + container_key=container_key, + name=workspace.workspace_name, + sub_types=[SUB_TYPE], + description=workspace.workspace_description, + owner_urn=make_user_urn(workspace.account_name), + ) + for workunit in container_work_units: + self.emitter.emit(workunit.metadata) + + def emit_task( + self, + inputs: Optional[List[_Entity]] = None, + outputs: Optional[List[_Entity]] = None, + ) -> None: + """ + Emit prefect task metadata to datahub rest. Prefect task get mapped with datahub + datajob entity. Assign provided inputs and outputs as datajob inlets and outlets + respectively. To emit task metadata it is compulsory to emit flow as well + otherwise task will not get emit. + + Args: + inputs (list): The list of task inputs. + outputs (list): The list of task outputs. + + Example: + Emit the task metadata as show below: + ```python + from datahub_provider.entities import Dataset + from prefect import flow, task + + from prefect_datahub import DatahubEmitter + + datahub_emitter = DatahubEmitter.load("MY_BLOCK_NAME") + + @task(name="Transform", description="Transform the data") + def transform(data): + data = data.split(" ") + datahub_emitter.emit_task( + inputs=[Dataset("snowflake", "mydb.schema.tableA")], + outputs=[Dataset("snowflake", "mydb.schema.tableC")], + ) + return data + + @flow(name="ETL flow", description="Extract transform load flow") + def etl(): + data = transform("This is data") + datahub_emitter.emit_flow() + ``` + """ flow_run_ctx = FlowRunContext.get() task_run_ctx = TaskRunContext.get() assert flow_run_ctx assert task_run_ctx - datajob = self.generate_datajob( + datajob = self._generate_datajob( flow_run_ctx=flow_run_ctx, task_run_ctx=task_run_ctx ) if inputs is not None: datajob.inlets.extend(self._entities_to_urn_list(inputs)) if outputs is not None: datajob.outlets.extend(self._entities_to_urn_list(outputs)) - self.datajob_to_emit[str(datajob.urn)] = datajob - - def emit_flow(self): + self.datajobs_to_emit[str(datajob.urn)] = datajob + + def emit_flow(self) -> None: + """ + Emit prefect flow metadata to datahub rest. Prefect flow get mapped with datahub + dataflow entity. Add upstream dependencies if present for each task. + Emit the prefect task run metadata as well. If user haven't called emit_task in + task function still emit_flow will emit task but without task name, description, + tags and properties. + Emit the prefect workspace metadata as well. + + + Example: + Emit the flow metadata as show below: + ```python + from prefect import flow, task + + from prefect_datahub import DatahubEmitter + + datahub_emitter = DatahubEmitter.load("MY_BLOCK_NAME") + + @flow(name="ETL flow", description="Extract transform load flow") + def etl(): + data = extract() + data = transform(data) + load(data) + datahub_emitter.emit_flow() + ``` + """ flow_run_ctx = FlowRunContext.get() assert flow_run_ctx # Emit flow - dataflow = self.generate_dataflow(flow_run_ctx=flow_run_ctx) + dataflow = self._generate_dataflow(flow_run_ctx=flow_run_ctx) dataflow.emit(self.emitter) # Emit task, task run and add upstream task if present for each task @@ -289,20 +444,22 @@ def emit_flow(self): for prefect_future in flow_run_ctx.task_run_futures } for node in graph_json: - task_run = asyncio.run(orchestration.get_client().read_task_run(node["id"])) + task_run = asyncio.run( + orchestration.get_client().read_task_run(node[constants.ID]) + ) # Emit task datajob_urn = DataJobUrn.create_from_ids( data_flow_urn=str(dataflow.urn), job_id=task_run.task_key, ) - if str(datajob_urn) in self.datajob_to_emit: - datajob = self.datajob_to_emit[str(datajob_urn)] + if str(datajob_urn) in self.datajobs_to_emit: + datajob = self.datajobs_to_emit[str(datajob_urn)] else: - datajob = self.generate_datajob( + datajob = self._generate_datajob( flow_run_ctx=flow_run_ctx, task_key=task_run.task_key ) # Add upstrem urns - for each in node["upstream_dependencies"]: + for each in node[constants.UPSTREAM_DEPENDENCIES]: upstream_task_urn = DataJobUrn.create_from_ids( data_flow_urn=str(dataflow.urn), job_id=task_run_key_map[each["id"]], @@ -310,34 +467,9 @@ def emit_flow(self): datajob.upstream_urns.extend([upstream_task_urn]) datajob.emit(self.emitter) - self.run_datajob( + # self._run_dataflow(dataflow,flow_run_ctx) + self._run_datajob( datajob=datajob, flow_run_name=flow_run_ctx.flow_run.name, task_run=task_run, ) - - def emit_workspaces(self) -> None: - try: - asyncio.run(cloud.get_cloud_client().api_healthcheck()) - except Exception as e: - get_run_logger().info( - "Cannot emit workspaces. Please set correct 'PREFECT_API_KEY'." - ) - return - workspaces = asyncio.run(cloud.get_cloud_client().read_workspaces()) - for workspace in workspaces: - container_key = WorkspaceKey( - workspace_name=workspace.workspace_name, - platform="prefect", - instance=self.platform_instance, - env=self.env, - ) - container_work_units = gen_containers( - container_key=container_key, - name=workspace.workspace_name, - sub_types=["Workspace"], - description=workspace.workspace_description, - owner_urn=make_user_urn(workspace.account_name), - ) - for workunit in container_work_units: - self.emitter.emit(workunit.metadata) diff --git a/requirements-dev.txt b/requirements-dev.txt index 1c84ddc..8f50f4d 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -12,5 +12,4 @@ mock; python_version < '3.8' mkdocs-gen-files interrogate coverage -pillow -acryl-datahub[datahub-rest] \ No newline at end of file +pillow \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 4ec3de6..db5c355 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1,2 @@ prefect>=2.0.0 +acryl-datahub[datahub-rest] \ No newline at end of file diff --git a/tests/conftest.py b/tests/conftest.py index 1119afd..23cf62b 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -390,7 +390,6 @@ async def mock_task_run_future(): @pytest.fixture(scope="module") def mock_run_context(): - task_run_ctx = MagicMock() task_run_ctx.task.task_key = mock_transform_task_json["task_key"] task_run_ctx.task.name = mock_transform_task_json["name"] diff --git a/tests/test_datahub_emitter.py b/tests/test_datahub_emitter.py index d1acfd5..e5cf3e5 100644 --- a/tests/test_datahub_emitter.py +++ b/tests/test_datahub_emitter.py @@ -23,7 +23,10 @@ def test_emit_task(mock_emit, mock_run_context): task_run_ctx: TaskRunContext = mock_run_context[0] flow_run_ctx: FlowRunContext = mock_run_context[1] - expected_datajob_urn = f"urn:li:dataJob:(urn:li:dataFlow:(prefect,{flow_run_ctx.flow.name},prod),{task_run_ctx.task.task_key})" + expected_datajob_urn = ( + f"urn:li:dataJob:(urn:li:dataFlow:" + f"(prefect,{flow_run_ctx.flow.name},prod),{task_run_ctx.task.task_key})" + ) assert expected_datajob_urn in datahub_emitter.datajob_to_emit.keys() actual_datajob = datahub_emitter.datajob_to_emit[expected_datajob_urn] @@ -64,22 +67,26 @@ def test_emit_flow(mock_emit, mock_run_context, mock_prefect_client): assert mock_emitter.method_calls[4].args[0].aspectName == "dataJobInfo" assert ( mock_emitter.method_calls[4].args[0].entityUrn - == f"urn:li:dataJob:(urn:li:dataFlow:(prefect,{flow_run_ctx.flow.name},prod),__main__.extract)" + == f"urn:li:dataJob:(urn:li:dataFlow:(prefect," + f"{flow_run_ctx.flow.name},prod),__main__.extract)" ) assert mock_emitter.method_calls[5].args[0].aspectName == "dataJobInputOutput" assert ( mock_emitter.method_calls[5].args[0].entityUrn - == f"urn:li:dataJob:(urn:li:dataFlow:(prefect,{flow_run_ctx.flow.name},prod),__main__.extract)" + == f"urn:li:dataJob:(urn:li:dataFlow:(prefect," + f"{flow_run_ctx.flow.name},prod),__main__.extract)" ) assert mock_emitter.method_calls[6].args[0].aspectName == "ownership" assert ( mock_emitter.method_calls[6].args[0].entityUrn - == f"urn:li:dataJob:(urn:li:dataFlow:(prefect,{flow_run_ctx.flow.name},prod),__main__.extract)" + == f"urn:li:dataJob:(urn:li:dataFlow:(prefect," + f"{flow_run_ctx.flow.name},prod),__main__.extract)" ) assert mock_emitter.method_calls[7].args[0].aspectName == "globalTags" assert ( mock_emitter.method_calls[7].args[0].entityUrn - == f"urn:li:dataJob:(urn:li:dataFlow:(prefect,{flow_run_ctx.flow.name},prod),__main__.extract)" + == f"urn:li:dataJob:(urn:li:dataFlow:(prefect," + f"{flow_run_ctx.flow.name},prod),__main__.extract)" ) assert ( mock_emitter.method_calls[8].args[0].aspectName @@ -116,22 +123,26 @@ def test_emit_flow(mock_emit, mock_run_context, mock_prefect_client): assert mock_emitter.method_calls[12].args[0].aspectName == "dataJobInfo" assert ( mock_emitter.method_calls[12].args[0].entityUrn - == f"urn:li:dataJob:(urn:li:dataFlow:(prefect,{flow_run_ctx.flow.name},prod),__main__.load)" + == f"urn:li:dataJob:(urn:li:dataFlow:(prefect," + f"{flow_run_ctx.flow.name},prod),__main__.load)" ) assert mock_emitter.method_calls[13].args[0].aspectName == "dataJobInputOutput" assert ( mock_emitter.method_calls[13].args[0].entityUrn - == f"urn:li:dataJob:(urn:li:dataFlow:(prefect,{flow_run_ctx.flow.name},prod),__main__.load)" + == f"urn:li:dataJob:(urn:li:dataFlow:(prefect," + f"{flow_run_ctx.flow.name},prod),__main__.load)" ) assert mock_emitter.method_calls[14].args[0].aspectName == "ownership" assert ( mock_emitter.method_calls[14].args[0].entityUrn - == f"urn:li:dataJob:(urn:li:dataFlow:(prefect,{flow_run_ctx.flow.name},prod),__main__.load)" + == f"urn:li:dataJob:(urn:li:dataFlow:(prefect," + f"{flow_run_ctx.flow.name},prod),__main__.load)" ) assert mock_emitter.method_calls[15].args[0].aspectName == "globalTags" assert ( mock_emitter.method_calls[15].args[0].entityUrn - == f"urn:li:dataJob:(urn:li:dataFlow:(prefect,{flow_run_ctx.flow.name},prod),__main__.load)" + == f"urn:li:dataJob:(urn:li:dataFlow:(prefect," + f"{flow_run_ctx.flow.name},prod),__main__.load)" ) assert ( mock_emitter.method_calls[16].args[0].aspectName @@ -168,22 +179,26 @@ def test_emit_flow(mock_emit, mock_run_context, mock_prefect_client): assert mock_emitter.method_calls[20].args[0].aspectName == "dataJobInfo" assert ( mock_emitter.method_calls[20].args[0].entityUrn - == f"urn:li:dataJob:(urn:li:dataFlow:(prefect,{flow_run_ctx.flow.name},prod),__main__.transform)" + == f"urn:li:dataJob:(urn:li:dataFlow:(prefect," + f"{flow_run_ctx.flow.name},prod),__main__.transform)" ) assert mock_emitter.method_calls[21].args[0].aspectName == "dataJobInputOutput" assert ( mock_emitter.method_calls[21].args[0].entityUrn - == f"urn:li:dataJob:(urn:li:dataFlow:(prefect,{flow_run_ctx.flow.name},prod),__main__.transform)" + == f"urn:li:dataJob:(urn:li:dataFlow:(prefect," + f"{flow_run_ctx.flow.name},prod),__main__.transform)" ) assert mock_emitter.method_calls[22].args[0].aspectName == "ownership" assert ( mock_emitter.method_calls[22].args[0].entityUrn - == f"urn:li:dataJob:(urn:li:dataFlow:(prefect,{flow_run_ctx.flow.name},prod),__main__.transform)" + == f"urn:li:dataJob:(urn:li:dataFlow:(prefect," + f"{flow_run_ctx.flow.name},prod),__main__.transform)" ) assert mock_emitter.method_calls[23].args[0].aspectName == "globalTags" assert ( mock_emitter.method_calls[23].args[0].entityUrn - == f"urn:li:dataJob:(urn:li:dataFlow:(prefect,{flow_run_ctx.flow.name},prod),__main__.transform)" + == f"urn:li:dataJob:(urn:li:dataFlow:(prefect," + f"{flow_run_ctx.flow.name},prod),__main__.transform)" ) assert ( mock_emitter.method_calls[24].args[0].aspectName From c141aa04cac3bc4336071180f1d76fa9e2cc17fc Mon Sep 17 00:00:00 2001 From: shubhamjagtap639 Date: Tue, 13 Jun 2023 10:19:07 +0530 Subject: [PATCH 08/39] Code change as per PR review comment --- README.md | 111 +++++++----- docs/concept_mapping.md | 10 ++ example/etl_flow.py | 44 ----- example/etl_sub_flow.py | 52 ------ example/load.py | 7 - mkdocs.yml | 1 + prefect_datahub/datahub_emitter.py | 168 ++++++++++++------ tests/test_datahub_emitter.py | 264 ++++++++++++++++++----------- 8 files changed, 360 insertions(+), 297 deletions(-) create mode 100644 docs/concept_mapping.md delete mode 100644 example/etl_flow.py delete mode 100644 example/etl_sub_flow.py delete mode 100644 example/load.py diff --git a/README.md b/README.md index 4838c18..8d375c4 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# prefect-datahub +# Emit flows & tasks metadata to datahub rest with `prefect-datahub`

@@ -18,63 +18,47 @@

-Visit the full docs [here](https://shubhamjagtap639.github.io/prefect-datahub) to see additional examples and the API reference. +## Welcome! -Block used to emit prefect task and flow related metadata to Datahub REST +The `prefect-datahub` collection makes it easy to leverage the capabilities of datahub emitter in your flows, featuring support for ingesting metadata of flows, tasks & workspace to datahub gms rest. - +Successful deployment of Datahub locally will lead creation of datahub GMS service running on 'http://localhost:8080'. -## Resources +### Saving configurations to a block -For more tips on how to use tasks and flows in a Collection, check out [Using Collections](https://docs.prefect.io/collections/usage/)! -### Installation +This is a one-time activity, where you can save the configuration on the [Prefect block document store](https://docs.prefect.io/2.10.13/concepts/blocks/#saving-blocks). +While saving you can provide below configutions. Default value will get set if not provided while saving the configuration to block. -Install `prefect-datahub` with `pip`: - -```bash -pip install prefect-datahub -``` - -Requires an installation of Python 3.7+. - -We recommend using a Python virtual environment manager such as pipenv, conda or virtualenv. - -These tasks are designed to work with Prefect 2.0. For more information about how to use Prefect, please refer to the [Prefect documentation](https://docs.prefect.io/). - - +```python +from datahub_provider.entities import Dataset +from prefect import flow, task + +from prefect_datahub import DatahubEmitter + +datahub_emitter = DatahubEmitter.load("MY_BLOCK_NAME") + +@task(name="Transform", description="Transform the data") +def transform(data): + data = data.split(" ") + datahub_emitter.add_task( + inputs=[Dataset("snowflake", "mydb.schema.tableA")], + outputs=[Dataset("snowflake", "mydb.schema.tableC")], + ) + return data + +@flow(name="ETL flow", description="Extract transform load flow") +def etl(): + data = transform("This is data") + datahub_emitter.emit_flow() +``` + +## Resources + +For more tips on how to use tasks and flows in a Collection, check out [Using Collections](https://docs.prefect.io/collections/usage/)! + +### Installation + +Install `prefect-datahub` with `pip`: + +```bash +pip install prefect-datahub +``` + +Requires an installation of Python 3.7+. + +We recommend using a Python virtual environment manager such as pipenv, conda or virtualenv. + +These tasks are designed to work with Prefect 2.0. For more information about how to use Prefect, please refer to the [Prefect documentation](https://docs.prefect.io/). ### Feedback diff --git a/docs/concept_mapping.md b/docs/concept_mapping.md new file mode 100644 index 0000000..c5ffaed --- /dev/null +++ b/docs/concept_mapping.md @@ -0,0 +1,10 @@ +# Prefect and Datahub concept mapping + + +Prefect concepts are documented [here](https://docs.prefect.io/latest/concepts/), and datahub concepts are documented [here](https://datahubproject.io/docs/what-is-datahub/datahub-concepts). + +Prefect Concept | DataHub Concept | URN | Possible Values +--- | --- | --- | --- +[Flow](https://docs.prefect.io/2.10.13/concepts/flows/#flows) | [DataFlow](https://datahubproject.io/docs/generated/metamodel/entities/dataflow/) | urn:li:dataFlow:(prefect, [platform-instance.]<flow-name>,prod) | <flow-name> is the user given a name like “etl”. if flow-name is not set by a user then prefect derive it from function-name annotated with @flow +[Flow Run](https://docs.prefect.io/latest/concepts/flows/#flow-runs) | [DataProcessInstance](https://datahubproject.io/docs/generated/metamodel/entities/dataprocessinstance) | urn:li:dataFlow:(prefect, [platform-instance.]<flow-name>,prod) | <flow-name> is the user given a name like “etl”. if flow-name is not set by a user then prefect derive it from function-name annotated with @flow +[Flow](https://docs.prefect.io/2.10.13/concepts/flows/#flows) | [DataFlow](https://datahubproject.io/docs/generated/metamodel/entities/dataflow/) | urn:li:dataFlow:(prefect, [platform-instance.]<flow-name>,prod) | <flow-name> is the user given a name like “etl”. if flow-name is not set by a user then prefect derive it from function-name annotated with @flow diff --git a/example/etl_flow.py b/example/etl_flow.py deleted file mode 100644 index de421e9..0000000 --- a/example/etl_flow.py +++ /dev/null @@ -1,44 +0,0 @@ -from datahub_provider.entities import Dataset -from prefect import flow, task - -from prefect_datahub import DatahubEmitter - -datahub_emitter = DatahubEmitter.load("datahub-emitter-block") - - -@task(name="Extract", description="Extract the actual data") -def extract(): - data = "This is data" - datahub_emitter.emit_task() - return data - - -@task(description="Transform the actual data") -def transform(actual_data): - actual_data = actual_data.split(" ") - datahub_emitter.emit_task( - inputs=[Dataset("snowflake", "mydb.schema.tableA")], - outputs=[Dataset("snowflake", "mydb.schema.tableC")], - ) - return actual_data - - -@task(name="Load_task", description="Load the actual data") -def load(data): - datahub_emitter.emit_task() - print(data) - - -@flow(name="ETL flow", description="Extract transform load flow") -def etl(): - print("Flow started") - extract() - data = transform("This is data") - load(data) - datahub_emitter.emit_flow() - print("") - - -if __name__ == "__main__": - etl() - print("s") diff --git a/example/etl_sub_flow.py b/example/etl_sub_flow.py deleted file mode 100644 index e779f76..0000000 --- a/example/etl_sub_flow.py +++ /dev/null @@ -1,52 +0,0 @@ -from datahub_provider.entities import Dataset -from prefect import flow, task - -from prefect_datahub import DatahubEmitter - -datahub_emitter = DatahubEmitter.load("datahub-emitter-block") - - -@task(name="Extract", description="Extract the actual data") -def extract(): - data = "This is data" - datahub_emitter.emit_task() - return data - - -@task(description="Transform the actual data") -def transform(actual_data): - actual_data = actual_data.split(" ") - datahub_emitter.emit_task( - inputs=[Dataset("snowflake", "mydb.schema.tableA")], - outputs=[Dataset("snowflake", "mydb.schema.tableC")], - ) - return actual_data - - -@task(name="Load_task", description="Load the actual data") -def load(data): - datahub_emitter.emit_task() - print(data) - - -@flow(log_prints=True) -def tl(data): - print("Flow started") - data = transform(data) - load(data) - datahub_emitter.emit_flow() - print("") - - -@flow(log_prints=True) -def etl(): - print("Flow started") - data = extract() - tl(data) - datahub_emitter.emit_flow() - print("") - - -if __name__ == "__main__": - etl() - print("s") diff --git a/example/load.py b/example/load.py deleted file mode 100644 index e9d235e..0000000 --- a/example/load.py +++ /dev/null @@ -1,7 +0,0 @@ -from prefect_datahub import DatahubEmitter - -emitter = DatahubEmitter( - datahub_rest_url="http://localhost:8080", capture_tags_info=False -) - -emitter.save("datahub-emitter-block", overwrite=True) diff --git a/mkdocs.yml b/mkdocs.yml index 1187ed1..968d6c0 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -76,5 +76,6 @@ nav: - Datahub Emitter: datahub_emitter.md - Blocks Catalog: blocks_catalog.md - Examples Catalog: examples_catalog.md + - Concept Mapping: concept_mapping.md diff --git a/prefect_datahub/datahub_emitter.py b/prefect_datahub/datahub_emitter.py index df62ea6..a255d47 100644 --- a/prefect_datahub/datahub_emitter.py +++ b/prefect_datahub/datahub_emitter.py @@ -1,4 +1,4 @@ -"""Datahub Emitter classes used to emit prefect metadata to Datahub REST. """ +"""Datahub Emitter classes used to emit prefect metadata to Datahub REST.""" import asyncio from typing import Dict, List, Optional @@ -9,8 +9,10 @@ InstanceRunResult, ) from datahub.emitter.mce_builder import make_user_urn +from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.emitter.mcp_builder import PlatformKey, gen_containers from datahub.emitter.rest_emitter import DatahubRestEmitter +from datahub.metadata.schema_classes import BrowsePathsClass from datahub.utilities.urns.data_flow_urn import DataFlowUrn from datahub.utilities.urns.data_job_urn import DataJobUrn from datahub.utilities.urns.dataset_urn import DatasetUrn @@ -20,6 +22,7 @@ from prefect.client import cloud, orchestration from prefect.client.schemas import TaskRun from prefect.context import FlowRunContext, TaskRunContext +from prefect.settings import PREFECT_API_URL from pydantic import Field from prefect_datahub import constants @@ -34,15 +37,25 @@ class DatahubEmitter(Block): Block used to emit prefect task and flow related metadata to Datahub REST Attributes: - datahub_rest_url (str): The Datahub GMS Rest URL. - env (str): The environment that all assets produced by this orchestrator \ - belong to. For more detail and possible values refer \ + datahub_rest_url Optional(str) : Datahub GMS Rest URL. \ + Example: http://localhost:8080. + env Optional(str) : The environment that all assets produced by this \ + orchestrator belong to. For more detail and possible values refer \ https://datahubproject.io/docs/graphql/enums/#fabrictype. - platform_instance (str): The instance of the platform that all assets \ + platform_instance Optional(str) : The instance of the platform that all assets \ produced by this recipe belong to. For more detail please refer to \ https://datahubproject.io/docs/platform-instances/. Example: + Store value: + ```python + from prefect_datahub import DatahubEmitter + DatahubEmitter( + datahub_rest_url="http://localhost:8080", + env="PROD", + platform_instance="local_prefect" + ).save("BLOCK_NAME") + ``` Load a stored value: ```python from prefect_datahub import DatahubEmitter @@ -58,7 +71,7 @@ class DatahubEmitter(Block): datahub_rest_url: Optional[str] = Field( default="http://localhost:8080", title="Datahub rest url", - description="Datahub gms rest url.", + description="Datahub GMS Rest URL. Example: http://localhost:8080", ) env: Optional[str] = Field( @@ -84,9 +97,27 @@ def __init__(self, *args, **kwargs): self.emitter.test_connection() def _entities_to_urn_list(self, iolets: List[_Entity]) -> List[DatasetUrn]: + """ + Convert list of _entity to list of dataser urn + + Args: + iolets: The list of entities. + + Returns: + The list of Dataset URN. + """ return [DatasetUrn.create_from_string(let.urn) for let in iolets] - async def _get_flow_run_graph(self, flow_run_id): + async def _get_flow_run_graph(self, flow_run_id) -> List[Dict]: + """ + Fetch the flow run graph for provided flow run id + + Args: + flow_run_id: The flow run id. + + Returns: + The flow run graph in json format. + """ response = await orchestration.get_client()._client.get( f"/flow_runs/{flow_run_id}/graph" ) @@ -149,8 +180,7 @@ def _generate_datajob( return datajob elif task_key is not None: datajob = DataJob( - id=task_key, - flow_urn=dataflow_urn, + id=task_key, flow_urn=dataflow_urn, name=task_key.split(".")[-1] ) return datajob return None @@ -207,7 +237,7 @@ def _generate_dataflow(self, flow_run_ctx: FlowRunContext) -> DataFlow: return dataflow - def _run_dataflow(self, dataflow: DataFlow, flow_run_ctx: FlowRunContext) -> None: + def _emit_flow_run(self, dataflow: DataFlow, flow_run_ctx: FlowRunContext) -> None: """ Emit prefect flow run to datahub rest. Prefect flow run get mapped with datahub data process instance entity which get's generate from provided dataflow entity. @@ -252,7 +282,7 @@ def _run_dataflow(self, dataflow: DataFlow, flow_run_ctx: FlowRunContext) -> Non ), ) - def _run_datajob( + def _emit_task_run( self, datajob: DataJob, flow_run_name: str, task_run: TaskRun ) -> None: """ @@ -291,15 +321,16 @@ def _run_datajob( dpi_property_bag[key] = str(getattr(task_run, key)) dpi.properties.update(dpi_property_bag) - if task_run.state_name == constants.COMPLETE: - result = InstanceRunResult.SUCCESS - elif task_run.state_name == constants.FAILED: - result = InstanceRunResult.FAILURE - elif task_run.state_name == constants.CANCELLED: - result = InstanceRunResult.SKIPPED + state_result_map: Dict[str, str] = {} + state_result_map[constants.COMPLETE] = InstanceRunResult.SUCCESS + state_result_map[constants.FAILED] = InstanceRunResult.FAILURE + state_result_map[constants.CANCELLED] = InstanceRunResult.SKIPPED + + if task_run.state_name in state_result_map: + result = state_result_map[task_run.state_name] else: raise Exception( - f"Result should be either success or failure and it was " + f"State should be either complete, failed or cancelled and it was " f"{task_run.state_name}" ) @@ -316,11 +347,14 @@ def _run_datajob( result_type=constants.ORCHESTRATOR, ) - def _emit_workspaces(self) -> None: + def _emit_workspaces(self) -> Optional[str]: """ Emit prefect workspace metadata to datahub rest. - Prefect workspce get mapped with datahub container entity. - Workspace account name also get emit as owner of continer. + Prefect workspace get mapped with datahub container entity. + Workspace account name also get emit as owner of container. + + Returns: + The emitted workspace name. """ try: asyncio.run(cloud.get_cloud_client().api_healthcheck()) @@ -328,36 +362,46 @@ def _emit_workspaces(self) -> None: get_run_logger().info( "Cannot emit workspaces. Please set correct 'PREFECT_API_KEY'." ) - return + return None + if "workspaces" not in PREFECT_API_URL.value(): + get_run_logger().info( + "Cannot emit workspaces. Please login to prefect cloud using command " + "'prefect cloud login'." + ) + return None SUB_TYPE = "Workspace" + current_workspace_id = PREFECT_API_URL.value().split("/")[-1] workspaces = asyncio.run(cloud.get_cloud_client().read_workspaces()) for workspace in workspaces: - container_key = WorkspaceKey( - workspace_name=workspace.workspace_name, - platform=constants.ORCHESTRATOR, - instance=self.platform_instance, - env=self.env, - ) - container_work_units = gen_containers( - container_key=container_key, - name=workspace.workspace_name, - sub_types=[SUB_TYPE], - description=workspace.workspace_description, - owner_urn=make_user_urn(workspace.account_name), - ) - for workunit in container_work_units: - self.emitter.emit(workunit.metadata) + if str(workspace.workspace_id) == current_workspace_id: + container_key = WorkspaceKey( + workspace_name=workspace.workspace_name, + platform=constants.ORCHESTRATOR, + instance=self.platform_instance, + env=self.env, + ) + container_work_units = gen_containers( + container_key=container_key, + name=workspace.workspace_name, + sub_types=[SUB_TYPE], + description=workspace.workspace_description, + owner_urn=make_user_urn(workspace.account_name), + ) + for workunit in container_work_units: + self.emitter.emit(workunit.metadata) + return workspace.workspace_name + return None - def emit_task( + def add_task( self, inputs: Optional[List[_Entity]] = None, outputs: Optional[List[_Entity]] = None, ) -> None: """ - Emit prefect task metadata to datahub rest. Prefect task get mapped with datahub - datajob entity. Assign provided inputs and outputs as datajob inlets and outlets - respectively. To emit task metadata it is compulsory to emit flow as well - otherwise task will not get emit. + Store prefect current running task metadata temporarily which later get emit + to datahub rest only if user calls emit_flow. Prefect task gets mapped with + datahub datajob entity. Assign provided inputs and outputs as datajob inlets + and outlets respectively. Args: inputs (list): The list of task inputs. @@ -376,7 +420,7 @@ def emit_task( @task(name="Transform", description="Transform the data") def transform(data): data = data.split(" ") - datahub_emitter.emit_task( + datahub_emitter.add_task( inputs=[Dataset("snowflake", "mydb.schema.tableA")], outputs=[Dataset("snowflake", "mydb.schema.tableC")], ) @@ -404,11 +448,11 @@ def etl(): def emit_flow(self) -> None: """ - Emit prefect flow metadata to datahub rest. Prefect flow get mapped with datahub - dataflow entity. Add upstream dependencies if present for each task. - Emit the prefect task run metadata as well. If user haven't called emit_task in - task function still emit_flow will emit task but without task name, description, - tags and properties. + Emit prefect current running flow metadata to datahub rest. Prefect flow gets + mapped with datahub dataflow entity. Add upstream dependencies if present for + each task. Emit the prefect task run metadata as well. If the user hasn't + called add_task in the task function still emit_flow will emit a task but + without task name, description,tags and properties. Emit the prefect workspace metadata as well. @@ -431,9 +475,20 @@ def etl(): """ flow_run_ctx = FlowRunContext.get() assert flow_run_ctx - # Emit flow + + # Emit workspace first + workspace_name = self._emit_workspaces() + + # Emit flow and flow run dataflow = self._generate_dataflow(flow_run_ctx=flow_run_ctx) dataflow.emit(self.emitter) + if workspace_name is not None: + mcp = MetadataChangeProposalWrapper( + entityUrn=str(dataflow.urn), + aspect=BrowsePathsClass(paths=[f"/{workspace_name}/{dataflow.name}"]), + ) + self.emitter.emit(mcp) + self._emit_flow_run(dataflow, flow_run_ctx) # Emit task, task run and add upstream task if present for each task graph_json = asyncio.run( @@ -462,14 +517,23 @@ def etl(): for each in node[constants.UPSTREAM_DEPENDENCIES]: upstream_task_urn = DataJobUrn.create_from_ids( data_flow_urn=str(dataflow.urn), - job_id=task_run_key_map[each["id"]], + job_id=task_run_key_map[each[constants.ID]], ) datajob.upstream_urns.extend([upstream_task_urn]) datajob.emit(self.emitter) + if workspace_name is not None: + mcp = MetadataChangeProposalWrapper( + entityUrn=str(datajob.urn), + aspect=BrowsePathsClass( + paths=[f"/{workspace_name}/{dataflow.name}/{datajob.name}"] + ), + ) + self.emitter.emit(mcp) - # self._run_dataflow(dataflow,flow_run_ctx) - self._run_datajob( + self._emit_task_run( datajob=datajob, flow_run_name=flow_run_ctx.flow_run.name, task_run=task_run, ) + + # Emit workspace diff --git a/tests/test_datahub_emitter.py b/tests/test_datahub_emitter.py index e5cf3e5..23c9fed 100644 --- a/tests/test_datahub_emitter.py +++ b/tests/test_datahub_emitter.py @@ -1,21 +1,52 @@ +import asyncio from unittest.mock import Mock, patch from datahub.api.entities.datajob import DataJob +from datahub.utilities.urns.dataset_urn import DatasetUrn from datahub_provider.entities import Dataset from prefect.context import FlowRunContext, TaskRunContext -from prefect_datahub.datahub_emitter import DatahubEmitter +from prefect_datahub import constants +from prefect_datahub.datahub_emitter import DatahubEmitter, WorkspaceKey + + +def test_constants(): + assert constants.ORCHESTRATOR == "prefect" + + +def test_workspace_key(): + container_key = WorkspaceKey( + workspace_name="datahub", + platform="prefect", + env="PROD", + ) + assert container_key.guid() == "bf46b065c6816616f35e83d8be976c62" + + +def test_entities_to_urn_list(): + dataset_urn_list = DatahubEmitter()._entities_to_urn_list( + [Dataset("snowflake", "mydb.schema.tableA")] + ) + for dataset_urn in dataset_urn_list: + assert isinstance(dataset_urn, DatasetUrn) + + +def test_get_flow_run_graph(mock_prefect_client): + graph_json = asyncio.run( + DatahubEmitter()._get_flow_run_graph("c3b947e5-3fa1-4b46-a2e2-58d50c938f2e") + ) + assert isinstance(graph_json, list) @patch("prefect_datahub.datahub_emitter.DatahubRestEmitter", autospec=True) -def test_emit_task(mock_emit, mock_run_context): +def test_add_task(mock_emit, mock_run_context): mock_emitter = Mock() mock_emit.return_value = mock_emitter datahub_emitter = DatahubEmitter() inputs = [Dataset("snowflake", "mydb.schema.tableA")] outputs = [Dataset("snowflake", "mydb.schema.tableC")] - datahub_emitter.emit_task( + datahub_emitter.add_task( inputs=inputs, outputs=outputs, ) @@ -28,8 +59,8 @@ def test_emit_task(mock_emit, mock_run_context): f"(prefect,{flow_run_ctx.flow.name},prod),{task_run_ctx.task.task_key})" ) - assert expected_datajob_urn in datahub_emitter.datajob_to_emit.keys() - actual_datajob = datahub_emitter.datajob_to_emit[expected_datajob_urn] + assert expected_datajob_urn in datahub_emitter.datajobs_to_emit.keys() + actual_datajob = datahub_emitter.datajobs_to_emit[expected_datajob_urn] assert isinstance(actual_datajob, DataJob) assert str(actual_datajob.flow_urn) == "urn:li:dataFlow:(prefect,etl,prod)" assert actual_datajob.name == task_run_ctx.task.name @@ -47,7 +78,9 @@ def test_emit_task(mock_emit, mock_run_context): @patch("prefect_datahub.datahub_emitter.DatahubRestEmitter", autospec=True) -def test_emit_flow(mock_emit, mock_run_context, mock_prefect_client): +def test_emit_flow( + mock_emit, mock_run_context, mock_prefect_client, mock_prefect_cloud_client +): mock_emitter = Mock() mock_emit.return_value = mock_emitter @@ -58,216 +91,251 @@ def test_emit_flow(mock_emit, mock_run_context, mock_prefect_client): expected_dataflow_urn = f"urn:li:dataFlow:(prefect,{flow_run_ctx.flow.name},prod)" - assert mock_emitter.method_calls[1].args[0].aspectName == "dataFlowInfo" - assert mock_emitter.method_calls[1].args[0].entityUrn == expected_dataflow_urn - assert mock_emitter.method_calls[2].args[0].aspectName == "ownership" - assert mock_emitter.method_calls[2].args[0].entityUrn == expected_dataflow_urn - assert mock_emitter.method_calls[3].args[0].aspectName == "globalTags" - assert mock_emitter.method_calls[3].args[0].entityUrn == expected_dataflow_urn - assert mock_emitter.method_calls[4].args[0].aspectName == "dataJobInfo" + assert mock_emitter.method_calls[1].args[0].aspectName == "containerProperties" + assert ( + mock_emitter.method_calls[1].args[0].entityUrn + == "urn:li:container:bf46b065c6816616f35e83d8be976c62" + ) + assert mock_emitter.method_calls[2].args[0].aspectName == "status" + assert ( + mock_emitter.method_calls[2].args[0].entityUrn + == "urn:li:container:bf46b065c6816616f35e83d8be976c62" + ) + assert mock_emitter.method_calls[3].args[0].aspectName == "dataPlatformInstance" + assert ( + mock_emitter.method_calls[3].args[0].entityUrn + == "urn:li:container:bf46b065c6816616f35e83d8be976c62" + ) + assert mock_emitter.method_calls[4].args[0].aspectName == "subTypes" assert ( mock_emitter.method_calls[4].args[0].entityUrn + == "urn:li:container:bf46b065c6816616f35e83d8be976c62" + ) + assert mock_emitter.method_calls[5].args[0].aspectName == "ownership" + assert ( + mock_emitter.method_calls[5].args[0].entityUrn + == "urn:li:container:bf46b065c6816616f35e83d8be976c62" + ) + assert ( + mock_emitter.method_calls[5].args[0].aspect.owners[0].owner + == "urn:li:corpuser:shubhamjagtapgslabcom" + ) + + assert mock_emitter.method_calls[6].args[0].aspectName == "dataFlowInfo" + assert mock_emitter.method_calls[6].args[0].entityUrn == expected_dataflow_urn + assert mock_emitter.method_calls[7].args[0].aspectName == "ownership" + assert mock_emitter.method_calls[7].args[0].entityUrn == expected_dataflow_urn + assert mock_emitter.method_calls[8].args[0].aspectName == "globalTags" + assert mock_emitter.method_calls[8].args[0].entityUrn == expected_dataflow_urn + assert mock_emitter.method_calls[9].args[0].aspectName == "browsePaths" + assert mock_emitter.method_calls[9].args[0].entityUrn == expected_dataflow_urn + assert ( + mock_emitter.method_calls[13].args[0].aspectName + == "dataProcessInstanceProperties" + ) + assert ( + mock_emitter.method_calls[13].args[0].entityUrn + == "urn:li:dataProcessInstance:1c61330602200cac15fe044b3698c176" + ) + assert ( + mock_emitter.method_calls[14].args[0].aspectName + == "dataProcessInstanceRelationships" + ) + assert ( + mock_emitter.method_calls[14].args[0].entityUrn + == "urn:li:dataProcessInstance:1c61330602200cac15fe044b3698c176" + ) + assert ( + mock_emitter.method_calls[15].args[0].aspectName + == "dataProcessInstanceRunEvent" + ) + assert ( + mock_emitter.method_calls[15].args[0].entityUrn + == "urn:li:dataProcessInstance:1c61330602200cac15fe044b3698c176" + ) + assert mock_emitter.method_calls[16].args[0].aspectName == "dataJobInfo" + assert ( + mock_emitter.method_calls[16].args[0].entityUrn == f"urn:li:dataJob:(urn:li:dataFlow:(prefect," f"{flow_run_ctx.flow.name},prod),__main__.extract)" ) - assert mock_emitter.method_calls[5].args[0].aspectName == "dataJobInputOutput" + assert mock_emitter.method_calls[17].args[0].aspectName == "dataJobInputOutput" assert ( - mock_emitter.method_calls[5].args[0].entityUrn + mock_emitter.method_calls[17].args[0].entityUrn == f"urn:li:dataJob:(urn:li:dataFlow:(prefect," f"{flow_run_ctx.flow.name},prod),__main__.extract)" ) - assert mock_emitter.method_calls[6].args[0].aspectName == "ownership" + assert mock_emitter.method_calls[18].args[0].aspectName == "ownership" assert ( - mock_emitter.method_calls[6].args[0].entityUrn + mock_emitter.method_calls[18].args[0].entityUrn + == f"urn:li:dataJob:(urn:li:dataFlow:(prefect," + f"{flow_run_ctx.flow.name},prod),__main__.extract)" + ) + assert mock_emitter.method_calls[19].args[0].aspectName == "globalTags" + assert ( + mock_emitter.method_calls[19].args[0].entityUrn == f"urn:li:dataJob:(urn:li:dataFlow:(prefect," f"{flow_run_ctx.flow.name},prod),__main__.extract)" ) - assert mock_emitter.method_calls[7].args[0].aspectName == "globalTags" + assert mock_emitter.method_calls[20].args[0].aspectName == "browsePaths" assert ( - mock_emitter.method_calls[7].args[0].entityUrn + mock_emitter.method_calls[20].args[0].entityUrn == f"urn:li:dataJob:(urn:li:dataFlow:(prefect," f"{flow_run_ctx.flow.name},prod),__main__.extract)" ) assert ( - mock_emitter.method_calls[8].args[0].aspectName + mock_emitter.method_calls[21].args[0].aspectName == "dataProcessInstanceProperties" ) assert ( - mock_emitter.method_calls[8].args[0].entityUrn + mock_emitter.method_calls[21].args[0].entityUrn == "urn:li:dataProcessInstance:77a8ea575ff6976d37cd1a60caf98a95" ) assert ( - mock_emitter.method_calls[9].args[0].aspectName + mock_emitter.method_calls[22].args[0].aspectName == "dataProcessInstanceRelationships" ) assert ( - mock_emitter.method_calls[9].args[0].entityUrn + mock_emitter.method_calls[22].args[0].entityUrn == "urn:li:dataProcessInstance:77a8ea575ff6976d37cd1a60caf98a95" ) assert ( - mock_emitter.method_calls[10].args[0].aspectName + mock_emitter.method_calls[23].args[0].aspectName == "dataProcessInstanceRunEvent" ) assert ( - mock_emitter.method_calls[10].args[0].entityUrn + mock_emitter.method_calls[23].args[0].entityUrn == "urn:li:dataProcessInstance:77a8ea575ff6976d37cd1a60caf98a95" ) assert ( - mock_emitter.method_calls[11].args[0].aspectName + mock_emitter.method_calls[24].args[0].aspectName == "dataProcessInstanceRunEvent" ) assert ( - mock_emitter.method_calls[11].args[0].entityUrn + mock_emitter.method_calls[24].args[0].entityUrn == "urn:li:dataProcessInstance:77a8ea575ff6976d37cd1a60caf98a95" ) - assert mock_emitter.method_calls[12].args[0].aspectName == "dataJobInfo" + assert mock_emitter.method_calls[25].args[0].aspectName == "dataJobInfo" assert ( - mock_emitter.method_calls[12].args[0].entityUrn + mock_emitter.method_calls[25].args[0].entityUrn == f"urn:li:dataJob:(urn:li:dataFlow:(prefect," f"{flow_run_ctx.flow.name},prod),__main__.load)" ) - assert mock_emitter.method_calls[13].args[0].aspectName == "dataJobInputOutput" + assert mock_emitter.method_calls[26].args[0].aspectName == "dataJobInputOutput" assert ( - mock_emitter.method_calls[13].args[0].entityUrn + mock_emitter.method_calls[26].args[0].entityUrn == f"urn:li:dataJob:(urn:li:dataFlow:(prefect," f"{flow_run_ctx.flow.name},prod),__main__.load)" ) - assert mock_emitter.method_calls[14].args[0].aspectName == "ownership" + assert mock_emitter.method_calls[27].args[0].aspectName == "ownership" assert ( - mock_emitter.method_calls[14].args[0].entityUrn + mock_emitter.method_calls[27].args[0].entityUrn == f"urn:li:dataJob:(urn:li:dataFlow:(prefect," f"{flow_run_ctx.flow.name},prod),__main__.load)" ) - assert mock_emitter.method_calls[15].args[0].aspectName == "globalTags" + assert mock_emitter.method_calls[28].args[0].aspectName == "globalTags" assert ( - mock_emitter.method_calls[15].args[0].entityUrn + mock_emitter.method_calls[28].args[0].entityUrn + == f"urn:li:dataJob:(urn:li:dataFlow:(prefect," + f"{flow_run_ctx.flow.name},prod),__main__.load)" + ) + assert mock_emitter.method_calls[29].args[0].aspectName == "browsePaths" + assert ( + mock_emitter.method_calls[29].args[0].entityUrn == f"urn:li:dataJob:(urn:li:dataFlow:(prefect," f"{flow_run_ctx.flow.name},prod),__main__.load)" ) assert ( - mock_emitter.method_calls[16].args[0].aspectName + mock_emitter.method_calls[30].args[0].aspectName == "dataProcessInstanceProperties" ) assert ( - mock_emitter.method_calls[16].args[0].entityUrn + mock_emitter.method_calls[30].args[0].entityUrn == "urn:li:dataProcessInstance:6efec88dd6d26cb85e8592baf38e42b9" ) assert ( - mock_emitter.method_calls[17].args[0].aspectName + mock_emitter.method_calls[31].args[0].aspectName == "dataProcessInstanceRelationships" ) assert ( - mock_emitter.method_calls[17].args[0].entityUrn + mock_emitter.method_calls[31].args[0].entityUrn == "urn:li:dataProcessInstance:6efec88dd6d26cb85e8592baf38e42b9" ) assert ( - mock_emitter.method_calls[18].args[0].aspectName + mock_emitter.method_calls[32].args[0].aspectName == "dataProcessInstanceRunEvent" ) assert ( - mock_emitter.method_calls[18].args[0].entityUrn + mock_emitter.method_calls[32].args[0].entityUrn == "urn:li:dataProcessInstance:6efec88dd6d26cb85e8592baf38e42b9" ) assert ( - mock_emitter.method_calls[19].args[0].aspectName + mock_emitter.method_calls[33].args[0].aspectName == "dataProcessInstanceRunEvent" ) assert ( - mock_emitter.method_calls[19].args[0].entityUrn + mock_emitter.method_calls[33].args[0].entityUrn == "urn:li:dataProcessInstance:6efec88dd6d26cb85e8592baf38e42b9" ) - assert mock_emitter.method_calls[20].args[0].aspectName == "dataJobInfo" + assert mock_emitter.method_calls[34].args[0].aspectName == "dataJobInfo" assert ( - mock_emitter.method_calls[20].args[0].entityUrn + mock_emitter.method_calls[34].args[0].entityUrn == f"urn:li:dataJob:(urn:li:dataFlow:(prefect," f"{flow_run_ctx.flow.name},prod),__main__.transform)" ) - assert mock_emitter.method_calls[21].args[0].aspectName == "dataJobInputOutput" + assert mock_emitter.method_calls[35].args[0].aspectName == "dataJobInputOutput" assert ( - mock_emitter.method_calls[21].args[0].entityUrn + mock_emitter.method_calls[35].args[0].entityUrn == f"urn:li:dataJob:(urn:li:dataFlow:(prefect," f"{flow_run_ctx.flow.name},prod),__main__.transform)" ) - assert mock_emitter.method_calls[22].args[0].aspectName == "ownership" + assert mock_emitter.method_calls[36].args[0].aspectName == "ownership" assert ( - mock_emitter.method_calls[22].args[0].entityUrn + mock_emitter.method_calls[36].args[0].entityUrn == f"urn:li:dataJob:(urn:li:dataFlow:(prefect," f"{flow_run_ctx.flow.name},prod),__main__.transform)" ) - assert mock_emitter.method_calls[23].args[0].aspectName == "globalTags" + assert mock_emitter.method_calls[37].args[0].aspectName == "globalTags" assert ( - mock_emitter.method_calls[23].args[0].entityUrn + mock_emitter.method_calls[37].args[0].entityUrn == f"urn:li:dataJob:(urn:li:dataFlow:(prefect," f"{flow_run_ctx.flow.name},prod),__main__.transform)" ) + assert mock_emitter.method_calls[38].args[0].aspectName == "browsePaths" assert ( - mock_emitter.method_calls[24].args[0].aspectName + mock_emitter.method_calls[38].args[0].entityUrn + == f"urn:li:dataJob:(urn:li:dataFlow:(prefect," + f"{flow_run_ctx.flow.name},prod),__main__.transform)" + ) + assert ( + mock_emitter.method_calls[39].args[0].aspectName == "dataProcessInstanceProperties" ) assert ( - mock_emitter.method_calls[24].args[0].entityUrn + mock_emitter.method_calls[39].args[0].entityUrn == "urn:li:dataProcessInstance:c4458dec616b26ad64e2c520614ef6b7" ) assert ( - mock_emitter.method_calls[25].args[0].aspectName + mock_emitter.method_calls[40].args[0].aspectName == "dataProcessInstanceRelationships" ) assert ( - mock_emitter.method_calls[25].args[0].entityUrn + mock_emitter.method_calls[40].args[0].entityUrn == "urn:li:dataProcessInstance:c4458dec616b26ad64e2c520614ef6b7" ) assert ( - mock_emitter.method_calls[26].args[0].aspectName + mock_emitter.method_calls[41].args[0].aspectName == "dataProcessInstanceRunEvent" ) assert ( - mock_emitter.method_calls[26].args[0].entityUrn + mock_emitter.method_calls[41].args[0].entityUrn == "urn:li:dataProcessInstance:c4458dec616b26ad64e2c520614ef6b7" ) assert ( - mock_emitter.method_calls[27].args[0].aspectName + mock_emitter.method_calls[42].args[0].aspectName == "dataProcessInstanceRunEvent" ) assert ( - mock_emitter.method_calls[27].args[0].entityUrn + mock_emitter.method_calls[42].args[0].entityUrn == "urn:li:dataProcessInstance:c4458dec616b26ad64e2c520614ef6b7" ) - - -@patch("prefect_datahub.datahub_emitter.DatahubRestEmitter", autospec=True) -def test_emit_workspace(mock_emit, mock_prefect_cloud_client): - mock_emitter = Mock() - mock_emit.return_value = mock_emitter - - datahub_emitter = DatahubEmitter() - datahub_emitter.emit_workspaces() - - assert mock_emitter.method_calls[1].args[0].aspectName == "containerProperties" - assert ( - mock_emitter.method_calls[1].args[0].entityUrn - == "urn:li:container:bf46b065c6816616f35e83d8be976c62" - ) - assert mock_emitter.method_calls[2].args[0].aspectName == "status" - assert ( - mock_emitter.method_calls[2].args[0].entityUrn - == "urn:li:container:bf46b065c6816616f35e83d8be976c62" - ) - assert mock_emitter.method_calls[3].args[0].aspectName == "dataPlatformInstance" - assert ( - mock_emitter.method_calls[3].args[0].entityUrn - == "urn:li:container:bf46b065c6816616f35e83d8be976c62" - ) - assert mock_emitter.method_calls[4].args[0].aspectName == "subTypes" - assert ( - mock_emitter.method_calls[4].args[0].entityUrn - == "urn:li:container:bf46b065c6816616f35e83d8be976c62" - ) - assert mock_emitter.method_calls[5].args[0].aspectName == "ownership" - assert ( - mock_emitter.method_calls[5].args[0].entityUrn - == "urn:li:container:bf46b065c6816616f35e83d8be976c62" - ) - assert ( - mock_emitter.method_calls[5].args[0].aspect.owners[0].owner - == "urn:li:corpuser:shubhamjagtapgslabcom" - ) From 9404be16f6d27021061d06f612f439b9d2532e48 Mon Sep 17 00:00:00 2001 From: shubhamjagtap639 Date: Tue, 13 Jun 2023 11:00:05 +0530 Subject: [PATCH 09/39] Test cases modified --- tests/conftest.py | 7 ++++++- tests/test_datahub_emitter.py | 7 ++++--- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 23cf62b..f801049 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -468,6 +468,11 @@ def mock_prefect_cloud_client(): prefect_cloud_client_mock = MagicMock() prefect_cloud_client_mock.api_healthcheck.side_effect = mock_api_healthcheck prefect_cloud_client_mock.read_workspaces.side_effect = mock_read_workspaces - with patch("prefect_datahub.datahub_emitter.cloud") as mock_client: + with patch( + "prefect_datahub.datahub_emitter.cloud" + ) as mock_client, patch( + "prefect_datahub.datahub_emitter.PREFECT_API_URL.value", + return_value = "https://api.prefect.cloud/api/accounts/33e98cfe-ad06-4ceb-a500-c11148499f75/workspaces/157eb822-1b3b-4338-ae80-98edd5d00cb9" + ): mock_client.get_cloud_client.return_value = prefect_cloud_client_mock yield prefect_cloud_client_mock diff --git a/tests/test_datahub_emitter.py b/tests/test_datahub_emitter.py index 23c9fed..914153c 100644 --- a/tests/test_datahub_emitter.py +++ b/tests/test_datahub_emitter.py @@ -23,15 +23,16 @@ def test_workspace_key(): assert container_key.guid() == "bf46b065c6816616f35e83d8be976c62" -def test_entities_to_urn_list(): +@patch("prefect_datahub.datahub_emitter.DatahubRestEmitter", autospec=True) +def test_entities_to_urn_list(mock_emit): dataset_urn_list = DatahubEmitter()._entities_to_urn_list( [Dataset("snowflake", "mydb.schema.tableA")] ) for dataset_urn in dataset_urn_list: assert isinstance(dataset_urn, DatasetUrn) - -def test_get_flow_run_graph(mock_prefect_client): +@patch("prefect_datahub.datahub_emitter.DatahubRestEmitter", autospec=True) +def test_get_flow_run_graph(mock_emit, mock_prefect_client): graph_json = asyncio.run( DatahubEmitter()._get_flow_run_graph("c3b947e5-3fa1-4b46-a2e2-58d50c938f2e") ) From 3cbae6de9ca444f8eafbaae7b797f0ec019090e0 Mon Sep 17 00:00:00 2001 From: shubhamjagtap639 Date: Tue, 13 Jun 2023 11:33:39 +0530 Subject: [PATCH 10/39] code formatted --- .coveragerc | 4 ++++ tests/conftest.py | 7 +++---- tests/test_datahub_emitter.py | 1 + 3 files changed, 8 insertions(+), 4 deletions(-) create mode 100644 .coveragerc diff --git a/.coveragerc b/.coveragerc new file mode 100644 index 0000000..ce04286 --- /dev/null +++ b/.coveragerc @@ -0,0 +1,4 @@ +[run] +omit = + # omit this single file + prefect_datahub/constants.py diff --git a/tests/conftest.py b/tests/conftest.py index f801049..531b246 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -468,11 +468,10 @@ def mock_prefect_cloud_client(): prefect_cloud_client_mock = MagicMock() prefect_cloud_client_mock.api_healthcheck.side_effect = mock_api_healthcheck prefect_cloud_client_mock.read_workspaces.side_effect = mock_read_workspaces - with patch( - "prefect_datahub.datahub_emitter.cloud" - ) as mock_client, patch( + with patch("prefect_datahub.datahub_emitter.cloud") as mock_client, patch( "prefect_datahub.datahub_emitter.PREFECT_API_URL.value", - return_value = "https://api.prefect.cloud/api/accounts/33e98cfe-ad06-4ceb-a500-c11148499f75/workspaces/157eb822-1b3b-4338-ae80-98edd5d00cb9" + return_value="https://api.prefect.cloud/api/accounts/33e98cfe-ad06-4ceb-" + "a500-c11148499f75/workspaces/157eb822-1b3b-4338-ae80-98edd5d00cb9", ): mock_client.get_cloud_client.return_value = prefect_cloud_client_mock yield prefect_cloud_client_mock diff --git a/tests/test_datahub_emitter.py b/tests/test_datahub_emitter.py index 914153c..9e6ea16 100644 --- a/tests/test_datahub_emitter.py +++ b/tests/test_datahub_emitter.py @@ -31,6 +31,7 @@ def test_entities_to_urn_list(mock_emit): for dataset_urn in dataset_urn_list: assert isinstance(dataset_urn, DatasetUrn) + @patch("prefect_datahub.datahub_emitter.DatahubRestEmitter", autospec=True) def test_get_flow_run_graph(mock_emit, mock_prefect_client): graph_json = asyncio.run( From d37ea71a05d753768ed63c1bb7139f293de3af45 Mon Sep 17 00:00:00 2001 From: shubhamjagtap639 Date: Tue, 13 Jun 2023 11:37:34 +0530 Subject: [PATCH 11/39] test case modified --- .coveragerc | 4 ---- tests/test_datahub_emitter.py | 1 + 2 files changed, 1 insertion(+), 4 deletions(-) delete mode 100644 .coveragerc diff --git a/.coveragerc b/.coveragerc deleted file mode 100644 index ce04286..0000000 --- a/.coveragerc +++ /dev/null @@ -1,4 +0,0 @@ -[run] -omit = - # omit this single file - prefect_datahub/constants.py diff --git a/tests/test_datahub_emitter.py b/tests/test_datahub_emitter.py index 9e6ea16..185d947 100644 --- a/tests/test_datahub_emitter.py +++ b/tests/test_datahub_emitter.py @@ -21,6 +21,7 @@ def test_workspace_key(): env="PROD", ) assert container_key.guid() == "bf46b065c6816616f35e83d8be976c62" + assert container_key.workspace_name == "datahub" @patch("prefect_datahub.datahub_emitter.DatahubRestEmitter", autospec=True) From 29ebd0e982e3848475f4ece637de4ed2537aab70 Mon Sep 17 00:00:00 2001 From: shubhamjagtap639 Date: Tue, 13 Jun 2023 11:49:31 +0530 Subject: [PATCH 12/39] Code formatted --- prefect_datahub/constants.py | 47 --------- prefect_datahub/datahub_emitter.py | 162 ++++++++++++++++++----------- 2 files changed, 103 insertions(+), 106 deletions(-) delete mode 100644 prefect_datahub/constants.py diff --git a/prefect_datahub/constants.py b/prefect_datahub/constants.py deleted file mode 100644 index d73ecca..0000000 --- a/prefect_datahub/constants.py +++ /dev/null @@ -1,47 +0,0 @@ -ORCHESTRATOR = "prefect" - -# Flow and task common constants -VERSION = "version" -RETRIES = "retries" -TIMEOUT_SECONDS = "timeout_seconds" -LOG_PRINTS = "log_prints" -ON_COMPLETION = "on_completion" -ON_FAILURE = "on_failure" - -# Flow constants -FLOW_RUN_NAME = "flow_run_name" -TASK_RUNNER = "task_runner" -PERSIST_RESULT = "persist_result" -ON_CANCELLATION = "on_cancellation" -ON_CRASHED = "on_crashed" - -# Task constants -CACHE_EXPIRATION = "cache_expiration" -TASK_RUN_NAME = "task_run_name" -REFRESH_CACHE = "refresh_cache" -TASK_KEY = "task_key" - -# Flow run and task run common constants -ID = "id" -CREATED = "created" -UPDATED = "updated" -TAGS = "tags" -ESTIMATED_RUN_TIME = "estimated_run_time" -START_TIME = "start_time" -END_TIME = "end_time" -TOTAL_RUN_TIME = "total_run_time" -NEXT_SCHEDULED_START_TIME = "next_scheduled_start_time" - -# Fask run constants -CREATED_BY = "created_by" -AUTO_SCHEDULED = "auto_scheduled" - -# Task run constants -FLOW_RUN_ID = "flow_run_id" -RUN_COUNT = "run_count" -UPSTREAM_DEPENDENCIES = "upstream_dependencies" - -# States constants -COMPLETE = "Completed" -FAILED = "Failed" -CANCELLED = "Cancelled" diff --git a/prefect_datahub/datahub_emitter.py b/prefect_datahub/datahub_emitter.py index a255d47..5157706 100644 --- a/prefect_datahub/datahub_emitter.py +++ b/prefect_datahub/datahub_emitter.py @@ -25,7 +25,53 @@ from prefect.settings import PREFECT_API_URL from pydantic import Field -from prefect_datahub import constants +ORCHESTRATOR = "prefect" + +# Flow and task common constants +VERSION = "version" +RETRIES = "retries" +TIMEOUT_SECONDS = "timeout_seconds" +LOG_PRINTS = "log_prints" +ON_COMPLETION = "on_completion" +ON_FAILURE = "on_failure" + +# Flow constants +FLOW_RUN_NAME = "flow_run_name" +TASK_RUNNER = "task_runner" +PERSIST_RESULT = "persist_result" +ON_CANCELLATION = "on_cancellation" +ON_CRASHED = "on_crashed" + +# Task constants +CACHE_EXPIRATION = "cache_expiration" +TASK_RUN_NAME = "task_run_name" +REFRESH_CACHE = "refresh_cache" +TASK_KEY = "task_key" + +# Flow run and task run common constants +ID = "id" +CREATED = "created" +UPDATED = "updated" +TAGS = "tags" +ESTIMATED_RUN_TIME = "estimated_run_time" +START_TIME = "start_time" +END_TIME = "end_time" +TOTAL_RUN_TIME = "total_run_time" +NEXT_SCHEDULED_START_TIME = "next_scheduled_start_time" + +# Fask run constants +CREATED_BY = "created_by" +AUTO_SCHEDULED = "auto_scheduled" + +# Task run constants +FLOW_RUN_ID = "flow_run_id" +RUN_COUNT = "run_count" +UPSTREAM_DEPENDENCIES = "upstream_dependencies" + +# States constants +COMPLETE = "Completed" +FAILED = "Failed" +CANCELLED = "Cancelled" class WorkspaceKey(PlatformKey): @@ -142,7 +188,7 @@ def _generate_datajob( The datajob entity. """ dataflow_urn = DataFlowUrn.create_from_ids( - orchestrator=constants.ORCHESTRATOR, + orchestrator=ORCHESTRATOR, flow_id=flow_run_ctx.flow.name, env=self.env, platform_instance=self.platform_instance, @@ -159,16 +205,16 @@ def _generate_datajob( job_property_bag: Dict[str, str] = {} allowed_task_keys = [ - constants.VERSION, - constants.CACHE_EXPIRATION, - constants.TASK_RUN_NAME, - constants.RETRIES, - constants.TIMEOUT_SECONDS, - constants.LOG_PRINTS, - constants.REFRESH_CACHE, - constants.TASK_KEY, - constants.ON_COMPLETION, - constants.ON_FAILURE, + VERSION, + CACHE_EXPIRATION, + TASK_RUN_NAME, + RETRIES, + TIMEOUT_SECONDS, + LOG_PRINTS, + REFRESH_CACHE, + TASK_KEY, + ON_COMPLETION, + ON_FAILURE, ] for key in allowed_task_keys: if ( @@ -200,7 +246,7 @@ def _generate_dataflow(self, flow_run_ctx: FlowRunContext) -> DataFlow: orchestration.get_client().read_flow(flow_id=flow_run_ctx.flow_run.flow_id) ) dataflow = DataFlow( - orchestrator=constants.ORCHESTRATOR, + orchestrator=ORCHESTRATOR, id=flow_run_ctx.flow.name, cluster=self.env, env=self.env, @@ -210,22 +256,22 @@ def _generate_dataflow(self, flow_run_ctx: FlowRunContext) -> DataFlow: dataflow.description = flow_run_ctx.flow.description dataflow.tags = flow.tags flow_property_bag: Dict[str, str] = {} - flow_property_bag[constants.ID] = str(flow.id) - flow_property_bag[constants.CREATED] = str(flow.created) - flow_property_bag[constants.UPDATED] = str(flow.updated) + flow_property_bag[ID] = str(flow.id) + flow_property_bag[CREATED] = str(flow.created) + flow_property_bag[UPDATED] = str(flow.updated) allowed_flow_keys = [ - constants.VERSION, - constants.FLOW_RUN_NAME, - constants.RETRIES, - constants.TASK_RUNNER, - constants.TIMEOUT_SECONDS, - constants.PERSIST_RESULT, - constants.LOG_PRINTS, - constants.ON_COMPLETION, - constants.ON_FAILURE, - constants.ON_CANCELLATION, - constants.ON_CRASHED, + VERSION, + FLOW_RUN_NAME, + RETRIES, + TASK_RUNNER, + TIMEOUT_SECONDS, + PERSIST_RESULT, + LOG_PRINTS, + ON_COMPLETION, + ON_FAILURE, + ON_CANCELLATION, + ON_CRASHED, ] for key in allowed_flow_keys: if ( @@ -258,17 +304,17 @@ def _emit_flow_run(self, dataflow: DataFlow, flow_run_ctx: FlowRunContext) -> No dpi_property_bag: Dict[str, str] = {} allowed_flow_run_keys = [ - constants.ID, - constants.CREATED, - constants.UPDATED, - constants.CREATED_BY, - constants.AUTO_SCHEDULED, - constants.ESTIMATED_RUN_TIME, - constants.START_TIME, - constants.TOTAL_RUN_TIME, - constants.NEXT_SCHEDULED_START_TIME, - constants.TAGS, - constants.RUN_COUNT, + ID, + CREATED, + UPDATED, + CREATED_BY, + AUTO_SCHEDULED, + ESTIMATED_RUN_TIME, + START_TIME, + TOTAL_RUN_TIME, + NEXT_SCHEDULED_START_TIME, + TAGS, + RUN_COUNT, ] for key in allowed_flow_run_keys: if hasattr(flow_run, key) and getattr(flow_run, key) is not None: @@ -304,17 +350,17 @@ def _emit_task_run( dpi_property_bag: Dict[str, str] = {} allowed_task_run_keys = [ - constants.ID, - constants.FLOW_RUN_ID, - constants.CREATED, - constants.UPDATED, - constants.ESTIMATED_RUN_TIME, - constants.START_TIME, - constants.END_TIME, - constants.TOTAL_RUN_TIME, - constants.NEXT_SCHEDULED_START_TIME, - constants.TAGS, - constants.RUN_COUNT, + ID, + FLOW_RUN_ID, + CREATED, + UPDATED, + ESTIMATED_RUN_TIME, + START_TIME, + END_TIME, + TOTAL_RUN_TIME, + NEXT_SCHEDULED_START_TIME, + TAGS, + RUN_COUNT, ] for key in allowed_task_run_keys: if hasattr(task_run, key) and getattr(task_run, key) is not None: @@ -322,9 +368,9 @@ def _emit_task_run( dpi.properties.update(dpi_property_bag) state_result_map: Dict[str, str] = {} - state_result_map[constants.COMPLETE] = InstanceRunResult.SUCCESS - state_result_map[constants.FAILED] = InstanceRunResult.FAILURE - state_result_map[constants.CANCELLED] = InstanceRunResult.SKIPPED + state_result_map[COMPLETE] = InstanceRunResult.SUCCESS + state_result_map[FAILED] = InstanceRunResult.FAILURE + state_result_map[CANCELLED] = InstanceRunResult.SKIPPED if task_run.state_name in state_result_map: result = state_result_map[task_run.state_name] @@ -344,7 +390,7 @@ def _emit_task_run( emitter=self.emitter, end_timestamp_millis=int(task_run.end_time.timestamp() * 1000), result=result, - result_type=constants.ORCHESTRATOR, + result_type=ORCHESTRATOR, ) def _emit_workspaces(self) -> Optional[str]: @@ -376,7 +422,7 @@ def _emit_workspaces(self) -> Optional[str]: if str(workspace.workspace_id) == current_workspace_id: container_key = WorkspaceKey( workspace_name=workspace.workspace_name, - platform=constants.ORCHESTRATOR, + platform=ORCHESTRATOR, instance=self.platform_instance, env=self.env, ) @@ -499,9 +545,7 @@ def etl(): for prefect_future in flow_run_ctx.task_run_futures } for node in graph_json: - task_run = asyncio.run( - orchestration.get_client().read_task_run(node[constants.ID]) - ) + task_run = asyncio.run(orchestration.get_client().read_task_run(node[ID])) # Emit task datajob_urn = DataJobUrn.create_from_ids( data_flow_urn=str(dataflow.urn), @@ -514,10 +558,10 @@ def etl(): flow_run_ctx=flow_run_ctx, task_key=task_run.task_key ) # Add upstrem urns - for each in node[constants.UPSTREAM_DEPENDENCIES]: + for each in node[UPSTREAM_DEPENDENCIES]: upstream_task_urn = DataJobUrn.create_from_ids( data_flow_urn=str(dataflow.urn), - job_id=task_run_key_map[each[constants.ID]], + job_id=task_run_key_map[each[ID]], ) datajob.upstream_urns.extend([upstream_task_urn]) datajob.emit(self.emitter) From 510324360516b61195e65c2856f788083ef748c1 Mon Sep 17 00:00:00 2001 From: shubhamjagtap639 Date: Tue, 13 Jun 2023 12:16:16 +0530 Subject: [PATCH 13/39] Workspacekey test case removed --- prefect_datahub/datahub_emitter.py | 6 ++---- tests/test_datahub_emitter.py | 17 +---------------- 2 files changed, 3 insertions(+), 20 deletions(-) diff --git a/prefect_datahub/datahub_emitter.py b/prefect_datahub/datahub_emitter.py index 5157706..85f8edb 100644 --- a/prefect_datahub/datahub_emitter.py +++ b/prefect_datahub/datahub_emitter.py @@ -74,10 +74,6 @@ CANCELLED = "Cancelled" -class WorkspaceKey(PlatformKey): - workspace_name: str - - class DatahubEmitter(Block): """ Block used to emit prefect task and flow related metadata to Datahub REST @@ -416,6 +412,8 @@ def _emit_workspaces(self) -> Optional[str]: ) return None SUB_TYPE = "Workspace" + class WorkspaceKey(PlatformKey): + workspace_name: str current_workspace_id = PREFECT_API_URL.value().split("/")[-1] workspaces = asyncio.run(cloud.get_cloud_client().read_workspaces()) for workspace in workspaces: diff --git a/tests/test_datahub_emitter.py b/tests/test_datahub_emitter.py index 185d947..8e3a0c5 100644 --- a/tests/test_datahub_emitter.py +++ b/tests/test_datahub_emitter.py @@ -6,22 +6,7 @@ from datahub_provider.entities import Dataset from prefect.context import FlowRunContext, TaskRunContext -from prefect_datahub import constants -from prefect_datahub.datahub_emitter import DatahubEmitter, WorkspaceKey - - -def test_constants(): - assert constants.ORCHESTRATOR == "prefect" - - -def test_workspace_key(): - container_key = WorkspaceKey( - workspace_name="datahub", - platform="prefect", - env="PROD", - ) - assert container_key.guid() == "bf46b065c6816616f35e83d8be976c62" - assert container_key.workspace_name == "datahub" +from prefect_datahub.datahub_emitter import DatahubEmitter @patch("prefect_datahub.datahub_emitter.DatahubRestEmitter", autospec=True) From 88d05518b888ffa3330ea9aec7ee52d3d4bd1a3b Mon Sep 17 00:00:00 2001 From: shubhamjagtap639 Date: Tue, 13 Jun 2023 12:24:15 +0530 Subject: [PATCH 14/39] Testcase for workspace key added --- prefect_datahub/datahub_emitter.py | 10 +++++++--- tests/test_datahub_emitter.py | 11 +++++++++++ 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/prefect_datahub/datahub_emitter.py b/prefect_datahub/datahub_emitter.py index 85f8edb..9f4fd2c 100644 --- a/prefect_datahub/datahub_emitter.py +++ b/prefect_datahub/datahub_emitter.py @@ -132,7 +132,13 @@ class DatahubEmitter(Block): "https://datahubproject.io/docs/platform-instances/.", ) + class WorkspaceKey(PlatformKey): + workspace_name: str + def __init__(self, *args, **kwargs): + """ + Initialize datahub rest emitter + """ super().__init__(*args, **kwargs) self.datajobs_to_emit = {} self.emitter = DatahubRestEmitter(gms_server=self.datahub_rest_url) @@ -412,13 +418,11 @@ def _emit_workspaces(self) -> Optional[str]: ) return None SUB_TYPE = "Workspace" - class WorkspaceKey(PlatformKey): - workspace_name: str current_workspace_id = PREFECT_API_URL.value().split("/")[-1] workspaces = asyncio.run(cloud.get_cloud_client().read_workspaces()) for workspace in workspaces: if str(workspace.workspace_id) == current_workspace_id: - container_key = WorkspaceKey( + container_key = self.WorkspaceKey( workspace_name=workspace.workspace_name, platform=ORCHESTRATOR, instance=self.platform_instance, diff --git a/tests/test_datahub_emitter.py b/tests/test_datahub_emitter.py index 8e3a0c5..45f4393 100644 --- a/tests/test_datahub_emitter.py +++ b/tests/test_datahub_emitter.py @@ -9,6 +9,17 @@ from prefect_datahub.datahub_emitter import DatahubEmitter +@patch("prefect_datahub.datahub_emitter.DatahubRestEmitter", autospec=True) +def test_workspace_key(): + container_key = DatahubEmitter().WorkspaceKey( + workspace_name="datahub", + platform="prefect", + env="PROD", + ) + assert container_key.guid() == "bf46b065c6816616f35e83d8be976c62" + assert container_key.workspace_name == "datahub" + + @patch("prefect_datahub.datahub_emitter.DatahubRestEmitter", autospec=True) def test_entities_to_urn_list(mock_emit): dataset_urn_list = DatahubEmitter()._entities_to_urn_list( From 6cda94b362800e46bfcea4f96c1ac54ef2778600 Mon Sep 17 00:00:00 2001 From: shubhamjagtap639 Date: Tue, 13 Jun 2023 12:28:39 +0530 Subject: [PATCH 15/39] Code formatted --- tests/test_datahub_emitter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_datahub_emitter.py b/tests/test_datahub_emitter.py index 45f4393..7472aaa 100644 --- a/tests/test_datahub_emitter.py +++ b/tests/test_datahub_emitter.py @@ -10,7 +10,7 @@ @patch("prefect_datahub.datahub_emitter.DatahubRestEmitter", autospec=True) -def test_workspace_key(): +def test_workspace_key(mock_emit): container_key = DatahubEmitter().WorkspaceKey( workspace_name="datahub", platform="prefect", From 3bfc4c1d2b7bf6d22e50a63586b644d43cf4852b Mon Sep 17 00:00:00 2001 From: shubhamjagtap639 Date: Tue, 13 Jun 2023 12:38:20 +0530 Subject: [PATCH 16/39] Test case added --- prefect_datahub/datahub_emitter.py | 9 +++++---- tests/test_datahub_emitter.py | 11 ----------- tests/test_workspace_key.py | 10 ++++++++++ 3 files changed, 15 insertions(+), 15 deletions(-) create mode 100644 tests/test_workspace_key.py diff --git a/prefect_datahub/datahub_emitter.py b/prefect_datahub/datahub_emitter.py index 9f4fd2c..cb4f268 100644 --- a/prefect_datahub/datahub_emitter.py +++ b/prefect_datahub/datahub_emitter.py @@ -74,6 +74,10 @@ CANCELLED = "Cancelled" +class WorkspaceKey(PlatformKey): + workspace_name: str + + class DatahubEmitter(Block): """ Block used to emit prefect task and flow related metadata to Datahub REST @@ -132,9 +136,6 @@ class DatahubEmitter(Block): "https://datahubproject.io/docs/platform-instances/.", ) - class WorkspaceKey(PlatformKey): - workspace_name: str - def __init__(self, *args, **kwargs): """ Initialize datahub rest emitter @@ -422,7 +423,7 @@ def _emit_workspaces(self) -> Optional[str]: workspaces = asyncio.run(cloud.get_cloud_client().read_workspaces()) for workspace in workspaces: if str(workspace.workspace_id) == current_workspace_id: - container_key = self.WorkspaceKey( + container_key = WorkspaceKey( workspace_name=workspace.workspace_name, platform=ORCHESTRATOR, instance=self.platform_instance, diff --git a/tests/test_datahub_emitter.py b/tests/test_datahub_emitter.py index 7472aaa..8e3a0c5 100644 --- a/tests/test_datahub_emitter.py +++ b/tests/test_datahub_emitter.py @@ -9,17 +9,6 @@ from prefect_datahub.datahub_emitter import DatahubEmitter -@patch("prefect_datahub.datahub_emitter.DatahubRestEmitter", autospec=True) -def test_workspace_key(mock_emit): - container_key = DatahubEmitter().WorkspaceKey( - workspace_name="datahub", - platform="prefect", - env="PROD", - ) - assert container_key.guid() == "bf46b065c6816616f35e83d8be976c62" - assert container_key.workspace_name == "datahub" - - @patch("prefect_datahub.datahub_emitter.DatahubRestEmitter", autospec=True) def test_entities_to_urn_list(mock_emit): dataset_urn_list = DatahubEmitter()._entities_to_urn_list( diff --git a/tests/test_workspace_key.py b/tests/test_workspace_key.py new file mode 100644 index 0000000..35b7723 --- /dev/null +++ b/tests/test_workspace_key.py @@ -0,0 +1,10 @@ +from prefect_datahub.datahub_emitter import WorkspaceKey + +def test_workspace_key(): + container_key = WorkspaceKey( + workspace_name="datahub", + platform="prefect", + env="PROD", + ) + assert container_key.guid() == "bf46b065c6816616f35e83d8be976c62" + assert container_key.workspace_name == "datahub" \ No newline at end of file From b61076f5ab3406f9ba7f8ee58d4c87a3c63731ae Mon Sep 17 00:00:00 2001 From: shubhamjagtap639 Date: Tue, 13 Jun 2023 16:02:37 +0530 Subject: [PATCH 17/39] Emit workspace code removed --- prefect_datahub/datahub_emitter.py | 83 ++++---------- tests/test_datahub_emitter.py | 174 +++++++++++++---------------- tests/test_workspace_key.py | 10 -- 3 files changed, 98 insertions(+), 169 deletions(-) delete mode 100644 tests/test_workspace_key.py diff --git a/prefect_datahub/datahub_emitter.py b/prefect_datahub/datahub_emitter.py index cb4f268..2c557c3 100644 --- a/prefect_datahub/datahub_emitter.py +++ b/prefect_datahub/datahub_emitter.py @@ -8,16 +8,13 @@ DataProcessInstance, InstanceRunResult, ) -from datahub.emitter.mce_builder import make_user_urn from datahub.emitter.mcp import MetadataChangeProposalWrapper -from datahub.emitter.mcp_builder import PlatformKey, gen_containers from datahub.emitter.rest_emitter import DatahubRestEmitter from datahub.metadata.schema_classes import BrowsePathsClass from datahub.utilities.urns.data_flow_urn import DataFlowUrn from datahub.utilities.urns.data_job_urn import DataJobUrn from datahub.utilities.urns.dataset_urn import DatasetUrn from datahub_provider.entities import _Entity -from prefect import get_run_logger from prefect.blocks.core import Block from prefect.client import cloud, orchestration from prefect.client.schemas import TaskRun @@ -74,10 +71,6 @@ CANCELLED = "Cancelled" -class WorkspaceKey(PlatformKey): - workspace_name: str - - class DatahubEmitter(Block): """ Block used to emit prefect task and flow related metadata to Datahub REST @@ -157,6 +150,26 @@ def _entities_to_urn_list(self, iolets: List[_Entity]) -> List[DatasetUrn]: """ return [DatasetUrn.create_from_string(let.urn) for let in iolets] + def _get_workspace(self) -> Optional[str]: + """ + Fetch workspace name if present in configured prefect api url. + + Returns: + The workspace name. + """ + try: + asyncio.run(cloud.get_cloud_client().api_healthcheck()) + except Exception: + return None + if "workspaces" not in PREFECT_API_URL.value(): + return None + current_workspace_id = PREFECT_API_URL.value().split("/")[-1] + workspaces = asyncio.run(cloud.get_cloud_client().read_workspaces()) + for workspace in workspaces: + if str(workspace.workspace_id) == current_workspace_id: + return workspace.workspace_name + return None + async def _get_flow_run_graph(self, flow_run_id) -> List[Dict]: """ Fetch the flow run graph for provided flow run id @@ -396,51 +409,6 @@ def _emit_task_run( result_type=ORCHESTRATOR, ) - def _emit_workspaces(self) -> Optional[str]: - """ - Emit prefect workspace metadata to datahub rest. - Prefect workspace get mapped with datahub container entity. - Workspace account name also get emit as owner of container. - - Returns: - The emitted workspace name. - """ - try: - asyncio.run(cloud.get_cloud_client().api_healthcheck()) - except Exception: - get_run_logger().info( - "Cannot emit workspaces. Please set correct 'PREFECT_API_KEY'." - ) - return None - if "workspaces" not in PREFECT_API_URL.value(): - get_run_logger().info( - "Cannot emit workspaces. Please login to prefect cloud using command " - "'prefect cloud login'." - ) - return None - SUB_TYPE = "Workspace" - current_workspace_id = PREFECT_API_URL.value().split("/")[-1] - workspaces = asyncio.run(cloud.get_cloud_client().read_workspaces()) - for workspace in workspaces: - if str(workspace.workspace_id) == current_workspace_id: - container_key = WorkspaceKey( - workspace_name=workspace.workspace_name, - platform=ORCHESTRATOR, - instance=self.platform_instance, - env=self.env, - ) - container_work_units = gen_containers( - container_key=container_key, - name=workspace.workspace_name, - sub_types=[SUB_TYPE], - description=workspace.workspace_description, - owner_urn=make_user_urn(workspace.account_name), - ) - for workunit in container_work_units: - self.emitter.emit(workunit.metadata) - return workspace.workspace_name - return None - def add_task( self, inputs: Optional[List[_Entity]] = None, @@ -525,8 +493,7 @@ def etl(): flow_run_ctx = FlowRunContext.get() assert flow_run_ctx - # Emit workspace first - workspace_name = self._emit_workspaces() + workspace_name = self._get_workspace() # Emit flow and flow run dataflow = self._generate_dataflow(flow_run_ctx=flow_run_ctx) @@ -534,7 +501,7 @@ def etl(): if workspace_name is not None: mcp = MetadataChangeProposalWrapper( entityUrn=str(dataflow.urn), - aspect=BrowsePathsClass(paths=[f"/{workspace_name}/{dataflow.name}"]), + aspect=BrowsePathsClass(paths=[f"/prefect/prod/{workspace_name}"]), ) self.emitter.emit(mcp) self._emit_flow_run(dataflow, flow_run_ctx) @@ -571,9 +538,7 @@ def etl(): if workspace_name is not None: mcp = MetadataChangeProposalWrapper( entityUrn=str(datajob.urn), - aspect=BrowsePathsClass( - paths=[f"/{workspace_name}/{dataflow.name}/{datajob.name}"] - ), + aspect=BrowsePathsClass(paths=[f"/prefect/prod/{workspace_name}"]), ) self.emitter.emit(mcp) @@ -582,5 +547,3 @@ def etl(): flow_run_name=flow_run_ctx.flow_run.name, task_run=task_run, ) - - # Emit workspace diff --git a/tests/test_datahub_emitter.py b/tests/test_datahub_emitter.py index 8e3a0c5..d76731e 100644 --- a/tests/test_datahub_emitter.py +++ b/tests/test_datahub_emitter.py @@ -26,6 +26,12 @@ def test_get_flow_run_graph(mock_emit, mock_prefect_client): assert isinstance(graph_json, list) +@patch("prefect_datahub.datahub_emitter.DatahubRestEmitter", autospec=True) +def test__get_workspace(mock_emit, mock_prefect_cloud_client): + workspace_name = DatahubEmitter()._get_workspace() + assert workspace_name == "datahub" + + @patch("prefect_datahub.datahub_emitter.DatahubRestEmitter", autospec=True) def test_add_task(mock_emit, mock_run_context): mock_emitter = Mock() @@ -79,251 +85,221 @@ def test_emit_flow( expected_dataflow_urn = f"urn:li:dataFlow:(prefect,{flow_run_ctx.flow.name},prod)" - assert mock_emitter.method_calls[1].args[0].aspectName == "containerProperties" - assert ( - mock_emitter.method_calls[1].args[0].entityUrn - == "urn:li:container:bf46b065c6816616f35e83d8be976c62" - ) - assert mock_emitter.method_calls[2].args[0].aspectName == "status" - assert ( - mock_emitter.method_calls[2].args[0].entityUrn - == "urn:li:container:bf46b065c6816616f35e83d8be976c62" - ) - assert mock_emitter.method_calls[3].args[0].aspectName == "dataPlatformInstance" - assert ( - mock_emitter.method_calls[3].args[0].entityUrn - == "urn:li:container:bf46b065c6816616f35e83d8be976c62" - ) - assert mock_emitter.method_calls[4].args[0].aspectName == "subTypes" - assert ( - mock_emitter.method_calls[4].args[0].entityUrn - == "urn:li:container:bf46b065c6816616f35e83d8be976c62" - ) - assert mock_emitter.method_calls[5].args[0].aspectName == "ownership" - assert ( - mock_emitter.method_calls[5].args[0].entityUrn - == "urn:li:container:bf46b065c6816616f35e83d8be976c62" - ) - assert ( - mock_emitter.method_calls[5].args[0].aspect.owners[0].owner - == "urn:li:corpuser:shubhamjagtapgslabcom" - ) - - assert mock_emitter.method_calls[6].args[0].aspectName == "dataFlowInfo" - assert mock_emitter.method_calls[6].args[0].entityUrn == expected_dataflow_urn - assert mock_emitter.method_calls[7].args[0].aspectName == "ownership" - assert mock_emitter.method_calls[7].args[0].entityUrn == expected_dataflow_urn - assert mock_emitter.method_calls[8].args[0].aspectName == "globalTags" - assert mock_emitter.method_calls[8].args[0].entityUrn == expected_dataflow_urn - assert mock_emitter.method_calls[9].args[0].aspectName == "browsePaths" - assert mock_emitter.method_calls[9].args[0].entityUrn == expected_dataflow_urn - assert ( - mock_emitter.method_calls[13].args[0].aspectName + assert mock_emitter.method_calls[1].args[0].aspectName == "dataFlowInfo" + assert mock_emitter.method_calls[1].args[0].entityUrn == expected_dataflow_urn + assert mock_emitter.method_calls[2].args[0].aspectName == "ownership" + assert mock_emitter.method_calls[2].args[0].entityUrn == expected_dataflow_urn + assert mock_emitter.method_calls[3].args[0].aspectName == "globalTags" + assert mock_emitter.method_calls[3].args[0].entityUrn == expected_dataflow_urn + assert mock_emitter.method_calls[4].args[0].aspectName == "browsePaths" + assert mock_emitter.method_calls[4].args[0].entityUrn == expected_dataflow_urn + assert ( + mock_emitter.method_calls[8].args[0].aspectName == "dataProcessInstanceProperties" ) assert ( - mock_emitter.method_calls[13].args[0].entityUrn + mock_emitter.method_calls[8].args[0].entityUrn == "urn:li:dataProcessInstance:1c61330602200cac15fe044b3698c176" ) assert ( - mock_emitter.method_calls[14].args[0].aspectName + mock_emitter.method_calls[9].args[0].aspectName == "dataProcessInstanceRelationships" ) assert ( - mock_emitter.method_calls[14].args[0].entityUrn + mock_emitter.method_calls[9].args[0].entityUrn == "urn:li:dataProcessInstance:1c61330602200cac15fe044b3698c176" ) assert ( - mock_emitter.method_calls[15].args[0].aspectName + mock_emitter.method_calls[10].args[0].aspectName == "dataProcessInstanceRunEvent" ) assert ( - mock_emitter.method_calls[15].args[0].entityUrn + mock_emitter.method_calls[10].args[0].entityUrn == "urn:li:dataProcessInstance:1c61330602200cac15fe044b3698c176" ) - assert mock_emitter.method_calls[16].args[0].aspectName == "dataJobInfo" + assert mock_emitter.method_calls[11].args[0].aspectName == "dataJobInfo" assert ( - mock_emitter.method_calls[16].args[0].entityUrn + mock_emitter.method_calls[11].args[0].entityUrn == f"urn:li:dataJob:(urn:li:dataFlow:(prefect," f"{flow_run_ctx.flow.name},prod),__main__.extract)" ) - assert mock_emitter.method_calls[17].args[0].aspectName == "dataJobInputOutput" + assert mock_emitter.method_calls[12].args[0].aspectName == "dataJobInputOutput" assert ( - mock_emitter.method_calls[17].args[0].entityUrn + mock_emitter.method_calls[12].args[0].entityUrn == f"urn:li:dataJob:(urn:li:dataFlow:(prefect," f"{flow_run_ctx.flow.name},prod),__main__.extract)" ) - assert mock_emitter.method_calls[18].args[0].aspectName == "ownership" + assert mock_emitter.method_calls[13].args[0].aspectName == "ownership" assert ( - mock_emitter.method_calls[18].args[0].entityUrn + mock_emitter.method_calls[13].args[0].entityUrn == f"urn:li:dataJob:(urn:li:dataFlow:(prefect," f"{flow_run_ctx.flow.name},prod),__main__.extract)" ) - assert mock_emitter.method_calls[19].args[0].aspectName == "globalTags" + assert mock_emitter.method_calls[14].args[0].aspectName == "globalTags" assert ( - mock_emitter.method_calls[19].args[0].entityUrn + mock_emitter.method_calls[14].args[0].entityUrn == f"urn:li:dataJob:(urn:li:dataFlow:(prefect," f"{flow_run_ctx.flow.name},prod),__main__.extract)" ) - assert mock_emitter.method_calls[20].args[0].aspectName == "browsePaths" + assert mock_emitter.method_calls[15].args[0].aspectName == "browsePaths" assert ( - mock_emitter.method_calls[20].args[0].entityUrn + mock_emitter.method_calls[15].args[0].entityUrn == f"urn:li:dataJob:(urn:li:dataFlow:(prefect," f"{flow_run_ctx.flow.name},prod),__main__.extract)" ) assert ( - mock_emitter.method_calls[21].args[0].aspectName + mock_emitter.method_calls[16].args[0].aspectName == "dataProcessInstanceProperties" ) assert ( - mock_emitter.method_calls[21].args[0].entityUrn + mock_emitter.method_calls[16].args[0].entityUrn == "urn:li:dataProcessInstance:77a8ea575ff6976d37cd1a60caf98a95" ) assert ( - mock_emitter.method_calls[22].args[0].aspectName + mock_emitter.method_calls[17].args[0].aspectName == "dataProcessInstanceRelationships" ) assert ( - mock_emitter.method_calls[22].args[0].entityUrn + mock_emitter.method_calls[17].args[0].entityUrn == "urn:li:dataProcessInstance:77a8ea575ff6976d37cd1a60caf98a95" ) assert ( - mock_emitter.method_calls[23].args[0].aspectName + mock_emitter.method_calls[18].args[0].aspectName == "dataProcessInstanceRunEvent" ) assert ( - mock_emitter.method_calls[23].args[0].entityUrn + mock_emitter.method_calls[18].args[0].entityUrn == "urn:li:dataProcessInstance:77a8ea575ff6976d37cd1a60caf98a95" ) assert ( - mock_emitter.method_calls[24].args[0].aspectName + mock_emitter.method_calls[19].args[0].aspectName == "dataProcessInstanceRunEvent" ) assert ( - mock_emitter.method_calls[24].args[0].entityUrn + mock_emitter.method_calls[19].args[0].entityUrn == "urn:li:dataProcessInstance:77a8ea575ff6976d37cd1a60caf98a95" ) - assert mock_emitter.method_calls[25].args[0].aspectName == "dataJobInfo" + assert mock_emitter.method_calls[20].args[0].aspectName == "dataJobInfo" assert ( - mock_emitter.method_calls[25].args[0].entityUrn + mock_emitter.method_calls[20].args[0].entityUrn == f"urn:li:dataJob:(urn:li:dataFlow:(prefect," f"{flow_run_ctx.flow.name},prod),__main__.load)" ) - assert mock_emitter.method_calls[26].args[0].aspectName == "dataJobInputOutput" + assert mock_emitter.method_calls[21].args[0].aspectName == "dataJobInputOutput" assert ( - mock_emitter.method_calls[26].args[0].entityUrn + mock_emitter.method_calls[21].args[0].entityUrn == f"urn:li:dataJob:(urn:li:dataFlow:(prefect," f"{flow_run_ctx.flow.name},prod),__main__.load)" ) - assert mock_emitter.method_calls[27].args[0].aspectName == "ownership" + assert mock_emitter.method_calls[22].args[0].aspectName == "ownership" assert ( - mock_emitter.method_calls[27].args[0].entityUrn + mock_emitter.method_calls[22].args[0].entityUrn == f"urn:li:dataJob:(urn:li:dataFlow:(prefect," f"{flow_run_ctx.flow.name},prod),__main__.load)" ) - assert mock_emitter.method_calls[28].args[0].aspectName == "globalTags" + assert mock_emitter.method_calls[23].args[0].aspectName == "globalTags" assert ( - mock_emitter.method_calls[28].args[0].entityUrn + mock_emitter.method_calls[23].args[0].entityUrn == f"urn:li:dataJob:(urn:li:dataFlow:(prefect," f"{flow_run_ctx.flow.name},prod),__main__.load)" ) - assert mock_emitter.method_calls[29].args[0].aspectName == "browsePaths" + assert mock_emitter.method_calls[24].args[0].aspectName == "browsePaths" assert ( - mock_emitter.method_calls[29].args[0].entityUrn + mock_emitter.method_calls[24].args[0].entityUrn == f"urn:li:dataJob:(urn:li:dataFlow:(prefect," f"{flow_run_ctx.flow.name},prod),__main__.load)" ) assert ( - mock_emitter.method_calls[30].args[0].aspectName + mock_emitter.method_calls[25].args[0].aspectName == "dataProcessInstanceProperties" ) assert ( - mock_emitter.method_calls[30].args[0].entityUrn + mock_emitter.method_calls[25].args[0].entityUrn == "urn:li:dataProcessInstance:6efec88dd6d26cb85e8592baf38e42b9" ) assert ( - mock_emitter.method_calls[31].args[0].aspectName + mock_emitter.method_calls[26].args[0].aspectName == "dataProcessInstanceRelationships" ) assert ( - mock_emitter.method_calls[31].args[0].entityUrn + mock_emitter.method_calls[26].args[0].entityUrn == "urn:li:dataProcessInstance:6efec88dd6d26cb85e8592baf38e42b9" ) assert ( - mock_emitter.method_calls[32].args[0].aspectName + mock_emitter.method_calls[27].args[0].aspectName == "dataProcessInstanceRunEvent" ) assert ( - mock_emitter.method_calls[32].args[0].entityUrn + mock_emitter.method_calls[27].args[0].entityUrn == "urn:li:dataProcessInstance:6efec88dd6d26cb85e8592baf38e42b9" ) assert ( - mock_emitter.method_calls[33].args[0].aspectName + mock_emitter.method_calls[28].args[0].aspectName == "dataProcessInstanceRunEvent" ) assert ( - mock_emitter.method_calls[33].args[0].entityUrn + mock_emitter.method_calls[28].args[0].entityUrn == "urn:li:dataProcessInstance:6efec88dd6d26cb85e8592baf38e42b9" ) - assert mock_emitter.method_calls[34].args[0].aspectName == "dataJobInfo" + assert mock_emitter.method_calls[29].args[0].aspectName == "dataJobInfo" assert ( - mock_emitter.method_calls[34].args[0].entityUrn + mock_emitter.method_calls[29].args[0].entityUrn == f"urn:li:dataJob:(urn:li:dataFlow:(prefect," f"{flow_run_ctx.flow.name},prod),__main__.transform)" ) - assert mock_emitter.method_calls[35].args[0].aspectName == "dataJobInputOutput" + assert mock_emitter.method_calls[30].args[0].aspectName == "dataJobInputOutput" assert ( - mock_emitter.method_calls[35].args[0].entityUrn + mock_emitter.method_calls[30].args[0].entityUrn == f"urn:li:dataJob:(urn:li:dataFlow:(prefect," f"{flow_run_ctx.flow.name},prod),__main__.transform)" ) - assert mock_emitter.method_calls[36].args[0].aspectName == "ownership" + assert mock_emitter.method_calls[31].args[0].aspectName == "ownership" assert ( - mock_emitter.method_calls[36].args[0].entityUrn + mock_emitter.method_calls[31].args[0].entityUrn == f"urn:li:dataJob:(urn:li:dataFlow:(prefect," f"{flow_run_ctx.flow.name},prod),__main__.transform)" ) - assert mock_emitter.method_calls[37].args[0].aspectName == "globalTags" + assert mock_emitter.method_calls[32].args[0].aspectName == "globalTags" assert ( - mock_emitter.method_calls[37].args[0].entityUrn + mock_emitter.method_calls[32].args[0].entityUrn == f"urn:li:dataJob:(urn:li:dataFlow:(prefect," f"{flow_run_ctx.flow.name},prod),__main__.transform)" ) - assert mock_emitter.method_calls[38].args[0].aspectName == "browsePaths" + assert mock_emitter.method_calls[33].args[0].aspectName == "browsePaths" assert ( - mock_emitter.method_calls[38].args[0].entityUrn + mock_emitter.method_calls[33].args[0].entityUrn == f"urn:li:dataJob:(urn:li:dataFlow:(prefect," f"{flow_run_ctx.flow.name},prod),__main__.transform)" ) assert ( - mock_emitter.method_calls[39].args[0].aspectName + mock_emitter.method_calls[34].args[0].aspectName == "dataProcessInstanceProperties" ) assert ( - mock_emitter.method_calls[39].args[0].entityUrn + mock_emitter.method_calls[34].args[0].entityUrn == "urn:li:dataProcessInstance:c4458dec616b26ad64e2c520614ef6b7" ) assert ( - mock_emitter.method_calls[40].args[0].aspectName + mock_emitter.method_calls[35].args[0].aspectName == "dataProcessInstanceRelationships" ) assert ( - mock_emitter.method_calls[40].args[0].entityUrn + mock_emitter.method_calls[35].args[0].entityUrn == "urn:li:dataProcessInstance:c4458dec616b26ad64e2c520614ef6b7" ) assert ( - mock_emitter.method_calls[41].args[0].aspectName + mock_emitter.method_calls[36].args[0].aspectName == "dataProcessInstanceRunEvent" ) assert ( - mock_emitter.method_calls[41].args[0].entityUrn + mock_emitter.method_calls[36].args[0].entityUrn == "urn:li:dataProcessInstance:c4458dec616b26ad64e2c520614ef6b7" ) assert ( - mock_emitter.method_calls[42].args[0].aspectName + mock_emitter.method_calls[37].args[0].aspectName == "dataProcessInstanceRunEvent" ) assert ( - mock_emitter.method_calls[42].args[0].entityUrn + mock_emitter.method_calls[37].args[0].entityUrn == "urn:li:dataProcessInstance:c4458dec616b26ad64e2c520614ef6b7" ) diff --git a/tests/test_workspace_key.py b/tests/test_workspace_key.py deleted file mode 100644 index 35b7723..0000000 --- a/tests/test_workspace_key.py +++ /dev/null @@ -1,10 +0,0 @@ -from prefect_datahub.datahub_emitter import WorkspaceKey - -def test_workspace_key(): - container_key = WorkspaceKey( - workspace_name="datahub", - platform="prefect", - env="PROD", - ) - assert container_key.guid() == "bf46b065c6816616f35e83d8be976c62" - assert container_key.workspace_name == "datahub" \ No newline at end of file From cfed8587cae22abc845261c6ace1c46e1488afce Mon Sep 17 00:00:00 2001 From: shubhamjagtap639 Date: Tue, 13 Jun 2023 17:01:38 +0530 Subject: [PATCH 18/39] dataprocess instance id modified --- prefect_datahub/datahub_emitter.py | 38 ++++++++------ tests/test_datahub_emitter.py | 81 +++++++++++++----------------- 2 files changed, 56 insertions(+), 63 deletions(-) diff --git a/prefect_datahub/datahub_emitter.py b/prefect_datahub/datahub_emitter.py index 2c557c3..9dbc963 100644 --- a/prefect_datahub/datahub_emitter.py +++ b/prefect_datahub/datahub_emitter.py @@ -1,6 +1,7 @@ """Datahub Emitter classes used to emit prefect metadata to Datahub REST.""" import asyncio +from uuid import UUID from typing import Dict, List, Optional from datahub.api.entities.datajob import DataFlow, DataJob @@ -299,7 +300,7 @@ def _generate_dataflow(self, flow_run_ctx: FlowRunContext) -> DataFlow: return dataflow - def _emit_flow_run(self, dataflow: DataFlow, flow_run_ctx: FlowRunContext) -> None: + def _emit_flow_run(self, dataflow: DataFlow, flow_run_id: UUID) -> None: """ Emit prefect flow run to datahub rest. Prefect flow run get mapped with datahub data process instance entity which get's generate from provided dataflow entity. @@ -307,15 +308,19 @@ def _emit_flow_run(self, dataflow: DataFlow, flow_run_ctx: FlowRunContext) -> No Args: dataflow: The datahub dataflow entity used to create data process instance. - flow_run_ctx: The prefect current running flow run context. + flow_run_id: The prefect current running flow run id. """ flow_run = asyncio.run( orchestration.get_client().read_flow_run( - flow_run_id=flow_run_ctx.flow_run.id + flow_run_id=flow_run_id ) ) + if self.platform_instance is not None: + dpi_id = f"{self.platform_instance}.{flow_run.name}" + else: + dpi_id = flow_run.name dpi = DataProcessInstance.from_dataflow( - dataflow=dataflow, id=flow_run_ctx.flow_run.name + dataflow=dataflow, id=dpi_id ) dpi_property_bag: Dict[str, str] = {} @@ -340,12 +345,12 @@ def _emit_flow_run(self, dataflow: DataFlow, flow_run_ctx: FlowRunContext) -> No dpi.emit_process_start( emitter=self.emitter, start_timestamp_millis=int( - flow_run_ctx.flow_run.start_time.timestamp() * 1000 + flow_run.start_time.timestamp() * 1000 ), ) def _emit_task_run( - self, datajob: DataJob, flow_run_name: str, task_run: TaskRun + self, datajob: DataJob, flow_run_name: str, task_run_id: str ) -> None: """ Emit prefect task run to datahub rest. Prefect task run get mapped with datahub @@ -355,11 +360,16 @@ def _emit_task_run( Args: datajob: The datahub datajob entity used to create data process instance. flow_run_name: The prefect current running flow run name. - task_run: The prefect task run entity. + task_run_id: The prefect task run id. """ + task_run = asyncio.run(orchestration.get_client().read_task_run(task_run_id)) + if self.platform_instance is not None: + dpi_id = f"{self.platform_instance}.{flow_run_name}.{task_run.name}" + else: + dpi_id = f"{flow_run_name}.{task_run.name}" dpi = DataProcessInstance.from_datajob( datajob=datajob, - id=f"{flow_run_name}.{task_run.name}", + id=dpi_id, clone_inlets=True, clone_outlets=True, ) @@ -504,7 +514,7 @@ def etl(): aspect=BrowsePathsClass(paths=[f"/prefect/prod/{workspace_name}"]), ) self.emitter.emit(mcp) - self._emit_flow_run(dataflow, flow_run_ctx) + self._emit_flow_run(dataflow, flow_run_ctx.flow_run.id) # Emit task, task run and add upstream task if present for each task graph_json = asyncio.run( @@ -515,19 +525,16 @@ def etl(): for prefect_future in flow_run_ctx.task_run_futures } for node in graph_json: - task_run = asyncio.run(orchestration.get_client().read_task_run(node[ID])) - # Emit task datajob_urn = DataJobUrn.create_from_ids( data_flow_urn=str(dataflow.urn), - job_id=task_run.task_key, + job_id=task_run_key_map[node[ID]], ) if str(datajob_urn) in self.datajobs_to_emit: datajob = self.datajobs_to_emit[str(datajob_urn)] else: datajob = self._generate_datajob( - flow_run_ctx=flow_run_ctx, task_key=task_run.task_key + flow_run_ctx=flow_run_ctx, task_key=task_run_key_map[node[ID]] ) - # Add upstrem urns for each in node[UPSTREAM_DEPENDENCIES]: upstream_task_urn = DataJobUrn.create_from_ids( data_flow_urn=str(dataflow.urn), @@ -541,9 +548,8 @@ def etl(): aspect=BrowsePathsClass(paths=[f"/prefect/prod/{workspace_name}"]), ) self.emitter.emit(mcp) - self._emit_task_run( datajob=datajob, flow_run_name=flow_run_ctx.flow_run.name, - task_run=task_run, + task_run_id=node[ID] ) diff --git a/tests/test_datahub_emitter.py b/tests/test_datahub_emitter.py index d76731e..4c6a1df 100644 --- a/tests/test_datahub_emitter.py +++ b/tests/test_datahub_emitter.py @@ -78,12 +78,14 @@ def test_emit_flow( mock_emitter = Mock() mock_emit.return_value = mock_emitter - datahub_emitter = DatahubEmitter() + platform_instance = 'datahub_workspace' + + datahub_emitter = DatahubEmitter(platform_instance=platform_instance) datahub_emitter.emit_flow() flow_run_ctx: FlowRunContext = mock_run_context[1] - expected_dataflow_urn = f"urn:li:dataFlow:(prefect,{flow_run_ctx.flow.name},prod)" + expected_dataflow_urn = f"urn:li:dataFlow:(prefect,{platform_instance}.{flow_run_ctx.flow.name},prod)" assert mock_emitter.method_calls[1].args[0].aspectName == "dataFlowInfo" assert mock_emitter.method_calls[1].args[0].entityUrn == expected_dataflow_urn @@ -99,7 +101,7 @@ def test_emit_flow( ) assert ( mock_emitter.method_calls[8].args[0].entityUrn - == "urn:li:dataProcessInstance:1c61330602200cac15fe044b3698c176" + == "urn:li:dataProcessInstance:a95d24db6abd98384fc1d4c8540098a4" ) assert ( mock_emitter.method_calls[9].args[0].aspectName @@ -107,7 +109,7 @@ def test_emit_flow( ) assert ( mock_emitter.method_calls[9].args[0].entityUrn - == "urn:li:dataProcessInstance:1c61330602200cac15fe044b3698c176" + == "urn:li:dataProcessInstance:a95d24db6abd98384fc1d4c8540098a4" ) assert ( mock_emitter.method_calls[10].args[0].aspectName @@ -115,37 +117,32 @@ def test_emit_flow( ) assert ( mock_emitter.method_calls[10].args[0].entityUrn - == "urn:li:dataProcessInstance:1c61330602200cac15fe044b3698c176" + == "urn:li:dataProcessInstance:a95d24db6abd98384fc1d4c8540098a4" ) assert mock_emitter.method_calls[11].args[0].aspectName == "dataJobInfo" assert ( mock_emitter.method_calls[11].args[0].entityUrn - == f"urn:li:dataJob:(urn:li:dataFlow:(prefect," - f"{flow_run_ctx.flow.name},prod),__main__.extract)" + == f"urn:li:dataJob:({expected_dataflow_urn},__main__.extract)" ) assert mock_emitter.method_calls[12].args[0].aspectName == "dataJobInputOutput" assert ( mock_emitter.method_calls[12].args[0].entityUrn - == f"urn:li:dataJob:(urn:li:dataFlow:(prefect," - f"{flow_run_ctx.flow.name},prod),__main__.extract)" + == f"urn:li:dataJob:({expected_dataflow_urn},__main__.extract)" ) assert mock_emitter.method_calls[13].args[0].aspectName == "ownership" assert ( mock_emitter.method_calls[13].args[0].entityUrn - == f"urn:li:dataJob:(urn:li:dataFlow:(prefect," - f"{flow_run_ctx.flow.name},prod),__main__.extract)" + == f"urn:li:dataJob:({expected_dataflow_urn},__main__.extract)" ) assert mock_emitter.method_calls[14].args[0].aspectName == "globalTags" assert ( mock_emitter.method_calls[14].args[0].entityUrn - == f"urn:li:dataJob:(urn:li:dataFlow:(prefect," - f"{flow_run_ctx.flow.name},prod),__main__.extract)" + == f"urn:li:dataJob:({expected_dataflow_urn},__main__.extract)" ) assert mock_emitter.method_calls[15].args[0].aspectName == "browsePaths" assert ( mock_emitter.method_calls[15].args[0].entityUrn - == f"urn:li:dataJob:(urn:li:dataFlow:(prefect," - f"{flow_run_ctx.flow.name},prod),__main__.extract)" + == f"urn:li:dataJob:({expected_dataflow_urn},__main__.extract)" ) assert ( mock_emitter.method_calls[16].args[0].aspectName @@ -153,7 +150,7 @@ def test_emit_flow( ) assert ( mock_emitter.method_calls[16].args[0].entityUrn - == "urn:li:dataProcessInstance:77a8ea575ff6976d37cd1a60caf98a95" + == "urn:li:dataProcessInstance:bf5eab177af0097bbff6a41694f39af9" ) assert ( mock_emitter.method_calls[17].args[0].aspectName @@ -161,7 +158,7 @@ def test_emit_flow( ) assert ( mock_emitter.method_calls[17].args[0].entityUrn - == "urn:li:dataProcessInstance:77a8ea575ff6976d37cd1a60caf98a95" + == "urn:li:dataProcessInstance:bf5eab177af0097bbff6a41694f39af9" ) assert ( mock_emitter.method_calls[18].args[0].aspectName @@ -169,7 +166,7 @@ def test_emit_flow( ) assert ( mock_emitter.method_calls[18].args[0].entityUrn - == "urn:li:dataProcessInstance:77a8ea575ff6976d37cd1a60caf98a95" + == "urn:li:dataProcessInstance:bf5eab177af0097bbff6a41694f39af9" ) assert ( mock_emitter.method_calls[19].args[0].aspectName @@ -177,37 +174,32 @@ def test_emit_flow( ) assert ( mock_emitter.method_calls[19].args[0].entityUrn - == "urn:li:dataProcessInstance:77a8ea575ff6976d37cd1a60caf98a95" + == "urn:li:dataProcessInstance:bf5eab177af0097bbff6a41694f39af9" ) assert mock_emitter.method_calls[20].args[0].aspectName == "dataJobInfo" assert ( mock_emitter.method_calls[20].args[0].entityUrn - == f"urn:li:dataJob:(urn:li:dataFlow:(prefect," - f"{flow_run_ctx.flow.name},prod),__main__.load)" + == f"urn:li:dataJob:({expected_dataflow_urn},__main__.load)" ) assert mock_emitter.method_calls[21].args[0].aspectName == "dataJobInputOutput" assert ( mock_emitter.method_calls[21].args[0].entityUrn - == f"urn:li:dataJob:(urn:li:dataFlow:(prefect," - f"{flow_run_ctx.flow.name},prod),__main__.load)" + == f"urn:li:dataJob:({expected_dataflow_urn},__main__.load)" ) assert mock_emitter.method_calls[22].args[0].aspectName == "ownership" assert ( mock_emitter.method_calls[22].args[0].entityUrn - == f"urn:li:dataJob:(urn:li:dataFlow:(prefect," - f"{flow_run_ctx.flow.name},prod),__main__.load)" + == f"urn:li:dataJob:({expected_dataflow_urn},__main__.load)" ) assert mock_emitter.method_calls[23].args[0].aspectName == "globalTags" assert ( mock_emitter.method_calls[23].args[0].entityUrn - == f"urn:li:dataJob:(urn:li:dataFlow:(prefect," - f"{flow_run_ctx.flow.name},prod),__main__.load)" + == f"urn:li:dataJob:({expected_dataflow_urn},__main__.load)" ) assert mock_emitter.method_calls[24].args[0].aspectName == "browsePaths" assert ( mock_emitter.method_calls[24].args[0].entityUrn - == f"urn:li:dataJob:(urn:li:dataFlow:(prefect," - f"{flow_run_ctx.flow.name},prod),__main__.load)" + == f"urn:li:dataJob:({expected_dataflow_urn},__main__.load)" ) assert ( mock_emitter.method_calls[25].args[0].aspectName @@ -215,7 +207,7 @@ def test_emit_flow( ) assert ( mock_emitter.method_calls[25].args[0].entityUrn - == "urn:li:dataProcessInstance:6efec88dd6d26cb85e8592baf38e42b9" + == "urn:li:dataProcessInstance:095673536b61e6f25c7691af0d2cc317" ) assert ( mock_emitter.method_calls[26].args[0].aspectName @@ -223,7 +215,7 @@ def test_emit_flow( ) assert ( mock_emitter.method_calls[26].args[0].entityUrn - == "urn:li:dataProcessInstance:6efec88dd6d26cb85e8592baf38e42b9" + == "urn:li:dataProcessInstance:095673536b61e6f25c7691af0d2cc317" ) assert ( mock_emitter.method_calls[27].args[0].aspectName @@ -231,7 +223,7 @@ def test_emit_flow( ) assert ( mock_emitter.method_calls[27].args[0].entityUrn - == "urn:li:dataProcessInstance:6efec88dd6d26cb85e8592baf38e42b9" + == "urn:li:dataProcessInstance:095673536b61e6f25c7691af0d2cc317" ) assert ( mock_emitter.method_calls[28].args[0].aspectName @@ -239,37 +231,32 @@ def test_emit_flow( ) assert ( mock_emitter.method_calls[28].args[0].entityUrn - == "urn:li:dataProcessInstance:6efec88dd6d26cb85e8592baf38e42b9" + == "urn:li:dataProcessInstance:095673536b61e6f25c7691af0d2cc317" ) assert mock_emitter.method_calls[29].args[0].aspectName == "dataJobInfo" assert ( mock_emitter.method_calls[29].args[0].entityUrn - == f"urn:li:dataJob:(urn:li:dataFlow:(prefect," - f"{flow_run_ctx.flow.name},prod),__main__.transform)" + == f"urn:li:dataJob:({expected_dataflow_urn},__main__.transform)" ) assert mock_emitter.method_calls[30].args[0].aspectName == "dataJobInputOutput" assert ( mock_emitter.method_calls[30].args[0].entityUrn - == f"urn:li:dataJob:(urn:li:dataFlow:(prefect," - f"{flow_run_ctx.flow.name},prod),__main__.transform)" + == f"urn:li:dataJob:({expected_dataflow_urn},__main__.transform)" ) assert mock_emitter.method_calls[31].args[0].aspectName == "ownership" assert ( mock_emitter.method_calls[31].args[0].entityUrn - == f"urn:li:dataJob:(urn:li:dataFlow:(prefect," - f"{flow_run_ctx.flow.name},prod),__main__.transform)" + == f"urn:li:dataJob:({expected_dataflow_urn},__main__.transform)" ) assert mock_emitter.method_calls[32].args[0].aspectName == "globalTags" assert ( mock_emitter.method_calls[32].args[0].entityUrn - == f"urn:li:dataJob:(urn:li:dataFlow:(prefect," - f"{flow_run_ctx.flow.name},prod),__main__.transform)" + == f"urn:li:dataJob:({expected_dataflow_urn},__main__.transform)" ) assert mock_emitter.method_calls[33].args[0].aspectName == "browsePaths" assert ( mock_emitter.method_calls[33].args[0].entityUrn - == f"urn:li:dataJob:(urn:li:dataFlow:(prefect," - f"{flow_run_ctx.flow.name},prod),__main__.transform)" + == f"urn:li:dataJob:({expected_dataflow_urn},__main__.transform)" ) assert ( mock_emitter.method_calls[34].args[0].aspectName @@ -277,7 +264,7 @@ def test_emit_flow( ) assert ( mock_emitter.method_calls[34].args[0].entityUrn - == "urn:li:dataProcessInstance:c4458dec616b26ad64e2c520614ef6b7" + == "urn:li:dataProcessInstance:04ba0f8064b2c45f69da571c434f1c69" ) assert ( mock_emitter.method_calls[35].args[0].aspectName @@ -285,7 +272,7 @@ def test_emit_flow( ) assert ( mock_emitter.method_calls[35].args[0].entityUrn - == "urn:li:dataProcessInstance:c4458dec616b26ad64e2c520614ef6b7" + == "urn:li:dataProcessInstance:04ba0f8064b2c45f69da571c434f1c69" ) assert ( mock_emitter.method_calls[36].args[0].aspectName @@ -293,7 +280,7 @@ def test_emit_flow( ) assert ( mock_emitter.method_calls[36].args[0].entityUrn - == "urn:li:dataProcessInstance:c4458dec616b26ad64e2c520614ef6b7" + == "urn:li:dataProcessInstance:04ba0f8064b2c45f69da571c434f1c69" ) assert ( mock_emitter.method_calls[37].args[0].aspectName @@ -301,5 +288,5 @@ def test_emit_flow( ) assert ( mock_emitter.method_calls[37].args[0].entityUrn - == "urn:li:dataProcessInstance:c4458dec616b26ad64e2c520614ef6b7" + == "urn:li:dataProcessInstance:04ba0f8064b2c45f69da571c434f1c69" ) From f8bbf4974a6fc84631358ba9d36e3d5d94138639 Mon Sep 17 00:00:00 2001 From: shubhamjagtap639 Date: Tue, 13 Jun 2023 17:06:34 +0530 Subject: [PATCH 19/39] Code formatted --- prefect_datahub/datahub_emitter.py | 17 +++++------------ tests/test_datahub_emitter.py | 6 ++++-- 2 files changed, 9 insertions(+), 14 deletions(-) diff --git a/prefect_datahub/datahub_emitter.py b/prefect_datahub/datahub_emitter.py index 9dbc963..558c85d 100644 --- a/prefect_datahub/datahub_emitter.py +++ b/prefect_datahub/datahub_emitter.py @@ -1,8 +1,8 @@ """Datahub Emitter classes used to emit prefect metadata to Datahub REST.""" import asyncio -from uuid import UUID from typing import Dict, List, Optional +from uuid import UUID from datahub.api.entities.datajob import DataFlow, DataJob from datahub.api.entities.dataprocess.dataprocess_instance import ( @@ -18,7 +18,6 @@ from datahub_provider.entities import _Entity from prefect.blocks.core import Block from prefect.client import cloud, orchestration -from prefect.client.schemas import TaskRun from prefect.context import FlowRunContext, TaskRunContext from prefect.settings import PREFECT_API_URL from pydantic import Field @@ -311,17 +310,13 @@ def _emit_flow_run(self, dataflow: DataFlow, flow_run_id: UUID) -> None: flow_run_id: The prefect current running flow run id. """ flow_run = asyncio.run( - orchestration.get_client().read_flow_run( - flow_run_id=flow_run_id - ) + orchestration.get_client().read_flow_run(flow_run_id=flow_run_id) ) if self.platform_instance is not None: dpi_id = f"{self.platform_instance}.{flow_run.name}" else: dpi_id = flow_run.name - dpi = DataProcessInstance.from_dataflow( - dataflow=dataflow, id=dpi_id - ) + dpi = DataProcessInstance.from_dataflow(dataflow=dataflow, id=dpi_id) dpi_property_bag: Dict[str, str] = {} allowed_flow_run_keys = [ @@ -344,9 +339,7 @@ def _emit_flow_run(self, dataflow: DataFlow, flow_run_id: UUID) -> None: dpi.emit_process_start( emitter=self.emitter, - start_timestamp_millis=int( - flow_run.start_time.timestamp() * 1000 - ), + start_timestamp_millis=int(flow_run.start_time.timestamp() * 1000), ) def _emit_task_run( @@ -551,5 +544,5 @@ def etl(): self._emit_task_run( datajob=datajob, flow_run_name=flow_run_ctx.flow_run.name, - task_run_id=node[ID] + task_run_id=node[ID], ) diff --git a/tests/test_datahub_emitter.py b/tests/test_datahub_emitter.py index 4c6a1df..4cb9f89 100644 --- a/tests/test_datahub_emitter.py +++ b/tests/test_datahub_emitter.py @@ -78,14 +78,16 @@ def test_emit_flow( mock_emitter = Mock() mock_emit.return_value = mock_emitter - platform_instance = 'datahub_workspace' + platform_instance = "datahub_workspace" datahub_emitter = DatahubEmitter(platform_instance=platform_instance) datahub_emitter.emit_flow() flow_run_ctx: FlowRunContext = mock_run_context[1] - expected_dataflow_urn = f"urn:li:dataFlow:(prefect,{platform_instance}.{flow_run_ctx.flow.name},prod)" + expected_dataflow_urn = ( + f"urn:li:dataFlow:(prefect,{platform_instance}.{flow_run_ctx.flow.name},prod)" + ) assert mock_emitter.method_calls[1].args[0].aspectName == "dataFlowInfo" assert mock_emitter.method_calls[1].args[0].entityUrn == expected_dataflow_urn From 089add5afdf34ba54d3fc7842204ea182a70d9a4 Mon Sep 17 00:00:00 2001 From: shubhamjagtap639 Date: Tue, 13 Jun 2023 17:18:26 +0530 Subject: [PATCH 20/39] Code chnages as per review comment --- prefect_datahub/datahub_emitter.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/prefect_datahub/datahub_emitter.py b/prefect_datahub/datahub_emitter.py index 558c85d..f742807 100644 --- a/prefect_datahub/datahub_emitter.py +++ b/prefect_datahub/datahub_emitter.py @@ -386,19 +386,20 @@ def _emit_task_run( dpi_property_bag[key] = str(getattr(task_run, key)) dpi.properties.update(dpi_property_bag) - state_result_map: Dict[str, str] = {} - state_result_map[COMPLETE] = InstanceRunResult.SUCCESS - state_result_map[FAILED] = InstanceRunResult.FAILURE - state_result_map[CANCELLED] = InstanceRunResult.SKIPPED + state_result_map: Dict[str, str] = { + COMPLETE: InstanceRunResult.SUCCESS, + FAILED: InstanceRunResult.FAILURE, + CANCELLED: InstanceRunResult.SKIPPED, + } - if task_run.state_name in state_result_map: - result = state_result_map[task_run.state_name] - else: + if task_run.state_name not in state_result_map: raise Exception( f"State should be either complete, failed or cancelled and it was " f"{task_run.state_name}" ) + result = state_result_map[task_run.state_name] + dpi.emit_process_start( emitter=self.emitter, start_timestamp_millis=int(task_run.start_time.timestamp() * 1000), From 63181ba39b0ba06a61108dfe6ea7e7e18260e0b2 Mon Sep 17 00:00:00 2001 From: shubhamjagtap639 Date: Tue, 13 Jun 2023 17:38:38 +0530 Subject: [PATCH 21/39] datajob tags test case added --- tests/conftest.py | 2 +- tests/test_datahub_emitter.py | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/tests/conftest.py b/tests/conftest.py index 531b246..e5ed64d 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -13,7 +13,7 @@ "name": "transform", "description": "Transform the actual data", "task_key": "__main__.transform", - "tags": [], + "tags": ["etl flow task"], } mock_extract_task_run_json = { "id": "fa14a52b-d271-4c41-99cb-6b42ca7c070b", diff --git a/tests/test_datahub_emitter.py b/tests/test_datahub_emitter.py index 4cb9f89..8e69c93 100644 --- a/tests/test_datahub_emitter.py +++ b/tests/test_datahub_emitter.py @@ -81,8 +81,10 @@ def test_emit_flow( platform_instance = "datahub_workspace" datahub_emitter = DatahubEmitter(platform_instance=platform_instance) + datahub_emitter.add_task() datahub_emitter.emit_flow() + task_run_ctx: TaskRunContext = mock_run_context[0] flow_run_ctx: FlowRunContext = mock_run_context[1] expected_dataflow_urn = ( @@ -255,6 +257,10 @@ def test_emit_flow( mock_emitter.method_calls[32].args[0].entityUrn == f"urn:li:dataJob:({expected_dataflow_urn},__main__.transform)" ) + assert ( + mock_emitter.method_calls[32].args[0].aspect.tags[0].tag + == f"urn:li:tag:{task_run_ctx.task.tags[0]}" + ) assert mock_emitter.method_calls[33].args[0].aspectName == "browsePaths" assert ( mock_emitter.method_calls[33].args[0].entityUrn From c3cf5614aabd84f95389a66ff30541e03ae22707 Mon Sep 17 00:00:00 2001 From: shubhamjagtap639 Date: Tue, 13 Jun 2023 17:46:42 +0530 Subject: [PATCH 22/39] temprarory change to check if PR checks getting passed or not --- prefect_datahub/datahub_emitter.py | 1 - 1 file changed, 1 deletion(-) diff --git a/prefect_datahub/datahub_emitter.py b/prefect_datahub/datahub_emitter.py index f742807..93704f2 100644 --- a/prefect_datahub/datahub_emitter.py +++ b/prefect_datahub/datahub_emitter.py @@ -265,7 +265,6 @@ def _generate_dataflow(self, flow_run_ctx: FlowRunContext) -> DataFlow: orchestrator=ORCHESTRATOR, id=flow_run_ctx.flow.name, cluster=self.env, - env=self.env, name=flow_run_ctx.flow.name, platform_instance=self.platform_instance, ) From c3bb7990a57d7204a15531106a1762d7aa46b65a Mon Sep 17 00:00:00 2001 From: shubhamjagtap639 Date: Tue, 13 Jun 2023 17:50:04 +0530 Subject: [PATCH 23/39] temp changes --- prefect_datahub/datahub_emitter.py | 1 - tests/test_datahub_emitter.py | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/prefect_datahub/datahub_emitter.py b/prefect_datahub/datahub_emitter.py index 93704f2..0863c6e 100644 --- a/prefect_datahub/datahub_emitter.py +++ b/prefect_datahub/datahub_emitter.py @@ -266,7 +266,6 @@ def _generate_dataflow(self, flow_run_ctx: FlowRunContext) -> DataFlow: id=flow_run_ctx.flow.name, cluster=self.env, name=flow_run_ctx.flow.name, - platform_instance=self.platform_instance, ) dataflow.description = flow_run_ctx.flow.description dataflow.tags = flow.tags diff --git a/tests/test_datahub_emitter.py b/tests/test_datahub_emitter.py index 8e69c93..c634640 100644 --- a/tests/test_datahub_emitter.py +++ b/tests/test_datahub_emitter.py @@ -88,7 +88,7 @@ def test_emit_flow( flow_run_ctx: FlowRunContext = mock_run_context[1] expected_dataflow_urn = ( - f"urn:li:dataFlow:(prefect,{platform_instance}.{flow_run_ctx.flow.name},prod)" + f"urn:li:dataFlow:(prefect,{flow_run_ctx.flow.name},prod)" ) assert mock_emitter.method_calls[1].args[0].aspectName == "dataFlowInfo" From 63fb530452222ca32d7c6e229349e80a2f314950 Mon Sep 17 00:00:00 2001 From: shubhamjagtap639 Date: Tue, 13 Jun 2023 17:54:19 +0530 Subject: [PATCH 24/39] temp change --- prefect_datahub/datahub_emitter.py | 1 - 1 file changed, 1 deletion(-) diff --git a/prefect_datahub/datahub_emitter.py b/prefect_datahub/datahub_emitter.py index 0863c6e..a3f5242 100644 --- a/prefect_datahub/datahub_emitter.py +++ b/prefect_datahub/datahub_emitter.py @@ -207,7 +207,6 @@ def _generate_datajob( orchestrator=ORCHESTRATOR, flow_id=flow_run_ctx.flow.name, env=self.env, - platform_instance=self.platform_instance, ) if task_run_ctx is not None: datajob = DataJob( From 4684ae1ba86d44861ea2cc692a0ea87b4076ee10 Mon Sep 17 00:00:00 2001 From: shubhamjagtap639 Date: Tue, 13 Jun 2023 17:56:28 +0530 Subject: [PATCH 25/39] Code formated --- tests/test_datahub_emitter.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tests/test_datahub_emitter.py b/tests/test_datahub_emitter.py index c634640..12fae0b 100644 --- a/tests/test_datahub_emitter.py +++ b/tests/test_datahub_emitter.py @@ -87,9 +87,7 @@ def test_emit_flow( task_run_ctx: TaskRunContext = mock_run_context[0] flow_run_ctx: FlowRunContext = mock_run_context[1] - expected_dataflow_urn = ( - f"urn:li:dataFlow:(prefect,{flow_run_ctx.flow.name},prod)" - ) + expected_dataflow_urn = f"urn:li:dataFlow:(prefect,{flow_run_ctx.flow.name},prod)" assert mock_emitter.method_calls[1].args[0].aspectName == "dataFlowInfo" assert mock_emitter.method_calls[1].args[0].entityUrn == expected_dataflow_urn From f42b6032ab66761ec846d63e31eaf9476539a62e Mon Sep 17 00:00:00 2001 From: shubhamjagtap639 Date: Wed, 14 Jun 2023 10:28:05 +0530 Subject: [PATCH 26/39] Temporary changes reverted --- prefect_datahub/datahub_emitter.py | 3 +++ tests/test_datahub_emitter.py | 4 +++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/prefect_datahub/datahub_emitter.py b/prefect_datahub/datahub_emitter.py index a3f5242..f742807 100644 --- a/prefect_datahub/datahub_emitter.py +++ b/prefect_datahub/datahub_emitter.py @@ -207,6 +207,7 @@ def _generate_datajob( orchestrator=ORCHESTRATOR, flow_id=flow_run_ctx.flow.name, env=self.env, + platform_instance=self.platform_instance, ) if task_run_ctx is not None: datajob = DataJob( @@ -264,7 +265,9 @@ def _generate_dataflow(self, flow_run_ctx: FlowRunContext) -> DataFlow: orchestrator=ORCHESTRATOR, id=flow_run_ctx.flow.name, cluster=self.env, + env=self.env, name=flow_run_ctx.flow.name, + platform_instance=self.platform_instance, ) dataflow.description = flow_run_ctx.flow.description dataflow.tags = flow.tags diff --git a/tests/test_datahub_emitter.py b/tests/test_datahub_emitter.py index 12fae0b..8e69c93 100644 --- a/tests/test_datahub_emitter.py +++ b/tests/test_datahub_emitter.py @@ -87,7 +87,9 @@ def test_emit_flow( task_run_ctx: TaskRunContext = mock_run_context[0] flow_run_ctx: FlowRunContext = mock_run_context[1] - expected_dataflow_urn = f"urn:li:dataFlow:(prefect,{flow_run_ctx.flow.name},prod)" + expected_dataflow_urn = ( + f"urn:li:dataFlow:(prefect,{platform_instance}.{flow_run_ctx.flow.name},prod)" + ) assert mock_emitter.method_calls[1].args[0].aspectName == "dataFlowInfo" assert mock_emitter.method_calls[1].args[0].entityUrn == expected_dataflow_urn From a3a86f1ba8d0ef863e19c3624cb838f852200d34 Mon Sep 17 00:00:00 2001 From: shubhamjagtap639 Date: Wed, 14 Jun 2023 16:51:58 +0530 Subject: [PATCH 27/39] Concept mapping doc added and code change as per review comment --- docs/concept_mapping.md | 6 ++++-- prefect_datahub/datahub_emitter.py | 16 +++++++++++++--- 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/docs/concept_mapping.md b/docs/concept_mapping.md index c5ffaed..27a864f 100644 --- a/docs/concept_mapping.md +++ b/docs/concept_mapping.md @@ -6,5 +6,7 @@ Prefect concepts are documented [here](https://docs.prefect.io/latest/concepts/) Prefect Concept | DataHub Concept | URN | Possible Values --- | --- | --- | --- [Flow](https://docs.prefect.io/2.10.13/concepts/flows/#flows) | [DataFlow](https://datahubproject.io/docs/generated/metamodel/entities/dataflow/) | urn:li:dataFlow:(prefect, [platform-instance.]<flow-name>,prod) | <flow-name> is the user given a name like “etl”. if flow-name is not set by a user then prefect derive it from function-name annotated with @flow -[Flow Run](https://docs.prefect.io/latest/concepts/flows/#flow-runs) | [DataProcessInstance](https://datahubproject.io/docs/generated/metamodel/entities/dataprocessinstance) | urn:li:dataFlow:(prefect, [platform-instance.]<flow-name>,prod) | <flow-name> is the user given a name like “etl”. if flow-name is not set by a user then prefect derive it from function-name annotated with @flow -[Flow](https://docs.prefect.io/2.10.13/concepts/flows/#flows) | [DataFlow](https://datahubproject.io/docs/generated/metamodel/entities/dataflow/) | urn:li:dataFlow:(prefect, [platform-instance.]<flow-name>,prod) | <flow-name> is the user given a name like “etl”. if flow-name is not set by a user then prefect derive it from function-name annotated with @flow +[Flow Run](https://docs.prefect.io/latest/concepts/flows/#flow-runs) | [DataProcessInstance](https://datahubproject.io/docs/generated/metamodel/entities/dataprocessinstance) | urn:li:dataProcessInstance:GUID(prefect,[platform-instance.]<flow-run-name>,prod) | <flow-run-name> is an autogenerated string for a flow execution. Sample value “attentive-stingray” +[Task](https://docs.prefect.io/2.10.13/concepts/tasks/#tasks) | [DataJob](https://datahubproject.io/docs/generated/metamodel/entities/datajob/) | urn:li:dataJob:(urn:li:dataFlow:(prefect, [platform-instance].<flow-name>, prod), <task-key>) | <task-key> is an autogenerated string for a task. Sample value “\_\_main\_\_.transform”. +[Task Run](https://docs.prefect.io/latest/concepts/tasks/#tasks) | [DataProcessInstance](https://datahubproject.io/docs/generated/metamodel/entities/dataprocessinstance) | urn:li:dataProcessInstance:GUID(prefect,[platform-instance.]<flow-run-name>.<task-run-name>,prod) | <task-run-name> is an autogenerated string for a task execution. Sample value “Extract-0”. +[Task Tag](https://docs.prefect.io/latest/concepts/tasks/#tags) | [Tag](https://datahubproject.io/docs/generated/metamodel/entities/tag/) | urn:li:tag:<tag-name> | <tag-name> is a label assign to a task. diff --git a/prefect_datahub/datahub_emitter.py b/prefect_datahub/datahub_emitter.py index f742807..fb8dc17 100644 --- a/prefect_datahub/datahub_emitter.py +++ b/prefect_datahub/datahub_emitter.py @@ -1,6 +1,7 @@ """Datahub Emitter classes used to emit prefect metadata to Datahub REST.""" import asyncio +import traceback from typing import Dict, List, Optional from uuid import UUID @@ -16,6 +17,7 @@ from datahub.utilities.urns.data_job_urn import DataJobUrn from datahub.utilities.urns.dataset_urn import DatasetUrn from datahub_provider.entities import _Entity +from prefect import get_run_logger from prefect.blocks.core import Block from prefect.client import cloud, orchestration from prefect.context import FlowRunContext, TaskRunContext @@ -160,8 +162,13 @@ def _get_workspace(self) -> Optional[str]: try: asyncio.run(cloud.get_cloud_client().api_healthcheck()) except Exception: + get_run_logger().debug(traceback.format_exc()) return None if "workspaces" not in PREFECT_API_URL.value(): + get_run_logger().debug( + "Cannot fetch workspace name. Please login to prefect cloud using " + "command 'prefect cloud login'." + ) return None current_workspace_id = PREFECT_API_URL.value().split("/")[-1] workspaces = asyncio.run(cloud.get_cloud_client().read_workspaces()) @@ -264,7 +271,6 @@ def _generate_dataflow(self, flow_run_ctx: FlowRunContext) -> DataFlow: dataflow = DataFlow( orchestrator=ORCHESTRATOR, id=flow_run_ctx.flow.name, - cluster=self.env, env=self.env, name=flow_run_ctx.flow.name, platform_instance=self.platform_instance, @@ -505,7 +511,9 @@ def etl(): if workspace_name is not None: mcp = MetadataChangeProposalWrapper( entityUrn=str(dataflow.urn), - aspect=BrowsePathsClass(paths=[f"/prefect/prod/{workspace_name}"]), + aspect=BrowsePathsClass( + paths=[f"/{ORCHESTRATOR}/{self.env}/{workspace_name}"] + ), ) self.emitter.emit(mcp) self._emit_flow_run(dataflow, flow_run_ctx.flow_run.id) @@ -539,7 +547,9 @@ def etl(): if workspace_name is not None: mcp = MetadataChangeProposalWrapper( entityUrn=str(datajob.urn), - aspect=BrowsePathsClass(paths=[f"/prefect/prod/{workspace_name}"]), + aspect=BrowsePathsClass( + paths=[f"/{ORCHESTRATOR}/{self.env}/{workspace_name}"] + ), ) self.emitter.emit(mcp) self._emit_task_run( From 06a1de56095fd996c838065be3f0a200ecf5d821 Mon Sep 17 00:00:00 2001 From: shubhamjagtap639 Date: Thu, 15 Jun 2023 12:10:38 +0530 Subject: [PATCH 28/39] Author name modified --- .cruft.json | 6 ++-- prefect_datahub/datahub_emitter.py | 47 ++++++++++++++++++------------ setup.py | 6 ++-- 3 files changed, 34 insertions(+), 25 deletions(-) diff --git a/.cruft.json b/.cruft.json index 565337a..4f23039 100644 --- a/.cruft.json +++ b/.cruft.json @@ -4,12 +4,12 @@ "checkout": null, "context": { "cookiecutter": { - "full_name": "Shubham Jagtap", + "full_name": "Arcyl Data", "email": "shubham.jagtap@gslab.com", - "github_organization": "shubhamjagtap639", + "github_organization": "PrefectHQ", "collection_name": "prefect-datahub", "collection_slug": "prefect_datahub", - "collection_short_description": "Block used to emit prefect task and flow related metadata to Datahub REST", + "collection_short_description": "Metadata emitter for datahub", "_copy_without_render": [ ".github/workflows/*.yml" ], diff --git a/prefect_datahub/datahub_emitter.py b/prefect_datahub/datahub_emitter.py index fb8dc17..eeb339f 100644 --- a/prefect_datahub/datahub_emitter.py +++ b/prefect_datahub/datahub_emitter.py @@ -20,6 +20,8 @@ from prefect import get_run_logger from prefect.blocks.core import Block from prefect.client import cloud, orchestration +from prefect.client.schemas import FlowRun, TaskRun, Workspace +from prefect.client.schemas.objects import Flow from prefect.context import FlowRunContext, TaskRunContext from prefect.settings import PREFECT_API_URL from pydantic import Field @@ -136,7 +138,7 @@ def __init__(self, *args, **kwargs): Initialize datahub rest emitter """ super().__init__(*args, **kwargs) - self.datajobs_to_emit = {} + self.datajobs_to_emit: Dict[str, DataJob] = {} self.emitter = DatahubRestEmitter(gms_server=self.datahub_rest_url) self.emitter.test_connection() @@ -145,7 +147,7 @@ def _entities_to_urn_list(self, iolets: List[_Entity]) -> List[DatasetUrn]: Convert list of _entity to list of dataser urn Args: - iolets: The list of entities. + iolets (list[_Entity]): The list of entities. Returns: The list of Dataset URN. @@ -171,18 +173,20 @@ def _get_workspace(self) -> Optional[str]: ) return None current_workspace_id = PREFECT_API_URL.value().split("/")[-1] - workspaces = asyncio.run(cloud.get_cloud_client().read_workspaces()) + workspaces: List[Workspace] = asyncio.run( + cloud.get_cloud_client().read_workspaces() + ) for workspace in workspaces: if str(workspace.workspace_id) == current_workspace_id: return workspace.workspace_name return None - async def _get_flow_run_graph(self, flow_run_id) -> List[Dict]: + async def _get_flow_run_graph(self, flow_run_id: str) -> List[Dict]: """ Fetch the flow run graph for provided flow run id Args: - flow_run_id: The flow run id. + flow_run_id (str): The flow run id. Returns: The flow run graph in json format. @@ -203,9 +207,10 @@ def _generate_datajob( Assign description, tags, and properties to created datajob. Args: - flow_run_ctx: The prefect current running flow run context. - task_run_ctx: The prefect current running task run context. - task_key: The task key. + flow_run_ctx (FlowRunContext): The prefect current running flow run context. + task_run_ctx (Optional[TaskRunContext]): The prefect current running task \ + run context. + task_key (Optional[str]): The task key. Returns: The datajob entity. @@ -260,12 +265,12 @@ def _generate_dataflow(self, flow_run_ctx: FlowRunContext) -> DataFlow: Assign description, tags, and properties to created dataflow. Args: - flow_run_ctx: The prefect current running flow run context. + flow_run_ctx (FlowRunContext): The prefect current running flow run context. Returns: The dataflow entity. """ - flow = asyncio.run( + flow: Flow = asyncio.run( orchestration.get_client().read_flow(flow_id=flow_run_ctx.flow_run.flow_id) ) dataflow = DataFlow( @@ -312,10 +317,11 @@ def _emit_flow_run(self, dataflow: DataFlow, flow_run_id: UUID) -> None: Assign flow run properties to data process instance properties. Args: - dataflow: The datahub dataflow entity used to create data process instance. - flow_run_id: The prefect current running flow run id. + dataflow (DataFlow): The datahub dataflow entity used to create \ + data process instance. + flow_run_id (UUID): The prefect current running flow run id. """ - flow_run = asyncio.run( + flow_run: FlowRun = asyncio.run( orchestration.get_client().read_flow_run(flow_run_id=flow_run_id) ) if self.platform_instance is not None: @@ -357,11 +363,14 @@ def _emit_task_run( Assign task run properties to data process instance properties. Args: - datajob: The datahub datajob entity used to create data process instance. - flow_run_name: The prefect current running flow run name. - task_run_id: The prefect task run id. + datajob (DataJob): The datahub datajob entity used to create \ + data process instance. + flow_run_name (str): The prefect current running flow run name. + task_run_id (str): The prefect task run id. """ - task_run = asyncio.run(orchestration.get_client().read_task_run(task_run_id)) + task_run: TaskRun = asyncio.run( + orchestration.get_client().read_task_run(task_run_id) + ) if self.platform_instance is not None: dpi_id = f"{self.platform_instance}.{flow_run_name}.{task_run.name}" else: @@ -431,8 +440,8 @@ def add_task( and outlets respectively. Args: - inputs (list): The list of task inputs. - outputs (list): The list of task outputs. + inputs (Optional[list]): The list of task inputs. + outputs (Optional[list]): The list of task outputs. Example: Emit the task metadata as show below: diff --git a/setup.py b/setup.py index 8852888..64d379b 100644 --- a/setup.py +++ b/setup.py @@ -13,12 +13,12 @@ setup( name="prefect-datahub", - description="Block used to emit prefect task and flow related metadata to Datahub REST", + description="Metadata emitter for datahub", license="Apache License 2.0", - author="Shubham Jagtap", + author="Acryl Data", author_email="shubham.jagtap@gslab.com", keywords="prefect", - url="https://github.com/shubhamjagtap639/prefect-datahub", + url="https://github.com/PrefectHQ/prefect-datahub", long_description=readme, long_description_content_type="text/markdown", version=versioneer.get_version(), From 54891a7d1f259e5fb01a1a86f73176c5ef3a9cda Mon Sep 17 00:00:00 2001 From: MohdSiddiqueBagwan Date: Fri, 16 Jun 2023 22:27:18 +0530 Subject: [PATCH 29/39] doc update --- README.md | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 8d375c4..88cad81 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# Emit flows & tasks metadata to datahub rest with `prefect-datahub` +# Emit flows & tasks metadata to DataHub rest with `prefect-datahub`

@@ -20,28 +20,28 @@ ## Welcome! -The `prefect-datahub` collection makes it easy to leverage the capabilities of datahub emitter in your flows, featuring support for ingesting metadata of flows, tasks & workspace to datahub gms rest. +The `prefect-datahub` collection makes it easy to leverage the capabilities of DataHub emitter in your flows, featuring support for ingesting metadata of flows, tasks & workspace to DataHub gms rest. ## Getting Started -### Setup Datahub UI +### Setup DataHub UI -In order to use 'prefect-datahub' collection, you'll first need to deploy the new instance of Datahub. +In order to use 'prefect-datahub' collection, you'll first need to deploy the new instance of DataHub. -You can get the instructions on deploying the open source Datahub locally by navigating to the [apps page](https://datahubproject.io/docs/quickstart). +You can get the instructions on deploying the open source DataHub by navigating to the [apps page](https://datahubproject.io/docs/quickstart). -Successful deployment of Datahub locally will lead creation of datahub GMS service running on 'http://localhost:8080'. +Successful deployment of DataHub will lead creation of DataHub GMS service running on 'http://localhost:8080' if you have deployed it on local system. ### Saving configurations to a block This is a one-time activity, where you can save the configuration on the [Prefect block document store](https://docs.prefect.io/2.10.13/concepts/blocks/#saving-blocks). -While saving you can provide below configutions. Default value will get set if not provided while saving the configuration to block. +While saving you can provide below configurations. Default value will get set if not provided while saving the configuration to block. Config | Type | Default | Description --- | --- | --- | --- -datahub_rest_url | `str` | *http://localhost:8080* | Datahub GMS Rest url +datahub_rest_url | `str` | *http://localhost:8080* | DataHub GMS REST URL env | `str` | *PROD* | The environment that all assets produced by this orchestrator belong to. For more detail and possible values refer [here](https://datahubproject.io/docs/graphql/enums/#fabrictype). platform_instance | `str` | *None* | The instance of the platform that all assets produced by this recipe belong to. For more detail please refer [here](https://datahubproject.io/docs/platform-instances/). @@ -54,7 +54,7 @@ DatahubEmitter( ).save("BLOCK-NAME-PLACEHOLDER") ``` -Congrats! You can now load the saved block to use your credentials in your Python code: +Congrats! You can now load the saved block to use your credentials in your Flow code: ```python from prefect_datahub import DatahubEmitter From b96bd03cb080096ffff8fcbcdeaa9ea9bb7c1748 Mon Sep 17 00:00:00 2001 From: shubhamjagtap639 Date: Mon, 19 Jun 2023 13:23:59 +0530 Subject: [PATCH 30/39] code formatted and changes as per review comment --- README.md | 8 +- prefect_datahub/datahub_emitter.py | 127 +++++++++++++++++------------ tests/conftest.py | 12 ++- 3 files changed, 92 insertions(+), 55 deletions(-) diff --git a/README.md b/README.md index 88cad81..48630d2 100644 --- a/README.md +++ b/README.md @@ -54,7 +54,7 @@ DatahubEmitter( ).save("BLOCK-NAME-PLACEHOLDER") ``` -Congrats! You can now load the saved block to use your credentials in your Flow code: +Congrats! You can now load the saved block to use your configurations in your Flow code: ```python from prefect_datahub import DatahubEmitter @@ -71,7 +71,9 @@ DatahubEmitter.load("BLOCK-NAME-PLACEHOLDER") prefect block register -m prefect_datahub ``` -After installing `prefect-datahub` and [saving the configution](#saving-configurations-to-block), you can easily use it within your flows to help you emit metadata as show below! +### Load the saved block in prefect workflows + +After installing `prefect-datahub` and [saving the configution](#saving-configurations-to-a-block), you can easily use it within your prefect workflows to help you emit metadata event as show below! ```python from datahub_provider.entities import Dataset @@ -96,6 +98,8 @@ def etl(): datahub_emitter.emit_flow() ``` +**Note**: To emit the tasks, user compulsory need to emit flow. Otherwise nothing will get emit. + ## Resources For more tips on how to use tasks and flows in a Collection, check out [Using Collections](https://docs.prefect.io/collections/usage/)! diff --git a/prefect_datahub/datahub_emitter.py b/prefect_datahub/datahub_emitter.py index eeb339f..0fbd2ea 100644 --- a/prefect_datahub/datahub_emitter.py +++ b/prefect_datahub/datahub_emitter.py @@ -108,7 +108,7 @@ class DatahubEmitter(Block): _block_type_name = "datahub emitter" # replace this with a relevant logo; defaults to Prefect logo - _logo_url = "https://images.ctfassets.net/gm98wzqotmnx/08yCE6xpJMX9Kjl5VArDS/c2ede674c20f90b9b6edeab71feffac9/prefect-200x200.png?h=250" # noqa + _logo_url = "https://datahubproject.io/img/datahub-logo-color-mark.svg" # noqa _documentation_url = "https://shubhamjagtap639.github.io/prefect-datahub/datahub_emitter/#prefect-datahub.datahub_emitter.DatahubEmitter" # noqa datahub_rest_url: Optional[str] = Field( @@ -196,6 +196,22 @@ async def _get_flow_run_graph(self, flow_run_id: str) -> List[Dict]: ) return response.json() + def _emit_browsepath(self, urn: str, workspace_name: str) -> None: + """ + Emit browsepath for provided urn. Set path as orchestrator/env/workspace_name. + + Args: + urn (str): The entity URN + workspace_name (str): The prefect cloud workspace name + """ + mcp = MetadataChangeProposalWrapper( + entityUrn=urn, + aspect=BrowsePathsClass( + paths=[f"/{ORCHESTRATOR}/{self.env}/{workspace_name}"] + ), + ) + self.emitter.emit(mcp) + def _generate_datajob( self, flow_run_ctx: FlowRunContext, @@ -310,6 +326,58 @@ def _generate_dataflow(self, flow_run_ctx: FlowRunContext) -> DataFlow: return dataflow + def _emit_tasks( + self, + flow_run_ctx: FlowRunContext, + dataflow: DataFlow, + workspace_name: Optional[str] = None, + ): + """ + Emit prefect tasks metadata to datahub rest. Add upstream dependencies if + present for each task. + + Args: + flow_run_ctx (FlowRunContext): The prefect current running flow run context + dataflow (DataFlow): The datahub dataflow entity. + workspace_name Optional(str): The prefect cloud workpace name. + """ + graph_json = asyncio.run( + self._get_flow_run_graph(str(flow_run_ctx.flow_run.id)) + ) + task_run_key_map = { + str(prefect_future.task_run.id): prefect_future.task_run.task_key + for prefect_future in flow_run_ctx.task_run_futures + } + if graph_json: + get_run_logger().info("Emitting tasks to datahub...") + for node in graph_json: + datajob_urn = DataJobUrn.create_from_ids( + data_flow_urn=str(dataflow.urn), + job_id=task_run_key_map[node[ID]], + ) + if str(datajob_urn) in self.datajobs_to_emit: + datajob = self.datajobs_to_emit[str(datajob_urn)] + else: + datajob = self._generate_datajob( + flow_run_ctx=flow_run_ctx, task_key=task_run_key_map[node[ID]] + ) + for each in node[UPSTREAM_DEPENDENCIES]: + upstream_task_urn = DataJobUrn.create_from_ids( + data_flow_urn=str(dataflow.urn), + job_id=task_run_key_map[each[ID]], + ) + datajob.upstream_urns.extend([upstream_task_urn]) + datajob.emit(self.emitter) + + if workspace_name is not None: + self._emit_browsepath(str(datajob.urn), workspace_name) + + self._emit_task_run( + datajob=datajob, + flow_run_name=flow_run_ctx.flow_run.name, + task_run_id=node[ID], + ) + def _emit_flow_run(self, dataflow: DataFlow, flow_run_id: UUID) -> None: """ Emit prefect flow run to datahub rest. Prefect flow run get mapped with datahub @@ -485,11 +553,9 @@ def etl(): def emit_flow(self) -> None: """ Emit prefect current running flow metadata to datahub rest. Prefect flow gets - mapped with datahub dataflow entity. Add upstream dependencies if present for - each task. Emit the prefect task run metadata as well. If the user hasn't - called add_task in the task function still emit_flow will emit a task but - without task name, description,tags and properties. - Emit the prefect workspace metadata as well. + mapped with datahub dataflow entity. If the user hasn't called add_task in + the task function still emit_flow will emit a task but without task name, + description,tags and properties. Example: @@ -516,53 +582,10 @@ def etl(): # Emit flow and flow run dataflow = self._generate_dataflow(flow_run_ctx=flow_run_ctx) + get_run_logger().info("Emitting flow to datahub...") dataflow.emit(self.emitter) if workspace_name is not None: - mcp = MetadataChangeProposalWrapper( - entityUrn=str(dataflow.urn), - aspect=BrowsePathsClass( - paths=[f"/{ORCHESTRATOR}/{self.env}/{workspace_name}"] - ), - ) - self.emitter.emit(mcp) + self._emit_browsepath(str(dataflow.urn), workspace_name) self._emit_flow_run(dataflow, flow_run_ctx.flow_run.id) - # Emit task, task run and add upstream task if present for each task - graph_json = asyncio.run( - self._get_flow_run_graph(str(flow_run_ctx.flow_run.id)) - ) - task_run_key_map = { - str(prefect_future.task_run.id): prefect_future.task_run.task_key - for prefect_future in flow_run_ctx.task_run_futures - } - for node in graph_json: - datajob_urn = DataJobUrn.create_from_ids( - data_flow_urn=str(dataflow.urn), - job_id=task_run_key_map[node[ID]], - ) - if str(datajob_urn) in self.datajobs_to_emit: - datajob = self.datajobs_to_emit[str(datajob_urn)] - else: - datajob = self._generate_datajob( - flow_run_ctx=flow_run_ctx, task_key=task_run_key_map[node[ID]] - ) - for each in node[UPSTREAM_DEPENDENCIES]: - upstream_task_urn = DataJobUrn.create_from_ids( - data_flow_urn=str(dataflow.urn), - job_id=task_run_key_map[each[ID]], - ) - datajob.upstream_urns.extend([upstream_task_urn]) - datajob.emit(self.emitter) - if workspace_name is not None: - mcp = MetadataChangeProposalWrapper( - entityUrn=str(datajob.urn), - aspect=BrowsePathsClass( - paths=[f"/{ORCHESTRATOR}/{self.env}/{workspace_name}"] - ), - ) - self.emitter.emit(mcp) - self._emit_task_run( - datajob=datajob, - flow_run_name=flow_run_ctx.flow_run.name, - task_run_id=node[ID], - ) + self._emit_tasks(flow_run_ctx, dataflow, workspace_name) diff --git a/tests/conftest.py b/tests/conftest.py index e5ed64d..75b7f43 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,5 +1,6 @@ import asyncio import json +import logging from unittest.mock import MagicMock, patch from uuid import UUID @@ -389,7 +390,16 @@ async def mock_task_run_future(): @pytest.fixture(scope="module") -def mock_run_context(): +def mock_run_logger(): + with patch( + "prefect_datahub.datahub_emitter.get_run_logger", + return_value=logging.getLogger(), + ) as mock_logger: + yield mock_logger + + +@pytest.fixture(scope="module") +def mock_run_context(mock_run_logger): task_run_ctx = MagicMock() task_run_ctx.task.task_key = mock_transform_task_json["task_key"] task_run_ctx.task.name = mock_transform_task_json["name"] From 054ac1583528ca29390826398b93410f52b8735f Mon Sep 17 00:00:00 2001 From: shubhamjagtap639 Date: Tue, 20 Jun 2023 11:04:02 +0530 Subject: [PATCH 31/39] Error handling added --- prefect_datahub/datahub_emitter.py | 57 ++++++++++++++++++++++-------- 1 file changed, 42 insertions(+), 15 deletions(-) diff --git a/prefect_datahub/datahub_emitter.py b/prefect_datahub/datahub_emitter.py index 0fbd2ea..4ce32f6 100644 --- a/prefect_datahub/datahub_emitter.py +++ b/prefect_datahub/datahub_emitter.py @@ -181,7 +181,7 @@ def _get_workspace(self) -> Optional[str]: return workspace.workspace_name return None - async def _get_flow_run_graph(self, flow_run_id: str) -> List[Dict]: + async def _get_flow_run_graph(self, flow_run_id: str) -> Optional[List[Dict]]: """ Fetch the flow run graph for provided flow run id @@ -191,9 +191,13 @@ async def _get_flow_run_graph(self, flow_run_id: str) -> List[Dict]: Returns: The flow run graph in json format. """ - response = await orchestration.get_client()._client.get( - f"/flow_runs/{flow_run_id}/graph" - ) + try: + response = await orchestration.get_client()._client.get( + f"/flow_runs/{flow_run_id}/graph" + ) + except Exception: + get_run_logger().debug(traceback.format_exc()) + return None return response.json() def _emit_browsepath(self, urn: str, workspace_name: str) -> None: @@ -275,7 +279,7 @@ def _generate_datajob( return datajob return None - def _generate_dataflow(self, flow_run_ctx: FlowRunContext) -> DataFlow: + def _generate_dataflow(self, flow_run_ctx: FlowRunContext) -> Optional[DataFlow]: """ Create dataflow entity using flow run ctx. Assign description, tags, and properties to created dataflow. @@ -286,9 +290,17 @@ def _generate_dataflow(self, flow_run_ctx: FlowRunContext) -> DataFlow: Returns: The dataflow entity. """ - flow: Flow = asyncio.run( - orchestration.get_client().read_flow(flow_id=flow_run_ctx.flow_run.flow_id) - ) + try: + flow: Flow = asyncio.run( + orchestration.get_client().read_flow( + flow_id=flow_run_ctx.flow_run.flow_id + ) + ) + except Exception: + get_run_logger().debug(traceback.format_exc()) + return None + assert flow + dataflow = DataFlow( orchestrator=ORCHESTRATOR, id=flow_run_ctx.flow.name, @@ -331,7 +343,7 @@ def _emit_tasks( flow_run_ctx: FlowRunContext, dataflow: DataFlow, workspace_name: Optional[str] = None, - ): + ) -> None: """ Emit prefect tasks metadata to datahub rest. Add upstream dependencies if present for each task. @@ -344,6 +356,9 @@ def _emit_tasks( graph_json = asyncio.run( self._get_flow_run_graph(str(flow_run_ctx.flow_run.id)) ) + if graph_json is None: + return + task_run_key_map = { str(prefect_future.task_run.id): prefect_future.task_run.task_key for prefect_future in flow_run_ctx.task_run_futures @@ -389,9 +404,15 @@ def _emit_flow_run(self, dataflow: DataFlow, flow_run_id: UUID) -> None: data process instance. flow_run_id (UUID): The prefect current running flow run id. """ - flow_run: FlowRun = asyncio.run( - orchestration.get_client().read_flow_run(flow_run_id=flow_run_id) - ) + try: + flow_run: FlowRun = asyncio.run( + orchestration.get_client().read_flow_run(flow_run_id=flow_run_id) + ) + except Exception: + get_run_logger().debug(traceback.format_exc()) + return + assert flow_run + if self.platform_instance is not None: dpi_id = f"{self.platform_instance}.{flow_run.name}" else: @@ -436,9 +457,15 @@ def _emit_task_run( flow_run_name (str): The prefect current running flow run name. task_run_id (str): The prefect task run id. """ - task_run: TaskRun = asyncio.run( - orchestration.get_client().read_task_run(task_run_id) - ) + try: + task_run: TaskRun = asyncio.run( + orchestration.get_client().read_task_run(task_run_id) + ) + except Exception: + get_run_logger().debug(traceback.format_exc()) + return + assert task_run + if self.platform_instance is not None: dpi_id = f"{self.platform_instance}.{flow_run_name}.{task_run.name}" else: From 7fc633bc4d2c34d8809da2fa0e75e7ee10302f46 Mon Sep 17 00:00:00 2001 From: MohdSiddiqueBagwan Date: Tue, 20 Jun 2023 13:05:31 +0530 Subject: [PATCH 32/39] remove unwanted if condition --- prefect_datahub/datahub_emitter.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/prefect_datahub/datahub_emitter.py b/prefect_datahub/datahub_emitter.py index 4ce32f6..d7180ac 100644 --- a/prefect_datahub/datahub_emitter.py +++ b/prefect_datahub/datahub_emitter.py @@ -363,8 +363,9 @@ def _emit_tasks( str(prefect_future.task_run.id): prefect_future.task_run.task_key for prefect_future in flow_run_ctx.task_run_futures } - if graph_json: - get_run_logger().info("Emitting tasks to datahub...") + + get_run_logger().info("Emitting tasks to datahub...") + for node in graph_json: datajob_urn = DataJobUrn.create_from_ids( data_flow_urn=str(dataflow.urn), @@ -610,9 +611,12 @@ def etl(): # Emit flow and flow run dataflow = self._generate_dataflow(flow_run_ctx=flow_run_ctx) get_run_logger().info("Emitting flow to datahub...") + dataflow.emit(self.emitter) + if workspace_name is not None: self._emit_browsepath(str(dataflow.urn), workspace_name) + self._emit_flow_run(dataflow, flow_run_ctx.flow_run.id) self._emit_tasks(flow_run_ctx, dataflow, workspace_name) From 026c8fd1051d9389f53bcd2241dd06cbbca5324b Mon Sep 17 00:00:00 2001 From: shubhamjagtap639 Date: Tue, 20 Jun 2023 13:27:11 +0530 Subject: [PATCH 33/39] Remove urn and possible value cols from concepts mapping table --- docs/concept_mapping.md | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/docs/concept_mapping.md b/docs/concept_mapping.md index 27a864f..b6d4055 100644 --- a/docs/concept_mapping.md +++ b/docs/concept_mapping.md @@ -3,10 +3,10 @@ Prefect concepts are documented [here](https://docs.prefect.io/latest/concepts/), and datahub concepts are documented [here](https://datahubproject.io/docs/what-is-datahub/datahub-concepts). -Prefect Concept | DataHub Concept | URN | Possible Values ---- | --- | --- | --- -[Flow](https://docs.prefect.io/2.10.13/concepts/flows/#flows) | [DataFlow](https://datahubproject.io/docs/generated/metamodel/entities/dataflow/) | urn:li:dataFlow:(prefect, [platform-instance.]<flow-name>,prod) | <flow-name> is the user given a name like “etl”. if flow-name is not set by a user then prefect derive it from function-name annotated with @flow -[Flow Run](https://docs.prefect.io/latest/concepts/flows/#flow-runs) | [DataProcessInstance](https://datahubproject.io/docs/generated/metamodel/entities/dataprocessinstance) | urn:li:dataProcessInstance:GUID(prefect,[platform-instance.]<flow-run-name>,prod) | <flow-run-name> is an autogenerated string for a flow execution. Sample value “attentive-stingray” -[Task](https://docs.prefect.io/2.10.13/concepts/tasks/#tasks) | [DataJob](https://datahubproject.io/docs/generated/metamodel/entities/datajob/) | urn:li:dataJob:(urn:li:dataFlow:(prefect, [platform-instance].<flow-name>, prod), <task-key>) | <task-key> is an autogenerated string for a task. Sample value “\_\_main\_\_.transform”. -[Task Run](https://docs.prefect.io/latest/concepts/tasks/#tasks) | [DataProcessInstance](https://datahubproject.io/docs/generated/metamodel/entities/dataprocessinstance) | urn:li:dataProcessInstance:GUID(prefect,[platform-instance.]<flow-run-name>.<task-run-name>,prod) | <task-run-name> is an autogenerated string for a task execution. Sample value “Extract-0”. -[Task Tag](https://docs.prefect.io/latest/concepts/tasks/#tags) | [Tag](https://datahubproject.io/docs/generated/metamodel/entities/tag/) | urn:li:tag:<tag-name> | <tag-name> is a label assign to a task. +Prefect Concept | DataHub Concept +--- | --- +[Flow](https://docs.prefect.io/2.10.13/concepts/flows/#flows) | [DataFlow](https://datahubproject.io/docs/generated/metamodel/entities/dataflow/) +[Flow Run](https://docs.prefect.io/latest/concepts/flows/#flow-runs) | [DataProcessInstance](https://datahubproject.io/docs/generated/metamodel/entities/dataprocessinstance) +[Task](https://docs.prefect.io/2.10.13/concepts/tasks/#tasks) | [DataJob](https://datahubproject.io/docs/generated/metamodel/entities/datajob/) +[Task Run](https://docs.prefect.io/latest/concepts/tasks/#tasks) | [DataProcessInstance](https://datahubproject.io/docs/generated/metamodel/entities/dataprocessinstance) +[Task Tag](https://docs.prefect.io/latest/concepts/tasks/#tags) | [Tag](https://datahubproject.io/docs/generated/metamodel/entities/tag/) From b44a9a30465db8fa280a39a4a0df2d837ed8256c Mon Sep 17 00:00:00 2001 From: shubhamjagtap639 Date: Tue, 27 Jun 2023 17:05:22 +0530 Subject: [PATCH 34/39] file reformatted --- prefect_datahub/datahub_emitter.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/prefect_datahub/datahub_emitter.py b/prefect_datahub/datahub_emitter.py index d7180ac..c574c49 100644 --- a/prefect_datahub/datahub_emitter.py +++ b/prefect_datahub/datahub_emitter.py @@ -363,9 +363,9 @@ def _emit_tasks( str(prefect_future.task_run.id): prefect_future.task_run.task_key for prefect_future in flow_run_ctx.task_run_futures } - + get_run_logger().info("Emitting tasks to datahub...") - + for node in graph_json: datajob_urn = DataJobUrn.create_from_ids( data_flow_urn=str(dataflow.urn), From 2af3209128cf5facc6bcd2803e1460433420c4ff Mon Sep 17 00:00:00 2001 From: shubhamjagtap639 Date: Tue, 27 Jun 2023 19:38:25 +0530 Subject: [PATCH 35/39] Code changes to support python3.7 --- tests/test_datahub_emitter.py | 140 +++++++++++++++++----------------- 1 file changed, 70 insertions(+), 70 deletions(-) diff --git a/tests/test_datahub_emitter.py b/tests/test_datahub_emitter.py index 8e69c93..6760ea5 100644 --- a/tests/test_datahub_emitter.py +++ b/tests/test_datahub_emitter.py @@ -91,210 +91,210 @@ def test_emit_flow( f"urn:li:dataFlow:(prefect,{platform_instance}.{flow_run_ctx.flow.name},prod)" ) - assert mock_emitter.method_calls[1].args[0].aspectName == "dataFlowInfo" - assert mock_emitter.method_calls[1].args[0].entityUrn == expected_dataflow_urn - assert mock_emitter.method_calls[2].args[0].aspectName == "ownership" - assert mock_emitter.method_calls[2].args[0].entityUrn == expected_dataflow_urn - assert mock_emitter.method_calls[3].args[0].aspectName == "globalTags" - assert mock_emitter.method_calls[3].args[0].entityUrn == expected_dataflow_urn - assert mock_emitter.method_calls[4].args[0].aspectName == "browsePaths" - assert mock_emitter.method_calls[4].args[0].entityUrn == expected_dataflow_urn - assert ( - mock_emitter.method_calls[8].args[0].aspectName + assert mock_emitter.method_calls[1][1][0].aspectName == "dataFlowInfo" + assert mock_emitter.method_calls[1][1][0].entityUrn == expected_dataflow_urn + assert mock_emitter.method_calls[2][1][0].aspectName == "ownership" + assert mock_emitter.method_calls[2][1][0].entityUrn == expected_dataflow_urn + assert mock_emitter.method_calls[3][1][0].aspectName == "globalTags" + assert mock_emitter.method_calls[3][1][0].entityUrn == expected_dataflow_urn + assert mock_emitter.method_calls[4][1][0].aspectName == "browsePaths" + assert mock_emitter.method_calls[4][1][0].entityUrn == expected_dataflow_urn + assert ( + mock_emitter.method_calls[8][1][0].aspectName == "dataProcessInstanceProperties" ) assert ( - mock_emitter.method_calls[8].args[0].entityUrn + mock_emitter.method_calls[8][1][0].entityUrn == "urn:li:dataProcessInstance:a95d24db6abd98384fc1d4c8540098a4" ) assert ( - mock_emitter.method_calls[9].args[0].aspectName + mock_emitter.method_calls[9][1][0].aspectName == "dataProcessInstanceRelationships" ) assert ( - mock_emitter.method_calls[9].args[0].entityUrn + mock_emitter.method_calls[9][1][0].entityUrn == "urn:li:dataProcessInstance:a95d24db6abd98384fc1d4c8540098a4" ) assert ( - mock_emitter.method_calls[10].args[0].aspectName + mock_emitter.method_calls[10][1][0].aspectName == "dataProcessInstanceRunEvent" ) assert ( - mock_emitter.method_calls[10].args[0].entityUrn + mock_emitter.method_calls[10][1][0].entityUrn == "urn:li:dataProcessInstance:a95d24db6abd98384fc1d4c8540098a4" ) - assert mock_emitter.method_calls[11].args[0].aspectName == "dataJobInfo" + assert mock_emitter.method_calls[11][1][0].aspectName == "dataJobInfo" assert ( - mock_emitter.method_calls[11].args[0].entityUrn + mock_emitter.method_calls[11][1][0].entityUrn == f"urn:li:dataJob:({expected_dataflow_urn},__main__.extract)" ) - assert mock_emitter.method_calls[12].args[0].aspectName == "dataJobInputOutput" + assert mock_emitter.method_calls[12][1][0].aspectName == "dataJobInputOutput" assert ( - mock_emitter.method_calls[12].args[0].entityUrn + mock_emitter.method_calls[12][1][0].entityUrn == f"urn:li:dataJob:({expected_dataflow_urn},__main__.extract)" ) - assert mock_emitter.method_calls[13].args[0].aspectName == "ownership" + assert mock_emitter.method_calls[13][1][0].aspectName == "ownership" assert ( - mock_emitter.method_calls[13].args[0].entityUrn + mock_emitter.method_calls[13][1][0].entityUrn == f"urn:li:dataJob:({expected_dataflow_urn},__main__.extract)" ) - assert mock_emitter.method_calls[14].args[0].aspectName == "globalTags" + assert mock_emitter.method_calls[14][1][0].aspectName == "globalTags" assert ( - mock_emitter.method_calls[14].args[0].entityUrn + mock_emitter.method_calls[14][1][0].entityUrn == f"urn:li:dataJob:({expected_dataflow_urn},__main__.extract)" ) - assert mock_emitter.method_calls[15].args[0].aspectName == "browsePaths" + assert mock_emitter.method_calls[15][1][0].aspectName == "browsePaths" assert ( - mock_emitter.method_calls[15].args[0].entityUrn + mock_emitter.method_calls[15][1][0].entityUrn == f"urn:li:dataJob:({expected_dataflow_urn},__main__.extract)" ) assert ( - mock_emitter.method_calls[16].args[0].aspectName + mock_emitter.method_calls[16][1][0].aspectName == "dataProcessInstanceProperties" ) assert ( - mock_emitter.method_calls[16].args[0].entityUrn + mock_emitter.method_calls[16][1][0].entityUrn == "urn:li:dataProcessInstance:bf5eab177af0097bbff6a41694f39af9" ) assert ( - mock_emitter.method_calls[17].args[0].aspectName + mock_emitter.method_calls[17][1][0].aspectName == "dataProcessInstanceRelationships" ) assert ( - mock_emitter.method_calls[17].args[0].entityUrn + mock_emitter.method_calls[17][1][0].entityUrn == "urn:li:dataProcessInstance:bf5eab177af0097bbff6a41694f39af9" ) assert ( - mock_emitter.method_calls[18].args[0].aspectName + mock_emitter.method_calls[18][1][0].aspectName == "dataProcessInstanceRunEvent" ) assert ( - mock_emitter.method_calls[18].args[0].entityUrn + mock_emitter.method_calls[18][1][0].entityUrn == "urn:li:dataProcessInstance:bf5eab177af0097bbff6a41694f39af9" ) assert ( - mock_emitter.method_calls[19].args[0].aspectName + mock_emitter.method_calls[19][1][0].aspectName == "dataProcessInstanceRunEvent" ) assert ( - mock_emitter.method_calls[19].args[0].entityUrn + mock_emitter.method_calls[19][1][0].entityUrn == "urn:li:dataProcessInstance:bf5eab177af0097bbff6a41694f39af9" ) - assert mock_emitter.method_calls[20].args[0].aspectName == "dataJobInfo" + assert mock_emitter.method_calls[20][1][0].aspectName == "dataJobInfo" assert ( - mock_emitter.method_calls[20].args[0].entityUrn + mock_emitter.method_calls[20][1][0].entityUrn == f"urn:li:dataJob:({expected_dataflow_urn},__main__.load)" ) - assert mock_emitter.method_calls[21].args[0].aspectName == "dataJobInputOutput" + assert mock_emitter.method_calls[21][1][0].aspectName == "dataJobInputOutput" assert ( - mock_emitter.method_calls[21].args[0].entityUrn + mock_emitter.method_calls[21][1][0].entityUrn == f"urn:li:dataJob:({expected_dataflow_urn},__main__.load)" ) - assert mock_emitter.method_calls[22].args[0].aspectName == "ownership" + assert mock_emitter.method_calls[22][1][0].aspectName == "ownership" assert ( - mock_emitter.method_calls[22].args[0].entityUrn + mock_emitter.method_calls[22][1][0].entityUrn == f"urn:li:dataJob:({expected_dataflow_urn},__main__.load)" ) - assert mock_emitter.method_calls[23].args[0].aspectName == "globalTags" + assert mock_emitter.method_calls[23][1][0].aspectName == "globalTags" assert ( - mock_emitter.method_calls[23].args[0].entityUrn + mock_emitter.method_calls[23][1][0].entityUrn == f"urn:li:dataJob:({expected_dataflow_urn},__main__.load)" ) - assert mock_emitter.method_calls[24].args[0].aspectName == "browsePaths" + assert mock_emitter.method_calls[24][1][0].aspectName == "browsePaths" assert ( - mock_emitter.method_calls[24].args[0].entityUrn + mock_emitter.method_calls[24][1][0].entityUrn == f"urn:li:dataJob:({expected_dataflow_urn},__main__.load)" ) assert ( - mock_emitter.method_calls[25].args[0].aspectName + mock_emitter.method_calls[25][1][0].aspectName == "dataProcessInstanceProperties" ) assert ( - mock_emitter.method_calls[25].args[0].entityUrn + mock_emitter.method_calls[25][1][0].entityUrn == "urn:li:dataProcessInstance:095673536b61e6f25c7691af0d2cc317" ) assert ( - mock_emitter.method_calls[26].args[0].aspectName + mock_emitter.method_calls[26][1][0].aspectName == "dataProcessInstanceRelationships" ) assert ( - mock_emitter.method_calls[26].args[0].entityUrn + mock_emitter.method_calls[26][1][0].entityUrn == "urn:li:dataProcessInstance:095673536b61e6f25c7691af0d2cc317" ) assert ( - mock_emitter.method_calls[27].args[0].aspectName + mock_emitter.method_calls[27][1][0].aspectName == "dataProcessInstanceRunEvent" ) assert ( - mock_emitter.method_calls[27].args[0].entityUrn + mock_emitter.method_calls[27][1][0].entityUrn == "urn:li:dataProcessInstance:095673536b61e6f25c7691af0d2cc317" ) assert ( - mock_emitter.method_calls[28].args[0].aspectName + mock_emitter.method_calls[28][1][0].aspectName == "dataProcessInstanceRunEvent" ) assert ( - mock_emitter.method_calls[28].args[0].entityUrn + mock_emitter.method_calls[28][1][0].entityUrn == "urn:li:dataProcessInstance:095673536b61e6f25c7691af0d2cc317" ) - assert mock_emitter.method_calls[29].args[0].aspectName == "dataJobInfo" + assert mock_emitter.method_calls[29][1][0].aspectName == "dataJobInfo" assert ( - mock_emitter.method_calls[29].args[0].entityUrn + mock_emitter.method_calls[29][1][0].entityUrn == f"urn:li:dataJob:({expected_dataflow_urn},__main__.transform)" ) - assert mock_emitter.method_calls[30].args[0].aspectName == "dataJobInputOutput" + assert mock_emitter.method_calls[30][1][0].aspectName == "dataJobInputOutput" assert ( - mock_emitter.method_calls[30].args[0].entityUrn + mock_emitter.method_calls[30][1][0].entityUrn == f"urn:li:dataJob:({expected_dataflow_urn},__main__.transform)" ) - assert mock_emitter.method_calls[31].args[0].aspectName == "ownership" + assert mock_emitter.method_calls[31][1][0].aspectName == "ownership" assert ( - mock_emitter.method_calls[31].args[0].entityUrn + mock_emitter.method_calls[31][1][0].entityUrn == f"urn:li:dataJob:({expected_dataflow_urn},__main__.transform)" ) - assert mock_emitter.method_calls[32].args[0].aspectName == "globalTags" + assert mock_emitter.method_calls[32][1][0].aspectName == "globalTags" assert ( - mock_emitter.method_calls[32].args[0].entityUrn + mock_emitter.method_calls[32][1][0].entityUrn == f"urn:li:dataJob:({expected_dataflow_urn},__main__.transform)" ) assert ( - mock_emitter.method_calls[32].args[0].aspect.tags[0].tag + mock_emitter.method_calls[32][1][0].aspect.tags[0].tag == f"urn:li:tag:{task_run_ctx.task.tags[0]}" ) - assert mock_emitter.method_calls[33].args[0].aspectName == "browsePaths" + assert mock_emitter.method_calls[33][1][0].aspectName == "browsePaths" assert ( - mock_emitter.method_calls[33].args[0].entityUrn + mock_emitter.method_calls[33][1][0].entityUrn == f"urn:li:dataJob:({expected_dataflow_urn},__main__.transform)" ) assert ( - mock_emitter.method_calls[34].args[0].aspectName + mock_emitter.method_calls[34][1][0].aspectName == "dataProcessInstanceProperties" ) assert ( - mock_emitter.method_calls[34].args[0].entityUrn + mock_emitter.method_calls[34][1][0].entityUrn == "urn:li:dataProcessInstance:04ba0f8064b2c45f69da571c434f1c69" ) assert ( - mock_emitter.method_calls[35].args[0].aspectName + mock_emitter.method_calls[35][1][0].aspectName == "dataProcessInstanceRelationships" ) assert ( - mock_emitter.method_calls[35].args[0].entityUrn + mock_emitter.method_calls[35][1][0].entityUrn == "urn:li:dataProcessInstance:04ba0f8064b2c45f69da571c434f1c69" ) assert ( - mock_emitter.method_calls[36].args[0].aspectName + mock_emitter.method_calls[36][1][0].aspectName == "dataProcessInstanceRunEvent" ) assert ( - mock_emitter.method_calls[36].args[0].entityUrn + mock_emitter.method_calls[36][1][0].entityUrn == "urn:li:dataProcessInstance:04ba0f8064b2c45f69da571c434f1c69" ) assert ( - mock_emitter.method_calls[37].args[0].aspectName + mock_emitter.method_calls[37][1][0].aspectName == "dataProcessInstanceRunEvent" ) assert ( - mock_emitter.method_calls[37].args[0].entityUrn + mock_emitter.method_calls[37][1][0].entityUrn == "urn:li:dataProcessInstance:04ba0f8064b2c45f69da571c434f1c69" ) From 44e877d7d4afb57205340ab1c713d7c8197ebb9b Mon Sep 17 00:00:00 2001 From: shubhamjagtap639 Date: Tue, 27 Jun 2023 19:39:24 +0530 Subject: [PATCH 36/39] Code formatted --- tests/test_datahub_emitter.py | 24 ++++++++---------------- 1 file changed, 8 insertions(+), 16 deletions(-) diff --git a/tests/test_datahub_emitter.py b/tests/test_datahub_emitter.py index 6760ea5..e696d15 100644 --- a/tests/test_datahub_emitter.py +++ b/tests/test_datahub_emitter.py @@ -100,8 +100,7 @@ def test_emit_flow( assert mock_emitter.method_calls[4][1][0].aspectName == "browsePaths" assert mock_emitter.method_calls[4][1][0].entityUrn == expected_dataflow_urn assert ( - mock_emitter.method_calls[8][1][0].aspectName - == "dataProcessInstanceProperties" + mock_emitter.method_calls[8][1][0].aspectName == "dataProcessInstanceProperties" ) assert ( mock_emitter.method_calls[8][1][0].entityUrn @@ -116,8 +115,7 @@ def test_emit_flow( == "urn:li:dataProcessInstance:a95d24db6abd98384fc1d4c8540098a4" ) assert ( - mock_emitter.method_calls[10][1][0].aspectName - == "dataProcessInstanceRunEvent" + mock_emitter.method_calls[10][1][0].aspectName == "dataProcessInstanceRunEvent" ) assert ( mock_emitter.method_calls[10][1][0].entityUrn @@ -165,16 +163,14 @@ def test_emit_flow( == "urn:li:dataProcessInstance:bf5eab177af0097bbff6a41694f39af9" ) assert ( - mock_emitter.method_calls[18][1][0].aspectName - == "dataProcessInstanceRunEvent" + mock_emitter.method_calls[18][1][0].aspectName == "dataProcessInstanceRunEvent" ) assert ( mock_emitter.method_calls[18][1][0].entityUrn == "urn:li:dataProcessInstance:bf5eab177af0097bbff6a41694f39af9" ) assert ( - mock_emitter.method_calls[19][1][0].aspectName - == "dataProcessInstanceRunEvent" + mock_emitter.method_calls[19][1][0].aspectName == "dataProcessInstanceRunEvent" ) assert ( mock_emitter.method_calls[19][1][0].entityUrn @@ -222,16 +218,14 @@ def test_emit_flow( == "urn:li:dataProcessInstance:095673536b61e6f25c7691af0d2cc317" ) assert ( - mock_emitter.method_calls[27][1][0].aspectName - == "dataProcessInstanceRunEvent" + mock_emitter.method_calls[27][1][0].aspectName == "dataProcessInstanceRunEvent" ) assert ( mock_emitter.method_calls[27][1][0].entityUrn == "urn:li:dataProcessInstance:095673536b61e6f25c7691af0d2cc317" ) assert ( - mock_emitter.method_calls[28][1][0].aspectName - == "dataProcessInstanceRunEvent" + mock_emitter.method_calls[28][1][0].aspectName == "dataProcessInstanceRunEvent" ) assert ( mock_emitter.method_calls[28][1][0].entityUrn @@ -283,16 +277,14 @@ def test_emit_flow( == "urn:li:dataProcessInstance:04ba0f8064b2c45f69da571c434f1c69" ) assert ( - mock_emitter.method_calls[36][1][0].aspectName - == "dataProcessInstanceRunEvent" + mock_emitter.method_calls[36][1][0].aspectName == "dataProcessInstanceRunEvent" ) assert ( mock_emitter.method_calls[36][1][0].entityUrn == "urn:li:dataProcessInstance:04ba0f8064b2c45f69da571c434f1c69" ) assert ( - mock_emitter.method_calls[37][1][0].aspectName - == "dataProcessInstanceRunEvent" + mock_emitter.method_calls[37][1][0].aspectName == "dataProcessInstanceRunEvent" ) assert ( mock_emitter.method_calls[37][1][0].entityUrn From 6128cd5a3c0c8313dd825d6394be6f144e2f795c Mon Sep 17 00:00:00 2001 From: shubhamjagtap639 Date: Tue, 27 Jun 2023 19:57:11 +0530 Subject: [PATCH 37/39] python-resources package added --- requirements-dev.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/requirements-dev.txt b/requirements-dev.txt index 8f50f4d..a265902 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -12,4 +12,5 @@ mock; python_version < '3.8' mkdocs-gen-files interrogate coverage -pillow \ No newline at end of file +pillow +python-resources \ No newline at end of file From a497abd06b5bf0c3b0433e6c85c7a160419d1831 Mon Sep 17 00:00:00 2001 From: shubhamjagtap639 Date: Tue, 27 Jun 2023 20:18:08 +0530 Subject: [PATCH 38/39] Remove unused python-resource package --- requirements-dev.txt | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/requirements-dev.txt b/requirements-dev.txt index a265902..8f50f4d 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -12,5 +12,4 @@ mock; python_version < '3.8' mkdocs-gen-files interrogate coverage -pillow -python-resources \ No newline at end of file +pillow \ No newline at end of file From 52d7f82d0be3ab16dd8e8ceb4ee4936c0b996237 Mon Sep 17 00:00:00 2001 From: shubhamjagtap639 Date: Wed, 28 Jun 2023 13:24:31 +0530 Subject: [PATCH 39/39] Prefect version changed to fixed 2.10.16 --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index db5c355..c4df23d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,2 @@ -prefect>=2.0.0 +prefect==2.10.16 acryl-datahub[datahub-rest] \ No newline at end of file