diff --git a/.circleci/continue_config.yml b/.circleci/continue_config.yml index 55ed688e27..48407c0a3d 100644 --- a/.circleci/continue_config.yml +++ b/.circleci/continue_config.yml @@ -354,34 +354,6 @@ jobs: name: Pip-compile requirements file command: conda activate kedro_builder; make pip-compile - build_docs: - executor: - name: docker - python_version: "3.7" - steps: - - setup - - run: - name: Build docs - command: make build-docs - - run: - name: Pip freeze including docs dependencies - command: pip freeze - when: always - - docs_linkcheck: - executor: - name: docker - python_version: "3.8" - steps: - - setup - - run: - name: Check for broken links - command: make linkcheck - - run: - name: Pip freeze including docs dependencies - command: pip freeze - when: always - sync: docker: # https://circleci.com/docs/2.0/circleci-images/#circleci-base-image @@ -544,27 +516,6 @@ jobs: workflows: version: 2.1 - build_docs_only: - when: - and: - - <> - - not: <> - - not: <> - - not: <> - - not: <> - jobs: - - build_docs - - docs_linkcheck - - lint: - matrix: - parameters: - python_version: ["3.7", "3.8", "3.9", "3.10"] - - all_circleci_checks_succeeded: - requires: - - build_docs - - docs_linkcheck - - lint - build_code_and_docs: when: and: @@ -601,8 +552,6 @@ workflows: matrix: parameters: python_version: ["3.7", "3.8", "3.9", "3.10"] - - build_docs - - docs_linkcheck - all_circleci_checks_succeeded: requires: - e2e_tests @@ -612,8 +561,6 @@ workflows: - lint - pip_compile - win_pip_compile - - build_docs - - docs_linkcheck main_updated: when: diff --git a/.gitignore b/.gitignore index 67b5f964c1..8e06e36735 100644 --- a/.gitignore +++ b/.gitignore @@ -134,6 +134,11 @@ venv.bak/ /site /kedro/framework/html +# Sphinx documentation +# Additional files created by sphinx.ext.autosummary +# Some of them are actually tracked to control the output +/docs/source/kedro.* + # mypy .mypy_cache/ diff --git a/.readthedocs.yml b/.readthedocs.yml index 2f95ec4cf4..771a7351b9 100644 --- a/.readthedocs.yml +++ b/.readthedocs.yml @@ -9,7 +9,7 @@ version: 2 build: os: ubuntu-22.04 tools: - python: "3.7" + python: "3.8" nodejs: "19" apt_packages: - libasound2 @@ -17,12 +17,14 @@ build: post_create_environment: - npm install -g @mermaid-js/mermaid-cli - ./docs/kedro-datasets-docs.sh + pre_build: + - python -m sphinx -WETan -j auto -D language=en -b linkcheck -d _build/doctrees docs/source _build/linkcheck # Build documentation in the docs/ directory with Sphinx sphinx: builder: html - configuration: docs/conf.py - fail_on_warning: false # Turn back on soon + configuration: docs/source/conf.py + fail_on_warning: true # Build documentation with MkDocs # mkdocs: diff --git a/Makefile b/Makefile index 925c420681..e680e12620 100644 --- a/Makefile +++ b/Makefile @@ -44,11 +44,8 @@ linkcheck: pip install -e ".[docs]" ./docs/build-docs.sh "linkcheck" -devserver: build-docs - cd docs && npm install && npm start - package: clean install - python setup.py sdist bdist_wheel + python -m pip install build && python -m build install-test-requirements: pip install -r test_requirements.txt @@ -63,7 +60,7 @@ print-python-env: @./tools/print_env.sh databricks-build: - python setup.py bdist_wheel + python -m pip install build && python -m build python ./tools/databricks_build.py sign-off: diff --git a/RELEASE.md b/RELEASE.md index 4dd780c048..3dd1c97e62 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -15,6 +15,8 @@ ## Bug fixes and other changes * Improvements to documentation about configuration. +* Improvements to Jupyter E2E tests. +* Improvements to documentation on visualising Kedro projects on Databricks. ## Breaking changes to the API diff --git a/docs/build-docs.sh b/docs/build-docs.sh index 19b1ca8d8d..5575b6b577 100755 --- a/docs/build-docs.sh +++ b/docs/build-docs.sh @@ -7,35 +7,12 @@ set -o nounset action=$1 -# Move some files around. We need a separate build directory, which would -# have all the files, build scripts would shuffle the files, -# we don't want that happening on the actual code locally. -# When running on ReadTheDocs, sphinx-build would run directly on the original files, -# but we don't care about the code state there. -rm -rf docs/build +# Reinstall kedro-datasets locally rm -rf kedro/datasets -mkdir docs/build/ -cp -r docs/_templates docs/conf.py docs/*.svg docs/*.json docs/build/ - bash docs/kedro-datasets-docs.sh if [ "$action" == "linkcheck" ]; then - sphinx-build -c docs/ -ETan -j auto -D language=en -b linkcheck docs/build/ docs/build/html + sphinx-build -WETan -j auto -D language=en -b linkcheck -d docs/build/doctrees docs/source docs/build/linkcheck elif [ "$action" == "docs" ]; then - sphinx-build -c docs/ -ETa -j auto -D language=en docs/build/ docs/build/html + sphinx-build -WETa -j auto -D language=en -b html -d docs/build/doctrees docs/source docs/build/html fi - -# Clean up build artefacts -rm -rf docs/build/html/_sources - -# Copy built HTML to temp directory, clean up build dir and replace with built docs only -rm -rf docs/temp -mkdir docs/temp/ -mkdir docs/temp/html -cp -rf docs/build/html/* docs/temp/html - -rm -rf docs/build -mkdir docs/build -mkdir docs/build/html -cp -rf docs/temp/html/* docs/build/html -rm -rf docs/temp diff --git a/docs/package.json b/docs/package.json deleted file mode 100644 index 839cab7e2d..0000000000 --- a/docs/package.json +++ /dev/null @@ -1,16 +0,0 @@ -{ - "name": "kedro-docs", - "version": "1.0.0", - "main": "build/html/index.html", - "scripts": { - "serve": "browser-sync start --server 'build/html' --files 'build/html/_static/css/*.css'", - "watch": "copy-and-watch --watch source/css/*.css build/html/_static/css", - "start": "npm-run-all -p serve watch" - }, - "author": "Richard Westenra ", - "devDependencies": { - "browser-sync": "^2.26.7", - "copy-and-watch": "^0.1.2", - "npm-run-all": "^4.1.5" - } -} diff --git a/docs/source/css/qb1-sphinx-rtd.css b/docs/source/_static/css/qb1-sphinx-rtd.css similarity index 100% rename from docs/source/css/qb1-sphinx-rtd.css rename to docs/source/_static/css/qb1-sphinx-rtd.css diff --git a/docs/source/css/theme-overrides.css b/docs/source/_static/css/theme-overrides.css similarity index 100% rename from docs/source/css/theme-overrides.css rename to docs/source/_static/css/theme-overrides.css diff --git a/docs/_templates/autosummary/base.rst b/docs/source/_templates/autosummary/base.rst similarity index 100% rename from docs/_templates/autosummary/base.rst rename to docs/source/_templates/autosummary/base.rst diff --git a/docs/_templates/autosummary/class.rst b/docs/source/_templates/autosummary/class.rst similarity index 100% rename from docs/_templates/autosummary/class.rst rename to docs/source/_templates/autosummary/class.rst diff --git a/docs/_templates/autosummary/module.rst b/docs/source/_templates/autosummary/module.rst similarity index 100% rename from docs/_templates/autosummary/module.rst rename to docs/source/_templates/autosummary/module.rst diff --git a/docs/_templates/breadcrumbs.html b/docs/source/_templates/breadcrumbs.html similarity index 100% rename from docs/_templates/breadcrumbs.html rename to docs/source/_templates/breadcrumbs.html diff --git a/docs/_templates/layout.html b/docs/source/_templates/layout.html similarity index 100% rename from docs/_templates/layout.html rename to docs/source/_templates/layout.html diff --git a/docs/conf.py b/docs/source/conf.py similarity index 92% rename from docs/conf.py rename to docs/source/conf.py index 37707da7d9..a857f9ad18 100644 --- a/docs/conf.py +++ b/docs/source/conf.py @@ -15,9 +15,7 @@ import importlib import os import re -import shutil import sys -from distutils.dir_util import copy_tree from inspect import getmembers, isclass, isfunction from pathlib import Path from typing import List, Tuple @@ -49,15 +47,12 @@ "sphinx.ext.napoleon", "sphinx_autodoc_typehints", "sphinx.ext.doctest", - "sphinx.ext.todo", - "sphinx.ext.coverage", - "sphinx.ext.mathjax", "sphinx.ext.ifconfig", "sphinx.ext.viewcode", - "nbsphinx", "sphinx_copybutton", "sphinxcontrib.mermaid", "myst_parser", + "notfound.extension", ] # enable autosummary plugin (table of contents for modules/classes/class @@ -68,6 +63,7 @@ # Add any paths that contain templates here, relative to this directory. templates_path = ["_templates"] +html_static_path = ["_static"] # The suffix(es) of source filenames. # You can specify multiple suffix as a list of string: @@ -82,7 +78,7 @@ # # This is also used if you do content translation via gettext catalogs. # Usually you set "language" from the command line for these cases. -language = None +language = "en" # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. @@ -305,23 +301,6 @@ # If true, `todo` and `todoList` produce output, else they produce nothing. todo_include_todos = False -# -- Extension configuration ------------------------------------------------- - -# nbsphinx_prolog = """ -# see here for prolog/epilog details: -# https://nbsphinx.readthedocs.io/en/0.3.1/prolog-and-epilog.html -# """ - -nbsphinx_epilog = """ -.. note:: - - Found a bug, or didn't find what you were looking for? 🙏 `Please file a - ticket `_ -""" - -# -- NBconvert kedro config ------------------------------------------------- -nbsphinx_kedro_name = "kedro" - # -- Kedro specific configuration ----------------------------------------- KEDRO_MODULES = [ "kedro.io", @@ -493,20 +472,6 @@ def autodoc_process_docstring(app, what, name, obj, options, lines): remove_arrows_in_examples(lines) -def _prepare_build_dir(app, config): - """Get current working directory to the state expected - by the ReadTheDocs builder. Shortly, it does the same as - ./build-docs.sh script except not running `sphinx-build` step.""" - build_root = Path(app.srcdir) - build_out = Path(app.outdir) - copy_tree(str(here / "source"), str(build_root)) - copy_tree(str(build_root / "api_docs"), str(build_root)) - shutil.rmtree(str(build_root / "api_docs")) - shutil.rmtree(str(build_out), ignore_errors=True) - copy_tree(str(build_root / "css"), str(build_out / "_static" / "css")) - shutil.rmtree(str(build_root / "css")) - - def env_override(default_appid): build_version = os.getenv("READTHEDOCS_VERSION") @@ -533,7 +498,6 @@ def _add_jinja_filters(app): def setup(app): - app.connect("config-inited", _prepare_build_dir) app.connect("builder-inited", _add_jinja_filters) app.connect("autodoc-process-docstring", autodoc_process_docstring) app.add_css_file("css/qb1-sphinx-rtd.css") @@ -573,4 +537,4 @@ def setup(app): # https://github.com/mermaidjs/mermaid.cli#linux-sandbox-issue mermaid_params = ["-p", here / "puppeteer-config.json", "-s", "2"] # https://github.com/kedro-org/kedro/issues/2451 -mermaid_version = mermaid_init_js = None +mermaid_version = mermaid_init_js = "" diff --git a/docs/source/contribution/development_for_databricks.md b/docs/source/contribution/development_for_databricks.md index da46975d7c..77987b382a 100644 --- a/docs/source/contribution/development_for_databricks.md +++ b/docs/source/contribution/development_for_databricks.md @@ -7,7 +7,7 @@ This guide describes how to efficiently develop features and fixes for Kedro on Databricks. Using this guide, you will be able to quickly test your locally modified version of Kedro on Databricks as part of a build-and-test development cycle. ```{note} -This page is for people developing changes to Kedro that need to test them on Databricks. If you are working on a Kedro project and need more information about project-deployment, consult the [documentation for deploying Kedro projects on Databricks](../deployment/databricks.md). +This page is for people developing changes to Kedro that need to test them on Databricks. If you are working on a Kedro project and need more information about workflows, consult the [documentation for developing a Kedro project on Databricks](../integrations/databricks_workspace.md). ``` ## Prerequisites diff --git a/docs/source/deployment/aws_batch.md b/docs/source/deployment/aws_batch.md index 383ce79f6e..ccb79b8624 100644 --- a/docs/source/deployment/aws_batch.md +++ b/docs/source/deployment/aws_batch.md @@ -105,7 +105,7 @@ A job queue is the bridge between the submitted jobs and the compute environment ### Configure the credentials -Ensure you have the necessary AWS credentials in place before moving on, so that your pipeline can access and interact with the AWS services. Check out [the AWS CLI documentation](https://docs.aws.amazon.com/cli/latest/userguide/cli-configure-quickstart.html#cli-configure-quickstart-config) for instructions on how to set this up. +Ensure you have the necessary AWS credentials in place before moving on, so that your pipeline can access and interact with the AWS services. Check out [the AWS CLI documentation](https://docs.aws.amazon.com/cli/latest/userguide/cli-chap-configure.html) for instructions on how to set this up. ```{note} You should configure the default region to match the region where you've created the Batch resources. diff --git a/docs/source/deployment/deployment_guide.md b/docs/source/deployment/deployment_guide.md index e940df8d46..468c8ff375 100644 --- a/docs/source/deployment/deployment_guide.md +++ b/docs/source/deployment/deployment_guide.md @@ -15,7 +15,6 @@ We also provide information to help you deploy to the following: * to [Prefect](prefect.md) * to [Kubeflow Workflows](kubeflow.md) * to [AWS Batch](aws_batch.md) -* to [Databricks](databricks.md) * to [Dask](dask.md) diff --git a/docs/source/index.rst b/docs/source/index.rst index 858376ce06..df57ce2c12 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -138,6 +138,13 @@ Welcome to Kedro's documentation! logging/logging +.. toctree:: + :maxdepth: 2 + :caption: Integrations + + integrations/databricks.rst + integrations/pyspark.rst + .. toctree:: :maxdepth: 2 :caption: Development @@ -160,18 +167,11 @@ Welcome to Kedro's documentation! deployment/prefect deployment/kubeflow deployment/aws_batch - deployment/databricks deployment/aws_sagemaker deployment/aws_step_functions deployment/airflow_astronomer deployment/dask -.. toctree:: - :maxdepth: 2 - :caption: PySpark integration - - tools_integration/pyspark - .. toctree:: :maxdepth: 2 :caption: Resources diff --git a/docs/source/integrations/databricks.md b/docs/source/integrations/databricks.md new file mode 100644 index 0000000000..32523578a0 --- /dev/null +++ b/docs/source/integrations/databricks.md @@ -0,0 +1,9 @@ +# Databricks integration + +```{toctree} +:caption: Databricks +:maxdepth: 2 + +databricks_workspace.md +databricks_visualisation.md +``` diff --git a/docs/source/integrations/databricks_visualisation.md b/docs/source/integrations/databricks_visualisation.md new file mode 100644 index 0000000000..69bef2dbad --- /dev/null +++ b/docs/source/integrations/databricks_visualisation.md @@ -0,0 +1,22 @@ +# Visualise a Kedro project in Databricks notebooks + +[Kedro-Viz](../visualisation/kedro-viz_visualisation.md) is a tool that enables you to visualise your Kedro pipeline and metrics generated from your data science experiments. It is a standalone web application that runs on a web browser, it can be run on a local machine or in Databricks notebooks. + +For Kedro-Viz to run with your Kedro project, you need to ensure that both the packages are installed in the same scope (notebook-scoped vs. cluster library). This means that if you `%pip install kedro` from inside your notebook then you should also `%pip install kedro-viz` from inside your notebook. +If your cluster comes with Kedro installed on it as a library already then you should also add Kedro-Viz as a [cluster library](https://docs.microsoft.com/en-us/azure/databricks/libraries/cluster-libraries). + +To run Kedro-Viz on Databricks you must first [launch the Kedro IPython extension](./databricks_workspace.md#9-using-the-kedro-ipython-extension). + +Kedro-Viz can then be launched in a new browser tab with the `%run_viz` line magic: + +```ipython +%run_viz +``` + +This command presents you with a link to the Kedro-Viz web application. + +![databricks_viz_link](../meta/images/databricks_viz_link.png) + +Clicking this link opens a new browser tab running Kedro-Viz for your project. + +![databricks_viz_demo](../meta/images/databricks_viz_demo.png) diff --git a/docs/source/deployment/databricks.md b/docs/source/integrations/databricks_workspace.md similarity index 95% rename from docs/source/deployment/databricks.md rename to docs/source/integrations/databricks_workspace.md index 4c9195c2b9..dd99ef92be 100644 --- a/docs/source/deployment/databricks.md +++ b/docs/source/integrations/databricks_workspace.md @@ -1,4 +1,4 @@ -# Deployment to a Databricks cluster +# Develop a project with Databricks Workspace and Notebooks This tutorial uses the [PySpark Iris Kedro Starter](https://github.com/kedro-org/kedro-starters/tree/main/pyspark-iris) to illustrate how to bootstrap a Kedro project using Spark and deploy it to a [Databricks cluster on AWS](https://databricks.com/aws). @@ -252,16 +252,6 @@ You must explicitly upgrade your `pip` version by doing the below: After this, you can reload Kedro by running the line magic command `%reload_kedro `. -### 10. Running Kedro-Viz on Databricks - -For Kedro-Viz to run with your Kedro project, you need to ensure that both the packages are installed in the same scope (notebook-scoped vs. cluster library). i.e. if you `%pip install kedro` from inside your notebook then you should also `%pip install kedro-viz` from inside your notebook. -If your cluster comes with Kedro installed on it as a library already then you should also add Kedro-Viz as a [cluster library](https://docs.microsoft.com/en-us/azure/databricks/libraries/cluster-libraries). - -Kedro-Viz can then be launched in a new browser tab with the `%run_viz` line magic: -```ipython -In [2]: %run_viz -``` - ## How to use datasets stored on Databricks DBFS DBFS is a distributed file system mounted into a DataBricks workspace and accessible on a DataBricks cluster. It maps cloud object storage URIs to relative paths so as to simplify the process of persisting files. With DBFS, libraries can read from or write to distributed storage as if it's a local file. diff --git a/docs/source/integrations/pyspark.md b/docs/source/integrations/pyspark.md new file mode 100644 index 0000000000..19c3e4b39c --- /dev/null +++ b/docs/source/integrations/pyspark.md @@ -0,0 +1,8 @@ +# PySpark integration + +```{toctree} +:caption: PySpark +:maxdepth: 2 + +pyspark_integration.md +``` diff --git a/docs/source/tools_integration/pyspark.md b/docs/source/integrations/pyspark_integration.md similarity index 100% rename from docs/source/tools_integration/pyspark.md rename to docs/source/integrations/pyspark_integration.md diff --git a/docs/source/api_docs/kedro.config.rst b/docs/source/kedro.config.rst similarity index 100% rename from docs/source/api_docs/kedro.config.rst rename to docs/source/kedro.config.rst diff --git a/docs/source/api_docs/kedro.datasets.rst b/docs/source/kedro.datasets.rst similarity index 100% rename from docs/source/api_docs/kedro.datasets.rst rename to docs/source/kedro.datasets.rst diff --git a/docs/source/api_docs/kedro.extras.datasets.rst b/docs/source/kedro.extras.datasets.rst similarity index 100% rename from docs/source/api_docs/kedro.extras.datasets.rst rename to docs/source/kedro.extras.datasets.rst diff --git a/docs/source/api_docs/kedro.extras.logging.color_logger.ColorHandler.rst b/docs/source/kedro.extras.logging.color_logger.ColorHandler.rst similarity index 100% rename from docs/source/api_docs/kedro.extras.logging.color_logger.ColorHandler.rst rename to docs/source/kedro.extras.logging.color_logger.ColorHandler.rst diff --git a/docs/source/api_docs/kedro.framework.cli.cli.KedroCLI.rst b/docs/source/kedro.framework.cli.cli.KedroCLI.rst similarity index 100% rename from docs/source/api_docs/kedro.framework.cli.cli.KedroCLI.rst rename to docs/source/kedro.framework.cli.cli.KedroCLI.rst diff --git a/docs/source/api_docs/kedro.framework.cli.hooks.manager.CLIHooksManager.rst b/docs/source/kedro.framework.cli.hooks.manager.CLIHooksManager.rst similarity index 100% rename from docs/source/api_docs/kedro.framework.cli.hooks.manager.CLIHooksManager.rst rename to docs/source/kedro.framework.cli.hooks.manager.CLIHooksManager.rst diff --git a/docs/source/kedro.framework.cli.jupyter.JupyterCommandGroup.rst b/docs/source/kedro.framework.cli.jupyter.JupyterCommandGroup.rst new file mode 100644 index 0000000000..3562a7950f --- /dev/null +++ b/docs/source/kedro.framework.cli.jupyter.JupyterCommandGroup.rst @@ -0,0 +1,10 @@ +kedro.framework.cli.jupyter.JupyterCommandGroup +=============================================== + +.. currentmodule:: kedro.framework.cli.jupyter + +.. autoclass:: JupyterCommandGroup + :members: + +.. Removed all methods and properties, +.. see https://github.com/kedro-org/kedro/issues/2453 diff --git a/docs/source/api_docs/kedro.framework.cli.utils.CommandCollection.rst b/docs/source/kedro.framework.cli.utils.CommandCollection.rst similarity index 100% rename from docs/source/api_docs/kedro.framework.cli.utils.CommandCollection.rst rename to docs/source/kedro.framework.cli.utils.CommandCollection.rst diff --git a/docs/source/api_docs/kedro.framework.cli.utils.rst b/docs/source/kedro.framework.cli.utils.rst similarity index 100% rename from docs/source/api_docs/kedro.framework.cli.utils.rst rename to docs/source/kedro.framework.cli.utils.rst diff --git a/docs/source/api_docs/kedro.framework.context.rst b/docs/source/kedro.framework.context.rst similarity index 100% rename from docs/source/api_docs/kedro.framework.context.rst rename to docs/source/kedro.framework.context.rst diff --git a/docs/source/api_docs/kedro.framework.session.shelvestore.ShelveStore.rst b/docs/source/kedro.framework.session.shelvestore.ShelveStore.rst similarity index 100% rename from docs/source/api_docs/kedro.framework.session.shelvestore.ShelveStore.rst rename to docs/source/kedro.framework.session.shelvestore.ShelveStore.rst diff --git a/docs/source/api_docs/kedro.framework.session.store.BaseSessionStore.rst b/docs/source/kedro.framework.session.store.BaseSessionStore.rst similarity index 100% rename from docs/source/api_docs/kedro.framework.session.store.BaseSessionStore.rst rename to docs/source/kedro.framework.session.store.BaseSessionStore.rst diff --git a/docs/source/api_docs/kedro.io.rst b/docs/source/kedro.io.rst similarity index 100% rename from docs/source/api_docs/kedro.io.rst rename to docs/source/kedro.io.rst diff --git a/docs/source/api_docs/kedro.pipeline.rst b/docs/source/kedro.pipeline.rst similarity index 100% rename from docs/source/api_docs/kedro.pipeline.rst rename to docs/source/kedro.pipeline.rst diff --git a/docs/source/api_docs/kedro.runner.rst b/docs/source/kedro.runner.rst similarity index 100% rename from docs/source/api_docs/kedro.runner.rst rename to docs/source/kedro.runner.rst diff --git a/docs/kedro_logo.svg b/docs/source/kedro_logo.svg similarity index 100% rename from docs/kedro_logo.svg rename to docs/source/kedro_logo.svg diff --git a/docs/source/kedro_project_setup/starters.md b/docs/source/kedro_project_setup/starters.md index 0c12f3f349..4415d4f237 100644 --- a/docs/source/kedro_project_setup/starters.md +++ b/docs/source/kedro_project_setup/starters.md @@ -49,8 +49,8 @@ The Kedro team maintains the following starters for a range of Kedro projects: * [`astro-airflow-iris`](https://github.com/kedro-org/kedro-starters/tree/main/astro-airflow-iris): The [Kedro Iris dataset example project](../get_started/new_project.md) with a minimal setup for deploying the pipeline on Airflow with [Astronomer](https://www.astronomer.io/). * [`standalone-datacatalog`](https://github.com/kedro-org/kedro-starters/tree/main/standalone-datacatalog): A minimum setup to use the traditional [Iris dataset](https://www.kaggle.com/uciml/iris) with Kedro's [`DataCatalog`](../data/data_catalog.md), which is a core component of Kedro. This starter is of use in the exploratory phase of a project. For more information, read the guide to [standalone use of the `DataCatalog`](../notebooks_and_ipython/kedro_and_notebooks.md). This starter was formerly known as `mini-kedro`. * [`pandas-iris`](https://github.com/kedro-org/kedro-starters/tree/main/pandas-iris): The [Kedro Iris dataset example project](../get_started/new_project.md) -* [`pyspark-iris`](https://github.com/kedro-org/kedro-starters/tree/main/pyspark-iris): An alternative Kedro Iris dataset example, using [PySpark](../tools_integration/pyspark.md) -* [`pyspark`](https://github.com/kedro-org/kedro-starters/tree/main/pyspark): The configuration and initialisation code for a [Kedro pipeline using PySpark](../tools_integration/pyspark.md) +* [`pyspark-iris`](https://github.com/kedro-org/kedro-starters/tree/main/pyspark-iris): An alternative Kedro Iris dataset example, using [PySpark](../integrations/pyspark_integration.md) +* [`pyspark`](https://github.com/kedro-org/kedro-starters/tree/main/pyspark): The configuration and initialisation code for a [Kedro pipeline using PySpark](../integrations/pyspark_integration.md) * [`spaceflights`](https://github.com/kedro-org/kedro-starters/tree/main/spaceflights): The [spaceflights tutorial](../tutorial/spaceflights_tutorial.md) example code ## Starter versioning diff --git a/docs/source/meta/images/databricks_viz_demo.png b/docs/source/meta/images/databricks_viz_demo.png new file mode 100644 index 0000000000..d4388e412a Binary files /dev/null and b/docs/source/meta/images/databricks_viz_demo.png differ diff --git a/docs/source/meta/images/databricks_viz_link.png b/docs/source/meta/images/databricks_viz_link.png new file mode 100644 index 0000000000..71a8fc9455 Binary files /dev/null and b/docs/source/meta/images/databricks_viz_link.png differ diff --git a/docs/source/nodes_and_pipelines/run_a_pipeline.md b/docs/source/nodes_and_pipelines/run_a_pipeline.md index ba861951dd..4d7fa7804c 100644 --- a/docs/source/nodes_and_pipelines/run_a_pipeline.md +++ b/docs/source/nodes_and_pipelines/run_a_pipeline.md @@ -46,7 +46,7 @@ kedro run --runner=ThreadRunner `SparkDataSet` doesn't work correctly with `ParallelRunner`. To add concurrency to the pipeline with `SparkDataSet`, you must use `ThreadRunner`. ``` -For more information on how to maximise concurrency when using Kedro with PySpark, please visit our guide on [how to build a Kedro pipeline with PySpark](../tools_integration/pyspark.md). +For more information on how to maximise concurrency when using Kedro with PySpark, please visit our guide on [how to build a Kedro pipeline with PySpark](../integrations/pyspark_integration.md). hook_manager: PluginManager = None, diff --git a/docs/puppeteer-config.json b/docs/source/puppeteer-config.json similarity index 100% rename from docs/puppeteer-config.json rename to docs/source/puppeteer-config.json diff --git a/docs/robots.txt b/docs/source/robots.txt similarity index 100% rename from docs/robots.txt rename to docs/source/robots.txt diff --git a/docs/source/tutorial/add_another_pipeline.md b/docs/source/tutorial/add_another_pipeline.md index 450b4dcf98..dfc7e5cf65 100644 --- a/docs/source/tutorial/add_another_pipeline.md +++ b/docs/source/tutorial/add_another_pipeline.md @@ -518,7 +518,7 @@ kedro run --runner=ThreadRunner kedro run --runner=module.path.to.my.runner ``` -`ParallelRunner` performs task parallelisation via multiprocessing, while `ThreadRunner` is intended for use with remote execution engines such as [Spark](../tools_integration/pyspark.md) and [Dask](/kedro.datasets.dask.ParquetDataSet). +`ParallelRunner` performs task parallelisation via multiprocessing, while `ThreadRunner` is intended for use with remote execution engines such as [Spark](../integrations/pyspark_integration.md) and [Dask](/kedro.datasets.dask.ParquetDataSet). You can find out more about the runners Kedro provides, and how to create your own, in the [pipeline documentation about runners](../nodes_and_pipelines/run_a_pipeline.md). atasets to work with different data formats (including CSV, Excel, and Parquet) diff --git a/docs/source/tutorial/package_a_project.md b/docs/source/tutorial/package_a_project.md index d86fc6a0bd..c91f23cf74 100644 --- a/docs/source/tutorial/package_a_project.md +++ b/docs/source/tutorial/package_a_project.md @@ -151,4 +151,4 @@ There are various methods to deploy packaged pipelines via Kedro plugins: * [Kedro-Docker](https://github.com/kedro-org/kedro-plugins/tree/main/kedro-docker) plugin for packaging and shipping Kedro projects within [Docker](https://www.docker.com/) containers. * [Kedro-Airflow](https://github.com/kedro-org/kedro-plugins/tree/main/kedro-airflow) to convert your Kedro project into an [Airflow](https://airflow.apache.org/) project. -* The [Deployment guide](../deployment/deployment_guide) touches on other deployment targets such as AWS Batch and Prefect, and there is a [range of third-party plugins for deployment](extend_kedro/plugins.md#community-developed-plugins). +* The [Deployment guide](../deployment/deployment_guide) touches on other deployment targets such as AWS Batch and Prefect, and there is a [range of third-party plugins for deployment](../extend_kedro/plugins.md#community-developed-plugins). diff --git a/features/jupyter.feature b/features/jupyter.feature index e4f5b50dba..188d07f2f4 100644 --- a/features/jupyter.feature +++ b/features/jupyter.feature @@ -10,10 +10,12 @@ Feature: Jupyter targets in new project Scenario: Execute jupyter notebook target When I execute the kedro jupyter command "notebook --no-browser" + Then I wait for the jupyter webserver to run for up to "120" seconds Then jupyter notebook should run on port 8888 Scenario: Execute jupyter lab target When I execute the kedro jupyter command "lab --no-browser" + Then I wait for the jupyter webserver to run for up to "120" seconds Then Jupyter Lab should run on port 8888 Scenario: Execute node convert into Python files diff --git a/features/steps/cli_steps.py b/features/steps/cli_steps.py index 04f7d2d5e7..76bb0d2722 100644 --- a/features/steps/cli_steps.py +++ b/features/steps/cli_steps.py @@ -128,7 +128,6 @@ def _check_service_up(context: behave.runner.Context, url: str, string: str): """ response = requests.get(url, timeout=1.0) response.raise_for_status() - data = response.text assert string in data assert context.result.poll() is None @@ -337,10 +336,22 @@ def exec_notebook(context, command): # Jupyter notebook forks a child process from a parent process, and # only kills the parent process when it is terminated context.result = ChildTerminatingPopen( - cmd, env=context.env, cwd=str(context.root_project_dir) + cmd, env=context.env, cwd=str(context.root_project_dir), universal_newlines=True ) +@then('I wait for the jupyter webserver to run for up to "{timeout:d}" seconds') +def wait_for_notebook_to_run(context, timeout): + timeout_start = time() + while time() < timeout_start + timeout: + stdout = context.result.stdout.readline() + if "http://127.0.0.1:" in stdout: + break + + if time() >= timeout_start + timeout: + raise TimeoutError("Failed to run Jupyter server in time") + + @when("Wait until the process is finished") def wait(context): """Wait for child process to terminate.""" @@ -565,14 +576,7 @@ def check_jupyter_nb_proc_on_port(context: behave.runner.Context, port: int): """ url = f"http://localhost:{port}" try: - util.wait_for( - func=_check_service_up, - context=context, - url=url, - string="Jupyter Notebook", - timeout_=15, - print_error=True, - ) + _check_service_up(context, url, "Jupyter Notebook") finally: context.result.terminate() @@ -588,14 +592,7 @@ def check_jupyter_lab_proc_on_port(context: behave.runner.Context, port: int): """ url = f"http://localhost:{port}" try: - util.wait_for( - func=_check_service_up, - timeout_=20, - context=context, - url=url, - string=' None: **kwargs: keyword arguments such as env and cwd """ - super().__init__(cmd, **kwargs) # type: ignore + super().__init__( # type: ignore + cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, **kwargs + ) def terminate(self) -> None: """Terminate process and children.""" diff --git a/kedro/config/common.py b/kedro/config/common.py index 4fb4d41b2f..48a3c90ce4 100644 --- a/kedro/config/common.py +++ b/kedro/config/common.py @@ -10,7 +10,7 @@ from yaml.parser import ParserError -from kedro.config import BadConfigException, MissingConfigException +from kedro.config.abstract_config import BadConfigException, MissingConfigException SUPPORTED_EXTENSIONS = [ ".yml", diff --git a/kedro/config/config.py b/kedro/config/config.py index 166f0636bf..557138f9ed 100644 --- a/kedro/config/config.py +++ b/kedro/config/config.py @@ -4,7 +4,7 @@ from pathlib import Path from typing import Any, Dict, Iterable, List -from kedro.config import AbstractConfigLoader +from kedro.config.abstract_config import AbstractConfigLoader from kedro.config.common import _get_config_from_patterns, _remove_duplicates diff --git a/kedro/config/omegaconf_config.py b/kedro/config/omegaconf_config.py index c3d0d02f27..ca1dbc2173 100644 --- a/kedro/config/omegaconf_config.py +++ b/kedro/config/omegaconf_config.py @@ -13,7 +13,7 @@ from yaml.parser import ParserError from yaml.scanner import ScannerError -from kedro.config import AbstractConfigLoader, MissingConfigException +from kedro.config.abstract_config import AbstractConfigLoader, MissingConfigException _config_logger = logging.getLogger(__name__) diff --git a/kedro/config/templated_config.py b/kedro/config/templated_config.py index 3468bf10dc..c6cec1bbf5 100644 --- a/kedro/config/templated_config.py +++ b/kedro/config/templated_config.py @@ -9,7 +9,7 @@ import jmespath -from kedro.config import AbstractConfigLoader +from kedro.config.abstract_config import AbstractConfigLoader from kedro.config.common import _get_config_from_patterns, _remove_duplicates IDENTIFIER_PATTERN = re.compile( diff --git a/kedro/io/__init__.py b/kedro/io/__init__.py index de6f6e49e4..1203d3ae19 100644 --- a/kedro/io/__init__.py +++ b/kedro/io/__init__.py @@ -1,6 +1,5 @@ """``kedro.io`` provides functionality to read and write to a -number of data sets. At core of the library is ``AbstractDataSet`` -which allows implementation of various ``AbstractDataSet``s. +number of data sets. At core of the library is the ``AbstractDataSet`` class. """ from .cached_dataset import CachedDataSet diff --git a/kedro/pipeline/modular_pipeline.py b/kedro/pipeline/modular_pipeline.py index 9309eed678..fa83e37c1d 100644 --- a/kedro/pipeline/modular_pipeline.py +++ b/kedro/pipeline/modular_pipeline.py @@ -157,7 +157,7 @@ def pipeline( tags: Union[str, Iterable[str]] = None, namespace: str = None, ) -> Pipeline: - """Create a ``Pipeline`` from a collection of nodes and/or ``Pipeline``s. + r"""Create a ``Pipeline`` from a collection of nodes and/or ``Pipeline``\s. Args: pipe: The nodes the ``Pipeline`` will be made of. If you diff --git a/pyproject.toml b/pyproject.toml index 7971446d5f..ba356a7e7d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -3,6 +3,50 @@ # Minimum requirements for the build system to execute. requires = ["setuptools>=65.5.1", "wheel"] # PEP 518 specifications. +[project] +name = "kedro" +authors = [ + {name = "Kedro"} +] +description = "Kedro helps you build production-ready data and analytics pipelines" +requires-python = ">=3.7, <3.11" +keywords = [ + "pipelines", + "machine learning", + "data pipelines", + "data science", + "data engineering", +] +license = {text = "Apache Software License (Apache 2.0)"} +classifiers = [ + "Development Status :: 4 - Beta", + "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", +] +dynamic = ["readme", "version", "dependencies", "optional-dependencies"] + +[project.urls] +Homepage = "https://kedro.org" +Source = "https://github.com/kedro-org/kedro" +Documentation = "https://docs.kedro.org" +Tracker = "https://github.com/kedro-org/kedro/issues" + +[project.scripts] +kedro = "kedro.framework.cli:main" + +[tool.setuptools] +zip-safe = false + +[tool.setuptools.packages.find] +include = ["kedro*"] + +[tool.setuptools.dynamic] +readme = {file = "README.md", content-type = "text/markdown"} +version = {attr = "kedro.__version__"} +dependencies = {file = "dependency/requirements.txt"} + [tool.black] exclude = "/templates/|^features/steps/test_starter" diff --git a/setup.py b/setup.py index 22fd77c5c8..274ffa2a3a 100644 --- a/setup.py +++ b/setup.py @@ -1,10 +1,9 @@ -import re from codecs import open from glob import glob from itertools import chain from os import path -from setuptools import find_packages, setup +from setuptools import setup name = "kedro" here = path.abspath(path.dirname(__file__)) @@ -15,28 +14,10 @@ HDFS = "hdfs>=2.5.8, <3.0" S3FS = "s3fs>=0.3.0, <0.5" -# get package version -with open(path.join(here, name, "__init__.py"), encoding="utf-8") as f: - result = re.search(r'__version__ = ["\']([^"\']+)', f.read()) - - if not result: - raise ValueError("Can't find the version in kedro/__init__.py") - - version = result.group(1) - # get the dependencies and installs with open("dependency/requirements.txt", encoding="utf-8") as f: requires = [x.strip() for x in f if x.strip()] -# get test dependencies and installs -with open("test_requirements.txt", encoding="utf-8") as f: - test_requires = [x.strip() for x in f if x.strip() and not x.startswith("-r")] - - -# Get the long description from the README file -with open(path.join(here, "README.md"), encoding="utf-8") as f: - readme = f.read() - template_files = [] for pattern in ["**/*", "**/.*", "**/.*/**", "**/.*/.**"]: template_files.extend( @@ -109,17 +90,25 @@ def _collect_requirements(requires): "biosequence": _collect_requirements(biosequence_require), "dask": _collect_requirements(dask_require), "docs": [ + # docutils>=0.17 changed the HTML + # see https://github.com/readthedocs/sphinx_rtd_theme/issues/1115 "docutils==0.16", - "sphinx~=3.4.3", - "sphinx_rtd_theme==1.1.1", - "nbsphinx==0.8.1", - "nbstripout~=0.4", - "sphinx-autodoc-typehints==1.11.1", + "sphinx~=5.3.0", + "sphinx_rtd_theme==1.2.0", + # Regression on sphinx-autodoc-typehints 1.21 + # that creates some problematic docstrings + "sphinx-autodoc-typehints==1.20.2", "sphinx_copybutton==0.3.1", + "sphinx-notfound-page", "ipykernel>=5.3, <7.0", "sphinxcontrib-mermaid~=0.7.1", - "myst-parser~=0.17.2", + "myst-parser~=1.0.0", "Jinja2<3.1.0", + # https://github.com/kedro-org/kedro-plugins/issues/141 + # https://github.com/kedro-org/kedro-plugins/issues/143 + "kedro-datasets[api,biosequence,dask,geopandas,matplotlib,holoviews,networkx,pandas,pillow,polars,video,plotly,redis,spark,svmlight,yaml]==1.1.1", + "kedro-datasets[tensorflow]==1.1.1; platform_system != 'Darwin' or platform_machine != 'arm64'", + "tensorflow-macos~=2.0; platform_system == 'Darwin' and platform_machine == 'arm64'", ], "geopandas": _collect_requirements(geopandas_require), "matplotlib": _collect_requirements(matplotlib_require), @@ -156,31 +145,8 @@ def _collect_requirements(requires): extras_require["all"] = _collect_requirements(extras_require) setup( - name=name, - version=version, - description="Kedro helps you build production-ready data and analytics pipelines", - license="Apache Software License (Apache 2.0)", - long_description=readme, - long_description_content_type="text/markdown", - url="https://github.com/kedro-org/kedro", - python_requires=">=3.7, <3.11", - packages=find_packages(exclude=["docs*", "tests*", "tools*", "features*"]), - include_package_data=True, - tests_require=test_requires, - install_requires=requires, - author="Kedro", - entry_points={"console_scripts": ["kedro = kedro.framework.cli:main"]}, package_data={ name: ["py.typed", "test_requirements.txt"] + template_files }, - zip_safe=False, - keywords="pipelines, machine learning, data pipelines, data science, data engineering", - classifiers=[ - "Development Status :: 4 - Beta", - "Programming Language :: Python :: 3.7", - "Programming Language :: Python :: 3.8", - "Programming Language :: Python :: 3.9", - "Programming Language :: Python :: 3.10", - ], extras_require=extras_require, )