diff --git a/{{ cookiecutter.project_name }}/Makefile b/{{ cookiecutter.project_name }}/Makefile index 4165477..a399178 100644 --- a/{{ cookiecutter.project_name }}/Makefile +++ b/{{ cookiecutter.project_name }}/Makefile @@ -10,6 +10,8 @@ VERSION_FILE:=VERSION COMPOSE_FILE=docker/docker-compose.yml TAG:=$(shell cat ${VERSION_FILE}) +# takes advantage of the makefile structure (command; ## documentation) +# to generate help help: @grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}' @@ -17,71 +19,73 @@ git-tag: ## Tag in git, then push tag up to origin git tag $(TAG) git push origin $(TAG) -## Test lint compliance using black. Config in pyproject.toml file -ci-black: dev-start + +ci-black: dev-start ## Test lint compliance using black. Config in pyproject.toml file docker exec -t $(CONTAINER_NAME) black --check $(PROJ_DIR) -## Test lint compliance using flake8. Config in tox.ini file -ci-flake8: dev-start + +ci-flake8: dev-start ## Test lint compliance using flake8. Config in tox.ini file docker exec -t $(CONTAINER_NAME) flake8 $(PROJ_DIR) -## Runs unit tests using pytest -ci-test: dev-start + +ci-test: dev-start ## Runs unit tests using pytest docker exec -t $(CONTAINER_NAME) pytest $(PROJ_DIR) -## Runs unit tests, with an interactive IPDB session at the first failure -ci-test-interactive: dev-start + +ci-test-interactive: dev-start ## Runs unit tests with interactive IPDB session at the first failure docker exec -it $(CONTAINER_NAME) pytest $(PROJ_DIR) -x --pdb --pdbcls=IPython.terminal.debugger:Pdb -## Runs mypy type checker -ci-mypy: dev-start + +ci-mypy: dev-start ## Runs mypy type checker docker exec -t $(CONTAINER_NAME) mypy --ignore-missing-imports --show-error-codes $(PROJ_DIR) -## Check black, flake8, and run unit tests -ci: ci-black ci-flake8 ci-test ci-mypy + +ci: ci-black ci-flake8 ci-test ci-mypy ## Check black, flake8, and run unit tests @echo "CI successful" -## Runs isort to sorts imports -isort: dev-start + +isort: dev-start ## Runs isort to sorts imports docker exec -t $(CONTAINER_NAME) isort -rc $(PROJ_DIR) --profile black -## Runs black auto-linter -black: dev-start + +black: dev-start ## Runs black auto-linter docker exec -t $(CONTAINER_NAME) black $(PROJ_DIR) -## Formats repo; runs black and isort on all files -format: isort black + +format: isort black ## Formats repo by running black and isort on all files @echo "Formatting complete" -## Deprecated. Here to support old workflow -lint: format -## Primary make command for devs, spins up containers -dev-start: +lint: format ## Deprecated. Here to support old workflow + + +dev-start: ## Primary make command for devs, spins up containers docker-compose -f $(COMPOSE_FILE) --project-name $(PROJECT) up -d --no-recreate -## Spin down active containers -dev-stop: + +dev-stop: ## Spin down active containers docker-compose -f $(COMPOSE_FILE) --project-name $(PROJECT) down -## Rebuild images for dev containers (useful when Dockerfile/requirements are updated) -dev-rebuild: + +# Useful when Dockerfile/requirements are updated) +dev-rebuild: ## Rebuild images for dev containers docker-compose -f $(COMPOSE_FILE) --project-name $(PROJECT) up -d --build -## Exec into docker bash terminal -bash: dev-start + +bash: dev-start ## Provides an interactive bash shell in the container docker exec -it $(CONTAINER_NAME) bash -## Build docs using Sphinx and copy to docs folder (this makes it easy to publish to gh-pages) -docs: + +# makes it easy to publish to gh-pages +docs: ## Build docs using Sphinx and copy to docs folder docker exec -e GRANT_SUDO=yes $(CONTAINER_NAME) bash -c "cd docsrc; make html" @cp -a docsrc/_build/html/. docs -## Provides an interactive ipython prompt -ipython: + +ipython: ## Provides an interactive ipython prompt docker exec -it $(CONTAINER_NAME) ipython -## Clean out temp/compiled python files -clean: + +clean: ## Clean out temp/compiled python files find . -name __pycache__ -delete find . -name "*.pyc" -delete diff --git a/{{ cookiecutter.project_name }}/README.md b/{{ cookiecutter.project_name }}/README.md index d06464c..1a4db33 100644 --- a/{{ cookiecutter.project_name }}/README.md +++ b/{{ cookiecutter.project_name }}/README.md @@ -6,132 +6,31 @@ {{cookiecutter.description}} -# Setup +This file outlines the experimentation and development workflow. You should modify it for your project to describe any project-specific scripts, data sources, configuration parameters that are specific to your project. -## Basics +For information on how to setup this project (including Docker), see [SETUP.md](SETUP.md). -Clone the code to your machine using the standard Git clone command. If you have SSH keys setup the command is: +## Experimentation Workflow -```bash -git clone git@github.com:manifoldai/{{cookiecutter.package_name}}.git -``` - -Important note: **Do not clone your code into Google Drive or DropBox**. There are known issues with MLFlow interacting with the file sync that is happening in the background. Clone to a directory that is not being synced by one of those services. - -## Docker - -You will need to have Docker and docker-compose installed on your system to run the code. - -### Install Docker - -- For Mac: https://store.docker.com/editions/community/docker-ce-desktop-mac -- For Windows: https://store.docker.com/editions/community/docker-ce-desktop-windows -- For Linux: Go to this page and choose the appropriate install for your Linux distro: https://www.docker.com/community-edition - -### Install Docker Compose: - -``` -$ sudo curl -L https://github.com/docker/compose/releases/download/1.21.0/docker-compose-$(uname -s)-$(uname -m) -o /usr/local/bin/docker-compose -$ sudo chmod +x /usr/local/bin/docker-compose -``` - -Test the installation: - -``` -$ docker-compose --version -docker-compose version 1.21.0, build 1719ceb -``` - -## Start Docker Containers - -The runtime for {{cookiecutter.project_name}} is inside a Docker container. We have a `make` command to launch the appropriate containers. To launch the docker containers and begin working on a CPU, run from the root directory of the repository: - -```bash -make dev-start -``` - -This builds images using the Dockerfile in docker/Dockerfile, and runs containers named after the project directory. To see the running containers, run -`docker ps` - -You should see three containers running. For example, on my laptop this looks like the below. On your machine the container ids and the names of the images and running containers will be different, i.e. they will have your username rather that sdey. In addition, the local ports will be different as well. That is expected. - -```bash -CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES -f168e19b8b67 {{cookiecutter.package_name}}_mlflow "bash -c 'mlflow ui …" 4 days ago Up 3 days 127.0.0.1:32770->5000/tcp {{cookiecutter.package_name}}_mlflow_ -87f03baf686e {{cookiecutter.package_name}}_bash "/bin/bash" 4 days ago Up 4 days 127.0.0.1:32768->8501/tcp {{cookiecutter.package_name}}_bash_ -d9bd01600486 {{cookiecutter.package_name}}_jupyter "bash -c 'cd /mnt &&…" 4 days ago Up 3 days 127.0.0.1:32769->8888/tcp {{cookiecutter.package_name}}_jupyter_ -``` - -We have also provided a simple make command to help you easily stop the containers associated with the project: - -```bash -make dev-stop -``` - -## Makefile - -We use `make` to run most of the typical developer operations, e.g. `make dev-start`, etc. For a full list of make commands, run: - -```bash -make help -``` - -The `make` commands supported out of the box are: - -``` -black Runs black auto-formatter -ci-black Test lint compliance using black. Config in pyproject.toml file -ci-test Runs unit tests using pytest -ci Check black, flake8, and run unit tests -dev-start Primary make command for devs, spins up containers -dev-stop Spin down active containers -docs Build docs using Sphinx and copy to docs folder (this makes it easy to publish to gh-pages) -git-tag Tag in git, then push tag up to origin -isort Runs isort to sorts imports -format Formats repo; runs black and isort on all files -``` - -## Using the Containers - -The docker-compose is setup to mount the working directory of the repository into each of the containers. That means that all change you make in the git repository will automatically show up in the containers, and vice versa. - -The typical workflow is to do all text editing and git commands in the local host, but _run_ all the code inside the container -- either in the bash executor or the Jupyter notebook. As mentioned earlier, the containers are the _runtime_ -- they have a consistent operating system (Ubuntu), drivers, libraries, and dependencies. It ensures that the runtime is consistent across all developers and compute environments -- from your local laptop to the cloud. This is the purpose of containerization. If you would like to read more about benefits of containerization read [here](https://dzone.com/articles/5-key-benefits-docker-ci). - -Let's go over each of the three containers and how to use them. - -### Bash Executor - -This is the container that you should go into to run experiments, e.g. etl, training, evaluation, prediction. Use the command below to go into the container from the command line: - -```bash -docker exec -it /bin/bash -``` - -In the example above, that command would be: - -```bash -docker exec -it {{cookiecutter.package_name}}_bash-executer_ /bin/bash -``` - -### Jupyter Server - -This is the container that you should browse to in your web browser to access Jupyter notebooks. You should go to the local port that the Jupyter server is mapped to access the standard Jupyter notebook UI, e.g. `http://localhost:`. You can look up that port using the `docker ps` command. In the example above, the user should browse to `http://localhost:32769` to see the UI. - -### MLFlow Server +In this section we will go over the most common usage pattern in this repo -- running an ML experiment -- which includes training a model and evaluating its performance. Let's walk through an example step by step. -We are using MLFlow to log all of our experiments. You can read more about MLFlow and it's benefits on their [website](https://mlflow.org/). Similar to the Jupyter notebook, you should go to the port that the MLFlow server is mapped to access the MLFlow UI, e.g.`http://localhost:`. You can look up that port using the `docker ps` command. In the example above, the user should browse to `http://localhost:32770` to see the UI. - -# Experimentation Workflow - -In this section we will go over the most common usage pattern in this repo -- running an ML experiment -- which includes training a model and evaluating it's performance. Let's walk through an example step by step. - -## Step 1: Get the Data +### Step 1: Get the Data The first thing is to get the data in the right place. This sounds easy, but it is a common source of error. Download the data and copy it to the right place. Typically we put raw data in `data/raw/` and ETLed data in `data/processed/` -## Step 2: Create a Configuration File +### Step 2: EDA + +EDA and other exploratory work typically happens in notebooks. We recommend +organizing your notebooks within the `notebooks/` directory, using a file +structure and naming convention that works for your project (e.g., +individual-specific notebooks should be prefaced with that person's initials, +etc.). Note that the `notebooks/scratch/` subdirectory is by default not tracked +in git, which makes it a good place for one-off analyses, or as a staging area +to avoid polluting your `git status`. + +### Step 3: Create a Configuration File We have designed the code so that all experimentation is controlled by configuration files. You don't need to edit source code to run experiments. This is by design. @@ -139,21 +38,19 @@ There are a number of example configuration files checked into the repo [here](c You can read about the configuration file structure [here](configs/README.md). In the following we will work with the configuration file example `config_example.yml` located [here](configs/examples/config_example.yml). -## Step 3: ETL the data +### Step 4: ETL the data -Once you have the data in the right place and a configuration file, the next step is to prepare your data for training. We do this by invoking the etl script: +Once you have the data in the right place and a configuration file, the next step is to prepare your data for training. We do this by writing and invoking an ETL script: ```bash -python {{cookiecutter.package_name}}/scripts/etl.py configs/examples/config_example.yml +$ make bash +$ python {{cookiecutter.package_name}}/scripts/etl.py configs/examples/config_example.yml ``` -Note that this command should be run inside the bash executor container, not on your local host. While running this command you will see info logging to the screen that tells you what the code is doing and how it is splitting the data. +Note that we use `make bash` to ensure that the code runs in the jupyter container, not on your local host. +While running this command, you should see info logging to the screen that tells you what the code is doing and how it is splitting the data. -```bash ->>>> put example output here <<<< -``` - -## Step 4: Train and Evaluate a Model +### Step 5: Train and Evaluate a Model The most common script run in this repository is the `evaluate.py` script. It trains and evaluates a model. We run this script as follows: @@ -163,19 +60,11 @@ python {{cookiecutter.package_name}}/scripts/evaluate.py configs/examples/config By default, this script will retrain a model from scratch, but you can point it do use an already trained model. We'll cover that later in the README. For now, lets assume that we are training a model from scratch. When you run the script, you will see output like the following below: -```bash ->>>> put example output here <<<< -``` - The log shows what loss is being used, the model architecture, and many more things. All of these things are configurable from the config file. It's good practice to check that what is being run is what you expect. A common failure mode is human error in the configuration files leading to you to not run the experiment you expected. After training, the code runs a number of evaluation metrics and plots and logs these all to MLFlow. If you go to MLFlow after an evaluate run, you can see all the parameters, metrics, and artifacts logged for that run. As a best practice, you should only look at the models and figures that are persisted inside MLFlow. That is the source of truth and our experiment bookkeeping. During the evaluation phase the logging should look like: -```bash ->>>> put example output here <<<< -``` - -## Step 5: View Run in MLFlow +### Step 6: View Run in MLFlow Once you have run an experiment, you want to look at the results in MLFlow. The easiest way is to use the MLFlow web UI. If you open up the MLFlow UI on your browser at `http://localhost:` you should see the MLFlow UI like below. ![mlflow](docs/imgs/mlflow.png?raw=true "MLFlow UI") @@ -184,23 +73,23 @@ The experiments are on the left. Experiments are a MLFlow construct to group tog We log three things in a run: parameters, metrics, and artifacts. We go into each of them in more detail below. -### Parameters +#### Parameters These are the configuration parameters for a specific run, like model name, loss, batch size, etc. We log most of the important ones, but not everything. If you want the detailed information about the run you should look at the `config.yml` artifact for that run. That contains all of the information about that run. -### Metrics +#### Metrics These are top level summary metrics that tell you how the run performed. Most of them, unless otherwise noted, are computed on the test set. Typically, there is one number we like to look `mse` -- the mean squared error between the actual and prediction. This is our error metric of interest. Lower `mse` is better. -### Artifacts +#### Artifacts These are the file artifacts associated with a run. These include the logs, config file, and most importantly the error analysis figures. There are a number of interesting figures to look at. Note that the error analysis is typically always done on the test set, i.e. data that the model has never seen. ![mlflow](docs/imgs/mlflow_detail.png?raw=true "MLFlow Detail UI") -## Documentation +### Documentation The repo created is automatically setup with Sphinx to build HTML documentation. If you write your pydoc in Google Style format, the parser will automatically generate the documentation. See [Google Style Python Docstrings](https://sphinxcontrib-napoleon.readthedocs.io/en/latest/example_google.html) for examples of such docstrings. @@ -214,11 +103,11 @@ The built documentation can be configured in the `./docsrc` directory. The two m We have a specific structure for our docs directory which builds inside `./docsrc/_build` and then copies the built documentation over to `./docs` -- which by default is under source control. This folder can be easily setup using GitHub Pages to serve up your documentation as a static site on GitHub. For more information on how to set that up, please look at the documentation on [GitHub pages](https://pages.github.com/) and this [article](https://www.docslikecode.com/articles/github-pages-python-sphinx/). -## Conclusion +### Conclusion This is the basic workflow! You can run this locally or on a cloud machine. When working on a cloud machine you will likely need to to ssh tunnelling, screen, and other tools to make sure you can view MLFLow and don't have issues with network pipes breaking. -# Project Structure +## Project Structure ``` ├── LICENSE @@ -243,9 +132,8 @@ This is the basic workflow! You can run this locally or on a cloud machine. When ├── experiments <- Where to store different model experiments, e.g., model pkls and analysis ├── logging.yml <- Logging config file ├── logs <- Logging directory -├── notebooks <- Jupyter notebooks. Naming convention is a number (for ordering), -│ the creator's initials, and a short `-` delimited description, e.g. -│ `1.0-jqp-initial-data-exploration`. +├── notebooks <- Jupyter notebooks. +│   └── scratch <- Notebooks not tracked by git ├── pull_request_template.md <- Pull request template for GitHub ├── pyproject.toml <- Config file used by black ├── tox.ini <- tox config file with settings for flake @@ -269,21 +157,20 @@ This is the basic workflow! You can run this locally or on a cloud machine. When --- -# Developer Workflow +## Developer Workflow Continued development in this repo is straightforward. The scaffolding is meant to be extensible -- you should add your own models, loss functions, feature engineering pipelines, etc. For brevity we are not putting information about the intended development workflow in this README. Look [here](docs/develop.md) for more information about the intended development workflow for this repo. + ## DVC Orbyter uses DVC to perform data versioning on datasets and artifacts. ### Adding a Remote to DVC - DVC uses remote repositories to store versions of data and make it easy to share with a team. If you're adding a single remote or the first of multiple, include the ``--default`` in the command as shown below. DVC determines the type of remote (file location on disk, s3 bucket, etc.) based on the `remote_url` argumentsw Run `dvc remote add --default ` - ### Tracking Files 1. `dvc add ` - track artifact in git @@ -294,4 +181,3 @@ Run `dvc remote add --default ` 1. `git checkout ` 2. `dvc pull` - sync local cache with remote cache based on current git HEAD 3. `dvc checkout` - link files in local cache to appropriate location in project repo. - diff --git a/{{ cookiecutter.project_name }}/SETUP.md b/{{ cookiecutter.project_name }}/SETUP.md new file mode 100644 index 0000000..101cf64 --- /dev/null +++ b/{{ cookiecutter.project_name }}/SETUP.md @@ -0,0 +1,108 @@ +# Setup + +This document describes how to set up your environment to work with this repository. + +## Get the repository + +Clone the code to your machine using the standard Git clone command. If you have SSH keys set up, the command is (where you should replace `manifoldai` if working in a different org): + +```bash +git clone git@github.com:manifoldai/{{cookiecutter.package_name}}.git +``` + +Important note: **Do not clone your code into Google Drive or DropBox**. There are known issues with MLFlow interacting with the file sync that is happening in the background. Clone to a directory that is not being synced by one of those services. + +## Docker + +You will need to have Docker and docker-compose installed on your system to run the code. + +### Install Docker + +- For Mac: https://store.docker.com/editions/community/docker-ce-desktop-mac +- For Windows: https://store.docker.com/editions/community/docker-ce-desktop-windows +- For Linux: Go to this page and choose the appropriate install for your Linux distro: https://www.docker.com/community-edition + +### Install Docker Compose: + +``` +$ sudo curl -L https://github.com/docker/compose/releases/download/1.21.0/docker-compose-$(uname -s)-$(uname -m) -o /usr/local/bin/docker-compose +$ sudo chmod +x /usr/local/bin/docker-compose +``` + +Test the installation: + +``` +$ docker-compose --version +docker-compose version 1.21.0, build 1719ceb +``` + +## Start Docker Containers + +The runtime for {{cookiecutter.project_name}} is inside a Docker container. We have a `make` command to launch the appropriate containers. To launch the docker containers and begin working on a CPU, run from the root directory of the repository: + +```bash +make dev-start +``` + +This builds images using the Dockerfile in docker/Dockerfile, and runs containers named after the project directory. To see the running containers, run `docker ps`. + +You should see two containers running. Note that the local ports could be different on your machine: this is to be expected. + +```bash +$ docker ps +CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES +c5df791a2731 {{ cookiecutter.package_name }}_jupyter "bash -c 'cd /mnt &&…" 3 seconds ago Up 1 second 127.0.0.1:55003->8888/tcp {{ cookiecutter.package_name }}_jupyter_ +451fa5f876dc {{ cookiecutter.package_name }}_mlflow "bash -c 'mlflow ser…" 3 seconds ago Up 1 second 127.0.0.1:55004->5000/tcp {{ cookiecutter.package_name }}_mlflow_ +``` + +Note the local addresses 127.0.0.1:55003 and 127.0.0.1:55004, which tell us where to access the Jupyterlab server and the MLFlow server respectively (see below for more details). + +We have also provided `make dev-stop` and `make dev-rebuild` to stop all containers and to rebuild the containers respectively. + +## Makefile + +We use `make` to run most of the typical developer operations, e.g. `make dev-start`, etc. For a full list of make commands, run: + +```bash +make help +``` + +The `make` commands supported out of the box are: + +``` +bash Provides an interactive bash shell in the container +black Runs black auto-linter +ci-black Test lint compliance using black. Config in pyproject.toml file +ci-mypy Runs mypy type checker +ci-test-interactive Runs unit tests with interactive IPDB session at the first failure +ci-test Runs unit tests using pytest +ci Check black, flake8, and run unit tests +clean Clean out temp/compiled python files +dev-rebuild Rebuild images for dev containers (useful when Dockerfile/requirements are updated) +dev-start Primary make command for devs, spins up containers +dev-stop Spin down active containers +docs Build docs using Sphinx and copy to docs folder (this makes it easy to publish to gh-pages) +format Formats repo by running black and isort on all files +git-tag Tag in git, then push tag up to origin +ipython Provides an interactive ipython prompt +isort Runs isort to sorts imports +``` + +## Using the Containers + +The `docker-compose.yml` file is setup to mount the working directory of the repository into each of the containers. That means that all changes you make in the git repository will automatically show up in the containers, and vice versa. + +The typical workflow is to do all text editing and git commands in the local host, but _run_ all the code inside the jupyter container, either in a notebook or through the shell. As mentioned earlier, the containers are the _runtime_ -- they have a consistent operating system (Ubuntu), drivers, libraries, and dependencies. It ensures that the runtime is consistent across all developers and compute environments -- from your local laptop to the cloud. This is the purpose of containerization. If you would like to read more about benefits of containerization read [here](https://dzone.com/articles/5-key-benefits-docker-ci). + +Let's go over the two containers and how to use them. + +### Jupyter Container + +This container will run all your code. It's running a Jupyterlab server on port 8888 of the container, which is mapped to a port on your local machine as described above. + +You should also use this container to run experiments, preprocessing and any similar +tasks. We typically run these through an interactive bash session (`make bash`). + +### MLFlow Server + +We use MLFlow to track and log of our experiments. You can read more about MLFlow and its benefits on the [MLFlow website](https://mlflow.org/). Just like the Jupyterlab server, you can access it using the corresponding port in the output of `docker ps`.