integrations/prodigy/project.yml

title: "Prodigy annotation tool integration"
description: "This project shows how to integrate the [Prodigy](https://prodi.gy) annotation tool (requires **v1.11+**) into your spaCy project template to automatically **export annotations** you've created and **train your model** on the collected data. Note that in order to run this template, you'll need to install Prodigy separately into your environment. For details on how the data was created, check out this [project template](https://github.com/explosion/projects/tree/v3/tutorials/ner_fashion_brands) and [blog post](https://explosion.ai/blog/sense2vec-reloaded#annotation).

> ⚠️ **Important note:** The example in this project uses a separate step `db-in` to export the example annotations into your database, so you can easily run it end-to-end. In your own workflows, you can leave this out and access the given dataset you've annotated directly."
# Variables can be referenced across the project.yml using ${vars.var_name}
vars:
  config: "config.cfg"
  name: "ner_fashion"
  version: "0.0.0"
  # Change this variable if you want to use the GPU (gpu_id: 0)
  gpu_id: -1

  files:
    train_file: "fashion_brands_training.jsonl"
    eval_file: "fashion_brands_eval.jsonl"

  prodigy:
    train_dataset: "fashion_brands_training"
    eval_dataset: "fashion_brands_eval"


# These are the directories that the project needs. The project CLI will make
# sure that they always exist.
directories: ["assets", "training", "configs", "corpus", "packages"]

# Assets that should be downloaded or available in the directory. We're shipping
# them with the project, so they won't have to be downloaded. But the
# 'project assets' command still lets you verify that the checksums match.
assets:
  - dest: "assets/${vars.files.train_file}.jsonl"
    checksum: "63373dd656daa1fd3043ce166a59474c"
    description: "JSONL-formatted training data exported from Prodigy, annotated with `FASHION_BRAND` entities (1235 examples)"
  - dest: "assets/${vars.files.eval_file}.jsonl"
    checksum: "5113dc04e03f079525edd8df3f4f39e3"
    description: "JSONL-formatted development data exported from Prodigy, annotated with `FASHION_BRAND` entities (500 examples)"

# Workflows are sequences of commands (see below) executed in order. You can
# run them via "spacy project run [workflow]". If a commands's inputs/outputs
# haven't changed, it won't be re-run.
workflows:
  all:
    - db-in
    - data-to-spacy
    - train_spacy

  all_prodigy:
    - db-in
    - train_prodigy

# Project commands, specified in a style similar to CI config files (e.g. Azure
# pipelines). The name is the command name that lets you trigger the command
# via "spacy project run [command] [path]". The help message is optional and
# shown when executing "spacy project run [optional command] [path] --help".
commands:
  - name: "db-in"
    help: "Load data into prodigy (only for example purposes)"
    script:
      - "python -m prodigy db-in ${vars.prodigy.train_dataset} assets/${vars.files.train_file}"
      - "python -m prodigy db-in ${vars.prodigy.eval_dataset} assets/${vars.files.eval_file}"
    deps:
      - "assets/${vars.files.train_file}"
      - "assets/${vars.files.eval_file}"

  - name: "data-to-spacy"
    help: "Merge your annotations and create data in spaCy's binary format"
    script:
      - "python -m prodigy data-to-spacy corpus/ --ner ${vars.prodigy.train_dataset},eval:${vars.prodigy.eval_dataset}"

    outputs:
      - "corpus/train.spacy"
      - "corpus/dev.spacy"

  - name: "train_spacy"
    help: "Train a named entity recognition model with spaCy"
    script:
      - "python -m spacy train configs/${vars.config} --output training/ --paths.train corpus/train.spacy --paths.dev corpus/dev.spacy --gpu-id ${vars.gpu_id}"
    deps:
      - "corpus/train.spacy"
      - "corpus/dev.spacy"
    outputs:
      - "training/model-best"

  - name: "train_prodigy"
    help: "Train a named entity recognition model with Prodigy"
    script:
      - "python -m prodigy train training/ --ner ${vars.prodigy.train_dataset},eval:${vars.prodigy.eval_dataset} --config configs/${vars.config} --gpu-id ${vars.gpu_id}"

    outputs:
      - "training/model-best"

  - name: "train_curve"
    help: "Train the model with Prodigy by using different portions of training examples to evaluate if more annotations can potentially improve the performance"
    script:
      - "python -m prodigy train-curve --ner ${vars.prodigy.train_dataset},eval:${vars.prodigy.eval_dataset} --config configs/${vars.config} --gpu-id ${vars.gpu_id} --show-plot"

  - name: package
    help: "Package the trained model so it can be installed"
    script:
      - "python -m spacy package training/model-best packages --name ${vars.name} --version ${vars.version} --force"
    deps:
      - "training/model-best"
    outputs_no_cache:
      - "packages/en_${vars.name}-${vars.version}/dist/en_${vars.name}-${vars.version}.tar.gz"