Add separate textcat_multilabel project

rojaAchary · Feb 22, 2021 · 88d1628 · 88d1628
1 parent 62598b3
commit 88d1628
Show file tree

Hide file tree

Showing 13 changed files with 4,776 additions and 0 deletions.
diff --git a/pipelines/textcat_multilabel_demo/.gitignore b/pipelines/textcat_multilabel_demo/.gitignore
@@ -0,0 +1,3 @@
+corpus
+packages
+training
diff --git a/pipelines/textcat_multilabel_demo/README.md b/pipelines/textcat_multilabel_demo/README.md
@@ -0,0 +1,49 @@
+<!-- SPACY PROJECT: AUTO-GENERATED DOCS START (do not remove) -->
+
+# 🪐 spaCy Project: Demo Textcat (Text Classification)
+
+A minimal demo textcat_multilabel project for spaCy v3.
+
+## 📋 project.yml
+
+The [`project.yml`](project.yml) defines the data assets required by the
+project, as well as the available commands and workflows. For details, see the
+[spaCy projects documentation](https://spacy.io/usage/projects).
+
+### ⏯ Commands
+
+The following commands are defined by the project. They
+can be executed using [`spacy project run [name]`](https://spacy.io/api/cli#project-run).
+Commands are only re-run if their inputs have changed.
+
+| Command | Description |
+| --- | --- |
+| `convert` | Convert the data to spaCy's binary format |
+| `train` | Train the textcat model |
+| `evaluate` | Evaluate the model and export metrics |
+| `package` | Package the trained model as a pip package |
+| `visualize-model` | Visualize the model's output interactively using Streamlit |
+
+### ⏭ Workflows
+
+The following workflows are defined by the project. They
+can be executed using [`spacy project run [name]`](https://spacy.io/api/cli#project-run)
+and will run the specified commands in order. Commands are only re-run if their
+inputs have changed.
+
+| Workflow | Steps |
+| --- | --- |
+| `all` | `convert` &rarr; `train` &rarr; `evaluate` &rarr; `package` |
+
+### 🗂 Assets
+
+The following assets are defined by the project. They can
+be fetched by running [`spacy project assets`](https://spacy.io/api/cli#project-assets)
+in the project directory.
+
+| File | Source | Description |
+| --- | --- | --- |
+| [`assets/cooking-train.jsonl`](assets/cooking-train.jsonl) | Local | Demo training data |
+| [`assets/cooking-dev.jsonl`](assets/cooking-dev.jsonl) | Local | Demo development data |
+
+<!-- SPACY PROJECT: AUTO-GENERATED DOCS END (do not remove) -->
diff --git a/pipelines/textcat_multilabel_demo/assets/.gitattributes b/pipelines/textcat_multilabel_demo/assets/.gitattributes
@@ -0,0 +1,6 @@
+# This is needed to ensure that text-based assets included with project
+# templates and cloned via Git end up with consistent line endings and
+# the same checksums. It will prevent Git from converting line endings.
+# Otherwise, a user cloning assets on Windows may end up with a different
+# checksum due to different line endings.
+*	-text
diff --git a/pipelines/textcat_multilabel_demo/assets/CC_BY-SA-4.0.txt b/pipelines/textcat_multilabel_demo/assets/CC_BY-SA-4.0.txt
diff --git a/pipelines/textcat_multilabel_demo/assets/README.md b/pipelines/textcat_multilabel_demo/assets/README.md
@@ -0,0 +1,9 @@
+### Data Source
+
+* https://cooking.stackexchange.com. The meta IDs link to the
+  original question as `https://cooking.stackexchange.com/questions/ID`, e.g.,
+  `https://cooking.stackexchange.com/questions/2` for the first instance.
+
+### Data License
+
+* CC BY-SA 4.0 ([`CC_BY-SA-4.0.txt`](CC_BY-SA-4.0.txt))
diff --git a/pipelines/textcat_multilabel_demo/assets/cooking-dev.jsonl b/pipelines/textcat_multilabel_demo/assets/cooking-dev.jsonl
diff --git a/pipelines/textcat_multilabel_demo/assets/cooking-train.jsonl b/pipelines/textcat_multilabel_demo/assets/cooking-train.jsonl
diff --git a/pipelines/textcat_multilabel_demo/configs/config.cfg b/pipelines/textcat_multilabel_demo/configs/config.cfg
@@ -0,0 +1,140 @@
+[paths]
+train = null
+dev = null
+vectors = null
+init_tok2vec = null
+
+[system]
+seed = 0
+gpu_allocator = null
+
+[nlp]
+lang = "en"
+pipeline = ["textcat_multilabel"]
+disabled = []
+before_creation = null
+after_creation = null
+after_pipeline_creation = null
+batch_size = 1000
+tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}
+
+[components]
+
+[components.textcat_multilabel]
+factory = "textcat_multilabel"
+threshold = 0.5
+
+[components.textcat_multilabel.model]
+@architectures = "spacy.TextCatEnsemble.v2"
+nO = null
+
+[components.textcat_multilabel.model.linear_model]
+@architectures = "spacy.TextCatBOW.v1"
+exclusive_classes = false
+ngram_size = 1
+no_output_layer = false
+nO = null
+
+[components.textcat_multilabel.model.tok2vec]
+@architectures = "spacy.Tok2Vec.v2"
+
+[components.textcat_multilabel.model.tok2vec.embed]
+@architectures = "spacy.MultiHashEmbed.v1"
+width = 64
+rows = [2000,2000,1000,1000,1000,1000]
+attrs = ["ORTH","LOWER","PREFIX","SUFFIX","SHAPE","ID"]
+include_static_vectors = false
+
+[components.textcat_multilabel.model.tok2vec.encode]
+@architectures = "spacy.MaxoutWindowEncoder.v2"
+width = 64
+window_size = 1
+maxout_pieces = 3
+depth = 2
+
+[corpora]
+
+[corpora.dev]
+@readers = "spacy.Corpus.v1"
+path = ${paths.dev}
+gold_preproc = false
+max_length = 0
+limit = 0
+augmenter = null
+
+[corpora.train]
+@readers = "spacy.Corpus.v1"
+path = ${paths.train}
+gold_preproc = false
+max_length = 0
+limit = 0
+augmenter = null
+
+[training]
+seed = ${system.seed}
+gpu_allocator = ${system.gpu_allocator}
+dropout = 0.1
+accumulate_gradient = 1
+patience = 1000
+max_epochs = 0
+max_steps = 2000
+eval_frequency = 100
+frozen_components = []
+dev_corpus = "corpora.dev"
+train_corpus = "corpora.train"
+before_to_disk = null
+
+[training.batcher]
+@batchers = "spacy.batch_by_words.v1"
+discard_oversize = false
+tolerance = 0.2
+get_length = null
+
+[training.batcher.size]
+@schedules = "compounding.v1"
+start = 100
+stop = 1000
+compound = 1.001
+t = 0.0
+
+[training.logger]
+@loggers = "spacy.ConsoleLogger.v1"
+progress_bar = false
+
+[training.optimizer]
+@optimizers = "Adam.v1"
+beta1 = 0.9
+beta2 = 0.999
+L2_is_weight_decay = true
+L2 = 0.01
+grad_clip = 1.0
+use_averages = false
+eps = 0.00000001
+learn_rate = 0.001
+
+[training.score_weights]
+cats_score_desc = null
+cats_micro_p = null
+cats_micro_r = null
+cats_micro_f = null
+cats_macro_p = null
+cats_macro_r = null
+cats_macro_f = null
+cats_macro_auc = null
+cats_f_per_type = null
+cats_macro_auc_per_type = null
+cats_score = 1.0
+
+[pretraining]
+
+[initialize]
+vectors = ${paths.vectors}
+init_tok2vec = ${paths.init_tok2vec}
+vocab_data = null
+lookups = null
+before_init = null
+after_init = null
+
+[initialize.components]
+
+[initialize.tokenizer]
diff --git a/pipelines/textcat_multilabel_demo/project.yml b/pipelines/textcat_multilabel_demo/project.yml
@@ -0,0 +1,92 @@
+title: "Demo Textcat (Text Classification)"
+description: "A minimal demo textcat_multilabel project for spaCy v3."
+# Variables can be referenced across the project.yml using ${vars.var_name}
+vars:
+  name: "textcat_multilabel_demo"
+  # Supported languages: all except ja, ko, th, vi, and zh, which would require
+  # custom tokenizer settings in config.cfg
+  lang: "en"
+  # Set your GPU ID, -1 is CPU
+  gpu_id: -1
+  version: "0.0.0"
+  train: "cooking-train.jsonl"
+  dev: "cooking-dev.jsonl"
+  config: "config.cfg"
+
+# These are the directories that the project needs. The project CLI will make
+# sure that they always exist.
+directories: ["assets", "corpus", "configs", "training", "scripts", "packages"]
+
+# Assets that should be downloaded or available in the directory. We're shipping
+# them with the project, so they won't have to be downloaded.
+assets:
+  - dest: "assets/${vars.train}"
+    description: "Training data from cooking.stackexchange.com"
+  - dest: "assets/${vars.dev}"
+    description: "Development data from cooking.stackexchange.com"
+
+# Workflows are sequences of commands (see below) executed in order. You can
+# run them via "spacy project run [workflow]". If a commands's inputs/outputs
+# haven't changed, it won't be re-run.
+workflows:
+  all:
+    - convert
+    - train
+    - evaluate
+    - package
+
+# Project commands, specified in a style similar to CI config files (e.g. Azure
+# pipelines). The name is the command name that lets you trigger the command
+# via "spacy project run [command] [path]". The help message is optional and
+# shown when executing "spacy project run [optional command] [path] --help".
+commands:
+  - name: "convert"
+    help: "Convert the data to spaCy's binary format"
+    script:
+      - "python scripts/convert.py ${vars.lang} assets/${vars.train} corpus/train.spacy"
+      - "python scripts/convert.py ${vars.lang} assets/${vars.dev} corpus/dev.spacy"
+    deps:
+      - "assets/${vars.train}"
+      - "assets/${vars.dev}"
+      - "scripts/convert.py"
+    outputs:
+      - "corpus/train.spacy"
+      - "corpus/dev.spacy"
+
+  - name: "train"
+    help: "Train the textcat model"
+    script:
+      - "python -m spacy train configs/${vars.config} --output training/ --paths.train corpus/train.spacy --paths.dev corpus/dev.spacy --nlp.lang ${vars.lang} --gpu-id ${vars.gpu_id}"
+    deps:
+      - "configs/${vars.config}"
+      - "corpus/train.spacy"
+      - "corpus/dev.spacy"
+    outputs:
+      - "training/model-best"
+
+  - name: "evaluate"
+    help: "Evaluate the model and export metrics"
+    script:
+      - "python -m spacy evaluate training/model-best corpus/dev.spacy --output training/metrics.json"
+    deps:
+      - "corpus/dev.spacy"
+      - "training/model-best"
+    outputs:
+      - "training/metrics.json"
+
+  - name: package
+    help: "Package the trained model as a pip package"
+    script:
+      - "python -m spacy package training/model-best packages --name ${vars.name} --version ${vars.version} --force"
+    deps:
+      - "training/model-best"
+    outputs_no_cache:
+      - "packages/en_${vars.name}-${vars.version}/dist/en_${vars.name}-${vars.version}.tar.gz"
+
+  - name: visualize-model
+    help: Visualize the model's output interactively using Streamlit
+    script:
+      - "streamlit run scripts/visualize_model.py training/model-best \"How can I get chewy chocolate chip cookies?\n<p>My chocolate chips cookies are always too crisp. How can I get chewy cookies, like those of Starbucks?</p>\n<hr/>\n<p>Thank you to everyone who has answered. So far the tip that had the biggest impact was to chill and rest the dough, however I also increased the brown sugar ratio and increased a bit the butter. Also adding maple syrup helped. </p>\""
+    deps:
+      - "scripts/visualize_model.py"
+      - "training/model-best"
diff --git a/pipelines/textcat_multilabel_demo/requirements.txt b/pipelines/textcat_multilabel_demo/requirements.txt
@@ -0,0 +1,2 @@
+spacy-streamlit>=1.0.0a0
+streamlit
diff --git a/pipelines/textcat_multilabel_demo/scripts/convert.py b/pipelines/textcat_multilabel_demo/scripts/convert.py
@@ -0,0 +1,23 @@
+"""Convert textcat annotation from JSONL to spaCy v3 .spacy format."""
+import srsly
+import typer
+import warnings
+from pathlib import Path
+
+import spacy
+from spacy.tokens import DocBin
+
+
+def convert(lang: str, input_path: Path, output_path: Path):
+    nlp = spacy.blank(lang)
+    docs = []
+    for line in srsly.read_jsonl(input_path):
+        doc = nlp.make_doc(line["text"])
+        doc.cats = line["cats"]
+        docs.append(doc)
+    db = DocBin(docs=docs)
+    db.to_disk(output_path)
+
+
+if __name__ == "__main__":
+    typer.run(convert)
diff --git a/pipelines/textcat_multilabel_demo/scripts/visualize_model.py b/pipelines/textcat_multilabel_demo/scripts/visualize_model.py
@@ -0,0 +1,14 @@
+import spacy_streamlit
+import typer
+
+
+def main(models: str, default_text: str):
+    models = [name.strip() for name in models.split(",")]
+    spacy_streamlit.visualize(models, default_text, visualizers=["textcat"])
+
+
+if __name__ == "__main__":
+    try:
+        typer.run(main)
+    except SystemExit:
+        pass
diff --git a/pipelines/textcat_multilabel_demo/test_project_textcat_multilabel_demo.py b/pipelines/textcat_multilabel_demo/test_project_textcat_multilabel_demo.py
@@ -0,0 +1,10 @@
+from spacy.cli.project.run import project_run
+from spacy.cli.project.assets import project_assets
+from pathlib import Path
+
+
+def test_textcat_multilabel_demo_project():
+    root = Path(__file__).parent
+    project_assets(root)
+    project_run(root, "all", capture=True)
+    project_run(root, "package", capture=True)