aimhubio · osoblanco · Dec 19, 2022 · Dec 19, 2022 · Jan 31, 2023 · alberttorosyan
diff --git a/aim_spacy/base_logger.py b/aim_spacy/base_logger.py
@@ -75,13 +75,14 @@ def aim_log_step(info: Optional[Dict[str, Any]]):
                     if isinstance(other_scores, dict):
                         for score_name, loss_value in other_scores.items():
                             if not isinstance(loss_value, dict):
-                                aim_run.track(loss_value, name=loss_name, context={'type':f'other_scores_{score_name}'}, epoch=epoch, step=step)
+                                if not score_name.endswith('_desc'): 
+                                    aim_run.track(loss_value, name=loss_name, context={'type':f'other_scores_{score_name}'}, epoch=epoch, step=step)
 
                     if model_log_interval is not None:
                         if (info["step"] % model_log_interval == 0 and info["step"] != 0):
-
-                            displacy_input = dict(docs=logging_handler.data, style=experiment_type, caption=f'Visualization at step: {info["step"]}')                   
-                            aim_run.track(aim_displacy(**displacy_input), step=step, epoch=epoch, name='Parsing', context={'type': experiment_type})                           
+                            if viz_path is not None:
+                                displacy_input = dict(docs=logging_handler.data, style=experiment_type, caption=f'Visualization at step: {info["step"]}')                   
+                                aim_run.track(aim_displacy(**displacy_input), step=step, epoch=epoch, name='Parsing', context={'type': experiment_type})                           
 
 
             def aim_finalize():

diff --git a/examples/textcat_demo/.gitignore b/examples/textcat_demo/.gitignore
@@ -0,0 +1,3 @@
+corpus
+packages
+training
diff --git a/examples/textcat_demo/README.md b/examples/textcat_demo/README.md
@@ -0,0 +1,49 @@
+<!-- SPACY PROJECT: AUTO-GENERATED DOCS START (do not remove) -->
+
+# 🪐 spaCy Project: Demo Textcat (Text Classification)
+
+A minimal demo textcat project for spaCy v3. The demo data comes from the [tutorials/textcat_docs_issues](https://github.com/explosion/projects/tree/v3/tutorials/textcat_docs_issues) project.
+
+## 📋 project.yml
+
+The [`project.yml`](project.yml) defines the data assets required by the
+project, as well as the available commands and workflows. For details, see the
+[spaCy projects documentation](https://spacy.io/usage/projects).
+
+### ⏯ Commands
+
+The following commands are defined by the project. They
+can be executed using [`spacy project run [name]`](https://spacy.io/api/cli#project-run).
+Commands are only re-run if their inputs have changed.
+
+| Command | Description |
+| --- | --- |
+| `convert` | Convert the data to spaCy's binary format |
+| `train` | Train the textcat model |
+| `evaluate` | Evaluate the model and export metrics |
+| `package` | Package the trained model as a pip package |
+| `visualize-model` | Visualize the model's output interactively using Streamlit |
+
+### ⏭ Workflows
+
+The following workflows are defined by the project. They
+can be executed using [`spacy project run [name]`](https://spacy.io/api/cli#project-run)
+and will run the specified commands in order. Commands are only re-run if their
+inputs have changed.
+
+| Workflow | Steps |
+| --- | --- |
+| `all` | `convert` &rarr; `train` &rarr; `evaluate` &rarr; `package` |
+
+### 🗂 Assets
+
+The following assets are defined by the project. They can
+be fetched by running [`spacy project assets`](https://spacy.io/api/cli#project-assets)
+in the project directory.
+
+| File | Source | Description |
+| --- | --- | --- |
+| [`assets/docs_issues_training.jsonl`](assets/docs_issues_training.jsonl) | Local | Demo training data |
+| [`assets/docs_issues_eval.jsonl`](assets/docs_issues_eval.jsonl) | Local | Demo development data |
+
+<!-- SPACY PROJECT: AUTO-GENERATED DOCS END (do not remove) -->
diff --git a/examples/textcat_demo/assets/.gitattributes b/examples/textcat_demo/assets/.gitattributes
@@ -0,0 +1,6 @@
+# This is needed to ensure that text-based assets included with project
+# templates and cloned via Git end up with consistent line endings and
+# the same checksums. It will prevent Git from converting line endings.
+# Otherwise, a user cloning assets on Windows may end up with a different
+# checksum due to different line endings.
+*	-text
diff --git a/examples/textcat_demo/assets/docs_issues_eval.jsonl b/examples/textcat_demo/assets/docs_issues_eval.jsonl
diff --git a/examples/textcat_demo/assets/docs_issues_training.jsonl b/examples/textcat_demo/assets/docs_issues_training.jsonl
diff --git a/examples/textcat_demo/configs/config.cfg b/examples/textcat_demo/configs/config.cfg
@@ -0,0 +1,146 @@
+[paths]
+train = null
+dev = null
+vectors = null
+init_tok2vec = null
+
+[system]
+seed = 0
+gpu_allocator = null
+
+[nlp]
+lang = "en"
+pipeline = ["textcat"]
+disabled = []
+before_creation = null
+after_creation = null
+after_pipeline_creation = null
+batch_size = 1000
+tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}
+
+[components]
+
+[components.textcat]
+factory = "textcat"
+threshold = 0.5
+
+[components.textcat.model]
+@architectures = "spacy.TextCatEnsemble.v2"
+nO = null
+
+[components.textcat.model.linear_model]
+@architectures = "spacy.TextCatBOW.v1"
+exclusive_classes = true
+ngram_size = 1
+no_output_layer = false
+nO = null
+
+[components.textcat.model.tok2vec]
+@architectures = "spacy.Tok2Vec.v2"
+
+[components.textcat.model.tok2vec.embed]
+@architectures = "spacy.MultiHashEmbed.v1"
+width = 64
+rows = [2000,2000,1000,1000,1000,1000]
+attrs = ["ORTH","LOWER","PREFIX","SUFFIX","SHAPE","ID"]
+include_static_vectors = false
+
+[components.textcat.model.tok2vec.encode]
+@architectures = "spacy.MaxoutWindowEncoder.v2"
+width = 64
+window_size = 1
+maxout_pieces = 3
+depth = 2
+
+[corpora]
+
+[corpora.dev]
+@readers = "spacy.Corpus.v1"
+path = ${paths.dev}
+gold_preproc = false
+max_length = 0
+limit = 0
+augmenter = null
+
+[corpora.train]
+@readers = "spacy.Corpus.v1"
+path = ${paths.train}
+gold_preproc = false
+max_length = 0
+limit = 0
+augmenter = null
+
+[training]
+seed = ${system.seed}
+gpu_allocator = ${system.gpu_allocator}
+dropout = 0.1
+accumulate_gradient = 1
+patience = 1000
+max_epochs = 0
+max_steps = 1000
+eval_frequency = 100
+frozen_components = []
+dev_corpus = "corpora.dev"
+train_corpus = "corpora.train"
+before_to_disk = null
+
+[training.batcher]
+@batchers = "spacy.batch_by_words.v1"
+discard_oversize = false
+tolerance = 0.2
+get_length = null
+
+[training.batcher.size]
+@schedules = "compounding.v1"
+start = 100
+stop = 1000
+compound = 1.001
+t = 0.0
+
+[training.logger]
+@loggers = "spacy.AimLogger.v1"
+repo = "./"
+experiment_name = "text_categorizer"
+viz_path = null
+model_log_interval = 400
+image_size = "3000,500"
+experiment_type = "text"
+
+
+[training.optimizer]
+@optimizers = "Adam.v1"
+beta1 = 0.9
+beta2 = 0.999
+L2_is_weight_decay = true
+L2 = 0.01
+grad_clip = 1.0
+use_averages = false
+eps = 0.00000001
+learn_rate = 0.001
+
+[training.score_weights]
+cats_score_desc = null
+cats_micro_p = null
+cats_micro_r = null
+cats_micro_f = null
+cats_macro_p = null
+cats_macro_r = null
+cats_macro_f = null
+cats_macro_auc = null
+cats_f_per_type = null
+cats_macro_auc_per_type = null
+cats_score = 1.0
+
+[pretraining]
+
+[initialize]
+vectors = ${paths.vectors}
+init_tok2vec = ${paths.init_tok2vec}
+vocab_data = null
+lookups = null
+before_init = null
+after_init = null
+
+[initialize.components]
+
+[initialize.tokenizer]
diff --git a/examples/textcat_demo/project.lock b/examples/textcat_demo/project.lock
@@ -0,0 +1,36 @@
+convert:
+  cmd: python -m spacy run convert
+  script:
+    - python scripts/convert.py en assets/docs_issues_training.jsonl corpus/train.spacy
+    - python scripts/convert.py en assets/docs_issues_eval.jsonl corpus/dev.spacy
+  deps:
+    - path: assets/docs_issues_training.jsonl
+      md5: 0483964180f039843a6959622ea9f1cc
+    - path: assets/docs_issues_eval.jsonl
+      md5: bf73b2bfef415815b2f1284848fc3308
+    - path: scripts/convert.py
+      md5: 35ee9ecddb27d1dbf74d9bc36fb44d0b
+  outs:
+    - path: corpus/train.spacy
+      md5: d5d8d4f725f6a973449fb293855ddac3
+    - path: corpus/dev.spacy
+      md5: a0cfe05230bd0a98c0a449d80fa1f89b
+  spacy_version: 3.4.4
+  spacy_git_version: Unknown
+train:
+  cmd: python -m spacy run train
+  script:
+    - python -m spacy train configs/config.cfg --output training/ --paths.train corpus/train.spacy
+      --paths.dev corpus/dev.spacy --nlp.lang en --gpu-id -1
+  deps:
+    - path: configs/config.cfg
+      md5: fac86ad75c9caddb861e19c11bf31035
+    - path: corpus/train.spacy
+      md5: d5d8d4f725f6a973449fb293855ddac3
+    - path: corpus/dev.spacy
+      md5: a0cfe05230bd0a98c0a449d80fa1f89b
+  outs:
+    - path: training/model-best
+      md5: 6825558e999426cd8778090f2ca2d156
+  spacy_version: 3.4.4
+  spacy_git_version: Unknown
diff --git a/examples/textcat_demo/project.yml b/examples/textcat_demo/project.yml
@@ -0,0 +1,92 @@
+title: "Demo Textcat (Text Classification)"
+description: "A minimal demo textcat project for spaCy v3. The demo data comes from the [tutorials/textcat_docs_issues](https://github.com/explosion/projects/tree/v3/tutorials/textcat_docs_issues) project."
+# Variables can be referenced across the project.yml using ${vars.var_name}
+vars:
+  name: "textcat_demo"
+  # Supported languages: all except ja, ko, th, vi, and zh, which would require
+  # custom tokenizer settings in config.cfg
+  lang: "en"
+  # Set your GPU ID, -1 is CPU
+  gpu_id: -1
+  version: "0.0.0"
+  train: "docs_issues_training.jsonl"
+  dev: "docs_issues_eval.jsonl"
+  config: "config.cfg"
+
+# These are the directories that the project needs. The project CLI will make
+# sure that they always exist.
+directories: ["assets", "corpus", "configs", "training", "scripts", "packages"]
+
+# Assets that should be downloaded or available in the directory. We're shipping
+# them with the project, so they won't have to be downloaded.
+assets:
+  - dest: "assets/${vars.train}"
+    description: "Demo training data"
+  - dest: "assets/${vars.dev}"
+    description: "Demo development data"
+
+# Workflows are sequences of commands (see below) executed in order. You can
+# run them via "spacy project run [workflow]". If a commands's inputs/outputs
+# haven't changed, it won't be re-run.
+workflows:
+  all:
+    - convert
+    - train
+    - evaluate
+    - package
+
+# Project commands, specified in a style similar to CI config files (e.g. Azure
+# pipelines). The name is the command name that lets you trigger the command
+# via "spacy project run [command] [path]". The help message is optional and
+# shown when executing "spacy project run [optional command] [path] --help".
+commands:
+  - name: "convert"
+    help: "Convert the data to spaCy's binary format"
+    script:
+      - "python scripts/convert.py ${vars.lang} assets/${vars.train} corpus/train.spacy"
+      - "python scripts/convert.py ${vars.lang} assets/${vars.dev} corpus/dev.spacy"
+    deps:
+      - "assets/${vars.train}"
+      - "assets/${vars.dev}"
+      - "scripts/convert.py"
+    outputs:
+      - "corpus/train.spacy"
+      - "corpus/dev.spacy"
+
+  - name: "train"
+    help: "Train the textcat model"
+    script:
+      - "python -m spacy train configs/${vars.config} --output training/ --paths.train corpus/train.spacy --paths.dev corpus/dev.spacy --nlp.lang ${vars.lang} --gpu-id ${vars.gpu_id}"
+    deps:
+      - "configs/${vars.config}"
+      - "corpus/train.spacy"
+      - "corpus/dev.spacy"
+    outputs:
+      - "training/model-best"
+
+  - name: "evaluate"
+    help: "Evaluate the model and export metrics"
+    script:
+      - "python -m spacy evaluate training/model-best corpus/dev.spacy --output training/metrics.json"
+    deps:
+      - "corpus/dev.spacy"
+      - "training/model-best"
+    outputs:
+      - "training/metrics.json"
+
+  - name: package
+    help: "Package the trained model as a pip package"
+    script:
+      - "python -m spacy package training/model-best packages --name ${vars.name} --version ${vars.version} --force"
+    deps:
+      - "training/model-best"
+    outputs_no_cache:
+      - "packages/${vars.lang}_${vars.name}-${vars.version}/dist/${vars.lang}_${vars.name}-${vars.version}.tar.gz"
+
+  - name: visualize-model
+    help: Visualize the model's output interactively using Streamlit
+    script:
+      - "streamlit run scripts/visualize_model.py training/model-best \"provision Portland K8s cluster\""
+    deps:
+      - "scripts/visualize_model.py"
+      - "training/model-best"
diff --git a/examples/textcat_demo/requirements.txt b/examples/textcat_demo/requirements.txt
@@ -0,0 +1,2 @@
+spacy-streamlit>=1.0.0a0
+streamlit
diff --git a/examples/textcat_demo/scripts/convert.py b/examples/textcat_demo/scripts/convert.py
@@ -0,0 +1,22 @@
+"""Convert textcat annotation from JSONL to spaCy v3 .spacy format."""
+import srsly
+import typer
+import warnings
+from pathlib import Path
+
+import spacy
+from spacy.tokens import DocBin
+
+
+def convert(lang: str, input_path: Path, output_path: Path):
+    nlp = spacy.blank(lang)
+    db = DocBin()
+    for line in srsly.read_jsonl(input_path):
+        doc = nlp.make_doc(line["text"])
+        doc.cats = line["cats"]
+        db.add(doc)
+    db.to_disk(output_path)
+
+
+if __name__ == "__main__":
+    typer.run(convert)
diff --git a/examples/textcat_demo/scripts/visualize_model.py b/examples/textcat_demo/scripts/visualize_model.py
@@ -0,0 +1,14 @@
+import spacy_streamlit
+import typer
+
+
+def main(models: str, default_text: str):
+    models = [name.strip() for name in models.split(",")]
+    spacy_streamlit.visualize(models, default_text, visualizers=["textcat"])
+
+
+if __name__ == "__main__":
+    try:
+        typer.run(main)
+    except SystemExit:
+        pass