Demo textcat project

* Provided data is textcat_docs_issues data in a simpler JSONL format * Both `textcat` and `textcat_multilabel` data and configs are provided, the default is `textcat`
rojaAchary · Feb 18, 2021 · b3592d9 · b3592d9
1 parent 5fc176d
commit b3592d9
Show file tree

Hide file tree

Showing 14 changed files with 2,803 additions and 0 deletions.
diff --git a/pipelines/textcat_demo/.gitignore b/pipelines/textcat_demo/.gitignore
@@ -0,0 +1,3 @@
+corpus
+packages
+training
diff --git a/pipelines/textcat_demo/README.md b/pipelines/textcat_demo/README.md
@@ -0,0 +1,49 @@
+<!-- SPACY PROJECT: AUTO-GENERATED DOCS START (do not remove) -->
+
+# 🪐 spaCy Project: Demo Textcat (Text Classification)
+
+A minimal demo textcat project for spaCy v3. The demo data comes from the [tutorials/textcat_docs_issues](https://github.com/explosion/projects/tree/v3/tutorials/textcat_docs_issues) project.
+
+## 📋 project.yml
+
+The [`project.yml`](project.yml) defines the data assets required by the
+project, as well as the available commands and workflows. For details, see the
+[spaCy projects documentation](https://spacy.io/usage/projects).
+
+### ⏯ Commands
+
+The following commands are defined by the project. They
+can be executed using [`spacy project run [name]`](https://spacy.io/api/cli#project-run).
+Commands are only re-run if their inputs have changed.
+
+| Command | Description |
+| --- | --- |
+| `convert` | Convert the data to spaCy's binary format |
+| `train` | Train the textcat model |
+| `evaluate` | Evaluate the model and export metrics |
+| `package` | Package the trained model as a pip package |
+| `visualize-model` | Visualize the model's output interactively using Streamlit |
+
+### ⏭ Workflows
+
+The following workflows are defined by the project. They
+can be executed using [`spacy project run [name]`](https://spacy.io/api/cli#project-run)
+and will run the specified commands in order. Commands are only re-run if their
+inputs have changed.
+
+| Workflow | Steps |
+| --- | --- |
+| `all` | `convert` &rarr; `train` &rarr; `evaluate` &rarr; `package` |
+
+### 🗂 Assets
+
+The following assets are defined by the project. They can
+be fetched by running [`spacy project assets`](https://spacy.io/api/cli#project-assets)
+in the project directory.
+
+| File | Source | Description |
+| --- | --- | --- |
+| [`assets/docs_issues_training_multilabel.jsonl`](assets/docs_issues_training_multilabel.jsonl) | Local | Demo training data |
+| [`assets/docs_issues_eval_multilabel.jsonl`](assets/docs_issues_eval_multilabel.jsonl) | Local | Demo development data |
+
+<!-- SPACY PROJECT: AUTO-GENERATED DOCS END (do not remove) -->
diff --git a/pipelines/textcat_demo/assets/.gitattributes b/pipelines/textcat_demo/assets/.gitattributes
@@ -0,0 +1,6 @@
+# This is needed to ensure that text-based assets included with project
+# templates and cloned via Git end up with consistent line endings and
+# the same checksums. It will prevent Git from converting line endings.
+# Otherwise, a user cloning assets on Windows may end up with a different
+# checksum due to different line endings.
+*	-text
diff --git a/pipelines/textcat_demo/assets/docs_issues_eval.jsonl b/pipelines/textcat_demo/assets/docs_issues_eval.jsonl
diff --git a/pipelines/textcat_demo/assets/docs_issues_eval_multilabel.jsonl b/pipelines/textcat_demo/assets/docs_issues_eval_multilabel.jsonl
diff --git a/pipelines/textcat_demo/assets/docs_issues_training.jsonl b/pipelines/textcat_demo/assets/docs_issues_training.jsonl
diff --git a/pipelines/textcat_demo/assets/docs_issues_training_multilabel.jsonl b/pipelines/textcat_demo/assets/docs_issues_training_multilabel.jsonl
diff --git a/pipelines/textcat_demo/configs/config.cfg b/pipelines/textcat_demo/configs/config.cfg
@@ -0,0 +1,140 @@
+[paths]
+train = null
+dev = null
+vectors = null
+init_tok2vec = null
+
+[system]
+seed = 0
+gpu_allocator = null
+
+[nlp]
+lang = "en"
+pipeline = ["textcat"]
+disabled = []
+before_creation = null
+after_creation = null
+after_pipeline_creation = null
+batch_size = 1000
+tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}
+
+[components]
+
+[components.textcat]
+factory = "textcat"
+threshold = 0.5
+
+[components.textcat.model]
+@architectures = "spacy.TextCatEnsemble.v2"
+nO = null
+
+[components.textcat.model.linear_model]
+@architectures = "spacy.TextCatBOW.v1"
+exclusive_classes = true
+ngram_size = 1
+no_output_layer = false
+nO = null
+
+[components.textcat.model.tok2vec]
+@architectures = "spacy.Tok2Vec.v2"
+
+[components.textcat.model.tok2vec.embed]
+@architectures = "spacy.MultiHashEmbed.v1"
+width = 64
+rows = [2000,2000,1000,1000,1000,1000]
+attrs = ["ORTH","LOWER","PREFIX","SUFFIX","SHAPE","ID"]
+include_static_vectors = false
+
+[components.textcat.model.tok2vec.encode]
+@architectures = "spacy.MaxoutWindowEncoder.v2"
+width = 64
+window_size = 1
+maxout_pieces = 3
+depth = 2
+
+[corpora]
+
+[corpora.dev]
+@readers = "spacy.Corpus.v1"
+path = ${paths.dev}
+gold_preproc = false
+max_length = 0
+limit = 0
+augmenter = null
+
+[corpora.train]
+@readers = "spacy.Corpus.v1"
+path = ${paths.train}
+gold_preproc = false
+max_length = 0
+limit = 0
+augmenter = null
+
+[training]
+seed = ${system.seed}
+gpu_allocator = ${system.gpu_allocator}
+dropout = 0.1
+accumulate_gradient = 1
+patience = 1000
+max_epochs = 0
+max_steps = 1000
+eval_frequency = 100
+frozen_components = []
+dev_corpus = "corpora.dev"
+train_corpus = "corpora.train"
+before_to_disk = null
+
+[training.batcher]
+@batchers = "spacy.batch_by_words.v1"
+discard_oversize = false
+tolerance = 0.2
+get_length = null
+
+[training.batcher.size]
+@schedules = "compounding.v1"
+start = 100
+stop = 1000
+compound = 1.001
+t = 0.0
+
+[training.logger]
+@loggers = "spacy.ConsoleLogger.v1"
+progress_bar = false
+
+[training.optimizer]
+@optimizers = "Adam.v1"
+beta1 = 0.9
+beta2 = 0.999
+L2_is_weight_decay = true
+L2 = 0.01
+grad_clip = 1.0
+use_averages = false
+eps = 0.00000001
+learn_rate = 0.001
+
+[training.score_weights]
+cats_score_desc = null
+cats_micro_p = null
+cats_micro_r = null
+cats_micro_f = null
+cats_macro_p = null
+cats_macro_r = null
+cats_macro_f = null
+cats_macro_auc = null
+cats_f_per_type = null
+cats_macro_auc_per_type = null
+cats_score = 1.0
+
+[pretraining]
+
+[initialize]
+vectors = ${paths.vectors}
+init_tok2vec = ${paths.init_tok2vec}
+vocab_data = null
+lookups = null
+before_init = null
+after_init = null
+
+[initialize.components]
+
+[initialize.tokenizer]
diff --git a/pipelines/textcat_demo/configs/config_multilabel.cfg b/pipelines/textcat_demo/configs/config_multilabel.cfg
@@ -0,0 +1,140 @@
+[paths]
+train = null
+dev = null
+vectors = null
+init_tok2vec = null
+
+[system]
+seed = 0
+gpu_allocator = null
+
+[nlp]
+lang = "en"
+pipeline = ["textcat_multilabel"]
+disabled = []
+before_creation = null
+after_creation = null
+after_pipeline_creation = null
+batch_size = 1000
+tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}
+
+[components]
+
+[components.textcat_multilabel]
+factory = "textcat_multilabel"
+threshold = 0.5
+
+[components.textcat_multilabel.model]
+@architectures = "spacy.TextCatEnsemble.v2"
+nO = null
+
+[components.textcat_multilabel.model.linear_model]
+@architectures = "spacy.TextCatBOW.v1"
+exclusive_classes = false
+ngram_size = 1
+no_output_layer = false
+nO = null
+
+[components.textcat_multilabel.model.tok2vec]
+@architectures = "spacy.Tok2Vec.v2"
+
+[components.textcat_multilabel.model.tok2vec.embed]
+@architectures = "spacy.MultiHashEmbed.v1"
+width = 64
+rows = [2000,2000,1000,1000,1000,1000]
+attrs = ["ORTH","LOWER","PREFIX","SUFFIX","SHAPE","ID"]
+include_static_vectors = false
+
+[components.textcat_multilabel.model.tok2vec.encode]
+@architectures = "spacy.MaxoutWindowEncoder.v2"
+width = 64
+window_size = 1
+maxout_pieces = 3
+depth = 2
+
+[corpora]
+
+[corpora.dev]
+@readers = "spacy.Corpus.v1"
+path = ${paths.dev}
+gold_preproc = false
+max_length = 0
+limit = 0
+augmenter = null
+
+[corpora.train]
+@readers = "spacy.Corpus.v1"
+path = ${paths.train}
+gold_preproc = false
+max_length = 0
+limit = 0
+augmenter = null
+
+[training]
+seed = ${system.seed}
+gpu_allocator = ${system.gpu_allocator}
+dropout = 0.1
+accumulate_gradient = 1
+patience = 1000
+max_epochs = 0
+max_steps = 1000
+eval_frequency = 100
+frozen_components = []
+dev_corpus = "corpora.dev"
+train_corpus = "corpora.train"
+before_to_disk = null
+
+[training.batcher]
+@batchers = "spacy.batch_by_words.v1"
+discard_oversize = false
+tolerance = 0.2
+get_length = null
+
+[training.batcher.size]
+@schedules = "compounding.v1"
+start = 100
+stop = 1000
+compound = 1.001
+t = 0.0
+
+[training.logger]
+@loggers = "spacy.ConsoleLogger.v1"
+progress_bar = false
+
+[training.optimizer]
+@optimizers = "Adam.v1"
+beta1 = 0.9
+beta2 = 0.999
+L2_is_weight_decay = true
+L2 = 0.01
+grad_clip = 1.0
+use_averages = false
+eps = 0.00000001
+learn_rate = 0.001
+
+[training.score_weights]
+cats_score_desc = null
+cats_micro_p = null
+cats_micro_r = null
+cats_micro_f = null
+cats_macro_p = null
+cats_macro_r = null
+cats_macro_f = null
+cats_macro_auc = null
+cats_f_per_type = null
+cats_macro_auc_per_type = null
+cats_score = 1.0
+
+[pretraining]
+
+[initialize]
+vectors = ${paths.vectors}
+init_tok2vec = ${paths.init_tok2vec}
+vocab_data = null
+lookups = null
+before_init = null
+after_init = null
+
+[initialize.components]
+
+[initialize.tokenizer]