forked from explosion/projects
-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* Provided data is textcat_docs_issues data in a simpler JSONL format * Both `textcat` and `textcat_multilabel` data and configs are provided, the default is `textcat`
- Loading branch information
1 parent
5fc176d
commit b3592d9
Showing
14 changed files
with
2,803 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
corpus | ||
packages | ||
training |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
<!-- SPACY PROJECT: AUTO-GENERATED DOCS START (do not remove) --> | ||
|
||
# 🪐 spaCy Project: Demo Textcat (Text Classification) | ||
|
||
A minimal demo textcat project for spaCy v3. The demo data comes from the [tutorials/textcat_docs_issues](https://github.com/explosion/projects/tree/v3/tutorials/textcat_docs_issues) project. | ||
|
||
## 📋 project.yml | ||
|
||
The [`project.yml`](project.yml) defines the data assets required by the | ||
project, as well as the available commands and workflows. For details, see the | ||
[spaCy projects documentation](https://spacy.io/usage/projects). | ||
|
||
### ⏯ Commands | ||
|
||
The following commands are defined by the project. They | ||
can be executed using [`spacy project run [name]`](https://spacy.io/api/cli#project-run). | ||
Commands are only re-run if their inputs have changed. | ||
|
||
| Command | Description | | ||
| --- | --- | | ||
| `convert` | Convert the data to spaCy's binary format | | ||
| `train` | Train the textcat model | | ||
| `evaluate` | Evaluate the model and export metrics | | ||
| `package` | Package the trained model as a pip package | | ||
| `visualize-model` | Visualize the model's output interactively using Streamlit | | ||
|
||
### ⏭ Workflows | ||
|
||
The following workflows are defined by the project. They | ||
can be executed using [`spacy project run [name]`](https://spacy.io/api/cli#project-run) | ||
and will run the specified commands in order. Commands are only re-run if their | ||
inputs have changed. | ||
|
||
| Workflow | Steps | | ||
| --- | --- | | ||
| `all` | `convert` → `train` → `evaluate` → `package` | | ||
|
||
### 🗂 Assets | ||
|
||
The following assets are defined by the project. They can | ||
be fetched by running [`spacy project assets`](https://spacy.io/api/cli#project-assets) | ||
in the project directory. | ||
|
||
| File | Source | Description | | ||
| --- | --- | --- | | ||
| [`assets/docs_issues_training_multilabel.jsonl`](assets/docs_issues_training_multilabel.jsonl) | Local | Demo training data | | ||
| [`assets/docs_issues_eval_multilabel.jsonl`](assets/docs_issues_eval_multilabel.jsonl) | Local | Demo development data | | ||
|
||
<!-- SPACY PROJECT: AUTO-GENERATED DOCS END (do not remove) --> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
# This is needed to ensure that text-based assets included with project | ||
# templates and cloned via Git end up with consistent line endings and | ||
# the same checksums. It will prevent Git from converting line endings. | ||
# Otherwise, a user cloning assets on Windows may end up with a different | ||
# checksum due to different line endings. | ||
* -text |
Large diffs are not rendered by default.
Oops, something went wrong.
500 changes: 500 additions & 0 deletions
500
pipelines/textcat_demo/assets/docs_issues_eval_multilabel.jsonl
Large diffs are not rendered by default.
Oops, something went wrong.
661 changes: 661 additions & 0 deletions
661
pipelines/textcat_demo/assets/docs_issues_training.jsonl
Large diffs are not rendered by default.
Oops, something went wrong.
661 changes: 661 additions & 0 deletions
661
pipelines/textcat_demo/assets/docs_issues_training_multilabel.jsonl
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,140 @@ | ||
[paths] | ||
train = null | ||
dev = null | ||
vectors = null | ||
init_tok2vec = null | ||
|
||
[system] | ||
seed = 0 | ||
gpu_allocator = null | ||
|
||
[nlp] | ||
lang = "en" | ||
pipeline = ["textcat"] | ||
disabled = [] | ||
before_creation = null | ||
after_creation = null | ||
after_pipeline_creation = null | ||
batch_size = 1000 | ||
tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"} | ||
|
||
[components] | ||
|
||
[components.textcat] | ||
factory = "textcat" | ||
threshold = 0.5 | ||
|
||
[components.textcat.model] | ||
@architectures = "spacy.TextCatEnsemble.v2" | ||
nO = null | ||
|
||
[components.textcat.model.linear_model] | ||
@architectures = "spacy.TextCatBOW.v1" | ||
exclusive_classes = true | ||
ngram_size = 1 | ||
no_output_layer = false | ||
nO = null | ||
|
||
[components.textcat.model.tok2vec] | ||
@architectures = "spacy.Tok2Vec.v2" | ||
|
||
[components.textcat.model.tok2vec.embed] | ||
@architectures = "spacy.MultiHashEmbed.v1" | ||
width = 64 | ||
rows = [2000,2000,1000,1000,1000,1000] | ||
attrs = ["ORTH","LOWER","PREFIX","SUFFIX","SHAPE","ID"] | ||
include_static_vectors = false | ||
|
||
[components.textcat.model.tok2vec.encode] | ||
@architectures = "spacy.MaxoutWindowEncoder.v2" | ||
width = 64 | ||
window_size = 1 | ||
maxout_pieces = 3 | ||
depth = 2 | ||
|
||
[corpora] | ||
|
||
[corpora.dev] | ||
@readers = "spacy.Corpus.v1" | ||
path = ${paths.dev} | ||
gold_preproc = false | ||
max_length = 0 | ||
limit = 0 | ||
augmenter = null | ||
|
||
[corpora.train] | ||
@readers = "spacy.Corpus.v1" | ||
path = ${paths.train} | ||
gold_preproc = false | ||
max_length = 0 | ||
limit = 0 | ||
augmenter = null | ||
|
||
[training] | ||
seed = ${system.seed} | ||
gpu_allocator = ${system.gpu_allocator} | ||
dropout = 0.1 | ||
accumulate_gradient = 1 | ||
patience = 1000 | ||
max_epochs = 0 | ||
max_steps = 1000 | ||
eval_frequency = 100 | ||
frozen_components = [] | ||
dev_corpus = "corpora.dev" | ||
train_corpus = "corpora.train" | ||
before_to_disk = null | ||
|
||
[training.batcher] | ||
@batchers = "spacy.batch_by_words.v1" | ||
discard_oversize = false | ||
tolerance = 0.2 | ||
get_length = null | ||
|
||
[training.batcher.size] | ||
@schedules = "compounding.v1" | ||
start = 100 | ||
stop = 1000 | ||
compound = 1.001 | ||
t = 0.0 | ||
|
||
[training.logger] | ||
@loggers = "spacy.ConsoleLogger.v1" | ||
progress_bar = false | ||
|
||
[training.optimizer] | ||
@optimizers = "Adam.v1" | ||
beta1 = 0.9 | ||
beta2 = 0.999 | ||
L2_is_weight_decay = true | ||
L2 = 0.01 | ||
grad_clip = 1.0 | ||
use_averages = false | ||
eps = 0.00000001 | ||
learn_rate = 0.001 | ||
|
||
[training.score_weights] | ||
cats_score_desc = null | ||
cats_micro_p = null | ||
cats_micro_r = null | ||
cats_micro_f = null | ||
cats_macro_p = null | ||
cats_macro_r = null | ||
cats_macro_f = null | ||
cats_macro_auc = null | ||
cats_f_per_type = null | ||
cats_macro_auc_per_type = null | ||
cats_score = 1.0 | ||
|
||
[pretraining] | ||
|
||
[initialize] | ||
vectors = ${paths.vectors} | ||
init_tok2vec = ${paths.init_tok2vec} | ||
vocab_data = null | ||
lookups = null | ||
before_init = null | ||
after_init = null | ||
|
||
[initialize.components] | ||
|
||
[initialize.tokenizer] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,140 @@ | ||
[paths] | ||
train = null | ||
dev = null | ||
vectors = null | ||
init_tok2vec = null | ||
|
||
[system] | ||
seed = 0 | ||
gpu_allocator = null | ||
|
||
[nlp] | ||
lang = "en" | ||
pipeline = ["textcat_multilabel"] | ||
disabled = [] | ||
before_creation = null | ||
after_creation = null | ||
after_pipeline_creation = null | ||
batch_size = 1000 | ||
tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"} | ||
|
||
[components] | ||
|
||
[components.textcat_multilabel] | ||
factory = "textcat_multilabel" | ||
threshold = 0.5 | ||
|
||
[components.textcat_multilabel.model] | ||
@architectures = "spacy.TextCatEnsemble.v2" | ||
nO = null | ||
|
||
[components.textcat_multilabel.model.linear_model] | ||
@architectures = "spacy.TextCatBOW.v1" | ||
exclusive_classes = false | ||
ngram_size = 1 | ||
no_output_layer = false | ||
nO = null | ||
|
||
[components.textcat_multilabel.model.tok2vec] | ||
@architectures = "spacy.Tok2Vec.v2" | ||
|
||
[components.textcat_multilabel.model.tok2vec.embed] | ||
@architectures = "spacy.MultiHashEmbed.v1" | ||
width = 64 | ||
rows = [2000,2000,1000,1000,1000,1000] | ||
attrs = ["ORTH","LOWER","PREFIX","SUFFIX","SHAPE","ID"] | ||
include_static_vectors = false | ||
|
||
[components.textcat_multilabel.model.tok2vec.encode] | ||
@architectures = "spacy.MaxoutWindowEncoder.v2" | ||
width = 64 | ||
window_size = 1 | ||
maxout_pieces = 3 | ||
depth = 2 | ||
|
||
[corpora] | ||
|
||
[corpora.dev] | ||
@readers = "spacy.Corpus.v1" | ||
path = ${paths.dev} | ||
gold_preproc = false | ||
max_length = 0 | ||
limit = 0 | ||
augmenter = null | ||
|
||
[corpora.train] | ||
@readers = "spacy.Corpus.v1" | ||
path = ${paths.train} | ||
gold_preproc = false | ||
max_length = 0 | ||
limit = 0 | ||
augmenter = null | ||
|
||
[training] | ||
seed = ${system.seed} | ||
gpu_allocator = ${system.gpu_allocator} | ||
dropout = 0.1 | ||
accumulate_gradient = 1 | ||
patience = 1000 | ||
max_epochs = 0 | ||
max_steps = 1000 | ||
eval_frequency = 100 | ||
frozen_components = [] | ||
dev_corpus = "corpora.dev" | ||
train_corpus = "corpora.train" | ||
before_to_disk = null | ||
|
||
[training.batcher] | ||
@batchers = "spacy.batch_by_words.v1" | ||
discard_oversize = false | ||
tolerance = 0.2 | ||
get_length = null | ||
|
||
[training.batcher.size] | ||
@schedules = "compounding.v1" | ||
start = 100 | ||
stop = 1000 | ||
compound = 1.001 | ||
t = 0.0 | ||
|
||
[training.logger] | ||
@loggers = "spacy.ConsoleLogger.v1" | ||
progress_bar = false | ||
|
||
[training.optimizer] | ||
@optimizers = "Adam.v1" | ||
beta1 = 0.9 | ||
beta2 = 0.999 | ||
L2_is_weight_decay = true | ||
L2 = 0.01 | ||
grad_clip = 1.0 | ||
use_averages = false | ||
eps = 0.00000001 | ||
learn_rate = 0.001 | ||
|
||
[training.score_weights] | ||
cats_score_desc = null | ||
cats_micro_p = null | ||
cats_micro_r = null | ||
cats_micro_f = null | ||
cats_macro_p = null | ||
cats_macro_r = null | ||
cats_macro_f = null | ||
cats_macro_auc = null | ||
cats_f_per_type = null | ||
cats_macro_auc_per_type = null | ||
cats_score = 1.0 | ||
|
||
[pretraining] | ||
|
||
[initialize] | ||
vectors = ${paths.vectors} | ||
init_tok2vec = ${paths.init_tok2vec} | ||
vocab_data = null | ||
lookups = null | ||
before_init = null | ||
after_init = null | ||
|
||
[initialize.components] | ||
|
||
[initialize.tokenizer] |
Oops, something went wrong.