Update benchmark projects (explosion#35)

* remove init_config and adjust init vectors command * add variants file and spacy-lookups-data dependency * have cnn_glove_small as default * add gitignore * automatically update readme * remove links for local files * similar edits to parsing benchmark project
rojaAchary · Feb 8, 2021 · 5fc176d · 5fc176d
1 parent 093e4ba
commit 5fc176d
Show file tree

Hide file tree

Showing 7 changed files with 36 additions and 16 deletions.
diff --git a/benchmarks/ner_conll03/.gitignore b/benchmarks/ner_conll03/.gitignore
@@ -0,0 +1,4 @@
+assets
+corpus
+metrics
+training
diff --git a/benchmarks/ner_conll03/README.md b/benchmarks/ner_conll03/README.md
@@ -16,7 +16,7 @@ Commands are only re-run if their inputs have changed.
 
 | Command | Description |
 | --- | --- |
-| `init-config` | Generate default config |
+| `install` | Install dependencies |
 | `corpus` | Convert the data to spaCy's format |
 | `vectors` | Convert, truncate and prune the vectors. |
 | `train` | Train the full pipeline |
@@ -32,7 +32,7 @@ inputs have changed.
 
 | Workflow | Steps |
 | --- | --- |
-| `all` | `vectors` &rarr; `corpus` &rarr; `train` &rarr; `evaluate` |
+| `all` | `install` &rarr; `vectors` &rarr; `corpus` &rarr; `train` &rarr; `evaluate` |
 
 ### 🗂 Assets
 
@@ -46,5 +46,6 @@ in the project directory.
 | `assets/conll2003/dev.iob` | Local | Development data (not available publicly so you have to add the file yourself) |
 | `assets/conll2003/test.iob` | Local | Test data (not available publicly so you have to add the file yourself) |
 | `assets/conll2003/train.iob` | Local | Training data (not available publicly so you have to add the file yourself) |
+| `assets/orth_variants.json` | URL | A file containing orth variants for data augmentation |
 
 <!-- SPACY PROJECT: AUTO-GENERATED DOCS END (do not remove) -->
diff --git a/benchmarks/ner_conll03/project.yml b/benchmarks/ner_conll03/project.yml
@@ -1,7 +1,7 @@
 title: "Named Entity Recognition (CoNLL-2003)"
 
 vars:
-  config: "cnn_glove_small"
+  config: "cnn_glove_small"  # pick between "transformer" or "cnn_glove_small"
   gpu: -1
 
 # These are the directories that the project needs. The project CLI will make
@@ -18,19 +18,25 @@ assets:
     description: "Test data (not available publicly so you have to add the file yourself)"
   - dest: "assets/conll2003/train.iob"
     description: "Training data (not available publicly so you have to add the file yourself)"
+  - url: "https://github.com/explosion/spacy-lookups-data/blob/master/spacy_lookups_data/data/en_orth_variants.json"
+    dest: "assets/orth_variants.json"
+    description: "A file containing orth variants for data augmentation"
 
 workflows:
   all:
+    - install
     - vectors
     - corpus
     - train
     - evaluate
 
 commands:
-  - name: init-config
-    help: "Generate default config"
+  - name: install
+    help: "Install dependencies"
     script:
-      - "python -m spacy init config --lang en --pipeline tagger,parser --optimize efficiency -C defaults.cfg"
+      - "python -m pip install -r requirements.txt"
+    deps:
+      - "requirements.txt"
 
   - name: corpus
     help: "Convert the data to spaCy's format"
@@ -50,7 +56,7 @@ commands:
   - name: vectors
     help: "Convert, truncate and prune the vectors."
     script:
-      - "python -m spacy init vocab -v assets/vectors.zip en corpus/en_vectors -vn en_glove840b_vectors_md"
+      - "python -m spacy init vectors en assets/vectors.zip corpus/en_vectors -n en_glove840b_vectors_md"
     deps:
       - "assets/vectors.zip"
     outputs:

diff --git a/benchmarks/ner_conll03/requirements.txt b/benchmarks/ner_conll03/requirements.txt
@@ -0,0 +1 @@
+spacy-lookups-data>=1.0.0,<1.1.0
diff --git a/benchmarks/parsing_penn_treebank/README.md b/benchmarks/parsing_penn_treebank/README.md
@@ -16,7 +16,7 @@ Commands are only re-run if their inputs have changed.
 
 | Command | Description |
 | --- | --- |
-| `init-config` | Generate default config |
+| `install` | Install dependencies |
 | `corpus` | Convert the data to spaCy's format |
 | `vectors` | Convert, truncate and prune the vectors. |
 | `train` | Train the full pipeline |
@@ -32,7 +32,7 @@ inputs have changed.
 
 | Workflow | Steps |
 | --- | --- |
-| `all` | `vectors` &rarr; `corpus` &rarr; `train` &rarr; `evaluate` |
+| `all` | `install` &rarr; `vectors` &rarr; `corpus` &rarr; `train` &rarr; `evaluate` |
 
 ### 🗂 Assets
 
@@ -46,5 +46,6 @@ in the project directory.
 | `assets/PTB_SD_3_3_0/dev.gold.conll` | Local | Development data (not available publicly so you have to add the file yourself) |
 | `assets/PTB_SD_3_3_0/test.gold.conll` | Local | Test data (not available publicly so you have to add the file yourself) |
 | `assets/vectors.zip` | URL | GloVe vectors |
+| `assets/orth_variants.json` | URL | A file containing orth variants for data augmentation |
 
 <!-- SPACY PROJECT: AUTO-GENERATED DOCS END (do not remove) -->
diff --git a/benchmarks/parsing_penn_treebank/project.yml b/benchmarks/parsing_penn_treebank/project.yml
@@ -1,7 +1,7 @@
 title: "Dependency Parsing (Penn Treebank)"
 
 vars:
-  name: "cnn_glove_small"
+  name: "cnn_glove_small"   # pick between "transformer" or "cnn_glove_small"
   gpu: 0
 
 # These are the directories that the project needs. The project CLI will make
@@ -18,19 +18,25 @@ assets:
   - url: "http://nlp.stanford.edu/data/glove.840B.300d.zip"
     dest: "assets/vectors.zip"
     description: "GloVe vectors"
+  - url: "https://github.com/explosion/spacy-lookups-data/blob/master/spacy_lookups_data/data/en_orth_variants.json"
+    dest: "assets/orth_variants.json"
+    description: "A file containing orth variants for data augmentation"
 
 workflows:
   all:
+    - install
     - vectors
     - corpus
     - train
     - evaluate
 
 commands:
-  - name: init-config
-    help: "Generate default config"
+  - name: install
+    help: "Install dependencies"
     script:
-      - "python -m spacy init config --lang en --pipeline tagger,parser --optimize efficiency -C defaults.cfg"
+      - "python -m pip install -r requirements.txt"
+    deps:
+      - "requirements.txt"
 
   - name: corpus
     help: "Convert the data to spaCy's format"
@@ -54,7 +60,7 @@ commands:
   - name: vectors
     help: "Convert, truncate and prune the vectors."
     script:
-      - "python -m spacy init vocab -v assets/vectors.zip en corpus/en_vectors -vn en_glove840b_vectors_md"
+      - "python -m spacy init vectors en assets/vectors.zip corpus/en_vectors -n en_glove840b_vectors_md"
     deps:
       - "assets/vectors.zip"
     outputs:
@@ -75,10 +81,10 @@ commands:
   - name: evaluate
     help: "Evaluate on the test data and save the metrics"
     script:
-      - "python -m spacy evaluate ./training/${vars.name}/model-best ./corpus/test.fixed.spacy --output ./metrics/${vars.name}.json --gpu-id ${vars.gpu} --gold-preproc"
+      - "python -m spacy evaluate ./training/${vars.name}/model-best ./corpus/test.spacy --output ./metrics/${vars.name}.json --gpu-id ${vars.gpu} --gold-preproc"
     deps:
       - "training/${vars.name}/model-best"
-      - "corpus/test.fixed.spacy"
+      - "corpus/test.spacy"
     outputs:
       - "metrics/${vars.name}.json"
 

diff --git a/benchmarks/parsing_penn_treebank/requirements.txt b/benchmarks/parsing_penn_treebank/requirements.txt
@@ -0,0 +1 @@
+spacy-lookups-data>=1.0.0,<1.1.0