Merge pull request #198 from ku-nlp/dev

v2.2.0
ku-nlp · Oct 28, 2023 · fe79a59 · fe79a59
2 parents 53594c7 + d5819fc
commit fe79a59
Show file tree

Hide file tree

Showing 162 changed files with 4,833 additions and 5,799 deletions.
diff --git a/.flake8 b/.flake8
diff --git a/.github/dependabot.yml b/.github/dependabot.yml
@@ -0,0 +1,15 @@
+# To get started with Dependabot version updates, you'll need to specify which
+# package ecosystems to update and where the package manifests are located.
+# Please see the documentation for all configuration options:
+# https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates
+
+version: 2
+updates:
+  - package-ecosystem: "github-actions"
+    # Workflow files stored in the
+    # default location of `.github/workflows`
+    directory: "/"
+    schedule:
+      interval: "monthly"
+      timezone: "Asia/Tokyo"
+    target-branch: "main"
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -44,13 +44,13 @@ jobs:
         run: |
           kwja --version
           kwja --help
-          kwja --tasks typo,senter,char,word --model-size tiny --text "自然言語処理"
-          kwja --tasks typo,senter,seq2seq,word --model-size tiny --text "自然言語処理"
+          kwja --tasks typo,char,word --model-size tiny --text "自然言語処理"
+          kwja --tasks typo,char,seq2seq,word --model-size tiny --text "自然言語処理"
       - name: Run KWJA (Windows)
         if: ${{ matrix.os == 'windows-latest' }}
         run: |
           $env:PYTHONUTF8 = "1"
           kwja --version
           kwja --help
-          kwja --tasks typo,senter,char,word --model-size tiny --text "自然言語処理"
-          kwja --tasks typo,senter,seq2seq,word --model-size tiny --text "自然言語処理"
+          kwja --tasks typo,char,word --model-size tiny --text "自然言語処理"
+          kwja --tasks typo,char,seq2seq,word --model-size tiny --text "自然言語処理"
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -7,32 +7,41 @@ repos:
       - id: end-of-file-fixer
       - id: trailing-whitespace
       - id: check-yaml
-  - repo: https://github.com/psf/black
-    rev: 23.3.0
+      - id: check-toml
+  - repo: https://github.com/psf/black-pre-commit-mirror
+    rev: 23.9.0
     hooks:
       - id: black
   - repo: https://github.com/PyCQA/flake8
-    rev: 6.0.0
+    rev: 6.1.0
     hooks:
       - id: flake8
+        additional_dependencies: [Flake8-pyproject]
   - repo: https://github.com/PyCQA/isort
     rev: 5.12.0
     hooks:
       - id: isort
   - repo: https://github.com/pre-commit/mirrors-mypy
-    rev: v1.3.0
+    rev: v1.5.1
     hooks:
       - id: mypy
         additional_dependencies:
-          - rhoknp==1.3.1
+          - rhoknp==1.5.0
           - hydra-core==1.3.2
           - torch==2.0.0
-          - torchmetrics==0.11.4
-          - transformers==4.29.2
+          - torchmetrics==1.1.0
+          - transformers==4.32.1
           - tokenizers==0.13.3
-          - wandb==0.15.4
+          - wandb==0.15.9
           - typer==0.9.0
-          - types-PyYAML==6.0.12.9
+          - types-PyYAML==6.0.12.11
+          - git+https://github.com/nobu-g/[email protected]
+  - repo: https://github.com/asottile/pyupgrade
+    rev: v3.10.1
+    hooks:
+      - id: pyupgrade
+        args:
+          - --py38-plus
   - repo: https://github.com/jumanjihouse/pre-commit-hooks
     rev: 3.0.0
     hooks:

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,21 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+## [v2.2.0] - 2023-10-27
+### Added
+- Support `jumanpp` and `knp` input formats. This functionality allows you to partly use tokenization results of `jumanpp` as input.
+  ```shell
+  kwja --tasks word --text "$(echo "外国人参政権" | jumanpp)" --input-format jumanpp
+  kwja --tasks word --filename <(echo "外国人参政権" | jumanpp) --input-format jumanpp
+
+  kwja --tasks word --text "$(echo "外国人参政権" | jumanpp | knp -tab)" --input-format knp
+  kwja --tasks word --filename <(echo "外国人参政権" | jumanpp | knp -tab) --input-format knp
+  ```
+- Analyze `デ`, `ト`, and `時間` cases in addition to `ガ`, `ヲ`, `ニ`, and `ガ２` cases in predicate-argument structure analysis.
+
+### Changed
+- Merge senter module into char module
+
 ## [v2.1.3] - 2023-08-28
 ### Fixed
 - Version specification of `rhoknp` in `pyproject.toml`.
@@ -175,7 +190,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ### Removed
 - Remove an unnecessary dependency, `fugashi`.
 
-[Unreleased]: https://github.com/ku-nlp/kwja/compare/v2.1.3...HEAD
+[Unreleased]: https://github.com/ku-nlp/kwja/compare/v2.2.0...HEAD
+[2.2.0]: https://github.com/ku-nlp/kwja/compare/v2.1.3...v2.2.0
 [2.1.3]: https://github.com/ku-nlp/kwja/compare/v2.1.2...v2.1.3
 [2.1.2]: https://github.com/ku-nlp/kwja/compare/v2.1.1...v2.1.2
 [2.1.1]: https://github.com/ku-nlp/kwja/compare/v2.1.0...v2.1.1

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -144,26 +144,36 @@ poetry run pytest
 
 ## Releasing a new version
 
-- Checkout `main` branch
+- Checkout the `dev` branch
 - Make sure the new version is supported in `_get_model_version` function in `src/kwja/cli/utils.py`
 - Update `CHANGELOG.md`
 - Edit `pyproject.toml` to update `tool.poetry.version`
-- Update dependencies
+- Update dependencies (edit `pyproject.toml` if necessary)
 
     ```shell
     poetry update
     ```
-
+- Push changes to the `dev` branch and create a pull request to the `main` branch
+- If CI is passed, merge the pull request
+- Checkout the `main` branch and pull changes
 - Add a new tag and push changes
 
     ```shell
     git tag -a v0.1.0 -m "Release v0.1.0"
     git push --follow-tags
     ```
 
-- If CI is passed, publish to PyPI
+- Publish to PyPI
 
     ```shell
     poetry build
     poetry publish [--username $PYPI_USERNAME] [--password $PYPI_PASSWORD]
     ```
+
+- Rebase the `dev` branch to the `main` branch
+
+    ```shell
+    git checkout dev
+    git rebase main
+    git push
+    ```
diff --git a/README.md b/README.md
@@ -178,7 +178,6 @@ word_batch_size: 1
 
 - typo, senter, character, and word modules
   - The performance on each task except typo correction and discourse relation analysis is the mean over all the corpora (KC, KWDLC, Fuman, and WAC) and over three runs with different random seeds.
-    - \* denotes results of a single run (TBU)
   - We set the learning rate of RoBERTa<sub>LARGE</sub> (word) to 2e-5 because we failed to fine-tune it with a higher learning rate.
     Other hyperparameters are the same described in configs, which are tuned for DeBERTa<sub>BASE</sub>.
 - seq2seq module
@@ -229,66 +228,66 @@ word_batch_size: 1
     <tr>
       <th colspan="2">Typo Correction</th>
       <td>79.0</td>
-      <td>76.7*</td>
+      <td>76.7</td>
       <td>80.8</td>
       <td>83.1</td>
     </tr>
     <tr>
       <th colspan="2">Sentence Segmentation</th>
       <td>-</td>
-      <td>98.2</td>
+      <td>98.4</td>
       <td>-</td>
-      <td>98.2</td>
+      <td>98.6</td>
     </tr>
     <tr>
       <th colspan="2">Word Segmentation</th>
       <td>98.5</td>
-      <td>98.6 / 98.2*</td>
+      <td>98.1 / 98.2*</td>
       <td>98.7</td>
-      <td>98.9 / 98.4*</td>
+      <td>98.4 / 98.4*</td>
     </tr>
     <tr>
       <th colspan="2">Word Normalization</th>
       <td>44.0</td>
-      <td>39.2</td>
+      <td>15.3</td>
       <td>39.8</td>
-      <td>46.0</td>
+      <td>48.6</td>
     </tr>
     <tr>
       <th rowspan="7">Morphological Analysis</th>
       <th>POS</th>
       <td>99.3</td>
       <td>99.4</td>
       <td>99.3</td>
-      <td>99.5</td>
+      <td>99.4</td>
     </tr>
     <tr>
       <th>sub-POS</th>
       <td>98.1</td>
       <td>98.5</td>
       <td>98.2</td>
-      <td>98.6</td>
+      <td>98.5</td>
     </tr>
     <tr>
       <th>conjtype</th>
       <td>99.4</td>
-      <td>99.5</td>
+      <td>99.6</td>
       <td>99.2</td>
       <td>99.6</td>
     </tr>
     <tr>
       <th>conjform</th>
       <td>99.5</td>
-      <td>99.6</td>
+      <td>99.7</td>
       <td>99.4</td>
       <td>99.7</td>
     </tr>
     <tr>
       <th>reading</th>
       <td>95.5</td>
-      <td>95.2 / 96.2*</td>
+      <td>95.4 / 96.2*</td>
       <td>90.8</td>
-      <td>95.5 / 96.8*</td>
+      <td>95.6 / 96.8*</td>
     </tr>
     <tr>
       <th>lemma</th>
@@ -307,9 +306,9 @@ word_batch_size: 1
     <tr>
       <th colspan="2">Named Entity Recognition</th>
       <td>83.0</td>
-      <td>84.0</td>
+      <td>84.6</td>
       <td>82.1</td>
-      <td>83.9</td>
+      <td>85.9</td>
     </tr>
     <tr>
       <th rowspan="2">Linguistic Feature Tagging</th>
@@ -322,44 +321,44 @@ word_batch_size: 1
     <tr>
       <th>base phrase</th>
       <td>86.6</td>
-      <td>91.4</td>
+      <td>93.6</td>
       <td>86.4</td>
-      <td>92.6</td>
+      <td>93.4</td>
     </tr>
     <tr>
       <th colspan="2">Dependency Parsing</th>
       <td>92.9</td>
-      <td>93.6</td>
+      <td>93.5</td>
       <td>93.8</td>
-      <td>93.7</td>
+      <td>93.6</td>
     </tr>
     <tr>
       <th colspan="2">Pas Analysis</th>
       <td>74.2</td>
-      <td>77.9</td>
+      <td>76.9</td>
       <td>75.3</td>
-      <td>78.6</td>
+      <td>77.5</td>
     </tr>
     <tr>
       <th colspan="2">Bridging Reference Resolution</th>
       <td>66.5</td>
-      <td>68.7</td>
+      <td>67.3</td>
       <td>65.2</td>
-      <td>68.4</td>
+      <td>67.5</td>
     </tr>
     <tr>
       <th colspan="2">Coreference Resolution</th>
       <td>74.9</td>
-      <td>78.3</td>
+      <td>78.6</td>
       <td>75.9</td>
-      <td>79.6</td>
+      <td>79.2</td>
     </tr>
     <tr>
       <th colspan="2">Discourse Relation Analysis</th>
       <td>42.2</td>
-      <td>40.6</td>
+      <td>39.2</td>
       <td>41.3</td>
-      <td>45.5</td>
+      <td>44.3</td>
     </tr>
   </tbody>
 </table>

diff --git a/configs/base_template.yaml b/configs/base_template.yaml
@@ -34,8 +34,8 @@ hydra:
       TOKENIZERS_PARALLELISM: false
     config:
       override_dirname:
-        kv_sep: '-'
-        item_sep: ','
+        kv_sep: '='
+        item_sep: '-'
         exclude_keys:
           - seed
           - name

diff --git a/configs/callbacks/char_module_writer.yaml b/configs/callbacks/char_module_writer.yaml
@@ -1,3 +1,3 @@
 prediction_writer:
-  _target_: kwja.callbacks.char_module_writer.CharModuleWriter
+  _target_: kwja.callbacks.CharModuleWriter
   destination: ${run_dir}/char_prediction.juman
diff --git a/configs/callbacks/senter_module_writer.yaml b/configs/callbacks/senter_module_writer.yaml
diff --git a/configs/callbacks/seq2seq_module_writer.yaml b/configs/callbacks/seq2seq_module_writer.yaml
@@ -1,5 +1,5 @@
 prediction_writer:
-  _target_: kwja.callbacks.seq2seq_module_writer.Seq2SeqModuleWriter
+  _target_: kwja.callbacks.Seq2SeqModuleWriter
   destination: ${run_dir}/seq2seq_prediction.txt
   tokenizer:
     _target_: transformers.AutoTokenizer.from_pretrained

diff --git a/configs/callbacks/typo_module_writer.yaml b/configs/callbacks/typo_module_writer.yaml
@@ -1,5 +1,5 @@
 prediction_writer:
-  _target_: kwja.callbacks.typo_module_writer.TypoModuleWriter
+  _target_: kwja.callbacks.TypoModuleWriter
   confidence_threshold: ${confidence_threshold}
   destination: ${run_dir}/typo_prediction.txt
   tokenizer:

diff --git a/configs/callbacks/word_module_writer.yaml b/configs/callbacks/word_module_writer.yaml
@@ -1,5 +1,5 @@
 prediction_writer:
-  _target_: kwja.callbacks.word_module_writer.WordModuleWriter
+  _target_: kwja.callbacks.WordModuleWriter
   ambig_surf_specs:
     - conjtype: "イ形容詞アウオ段"
       conjform: "エ基本形"