move all corpora and resources to tests/files/

roshan-research · Jul 7, 2023 · 1a0262e · 1a0262e
1 parent a68c5ea
commit 1a0262e
Show file tree

Hide file tree

Showing 59 changed files with 221 additions and 2,583 deletions.
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -32,9 +32,9 @@ jobs:
                 poetry lock
                 poetry install --with dev            
 
-            - name: Download resources
+            - name: Download test files
               run: |                  
-                  git clone https://github.com/sir-kokabi/resources.git resources
+                  git clone https://github.com/sir-kokabi/resources.git tests/files/
 
             - name: Run tests
               run: poetry run poe test
diff --git a/.gitignore b/.gitignore
@@ -159,4 +159,8 @@ sample.py
 
 # ipynb files 
 
-ss.ipynb
+ss.ipynb
+
+.ruff_cache
+
+tests/files/
diff --git a/README.md b/README.md
@@ -42,19 +42,16 @@ To install the latest version of Hazm, run the following command in your termina
 Alternatively, you can install the latest update from GitHub (this version may be unstable and buggy):
 
     pip install git+https://github.com/roshan-research/hazm.git
-
-then **download [resources.zip (~4 MB)](https://github.com/sir-kokabi/resources/releases/download/0.9.0/resources.zip)** and extract it to a to a folder named `resources` in the root of your project.
 
-Finally if you do not want to train and use your own model, you can download our pre-trained models: 
-
-| **Module name**          | **Size** |
-|:------------------------ |:-------- |
-| [**Download WordEmbedding**](https://mega.nz/file/GqZUlbpS#XRYP5FHbPK2LnLZ8IExrhrw3ZQ-jclNSVCz59uEhrxY)        | ~ 5 GB |
-| [**Download SentEmbedding**](https://mega.nz/file/WzR0QChY#J1nG-HGq0UJP69VMY8I1YGl_MfEAFCo5iizpjofA4OY)        | ~ 1 GB |
-| [**Download DependencyParser**](https://drive.google.com/file/d/1Ww3xsZC5BXY5eN8-2TWo40G-WvppkXYD/view?usp=drive_link)     | ~ 60 MB |
-| [**Download POSTagger**](https://drive.google.com/file/d/1Q3JK4NVUC2t5QT63aDiVrCRBV225E_B3)            | ~ 18 MB |
-| [**Download Chunker**](https://drive.google.com/file/d/16hlAb_h7xdlxF4Ukhqk_fOV3g7rItVtk)              | ~ 4 MB |
+Finally if you do not want to train and use your own model, you can download our pre-trained models:
 
+| **Module name**                                                                                                        | **Size** |
+| :--------------------------------------------------------------------------------------------------------------------- | :------- |
+| [**Download WordEmbedding**](https://mega.nz/file/GqZUlbpS#XRYP5FHbPK2LnLZ8IExrhrw3ZQ-jclNSVCz59uEhrxY)                | ~ 5 GB   |
+| [**Download SentEmbedding**](https://mega.nz/file/WzR0QChY#J1nG-HGq0UJP69VMY8I1YGl_MfEAFCo5iizpjofA4OY)                | ~ 1 GB   |
+| [**Download DependencyParser**](https://drive.google.com/file/d/1Ww3xsZC5BXY5eN8-2TWo40G-WvppkXYD/view?usp=drive_link) | ~ 60 MB  |
+| [**Download POSTagger**](https://drive.google.com/file/d/1Q3JK4NVUC2t5QT63aDiVrCRBV225E_B3)                            | ~ 18 MB  |
+| [**Download Chunker**](https://drive.google.com/file/d/16hlAb_h7xdlxF4Ukhqk_fOV3g7rItVtk)                              | ~ 4 MB   |
 
 ## Usage
 
@@ -77,16 +74,16 @@ Finally if you do not want to train and use your own model, you can download our
 >>> lemmatizer.lemmatize('می‌روم')
 'رفت#رو'
 
->>> tagger = POSTagger(model='resources/pos_tagger.model')
+>>> tagger = POSTagger(model='pos_tagger.model')
 >>> tagger.tag(word_tokenize('ما بسیار کتاب می‌خوانیم'))
 [('ما', 'PRO'), ('بسیار', 'ADV'), ('کتاب', 'N'), ('می‌خوانیم', 'V')]
 
->>> chunker = Chunker(model='resources/chunker.model')
+>>> chunker = Chunker(model='chunker.model')
 >>> tagged = tagger.tag(word_tokenize('کتاب خواندن را دوست داریم'))
 >>> tree2brackets(chunker.parse(tagged))
 '[کتاب خواندن NP] [را POSTP] [دوست داریم VP]'
 
->>> word_embedding = WordEmbedding(model_type = 'fasttext', model_path = 'resources/word2vec.bin')
+>>> word_embedding = WordEmbedding(model_type = 'fasttext', model_path = 'word2vec.bin')
 >>> word_embedding.doesnt_match(['سلام' ,'درود' ,'خداحافظ' ,'پنجره'])
 'پنجره'
 >>> word_embedding.doesnt_match(['ساعت' ,'پلنگ' ,'شیر'])

diff --git a/corpora/bijankhan.txt b/corpora/bijankhan.txt
diff --git a/corpora/dadegan.conll b/corpora/dadegan.conll
diff --git a/corpora/dadegan.conllu b/corpora/dadegan.conllu
diff --git a/corpora/degarbayan/corpus_pair.xml b/corpora/degarbayan/corpus_pair.xml
diff --git a/corpora/hamshahri/1996/ham2_960623.xml b/corpora/hamshahri/1996/ham2_960623.xml