migrate to python 3

roshan-research · Mar 19, 2023 · 7c7a770 · 7c7a770
1 parent dd267fb
commit 7c7a770
Show file tree

Hide file tree

Showing 35 changed files with 547 additions and 722 deletions.
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -16,7 +16,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-versions: [ '3.7', '3.8', '3.9', '3.10' ]
+        python-versions: [ '3.9', '3.10', '3.11.2' ]
         #python-versions: [ '2.7', '3.7', '3.8', '3.9', '3.10', '3.11' ]
 
     steps:

diff --git a/LICENSE b/LICENSE
@@ -14,7 +14,7 @@ copies or substantial portions of the Software.
 
 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
-FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
 COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
 IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/data.py b/data.py
@@ -1,8 +1,4 @@
-# coding: utf-8
-
-from __future__ import print_function, unicode_literals
-import codecs, subprocess, random
-import multiprocessing
+import subprocess, random
 from collections import Counter
 from itertools import islice
 from nltk.tag import untag
@@ -19,16 +15,14 @@ def create_words_file(dic_file="resources/persian.dic", output="hazm/data/words.
 
     dic_words = [
         line.strip().replace(", ", ",").split("\t")
-        for line in codecs.open(dic_file, encoding="utf-8")
+        for line in open(dic_file, encoding="utf-8")
         if len(line.strip().split("\t")) == 3
     ]
-    dic_words = filter(
-        lambda item: not item[2].startswith("V") and "NEG" not in item[2], dic_words
-    )
+    dic_words = [item for item in dic_words if not item[2].startswith("V") and "NEG" not in item[2]]
     dic_words = [
         "\t".join(item) for item in sorted(dic_words, key=lambda item: item[0])
     ]
-    print(*dic_words, sep="\n", file=codecs.open(output, "w", "utf-8"))
+    print(*dic_words, sep="\n", file=open(output, "w", "utf-8"))
     print(output, "created")
 
 
@@ -38,7 +32,7 @@ def evaluate_lemmatizer(
     lemmatizer = Lemmatizer()
 
     errors = []
-    with codecs.open("resources/lemmatizer_errors.txt", "w", "utf8") as output:
+    with open("resources/lemmatizer_errors.txt", "w", "utf8") as output:
         dadegan = DadeganReader(conll_file)
         for tree in dadegan.trees():
             for node in tree.nodelist[1:]:
@@ -47,11 +41,11 @@ def evaluate_lemmatizer(
                     errors.append((word, lemma, pos, lemmatizer.lemmatize(word, pos)))
         print(len(errors), "errors", file=output)
         counter = Counter(errors)
-        for item, count in sorted(counter.items(), key=lambda t: t[1], reverse=True):
+        for item, count in sorted(list(counter.items()), key=lambda t: t[1], reverse=True):
             print(count, *item, file=output)
 
     missed = []
-    with codecs.open("resources/lemmatizer_missed.txt", "w", "utf8") as output:
+    with open("resources/lemmatizer_missed.txt", "w", "utf8") as output:
         peykare = PeykareReader(peykare_root)
         for sentence in peykare.sents():
             for word in sentence:
@@ -60,7 +54,7 @@ def evaluate_lemmatizer(
                         missed.append(word[0])
         print(len(missed), "missed", file=output)
         counter = Counter(missed)
-        for item, count in sorted(counter.items(), key=lambda t: t[1], reverse=True):
+        for item, count in sorted(list(counter.items()), key=lambda t: t[1], reverse=True):
             print(count, item, file=output)
 
 
@@ -81,7 +75,7 @@ def evaluate_normalizer(tnews_root="corpora/tnews"):
         affix_spacing=False,
     )
 
-    with codecs.open("resources/normalized.txt", "w", "utf8") as output1, codecs.open(
+    with open("resources/normalized.txt", "w", "utf8") as output1, open(
         "resources/normalized_token_based.txt", "w", "utf8"
     ) as output2:
         random.seed(0)
@@ -99,7 +93,7 @@ def evaluate_informal_normalizer(sentipars_root="corpora/sentipers"):
     normalizer = Normalizer()
     informal_normalizer = InformalNormalizer()
 
-    output = codecs.open("resources/normalized.txt", "w", "utf8")
+    output = open("resources/normalized.txt", "w", "utf8")
     for comments in sentipers.comments():
         for comment in comments:
             for sentence in comment:
@@ -120,7 +114,7 @@ def evaluate_chunker(treebank_root="corpora/treebank"):
 
     print(chunker.evaluate(chunked_trees))
 
-    output = codecs.open("resources/chunker_errors.txt", "w", "utf8")
+    output = open("resources/chunker_errors.txt", "w", "utf8")
     for sentence, gold in zip(treebank.sents(), chunked_trees):
         chunked = chunker.parse(sentence)
         if chunked != gold:
@@ -155,14 +149,14 @@ def train_postagger(
             '*:s1=%m[0,0,".?$"]',
             '*:s2=%m[0,0,".?.?$"]',
             '*:s3=%m[0,0,".?.?.?$"]',
-            '*:p?l=%t[-1,0,"\p"]',
-            '*:p?=%t[0,0,"\p"]',
-            '*:p?r=%t[1,0,"\p"]',
-            '*:p?a=%t[0,0,"^\p*$"]',
-            '*:n?l=%t[-1,0,"\d"]',
-            '*:n?=%t[0,0,"\d"]',
-            '*:n?r=%t[1,0,"\d"]',
-            '*:n?a=%t[0,0,"^\d*$"]',
+            r'*:p?l=%t[-1,0,"\p"]',
+            r'*:p?=%t[0,0,"\p"]',
+            r'*:p?r=%t[1,0,"\p"]',
+            r'*:p?a=%t[0,0,"^\p*$"]',
+            r'*:n?l=%t[-1,0,"\d"]',
+            r'*:n?=%t[0,0,"\d"]',
+            r'*:n?r=%t[1,0,"\d"]',
+            r'*:n?a=%t[0,0,"^\d*$"]',
         ],
     )
 
@@ -204,7 +198,7 @@ def train_chunker(
     )
 
     def retag_trees(trees, sents):
-        for tree, sentence in zip(trees, tagger.tag_sents(map(untag, sents))):
+        for tree, sentence in zip(trees, tagger.tag_sents(list(map(untag, sents)))):
             for n, word in zip(tree.treepositions("leaves"), sentence):
                 tree[n] = word
 
@@ -234,9 +228,9 @@ def train_maltparser(
 
     train, test = DadeganReader(train_file), DadeganReader(test_file)
     train_data = train_file + ".data"
-    with codecs.open(train_data, "w", "utf8") as output:
+    with open(train_data, "w", "utf8") as output:
         for tree, sentence in zip(
-            train.trees(), tagger.tag_sents(map(untag, train.sents()))
+            train.trees(), tagger.tag_sents(list(map(untag, train.sents())))
         ):
             for i, (node, word) in enumerate(
                 zip(list(tree.nodes.values())[1:], sentence), start=1
@@ -283,16 +277,16 @@ def train_maltparser(
 
     # evaluation
     parser = MaltParser(tagger=tagger, lemmatizer=lemmatizer, model_file=model_file)
-    parsed_trees = parser.parse_sents(map(untag, test.sents()))
+    parsed_trees = parser.parse_sents(list(map(untag, test.sents())))
 
     test_data, test_results = test_file + ".data", test_file + ".results"
     print(
         "\n".join([tree.to_conll(10) for tree in test.trees()]).strip(),
-        file=codecs.open(test_data, "w", "utf8"),
+        file=open(test_data, "w", "utf8"),
     )
     print(
         "\n".join([tree.to_conll(10) for tree in parsed_trees]).strip(),
-        file=codecs.open(test_results, "w", "utf8"),
+        file=open(test_results, "w", "utf8"),
     )
     subprocess.Popen(
         ["java", "-jar", "resources/MaltEval.jar", "-g", test_data, "-s", test_results]
@@ -309,9 +303,9 @@ def train_turboparser(
 
     train, test = DadeganReader(train_file), DadeganReader(test_file)
     train_data = train_file + ".data"
-    with codecs.open(train_data, "w", "utf8") as output:
+    with open(train_data, "w", "utf8") as output:
         for tree, sentence in zip(
-            train.trees(), tagger.tag_sents(map(untag, train.sents()))
+            train.trees(), tagger.tag_sents(list(map(untag, train.sents())))
         ):
             for i, (node, word) in enumerate(
                 zip(list(tree.nodes.values())[1:], sentence), start=1
@@ -346,16 +340,16 @@ def train_turboparser(
 
     # evaluation
     parser = TurboParser(tagger=tagger, lemmatizer=lemmatizer, model_file=model_file)
-    parsed_trees = parser.parse_sents(map(untag, test.sents()))
+    parsed_trees = parser.parse_sents(list(map(untag, test.sents())))
 
     test_data, test_results = test_file + ".data", test_file + ".results"
     print(
         "\n".join([tree.to_conll(10) for tree in test.trees()]).strip(),
-        file=codecs.open(test_data, "w", "utf8"),
+        file=open(test_data, "w", "utf8"),
     )
     print(
         "\n".join([tree.to_conll(10) for tree in parsed_trees]).strip(),
-        file=codecs.open(test_results, "w", "utf8"),
+        file=open(test_results, "w", "utf8"),
     )
     subprocess.Popen(
         [
@@ -390,9 +384,9 @@ def train_stanford_postagger(
         list(peykare.sents()), test_size=test_size, random_state=0
     )
 
-    output = codecs.open(train_file, "w", "utf8")
+    output = open(train_file, "w", "utf8")
     for sentence in train:
-        print(*(map(lambda w: "/".join(w).replace(" ", "_"), sentence)), file=output)
+        print(*(["/".join(w).replace(" ", "_") for w in sentence]), file=output)
     subprocess.Popen(
         [
             "java",

diff --git a/format_docstrings.py b/format_docstrings.py
@@ -1,8 +1,8 @@
 import re, textwrap, glob
 
 
-def format_all_docstrings(pyFile):
-    text = open(pyFile, "r", encoding="utf-8").read()
+def format_all_docstrings(py_file):
+    text = open(py_file, encoding="utf-8").read()
     text = text.replace("\t", "    ")
 
     # Regex pattern that matches all docstrings
@@ -13,7 +13,7 @@ def format_all_docstrings(pyFile):
         new_doc = format_docstring(old_doc)
         text = text.replace(old_doc, new_doc)
 
-    open(pyFile, "w", encoding="utf-8").write(text)
+    open(py_file, "w", encoding="utf-8").write(text)
 
 
 def format_section(section, new):
@@ -38,7 +38,7 @@ def wrap_text(text, width):
     result = ""
     lines = text.split("\n")
     for line in lines:
-        wrapped_line = textwrap.fill(line, 79)
+        wrapped_line = textwrap.fill(line, width)
         result += wrapped_line + "\n"
 
     return result

diff --git a/hazm/BijankhanReader.py b/hazm/BijankhanReader.py
@@ -1,5 +1,3 @@
-# coding: utf-8
-
 """این ماژول شامل کلاس‌ها و توابعی برای خواندن پیکرهٔ بی‌جن‌خان است.
 
 [پیکرهٔ
@@ -11,8 +9,7 @@
 
 """
 
-from __future__ import unicode_literals
-import re, codecs
+
 from .Normalizer import *
 from .PeykareReader import join_verb_parts
 
@@ -64,27 +61,29 @@ class BijankhanReader:
     """این کلاس شامل توابعی برای خواندن پیکرهٔ بی‌جن‌خان است.
     
     Args:
-        bijankhan_file (str): مسیر فایلِ پیکره.
-        joined_verb_parts (bool, optional): اگر `True‍` باشد افعال چندبخشی را با _ به‌هم می‌چسباند.
-        pos_map (str, optional): دیکشنری مبدل برچسب‌های ریز به درشت.
+        bijankhan_file: مسیر فایلِ پیکره.
+        joined_verb_parts: اگر `True‍` باشد افعال چندبخشی را با _ به‌هم می‌چسباند.
+        pos_map: دیکشنری مبدل برچسب‌های ریز به درشت.
     
     """
 
-    def __init__(self, bijankhan_file, joined_verb_parts=True, pos_map=default_pos_map):
+    def __init__(self, bijankhan_file: str, joined_verb_parts:bool=True, pos_map:str=None):
+        if pos_map is None:
+            pos_map = default_pos_map
         self._bijankhan_file = bijankhan_file
         self._joined_verb_parts = joined_verb_parts
         self._pos_map = pos_map
         self._normalizer = Normalizer(correct_spacing=False)
 
-    def _sentences(self):
+    def _sentences(self) -> str:
         """جملات پیکره را به شکل متن خام برمی‌گرداند.
         
         Yields:
-            (str): جملهٔ بعدی.
+            جملهٔ بعدی.
         
         """
         sentence = []
-        for line in codecs.open(self._bijankhan_file, encoding="utf-8"):
+        for line in open(self._bijankhan_file, encoding="utf-8"):
             parts = re.split("  +", line.strip())
             if len(parts) == 2:
                 word, tag = parts
@@ -96,7 +95,7 @@ def _sentences(self):
                         yield sentence
                         sentence = []
 
-    def sents(self):
+    def sents(self) -> list[tuple[str,str]]:
         """جملات پیکره را به شکل لیستی از `(توکن،برچسب)`ها برمی‌گرداند..
         
         Examples:
@@ -105,7 +104,7 @@ def sents(self):
             [('اولین', 'ADJ'), ('سیاره', 'N'), ('خارج', 'ADJ'), ('از', 'PREP'), ('منظومه', 'N'), ('شمسی', 'ADJ'), ('دیده_شد', 'V'), ('.', 'PUNC')]
         
         Yields:
-            (List[Tuple[str,str]]): جملهٔ بعدی در قالب لیستی از `(توکن،برچسب)`ها.
+            جملهٔ بعدی در قالب لیستی از `(توکن،برچسب)`ها.
         
         """
         map_poses = lambda item: (item[0], self._pos_map.get(item[1], item[1]))