Skip to content

Commit

Permalink
migrate to python 3
Browse files Browse the repository at this point in the history
  • Loading branch information
sir-kokabi committed Mar 19, 2023
1 parent dd267fb commit 7c7a770
Show file tree
Hide file tree
Showing 35 changed files with 547 additions and 722 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
python-versions: [ '3.7', '3.8', '3.9', '3.10' ]
python-versions: [ '3.9', '3.10', '3.11.2' ]
#python-versions: [ '2.7', '3.7', '3.8', '3.9', '3.10', '3.11' ]

steps:
Expand Down
2 changes: 1 addition & 1 deletion LICENSE
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
70 changes: 32 additions & 38 deletions data.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,4 @@
# coding: utf-8

from __future__ import print_function, unicode_literals
import codecs, subprocess, random
import multiprocessing
import subprocess, random
from collections import Counter
from itertools import islice
from nltk.tag import untag
Expand All @@ -19,16 +15,14 @@ def create_words_file(dic_file="resources/persian.dic", output="hazm/data/words.

dic_words = [
line.strip().replace(", ", ",").split("\t")
for line in codecs.open(dic_file, encoding="utf-8")
for line in open(dic_file, encoding="utf-8")
if len(line.strip().split("\t")) == 3
]
dic_words = filter(
lambda item: not item[2].startswith("V") and "NEG" not in item[2], dic_words
)
dic_words = [item for item in dic_words if not item[2].startswith("V") and "NEG" not in item[2]]
dic_words = [
"\t".join(item) for item in sorted(dic_words, key=lambda item: item[0])
]
print(*dic_words, sep="\n", file=codecs.open(output, "w", "utf-8"))
print(*dic_words, sep="\n", file=open(output, "w", "utf-8"))
print(output, "created")


Expand All @@ -38,7 +32,7 @@ def evaluate_lemmatizer(
lemmatizer = Lemmatizer()

errors = []
with codecs.open("resources/lemmatizer_errors.txt", "w", "utf8") as output:
with open("resources/lemmatizer_errors.txt", "w", "utf8") as output:
dadegan = DadeganReader(conll_file)
for tree in dadegan.trees():
for node in tree.nodelist[1:]:
Expand All @@ -47,11 +41,11 @@ def evaluate_lemmatizer(
errors.append((word, lemma, pos, lemmatizer.lemmatize(word, pos)))
print(len(errors), "errors", file=output)
counter = Counter(errors)
for item, count in sorted(counter.items(), key=lambda t: t[1], reverse=True):
for item, count in sorted(list(counter.items()), key=lambda t: t[1], reverse=True):
print(count, *item, file=output)

missed = []
with codecs.open("resources/lemmatizer_missed.txt", "w", "utf8") as output:
with open("resources/lemmatizer_missed.txt", "w", "utf8") as output:
peykare = PeykareReader(peykare_root)
for sentence in peykare.sents():
for word in sentence:
Expand All @@ -60,7 +54,7 @@ def evaluate_lemmatizer(
missed.append(word[0])
print(len(missed), "missed", file=output)
counter = Counter(missed)
for item, count in sorted(counter.items(), key=lambda t: t[1], reverse=True):
for item, count in sorted(list(counter.items()), key=lambda t: t[1], reverse=True):
print(count, item, file=output)


Expand All @@ -81,7 +75,7 @@ def evaluate_normalizer(tnews_root="corpora/tnews"):
affix_spacing=False,
)

with codecs.open("resources/normalized.txt", "w", "utf8") as output1, codecs.open(
with open("resources/normalized.txt", "w", "utf8") as output1, open(
"resources/normalized_token_based.txt", "w", "utf8"
) as output2:
random.seed(0)
Expand All @@ -99,7 +93,7 @@ def evaluate_informal_normalizer(sentipars_root="corpora/sentipers"):
normalizer = Normalizer()
informal_normalizer = InformalNormalizer()

output = codecs.open("resources/normalized.txt", "w", "utf8")
output = open("resources/normalized.txt", "w", "utf8")
for comments in sentipers.comments():
for comment in comments:
for sentence in comment:
Expand All @@ -120,7 +114,7 @@ def evaluate_chunker(treebank_root="corpora/treebank"):

print(chunker.evaluate(chunked_trees))

output = codecs.open("resources/chunker_errors.txt", "w", "utf8")
output = open("resources/chunker_errors.txt", "w", "utf8")
for sentence, gold in zip(treebank.sents(), chunked_trees):
chunked = chunker.parse(sentence)
if chunked != gold:
Expand Down Expand Up @@ -155,14 +149,14 @@ def train_postagger(
'*:s1=%m[0,0,".?$"]',
'*:s2=%m[0,0,".?.?$"]',
'*:s3=%m[0,0,".?.?.?$"]',
'*:p?l=%t[-1,0,"\p"]',
'*:p?=%t[0,0,"\p"]',
'*:p?r=%t[1,0,"\p"]',
'*:p?a=%t[0,0,"^\p*$"]',
'*:n?l=%t[-1,0,"\d"]',
'*:n?=%t[0,0,"\d"]',
'*:n?r=%t[1,0,"\d"]',
'*:n?a=%t[0,0,"^\d*$"]',
r'*:p?l=%t[-1,0,"\p"]',
r'*:p?=%t[0,0,"\p"]',
r'*:p?r=%t[1,0,"\p"]',
r'*:p?a=%t[0,0,"^\p*$"]',
r'*:n?l=%t[-1,0,"\d"]',
r'*:n?=%t[0,0,"\d"]',
r'*:n?r=%t[1,0,"\d"]',
r'*:n?a=%t[0,0,"^\d*$"]',
],
)

Expand Down Expand Up @@ -204,7 +198,7 @@ def train_chunker(
)

def retag_trees(trees, sents):
for tree, sentence in zip(trees, tagger.tag_sents(map(untag, sents))):
for tree, sentence in zip(trees, tagger.tag_sents(list(map(untag, sents)))):
for n, word in zip(tree.treepositions("leaves"), sentence):
tree[n] = word

Expand Down Expand Up @@ -234,9 +228,9 @@ def train_maltparser(

train, test = DadeganReader(train_file), DadeganReader(test_file)
train_data = train_file + ".data"
with codecs.open(train_data, "w", "utf8") as output:
with open(train_data, "w", "utf8") as output:
for tree, sentence in zip(
train.trees(), tagger.tag_sents(map(untag, train.sents()))
train.trees(), tagger.tag_sents(list(map(untag, train.sents())))
):
for i, (node, word) in enumerate(
zip(list(tree.nodes.values())[1:], sentence), start=1
Expand Down Expand Up @@ -283,16 +277,16 @@ def train_maltparser(

# evaluation
parser = MaltParser(tagger=tagger, lemmatizer=lemmatizer, model_file=model_file)
parsed_trees = parser.parse_sents(map(untag, test.sents()))
parsed_trees = parser.parse_sents(list(map(untag, test.sents())))

test_data, test_results = test_file + ".data", test_file + ".results"
print(
"\n".join([tree.to_conll(10) for tree in test.trees()]).strip(),
file=codecs.open(test_data, "w", "utf8"),
file=open(test_data, "w", "utf8"),
)
print(
"\n".join([tree.to_conll(10) for tree in parsed_trees]).strip(),
file=codecs.open(test_results, "w", "utf8"),
file=open(test_results, "w", "utf8"),
)
subprocess.Popen(
["java", "-jar", "resources/MaltEval.jar", "-g", test_data, "-s", test_results]
Expand All @@ -309,9 +303,9 @@ def train_turboparser(

train, test = DadeganReader(train_file), DadeganReader(test_file)
train_data = train_file + ".data"
with codecs.open(train_data, "w", "utf8") as output:
with open(train_data, "w", "utf8") as output:
for tree, sentence in zip(
train.trees(), tagger.tag_sents(map(untag, train.sents()))
train.trees(), tagger.tag_sents(list(map(untag, train.sents())))
):
for i, (node, word) in enumerate(
zip(list(tree.nodes.values())[1:], sentence), start=1
Expand Down Expand Up @@ -346,16 +340,16 @@ def train_turboparser(

# evaluation
parser = TurboParser(tagger=tagger, lemmatizer=lemmatizer, model_file=model_file)
parsed_trees = parser.parse_sents(map(untag, test.sents()))
parsed_trees = parser.parse_sents(list(map(untag, test.sents())))

test_data, test_results = test_file + ".data", test_file + ".results"
print(
"\n".join([tree.to_conll(10) for tree in test.trees()]).strip(),
file=codecs.open(test_data, "w", "utf8"),
file=open(test_data, "w", "utf8"),
)
print(
"\n".join([tree.to_conll(10) for tree in parsed_trees]).strip(),
file=codecs.open(test_results, "w", "utf8"),
file=open(test_results, "w", "utf8"),
)
subprocess.Popen(
[
Expand Down Expand Up @@ -390,9 +384,9 @@ def train_stanford_postagger(
list(peykare.sents()), test_size=test_size, random_state=0
)

output = codecs.open(train_file, "w", "utf8")
output = open(train_file, "w", "utf8")
for sentence in train:
print(*(map(lambda w: "/".join(w).replace(" ", "_"), sentence)), file=output)
print(*(["/".join(w).replace(" ", "_") for w in sentence]), file=output)
subprocess.Popen(
[
"java",
Expand Down
8 changes: 4 additions & 4 deletions format_docstrings.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import re, textwrap, glob


def format_all_docstrings(pyFile):
text = open(pyFile, "r", encoding="utf-8").read()
def format_all_docstrings(py_file):
text = open(py_file, encoding="utf-8").read()
text = text.replace("\t", " ")

# Regex pattern that matches all docstrings
Expand All @@ -13,7 +13,7 @@ def format_all_docstrings(pyFile):
new_doc = format_docstring(old_doc)
text = text.replace(old_doc, new_doc)

open(pyFile, "w", encoding="utf-8").write(text)
open(py_file, "w", encoding="utf-8").write(text)


def format_section(section, new):
Expand All @@ -38,7 +38,7 @@ def wrap_text(text, width):
result = ""
lines = text.split("\n")
for line in lines:
wrapped_line = textwrap.fill(line, 79)
wrapped_line = textwrap.fill(line, width)
result += wrapped_line + "\n"

return result
Expand Down
25 changes: 12 additions & 13 deletions hazm/BijankhanReader.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
# coding: utf-8

"""این ماژول شامل کلاس‌ها و توابعی برای خواندن پیکرهٔ بی‌جن‌خان است.
[پیکرهٔ
Expand All @@ -11,8 +9,7 @@
"""

from __future__ import unicode_literals
import re, codecs

from .Normalizer import *
from .PeykareReader import join_verb_parts

Expand Down Expand Up @@ -64,27 +61,29 @@ class BijankhanReader:
"""این کلاس شامل توابعی برای خواندن پیکرهٔ بی‌جن‌خان است.
Args:
bijankhan_file (str): مسیر فایلِ پیکره.
joined_verb_parts (bool, optional): اگر `True‍` باشد افعال چندبخشی را با _ به‌هم می‌چسباند.
pos_map (str, optional): دیکشنری مبدل برچسب‌های ریز به درشت.
bijankhan_file: مسیر فایلِ پیکره.
joined_verb_parts: اگر `True‍` باشد افعال چندبخشی را با _ به‌هم می‌چسباند.
pos_map: دیکشنری مبدل برچسب‌های ریز به درشت.
"""

def __init__(self, bijankhan_file, joined_verb_parts=True, pos_map=default_pos_map):
def __init__(self, bijankhan_file: str, joined_verb_parts:bool=True, pos_map:str=None):
if pos_map is None:
pos_map = default_pos_map
self._bijankhan_file = bijankhan_file
self._joined_verb_parts = joined_verb_parts
self._pos_map = pos_map
self._normalizer = Normalizer(correct_spacing=False)

def _sentences(self):
def _sentences(self) -> str:
"""جملات پیکره را به شکل متن خام برمی‌گرداند.
Yields:
(str): جملهٔ بعدی.
جملهٔ بعدی.
"""
sentence = []
for line in codecs.open(self._bijankhan_file, encoding="utf-8"):
for line in open(self._bijankhan_file, encoding="utf-8"):
parts = re.split(" +", line.strip())
if len(parts) == 2:
word, tag = parts
Expand All @@ -96,7 +95,7 @@ def _sentences(self):
yield sentence
sentence = []

def sents(self):
def sents(self) -> list[tuple[str,str]]:
"""جملات پیکره را به شکل لیستی از `(توکن،برچسب)`ها برمی‌گرداند..
Examples:
Expand All @@ -105,7 +104,7 @@ def sents(self):
[('اولین', 'ADJ'), ('سیاره', 'N'), ('خارج', 'ADJ'), ('از', 'PREP'), ('منظومه', 'N'), ('شمسی', 'ADJ'), ('دیده_شد', 'V'), ('.', 'PUNC')]
Yields:
(List[Tuple[str,str]]): جملهٔ بعدی در قالب لیستی از `(توکن،برچسب)`ها.
جملهٔ بعدی در قالب لیستی از `(توکن،برچسب)`ها.
"""
map_poses = lambda item: (item[0], self._pos_map.get(item[1], item[1]))
Expand Down
Loading

0 comments on commit 7c7a770

Please sign in to comment.