Skip to content

Commit

Permalink
remove stanford
Browse files Browse the repository at this point in the history
  • Loading branch information
sir-kokabi committed Oct 3, 2023
1 parent af46be6 commit 6aea285
Show file tree
Hide file tree
Showing 5 changed files with 0 additions and 94 deletions.
45 changes: 0 additions & 45 deletions data.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@
from hazm.corpus_readers.peykare_reader import \
coarse_pos_e as peykare_coarse_pos_e
from hazm.dependency_parser import MaltParser, TurboParser
from hazm.pos_tagger import StanfordPOSTagger


def create_words_file(dic_file="tests/files/persian.dic", output="hazm/data/words.dat"):
Expand Down Expand Up @@ -381,48 +380,4 @@ def train_turboparser(
"LAS;UAS",
]
).wait()


def train_stanford_postagger(
peykare_root="tests/files/peykare",
path_to_model="tests/files/persian.tagger",
path_to_jar="tests/files/stanford_postagger.jar",
properties_file="tests/files/stanford-postagger.props",
memory_min="-Xms1g",
memory_max="-Xmx6g",
test_size=0.1,
pos_map=peykare_coarse_pos_e,
):
peykare = PeykareReader(peykare_root, pos_map=pos_map)
train_file = "tests/files/tagger_train_data.txt"
train, test = train_test_split(
list(peykare.sents()), test_size=test_size, random_state=0
)

output = open(train_file, "w", "utf8")
for sentence in train:
print(*(["/".join(w).replace(" ", "_") for w in sentence]), file=output)
subprocess.Popen(
[
"java",
memory_min,
memory_max,
"-classpath",
path_to_jar,
"edu.stanford.nlp.tagger.maxent.MaxentTagger",
"-prop",
properties_file,
"-model",
path_to_model,
"-trainFile",
train_file,
"-tagSeparator",
"/",
"-search",
"owlqn2",
]
).wait()

tagger = StanfordPOSTagger(path_to_jar=path_to_jar, path_to_model=path_to_model)
print(tagger.evaluate(test))

1 change: 0 additions & 1 deletion hazm/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
from hazm.sequence_tagger import SequenceTagger

from hazm.pos_tagger import POSTagger
from hazm.pos_tagger import StanfordPOSTagger

from hazm.stemmer import Stemmer
from hazm.word_tokenizer import WordTokenizer
Expand Down
37 changes: 0 additions & 37 deletions hazm/pos_tagger.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
"""این ماژول شامل کلاس‌ها و توابعی برای برچسب‌گذاری توکن‌هاست."""

from nltk.tag import stanford

from hazm import SequenceTagger

punctuation_list = [
Expand Down Expand Up @@ -155,38 +153,3 @@ def tag_sents(self: "POSTagger", sentences):
if self.__is_universal
else tagged_sents
)


class StanfordPOSTagger(stanford.StanfordPOSTagger):
"""StanfordPOSTagger."""

def __init__(
self: "StanfordPOSTagger",
model_filename: "str",
path_to_jar: str,
*args, # noqa: ANN002
**kwargs, # noqa: ANN003
) -> None:
self._SEPARATOR = "/"
super(stanford.StanfordPOSTagger, self).__init__(
model_filename=model_filename,
path_to_jar=path_to_jar,
*args, # noqa: B026
**kwargs,
)

def tag(self: "StanfordPOSTagger", tokens):
"""tag.
Examples:
>>> tagger = StanfordPOSTagger(model_filename='persian.tagger', path_to_jar='stanford_postagger.jar')
>>> tagger.tag(['من', 'به', 'مدرسه', 'رفته_بودم', '.'])
[('من', 'PRO'), ('به', 'P'), ('مدرسه', 'N'), ('رفته_بودم', 'V'), ('.', 'PUNC')]
"""
return self.tag_sents([tokens])[0]

def tag_sents(self: "StanfordPOSTagger", sentences):
"""tag_sents."""
refined = ([w.replace(" ", "_") for w in s] for s in sentences)
return super(stanford.StanfordPOSTagger, self).tag_sents(refined)
5 changes: 0 additions & 5 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
from hazm import RuleBasedChunker
from hazm import SentEmbedding
from hazm import SentenceTokenizer
from hazm import StanfordPOSTagger
from hazm import Stemmer
from hazm import TokenSplitter
from hazm import WordEmbedding
Expand Down Expand Up @@ -48,10 +47,6 @@ def pos_tagger():
def universal_pos_tagger():
return POSTagger(model="tests/files/pos_tagger.model",universal_tag=True)

@pytest.fixture(scope="session")
def stanford_pos_tagger():
return StanfordPOSTagger(model_filename="tests/files/persian.tagger", path_to_jar="tests/files/stanford_postagger.jar")

@pytest.fixture(scope="session")
def token_splitter():
return TokenSplitter()
Expand Down
6 changes: 0 additions & 6 deletions tests/test_pos_tagger.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,3 @@ def test_tag_sents_universal(self:"TestPOSTagger", universal_pos_tagger):
actual = universal_pos_tagger.tag_sents([["من", "به", "مدرسه", "ایران", "رفته_بودم", "."]])
expected = [[("من", "PRON"), ("به", "ADP"), ("مدرسه", "NOUN"), ("ایران", "NOUN"), ("رفته_بودم", "VERB"), (".", "PUNCT")]]
assert actual == expected

class TestStanfordPOSTagger:
def test_data_maker(self:"TestPOSTagger", stanford_pos_tagger):
actual = stanford_pos_tagger.tag(["من", "به", "مدرسه", "رفته_بودم", "."])
expected = [("من", "PRO"), ("به", "P"), ("مدرسه", "N"), ("رفته_بودم", "V"), (".", "PUNC")]
assert actual == expected

0 comments on commit 6aea285

Please sign in to comment.