diff --git a/data.py b/data.py index ceafd183..b8781eb2 100755 --- a/data.py +++ b/data.py @@ -14,7 +14,6 @@ from hazm.corpus_readers.peykare_reader import \ coarse_pos_e as peykare_coarse_pos_e from hazm.dependency_parser import MaltParser, TurboParser -from hazm.pos_tagger import StanfordPOSTagger def create_words_file(dic_file="tests/files/persian.dic", output="hazm/data/words.dat"): @@ -381,48 +380,4 @@ def train_turboparser( "LAS;UAS", ] ).wait() - - -def train_stanford_postagger( - peykare_root="tests/files/peykare", - path_to_model="tests/files/persian.tagger", - path_to_jar="tests/files/stanford_postagger.jar", - properties_file="tests/files/stanford-postagger.props", - memory_min="-Xms1g", - memory_max="-Xmx6g", - test_size=0.1, - pos_map=peykare_coarse_pos_e, -): - peykare = PeykareReader(peykare_root, pos_map=pos_map) - train_file = "tests/files/tagger_train_data.txt" - train, test = train_test_split( - list(peykare.sents()), test_size=test_size, random_state=0 - ) - - output = open(train_file, "w", "utf8") - for sentence in train: - print(*(["/".join(w).replace(" ", "_") for w in sentence]), file=output) - subprocess.Popen( - [ - "java", - memory_min, - memory_max, - "-classpath", - path_to_jar, - "edu.stanford.nlp.tagger.maxent.MaxentTagger", - "-prop", - properties_file, - "-model", - path_to_model, - "-trainFile", - train_file, - "-tagSeparator", - "/", - "-search", - "owlqn2", - ] - ).wait() - - tagger = StanfordPOSTagger(path_to_jar=path_to_jar, path_to_model=path_to_model) - print(tagger.evaluate(test)) diff --git a/hazm/__init__.py b/hazm/__init__.py index 7e5befd7..bd2362cd 100644 --- a/hazm/__init__.py +++ b/hazm/__init__.py @@ -19,7 +19,6 @@ from hazm.sequence_tagger import SequenceTagger from hazm.pos_tagger import POSTagger -from hazm.pos_tagger import StanfordPOSTagger from hazm.stemmer import Stemmer from hazm.word_tokenizer import WordTokenizer diff --git a/hazm/pos_tagger.py b/hazm/pos_tagger.py index d999dce0..7b6d4f9d 100644 --- a/hazm/pos_tagger.py +++ b/hazm/pos_tagger.py @@ -1,7 +1,5 @@ """این ماژول شامل کلاس‌ها و توابعی برای برچسب‌گذاری توکن‌هاست.""" -from nltk.tag import stanford - from hazm import SequenceTagger punctuation_list = [ @@ -155,38 +153,3 @@ def tag_sents(self: "POSTagger", sentences): if self.__is_universal else tagged_sents ) - - -class StanfordPOSTagger(stanford.StanfordPOSTagger): - """StanfordPOSTagger.""" - - def __init__( - self: "StanfordPOSTagger", - model_filename: "str", - path_to_jar: str, - *args, # noqa: ANN002 - **kwargs, # noqa: ANN003 - ) -> None: - self._SEPARATOR = "/" - super(stanford.StanfordPOSTagger, self).__init__( - model_filename=model_filename, - path_to_jar=path_to_jar, - *args, # noqa: B026 - **kwargs, - ) - - def tag(self: "StanfordPOSTagger", tokens): - """tag. - - Examples: - >>> tagger = StanfordPOSTagger(model_filename='persian.tagger', path_to_jar='stanford_postagger.jar') - >>> tagger.tag(['من', 'به', 'مدرسه', 'رفته_بودم', '.']) - [('من', 'PRO'), ('به', 'P'), ('مدرسه', 'N'), ('رفته_بودم', 'V'), ('.', 'PUNC')] - - """ - return self.tag_sents([tokens])[0] - - def tag_sents(self: "StanfordPOSTagger", sentences): - """tag_sents.""" - refined = ([w.replace(" ", "_") for w in s] for s in sentences) - return super(stanford.StanfordPOSTagger, self).tag_sents(refined) diff --git a/tests/conftest.py b/tests/conftest.py index 3c0962be..98b44bba 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -9,7 +9,6 @@ from hazm import RuleBasedChunker from hazm import SentEmbedding from hazm import SentenceTokenizer -from hazm import StanfordPOSTagger from hazm import Stemmer from hazm import TokenSplitter from hazm import WordEmbedding @@ -48,10 +47,6 @@ def pos_tagger(): def universal_pos_tagger(): return POSTagger(model="tests/files/pos_tagger.model",universal_tag=True) -@pytest.fixture(scope="session") -def stanford_pos_tagger(): - return StanfordPOSTagger(model_filename="tests/files/persian.tagger", path_to_jar="tests/files/stanford_postagger.jar") - @pytest.fixture(scope="session") def token_splitter(): return TokenSplitter() diff --git a/tests/test_pos_tagger.py b/tests/test_pos_tagger.py index 33f7e2f2..eb8ea02a 100644 --- a/tests/test_pos_tagger.py +++ b/tests/test_pos_tagger.py @@ -19,9 +19,3 @@ def test_tag_sents_universal(self:"TestPOSTagger", universal_pos_tagger): actual = universal_pos_tagger.tag_sents([["من", "به", "مدرسه", "ایران", "رفته_بودم", "."]]) expected = [[("من", "PRON"), ("به", "ADP"), ("مدرسه", "NOUN"), ("ایران", "NOUN"), ("رفته_بودم", "VERB"), (".", "PUNCT")]] assert actual == expected - -class TestStanfordPOSTagger: - def test_data_maker(self:"TestPOSTagger", stanford_pos_tagger): - actual = stanford_pos_tagger.tag(["من", "به", "مدرسه", "رفته_بودم", "."]) - expected = [("من", "PRO"), ("به", "P"), ("مدرسه", "N"), ("رفته_بودم", "V"), (".", "PUNC")] - assert actual == expected