From f840599e1d1e0db4e23940a8848a4abe52aedaeb Mon Sep 17 00:00:00 2001 From: mmatera Date: Mon, 20 Feb 2023 17:49:56 -0300 Subject: [PATCH 01/14] split main in modules --- Makefile | 5 +- pymathics/natlang/__init__.py | 25 +- pymathics/natlang/linguistic_data.py | 350 ++++ pymathics/natlang/main.py | 1506 ----------------- pymathics/natlang/normalization.py | 301 ++++ pymathics/natlang/spacy.py | 249 +++ pymathics/natlang/textual_analysis.py | 427 +++++ pymathics/natlang/translation.py | 45 + pymathics/natlang/util.py | 328 ++++ setup.py | 7 +- .../test_summary_text.py | 3 +- 11 files changed, 1722 insertions(+), 1524 deletions(-) create mode 100644 pymathics/natlang/linguistic_data.py delete mode 100644 pymathics/natlang/main.py create mode 100644 pymathics/natlang/normalization.py create mode 100644 pymathics/natlang/spacy.py create mode 100644 pymathics/natlang/textual_analysis.py create mode 100644 pymathics/natlang/translation.py create mode 100644 pymathics/natlang/util.py diff --git a/Makefile b/Makefile index a4ac90b..6de236d 100644 --- a/Makefile +++ b/Makefile @@ -72,7 +72,7 @@ pytest: doctest: - MATHICS_CHARACTER_ENCODING="ASCII" $(PYTHON) -m mathics.docpipeline -l pymathics.natlang -c "Natural Language Processing" $o + MATHICS_CHARACTER_ENCODING="ASCII" $(PYTHON) -m mathics.docpipeline -l pymathics.natlang -c 'Natural Language Processing' $o # #: Make Mathics PDF manual @@ -89,5 +89,4 @@ ChangeLog: rmChangeLog #: Run pytest consistency and style checks check-consistency-and-style: - # MATHICS_LINT=t $(PYTHON) -m pytest test/consistency-and-style - echo "check-consistency-and-style deactivated. Activate me later. " + MATHICS_LINT=t $(PYTHON) -m pytest test/consistency-and-style diff --git a/pymathics/natlang/__init__.py b/pymathics/natlang/__init__.py index f3b883e..c0d76ae 100644 --- a/pymathics/natlang/__init__.py +++ b/pymathics/natlang/__init__.py @@ -38,28 +38,32 @@ = Old Man Apulia, conduct peculiar """ - -from pymathics.natlang.main import ( - DeleteStopwords, - DictionaryLookup, - DictionaryWordQ, - LanguageIdentify, +from pymathics.natlang.linguistic_data import ( Pluralize, RandomWord, - SpellingCorrectionList, + WordData, + WordDefinition, + WordList, +) +from pymathics.natlang.normalization import ( + DeleteStopwords, TextCases, TextPosition, TextSentences, TextStructure, TextWords, +) +from pymathics.natlang.textual_analysis import ( + Containing, + DictionaryLookup, + DictionaryWordQ, + SpellingCorrectionList, WordCount, - WordData, - WordDefinition, WordFrequency, - WordList, WordSimilarity, WordStem, ) +from pymathics.natlang.translation import LanguageIdentify from pymathics.natlang.version import __version__ pymathics_version_data = { @@ -70,6 +74,7 @@ } __all__ = [ + "Containing", "DeleteStopwords", "DictionaryLookup", "DictionaryWordQ", diff --git a/pymathics/natlang/linguistic_data.py b/pymathics/natlang/linguistic_data.py new file mode 100644 index 0000000..942ff48 --- /dev/null +++ b/pymathics/natlang/linguistic_data.py @@ -0,0 +1,350 @@ +# -*- coding: utf-8 -*- +""" +Linguistic Data + +See :WMA:https://reference.wolfram.com/language/guide/LinguisticData.html guide. + +""" + + +# TODO: Complete me + +# WordFrequencyData — data on typical current and historical word frequencies +# Synonyms — synonyms for a word +# Antonyms — antonyms for a word +# PartOfSpeech — possible parts of speech for a word + + +from typing import Optional + +from mathics.builtin.base import Builtin, MessageException + +# from mathics.builtin.codetables import iso639_3 +from mathics.builtin.numbers.randomnumbers import RandomEnv +from mathics.core.atoms import String +from mathics.core.convert.expression import Expression, to_expression +from mathics.core.evaluation import Evaluation +from mathics.core.list import ListExpression +from mathics.core.symbols import Symbol, SymbolList +from mathics.core.systemsymbols import SymbolMissing, SymbolRule, SymbolStringExpression +from pattern.en import pluralize + +from pymathics.natlang.textual_analysis import WordStem +from pymathics.natlang.util import ( + WordProperty, + _WordListBuiltin, + _wordnet_pos_to_type, + _WordNetBuiltin, + merge_dictionaries, +) + +SymbolDictionaryLookup = Symbol("Pymathics`Natlang`DictionaryLookup") +StringNotAvailable = String("NotAvailable") + + +class Pluralize(Builtin): + """ + :WMA: + https://reference.wolfram.com/language/ref/Pluralize.html + +
+
'Pluralize[$word$]' +
returns the plural form of $word$. +
+ + >> Pluralize["potato"] + = potatoes + """ + + requires = ("pattern",) + summary_text = "Retrieve the pluralized form of a word" + + def eval(self, word, evaluation): + "Pluralize[word_String]" + + return String(pluralize(word.value)) + + +class RandomWord(_WordListBuiltin): + """ + :WMA: + https://reference.wolfram.com/language/ref/RandomWord.html + +
+
'RandomWord[]' +
returns a random word. + +
'RandomWord[$type$]' +
returns a random word of the given $type$, e.g. of type "Noun" or "Adverb". + +
'RandomWord[$type$, $n$]' +
returns $n$ random words of the given $type$. + + >> RandomWord["Noun"] + = ... + >> RandomWord["Noun", 3] + = {..., ..., ...} + +
+ """ + + summary_text = "generate a random word of a given kind" + + def _random_words(self, type, n, evaluation: Evaluation, options: dict): + words = self._words(self._language_name(evaluation, options), type, evaluation) + if words is not None: + with RandomEnv(evaluation) as rand: + return [ + String(words[rand.randint(0, len(words) - 1)].replace("_", " ")) + for _ in range(n) + ] + + def eval(self, evaluation: Evaluation, options: dict): + "RandomWord[OptionsPattern[RandomWord]]" + words = self._random_words("All", 1, evaluation, options) + if words: + return words[0] + + def eval_type(self, type, evaluation: Evaluation, options: dict): + "RandomWord[type_String, OptionsPattern[RandomWord]]" + words = self._random_words(type.value, 1, evaluation, options) + if words: + return words[0] + + def eval_type_n(self, type, n, evaluation: Evaluation, options: dict): + "RandomWord[type_String, n_Integer, OptionsPattern[RandomWord]]" + words = self._random_words(type.value, n.value, evaluation, options) + if words: + return ListExpression(*words) + + +class WordData(_WordListBuiltin): + """ + + :WMA: + https://reference.wolfram.com/language/ref/WordData.html + +
+
'WordData[$word$]' +
returns a list of possible senses of a word. + +
'WordData[$word$, $property$]' +
returns detailed information about a word regarding $property$, e.g. "Definitions" or "Examples". +
+ + The following are valid properties: + + + >> WordData["riverside", "Definitions"] + = {{riverside, Noun, Bank} -> the bank of a river} + + >> WordData[{"fish", "Verb", "Angle"}, "Examples"] + = {{fish, Verb, Angle} -> {fish for compliments}} + """ + + messages = merge_dictionaries( + _WordNetBuiltin.messages, + { + "notprop": "WordData[] does not recognize `1` as a valid property.", + }, + ) + summary_text = "retrieve an association with properties of a word" + + def _parse_word(self, word): + if isinstance(word, String): + return word.value.lower() + elif word.get_head_name() == "System`List": + if len(word.elements) == 3 and all( + isinstance(s, String) for s in word.elements + ): + return tuple(s.value for s in word.elements) + + def _standard_property( + self, py_word, py_form, py_property, wordnet, language_code, evaluation + ): + senses = self._senses(py_word, wordnet, language_code) + if not senses: + return Expression(SymbolMissing, StringNotAvailable) + elif py_form == "List": + word_property = WordProperty(self._short_syn_form, wordnet, language_code) + property_getter = getattr( + word_property, "%s" % self._underscore(py_property), None + ) + if property_getter: + return ListExpression( + *[property_getter(syn, desc) for syn, desc in senses] + ) + elif py_form in ("Rules", "ShortRules"): + syn_form = (lambda s: s) if py_form == "Rules" else (lambda s: s[0]) + word_property = WordProperty(syn_form, wordnet, language_code) + property_getter = getattr( + word_property, self._underscore(py_property), None + ) + if property_getter: + list_expr_elements = [ + to_expression(SymbolRule, desc, property_getter(syn, desc)) + for syn, desc in senses + ] + return ListExpression(*list_expr_elements) + evaluation.message(self.get_name(), "notprop", property) + + def _parts_of_speech(self, py_word, wordnet, language_code): + parts = set( + syn.pos() for syn, _ in self._senses(py_word, wordnet, language_code) + ) + if not parts: + return Expression(SymbolMissing, StringNotAvailable) + else: + return ListExpression( + *[String(s) for s in sorted([_wordnet_pos_to_type[p] for p in parts])] + ) + + def _property( + self, word, py_property, py_form, evaluation: Evaluation, options: dict + ): + if py_property == "PorterStem": + if isinstance(word, String): + return String(WordStem.porter(word.value)) + else: + return + + wordnet, language_code = self._load_wordnet( + evaluation, self._language_name(evaluation, options) + ) + if not wordnet: + return + + py_word = self._parse_word(word) + if not py_word: + return + + if py_property == "PartsOfSpeech": + return self._parts_of_speech(py_word, wordnet, language_code) + + try: + return self._standard_property( + py_word, py_form, py_property, wordnet, language_code, evaluation + ) + except MessageException as e: + e.message(evaluation) + + def eval(self, word, evaluation: Evaluation, options: dict) -> Optional[Expression]: + "WordData[word_, OptionsPattern[WordData]]" + if word.get_head() is SymbolStringExpression: + return Expression(SymbolDictionaryLookup, word) + elif isinstance(word, String) or word.get_head() is SymbolList: + pass + else: + return + + wordnet, language_code = self._load_wordnet( + evaluation, self._language_name(evaluation, options) + ) + if not wordnet: + return + + py_word = self._parse_word(word) + if not py_word: + return + + senses = self._senses(py_word, wordnet, language_code) + if senses is not None: + return ListExpression(*[[String(s) for s in desc] for syn, desc in senses]) + + def eval_property(self, word, property, evaluation: Evaluation, options: dict): + "WordData[word_, property_String, OptionsPattern[WordData]]" + if word.get_head is SymbolStringExpression: + if property.get_string_value() == "Lookup": + return Expression(SymbolDictionaryLookup, word) + elif isinstance(word, String) or word.get_head() is SymbolList: + return self._property( + word, property.get_string_value(), "ShortRules", evaluation, options + ) + + def eval_property_form( + self, word, property, form, evaluation: Evaluation, options: dict + ): + "WordData[word_, property_String, form_String, OptionsPattern[WordData]]" + if isinstance(word, String) or word.get_head() is SymbolList: + return self._property( + word, + property.value, + form.value, + evaluation, + options, + ) + + +class WordDefinition(_WordNetBuiltin): + """ + :WMA: + https://reference.wolfram.com/language/ref/WordDefinition.html + +
+
'WordDefinition[$word$]' +
returns a definition of $word$ or Missing["Available"] if $word$ is not known. +
+ + >> WordDefinition["gram"] + = {a metric unit of weight equal to one thousandth of a kilogram} + """ + + summary_text = "retrieve the definition of a word" + + def eval(self, word, evaluation: Evaluation, options: dict): + "WordDefinition[word_String, OptionsPattern[WordDefinition]]" + wordnet, language_code = self._load_wordnet( + evaluation, self._language_name(evaluation, options) + ) + if wordnet: + senses = self._senses(word.value.lower(), wordnet, language_code) + if senses: + return ListExpression(*[String(syn.definition()) for syn, _ in senses]) + else: + return Expression(SymbolMissing, StringNotAvailable) + + +class WordList(_WordListBuiltin): + """ + :WMA: + https://reference.wolfram.com/language/ref/WordList.html + +
+
'WordList[]' +
returns a list of common words. + +
'WordList[$type$]' +
returns a list of common words of type $type$. +
+ + >> N[Mean[StringLength /@ WordList["Adjective"]], 2] + = 9.3 + """ + + summary_text = "retrieve a list of common words" + + def eval(self, evaluation: Evaluation, options: dict): + "WordList[OptionsPattern[]]" + words = self._words(self._language_name(evaluation, options), "All", evaluation) + if words is not None: + return ListExpression(*(String(word) for word in words)) + + def eval_type(self, wordtype, evaluation: Evaluation, options: dict): + "WordList[wordtype_String, OptionsPattern[]]" + words = self._words( + self._language_name(evaluation, options), + wordtype.value, + evaluation, + ) + if words is not None: + return ListExpression(*(String(word) for word in words)) diff --git a/pymathics/natlang/main.py b/pymathics/natlang/main.py deleted file mode 100644 index 4d6dbe6..0000000 --- a/pymathics/natlang/main.py +++ /dev/null @@ -1,1506 +0,0 @@ -# -*- coding: utf-8 -*- -# FIXME: split this up into smaller pieces - -""" -Natural Language Functions - -""" - -import heapq -import itertools -import math - -# import os -import re -from itertools import chain -from typing import Optional, Union - -import enchant -import langid # see https://github.com/saffsd/langid.py -import pycountry -import spacy -from mathics.builtin.atomic.strings import anchor_pattern, to_regex -from mathics.builtin.base import Builtin, MessageException -from mathics.builtin.codetables import iso639_3 -from mathics.builtin.numbers.randomnumbers import RandomEnv -from mathics.core.atoms import Integer, Real, String -from mathics.core.convert.expression import ( - ListExpression, - to_expression, - to_mathics_list, -) -from mathics.core.evaluation import Evaluation -from mathics.core.expression import Expression -from mathics.core.symbols import ( - Symbol, - SymbolFalse, - SymbolList, - SymbolTrue, - strip_context, -) -from mathics.core.systemsymbols import ( - SymbolFailed, - SymbolMissing, - SymbolRule, - SymbolStringExpression, -) -from mathics.eval.nevaluator import eval_N -from pattern.en import pluralize - -SymbolDictionaryLookup = Symbol("Pymathics`Natlang`DictionaryLookup") - -StringNotAvailable = String("NotAvailable") - - -def _parse_nltk_lookup_error(e): - m = re.search(r"Resource '([^']+)' not found\.", str(e)) - if m: - return m.group(1) - else: - return "unknown" - - -def _make_forms(): - forms = { - "Word": lambda doc: (token for token in doc), - "Sentence": lambda doc: (sent for sent in doc.sents), - "Paragraph": lambda doc: _fragments(doc, re.compile(r"^[\n][\n]+$")), - "Line": lambda doc: _fragments(doc, re.compile(r"^[\n]$")), - "URL": lambda doc: (token for token in doc if token.orth_.like_url()), - "EmailAddress": lambda doc: ( - token for token in doc if token.orth_.like_email() - ), - } - - def filter_named_entity(label): - def generator(doc): - for ent in doc.ents: - if ent.label == label: - yield ent - - return generator - - def filter_pos(pos): - def generator(doc): - for token in doc: - if token.pos == pos: - yield token - - return generator - - for name, symbol in _symbols.items(): - forms[name] = filter_named_entity(symbol) - - for tag, names in _pos_tags.items(): - name, phrase_name = names - forms[name] = filter_pos(tag) - - return forms - - -# the following two may only be accessed after_WordNetBuiltin._load_wordnet has -# been called. - -_wordnet_pos_to_type = {} -_wordnet_type_to_pos = {} - -import nltk - - -def _init_nltk_maps(): - _wordnet_pos_to_type.update( - { - nltk.corpus.wordnet.VERB: "Verb", - nltk.corpus.wordnet.NOUN: "Noun", - nltk.corpus.wordnet.ADJ: "Adjective", - nltk.corpus.wordnet.ADJ_SAT: "Adjective", - nltk.corpus.wordnet.ADV: "Adverb", - } - ) - _wordnet_type_to_pos.update( - { - "Verb": [nltk.corpus.wordnet.VERB], - "Noun": [nltk.corpus.wordnet.NOUN], - "Adjective": [nltk.corpus.wordnet.ADJ, nltk.corpus.wordnet.ADJ_SAT], - "Adverb": [nltk.corpus.wordnet.ADV], - } - ) - - -from spacy.tokens import Span - -# Part of speech tags and their public interface names in Mathics -# see http://www.mathcs.emory.edu/~choi/doc/clear-dependency-2012.pdf -_pos_tags = { - spacy.parts_of_speech.ADJ: ("Adjective", ""), - spacy.parts_of_speech.ADP: ("Preposition", "Prepositional Phrase"), - spacy.parts_of_speech.ADV: ("Adverb", ""), - spacy.parts_of_speech.CONJ: ("Conjunct", ""), - spacy.parts_of_speech.DET: ("Determiner", ""), - spacy.parts_of_speech.INTJ: ("Interjection", ""), - spacy.parts_of_speech.NOUN: ("Noun", "Noun Phrase"), - spacy.parts_of_speech.NUM: ("Number", ""), - spacy.parts_of_speech.PART: ("Particle", ""), - spacy.parts_of_speech.PRON: ("Pronoun", ""), - spacy.parts_of_speech.PROPN: ("Proposition", ""), - spacy.parts_of_speech.PUNCT: ("Punctuation", ""), - spacy.parts_of_speech.SCONJ: ("Sconj", ""), - spacy.parts_of_speech.SYM: ("Symbol", ""), - spacy.parts_of_speech.VERB: ("Verb", "Verb Phrase"), - spacy.parts_of_speech.X: ("X", ""), - spacy.parts_of_speech.EOL: ("EOL", ""), - spacy.parts_of_speech.SPACE: ("Space", ""), -} - -# Mathics3 named entitiy names and their corresponding constants in spacy. -_symbols = { - "Person": spacy.symbols.PERSON, - "Company": spacy.symbols.ORG, - "Quantity": spacy.symbols.QUANTITY, - "Number": spacy.symbols.CARDINAL, - "CurrencyAmount": spacy.symbols.MONEY, - "Country": spacy.symbols.GPE, # also includes cities and states - "City": spacy.symbols.GPE, # also includes countries and states -} - -# forms are everything one can use in TextCases[] or TextPosition[]. -_forms = _make_forms() - - -def _merge_dictionaries(a, b): - c = a.copy() - c.update(b) - return c - - -def _position(t): - if isinstance(t, Span): - i = t.doc[t.start] - r = t.doc[t.end - 1] - return 1 + i.idx, r.idx + len(r.text) - else: - return 1 + t.idx, t.idx + len(t.text) - - -def _fragments(doc, sep): - start = 0 - for i, token in enumerate(doc): - if sep.match(token.text): - yield Span(doc, start, i) - start = i + 1 - end = len(doc) - if start < end: - yield Span(doc, start, end) - - -class _SpacyBuiltin(Builtin): - requires = ("spacy",) - - options = { - "Language": '"English"', - } - - messages = { - "runtime": "Spacy gave the following error: ``", - "lang": 'Language "`1`" is currently not supported with `2`[].', - } - - _language_codes = { - "English": "en", - "German": "de", - } - - _spacy_instances = {} - - def _load_spacy(self, evaluation: Evaluation, options: dict): - language_code = None - language_name = self.get_option(options, "Language", evaluation) - if language_name is None: - language_name = String("Undefined") - if isinstance(language_name, String): - language_code = _SpacyBuiltin._language_codes.get(language_name.value) - if not language_code: - evaluation.message( - self.get_name(), "lang", language_name, strip_context(self.get_name()) - ) - return None - - instance = _SpacyBuiltin._spacy_instances.get(language_code) - if instance: - return instance - - try: - instance = spacy.load(f"{language_code}_core_web_md") - - # "via" parameter no longer exists. This was used in MATHICS3_SPACY_DATA - # if "MATHICS3_SPACY_DATA" in os.environ: - # instance = spacy.load( - # language_code, via=os.environ["MATHICS3_SPACY_DATA"] - # ) - # else: - # instance = spacy.load(f"{language_code}_core_web_md") - - _SpacyBuiltin._spacy_instances[language_code] = instance - return instance - except RuntimeError as e: - evaluation.message(self.get_name(), "runtime", str(e)) - return None - - def _nlp(self, text, evaluation, options) -> Optional[spacy.tokens.doc.Doc]: - nlp = self._load_spacy(evaluation, options) - if not nlp: - return None - return nlp(text) - - def _is_stop_lambda(self, evaluation: Evaluation, options: dict): - nlp = self._load_spacy(evaluation, options) - if not nlp: - return None - - vocab = nlp.vocab - - def is_stop(word): - return vocab[word].is_stop - - return is_stop - - -class WordCount(_SpacyBuiltin): - """ -
-
'WordCount[$string$]' -
returns the number of words in $string$. -
- - >> WordCount["A long time ago"] - = 4 - """ - - def eval(self, text, evaluation: Evaluation, options: dict): - "WordCount[text_String, OptionsPattern[WordCount]]" - doc = self._nlp(text.value, evaluation, options) - if doc: - punctuation = spacy.parts_of_speech.PUNCT - return Integer(sum(1 for word in doc if word.pos != punctuation)) - - -class TextWords(_SpacyBuiltin): - """ -
-
'TextWords[$string$]' -
returns the words in $string$. - -
'TextWords[$string$, $n$]' -
returns the first $n$ words in $string$ -
- - >> TextWords["Hickory, dickory, dock! The mouse ran up the clock."] - = {Hickory, dickory, dock, The, mouse, ran, up, the, clock} - """ - - def eval( - self, text: String, evaluation: Evaluation, options: dict - ) -> Optional[ListExpression]: - "TextWords[text_String, OptionsPattern[WordCount]]" - doc = self._nlp(text.value, evaluation, options) - if doc: - punctuation = spacy.parts_of_speech.PUNCT - return ListExpression( - *[String(word.text) for word in doc if word.pos != punctuation], - ) - - def eval_n(self, text: String, n: Integer, evaluation: Evaluation, options: dict): - "TextWords[text_String, n_Integer, OptionsPattern[TextWords]]" - doc = self._nlp(text.value, evaluation, options) - if doc: - punctuation = spacy.parts_of_speech.PUNCT - return ListExpression( - *itertools.islice( - (String(word.text) for word in doc if word.pos != punctuation), - n.value, - ), - ) - - -class TextSentences(_SpacyBuiltin): - """ -
-
'TextSentences[$string$]' -
returns the sentences in $string$. - -
'TextSentences[$string$, $n$]' -
returns the first $n$ sentences in $string$ -
- - >> TextSentences["Night and day. Day and night."] - = {Night and day., Day and night.} - - >> TextSentences["Night and day. Day and night.", 1] - = {Night and day.} - - >> TextSentences["Mr. Jones met Mrs. Jones."] - = {Mr. Jones met Mrs. Jones.} - """ - - def eval(self, text: String, evaluation: Evaluation, options: dict): - "TextSentences[text_String, OptionsPattern[TextSentences]]" - doc = self._nlp(text.value, evaluation, options) - if doc: - return ListExpression(*[String(sent.text) for sent in doc.sents]) - - def eval_n(self, text: String, n: Integer, evaluation: Evaluation, options: dict): - "TextSentences[text_String, n_Integer, OptionsPattern[TextSentences]]" - doc = self._nlp(text.value, evaluation, options) - if doc: - return ListExpression( - *itertools.islice((String(sent.text) for sent in doc.sents), n.value), - ) - - -class DeleteStopwords(_SpacyBuiltin): - """ -
-
'DeleteStopwords[$list$]' -
returns the words in $list$ without stopwords. - -
'DeleteStopwords[$string$]' -
returns $string$ without stopwords. -
- - ## This has changed since old versions of natlang, and I am - ## not sure the old behavior was correct. - ## >> DeleteStopwords[{"Somewhere", "over", "the", "rainbow"}] - ## = {rainbow} - - >> DeleteStopwords["There was an Old Man of Apulia, whose conduct was very peculiar"] - = Old Man Apulia, conduct peculiar - """ - - def eval_list(self, li, evaluation: Evaluation, options: dict) -> ListExpression: - "DeleteStopwords[li_List, OptionsPattern[DeleteStopwords]]" - is_stop = self._is_stop_lambda(evaluation, options) - - def filter_words(words): - for w in words: - s = w.get_string_value() - if s is not None: - yield String(s) - elif is_stop is not None and is_stop(s) is not None: - yield String(s) - - return ListExpression(*list(filter_words(li.elements))) - - def eval_string(self, s: String, evaluation: Evaluation, options: dict): - "DeleteStopwords[s_String, OptionsPattern[DeleteStopwords]]" - doc = self._nlp(s.value, evaluation, options) - if doc: - is_stop = self._is_stop_lambda(evaluation, options) - if is_stop: - - def tokens(): - for token in doc: - if not is_stop(token.text): - yield token.text_with_ws - else: - yield token.whitespace_.strip() - - return String("".join(tokens())) - - -class WordFrequency(_SpacyBuiltin): - """ -
-
'WordFrequency[$text$, $word$]' -
returns the relative frequency of $word$ in $text$. -
- - $word$ may also specify multiple words using $a$ | $b$ | ... - - ## Problem with import for certain characters in the text. - ## >> text = Import["ExampleData/EinsteinSzilLetter.txt"]; - >> text = "I have a dairy cow, it's not just any cow. \ - She gives me milkshake, oh what a salty cow. She is the best\ - cow in the county."; - - >> WordFrequency[text, "a" | "the"] - = 0.114286 - - >> WordFrequency["Apple Tree", "apple", IgnoreCase -> True] - = 0.5 - """ - - options = _SpacyBuiltin.options - options.update({"IgnoreCase": "False"}) - - def eval( - self, text: String, word, evaluation: Evaluation, options: dict - ) -> Optional[Expression]: - "WordFrequency[text_String, word_, OptionsPattern[WordFrequency]]" - doc = self._nlp(text.value, evaluation, options) - if not doc: - return - if isinstance(word, String): - words = set([word.value]) - elif word.get_head_name() == "System`Alternatives": - if not all(isinstance(a, String) for a in word.elements): - return # error - words = set(a.value for a in word.elements) - else: - return # error - - ignore_case = self.get_option(options, "IgnoreCase", evaluation) is SymbolTrue - if ignore_case: - words = [w.lower() for w in words] - n = 0 - for token in doc: - token_text = token.text - if ignore_case: - token_text = token_text.lower() - if token_text in words: - n += 1 - return eval_N(Integer(n) / Integer(len(doc)), evaluation) - - -class Containing(Builtin): - pass - - -def _cases(doc, form): - if isinstance(form, String): - generators = [_forms.get(form.value)] - elif form.get_head_name() == "System`Alternatives": - if not all(isinstance(f, String) for f in form.elements): - return # error - generators = [_forms.get(f.value) for f in form.elements] - elif form.get_head_name() == "PyMathics`Containing": - if len(form.elements) == 2: - for t in _containing(doc, *form.elements): - yield t - return - else: - return # error - else: - return # error - - def try_next(iterator): - try: - return next(iterator) - except StopIteration: - return None - - feeds = [] - for i, iterator in enumerate([iter(generator(doc)) for generator in generators]): - t = try_next(iterator) - if t: - feeds.append((_position(t), i, t, iterator)) - heapq.heapify(feeds) - while feeds: - pos, i, token, iterator = heapq.heappop(feeds) - yield token - t = try_next(iterator) - if t: - heapq.heappush(feeds, (_position(t), i, t, iterator)) - - -def _containing(doc, outer, inner): - if not isinstance(outer, String): - return # error - outer_generator = _forms.get(outer.value) - inner_iter = _cases(doc, inner) - inner_start = None - produce_t = False - try: - for t in outer_generator(doc): - start, end = _position(t) - if inner_start is not None and inner_start < end: - produce_t = True - if produce_t: - yield t - produce_t = False - while True: - inner_start, inner_end = _position(next(inner_iter)) - if inner_end > start: - break - if inner_start < end: - produce_t = True - except StopIteration: - pass - - -class TextCases(_SpacyBuiltin): - """ -
-
'TextCases[$text$, $form$]' -
returns all elements of type $form$ in $text$ in order of their appearance. -
- - >> TextCases["I was in London last year.", "Pronoun"] - = {I} - - >> TextCases["I was in London last year.", "City"] - = {London} - - ## >> TextCases[Import["ExampleData/EinsteinSzilLetter.txt"], "Person", 3][[2;;3]] - ## = {L. Szilard, Joliot} - - >> TextCases["Anne, Peter and Mr Johnes say hello.", "Person", 3][[2;;3]] - = {Peter, Johnes} - - """ - - def eval_string_form( - self, text: String, form, evaluation: Evaluation, options: dict - ): - "TextCases[text_String, form_, OptionsPattern[TextCases]]" - doc = self._nlp(text.value, evaluation, options) - if doc: - return to_mathics_list(*[t.text for t in _cases(doc, form)]) - - def eval_string_form_n( - self, text: String, form, n: Integer, evaluation: Evaluation, options: dict - ): - "TextCases[text_String, form_, n_Integer, OptionsPattern[TextCases]]" - doc = self._nlp(text.value, evaluation, options) - if doc: - return to_mathics_list( - *itertools.islice((t.text for t in _cases(doc, form)), n.value) - ) - - -class TextPosition(_SpacyBuiltin): - """ -
-
'TextPosition[$text$, $form$]' -
returns the positions of elements of type $form$ in $text$ in order of their appearance. -
- - >> TextPosition["Liverpool and London are two English cities.", "City"] - = {{1, 9}, {15, 20}} - """ - - def eval_text_form(self, text: String, form, evaluation: Evaluation, options: dict): - "TextPosition[text_String, form_, OptionsPattern[TextPosition]]" - doc = self._nlp(text.value, evaluation, options) - if doc: - return to_mathics_list(*[_position(t) for t in _cases(doc, form)]) - - def eval_text_form_n( - self, text: String, form, n: Integer, evaluation: Evaluation, options: dict - ): - "TextPosition[text_String, form_, n_Integer, OptionsPattern[TextPosition]]" - doc = self._nlp(text.value, evaluation, options) - if doc: - return to_mathics_list( - *itertools.islice((_position(t) for t in _cases(doc, form)), n.value) - ) - - -class TextStructure(_SpacyBuiltin): - """ -
-
'TextStructure[$text$, $form$]' -
returns the grammatical structure of $text$ as $form$. -
- - >> TextStructure["The cat sat on the mat.", "ConstituentString"] - = {(Sentence, ((Verb Phrase, (Noun Phrase, (Determiner, The), (Noun, cat)), (Verb, sat), (Prepositional Phrase, (Preposition, on), (Noun Phrase, (Determiner, the), (Noun, mat))), (Punctuation, .))))} - """ - - _root_pos = set(i for i, names in _pos_tags.items() if names[1]) - - def _to_constituent_string(self, node): - token, children = node - name, phrase_name = _pos_tags.get(token.pos, ("Unknown", "Unknown Phrase")) - if not children: - return "(%s, %s)" % (name, token.text) - else: - sub = ", ".join( - self._to_constituent_string(next_node) for next_node in children - ) - return "(%s, %s)" % (phrase_name, sub) - - def _to_tree(self, tokens, path=[]): - roots = [] - i = 0 - while i < len(tokens): - token = tokens[i] - - if token in path: - roots.append((token, None)) - i += 1 - else: - root = token - while root.head != root and root.head not in path: - root = root.head - - sub = list(root.subtree) - - if root.pos not in self._root_pos: - roots.extend(self._to_tree(sub, path + [root])) - else: - roots.append((root, self._to_tree(sub, path + [root]))) - - i += len(sub) - - return roots - - def eval(self, text, evaluation: Evaluation, options: dict): - 'TextStructure[text_String, "ConstituentString", OptionsPattern[TextStructure]]' - doc = self._nlp(text.value, evaluation, options) - if doc: - tree = self._to_tree(list(doc)) - sents = ["(Sentence, (%s))" % self._to_constituent_string(x) for x in tree] - return to_mathics_list(*sents, elements_conversion_fn=String) - - -class WordSimilarity(_SpacyBuiltin): - """ -
-
'WordSimilarity[$text1$, $text2$]' -
returns a real-valued measure of semantic similarity of two texts or words. - -
'WordSimilarity[{$text1$, $i1$}, {$text2$, $j1$}]' -
returns a measure of similarity of two words within two texts. - -
'WordSimilarity[{$text1$, {$i1$, $i2$, ...}}, {$text2$, {$j1$, $j2$, ...}}]' -
returns a measure of similarity of multiple words within two texts. -
- - >> NumberForm[WordSimilarity["car", "train"], 3] - = 0.439 - - >> NumberForm[WordSimilarity["car", "hedgehog"], 3] - = 0.195 - - >> NumberForm[WordSimilarity[{"An ocean full of water.", {2, 2}}, { "A desert full of sand.", {2, 5}}], 3] - = {0.505, 0.481} - """ - - messages = _merge_dictionaries( - _SpacyBuiltin.messages, - { - "txtidx": "Index `1` in position `2` must be between 1 and `3`.", - "idxfmt": "Indices must be integers or lists of integers of the same length.", - }, - ) - - def eval( - self, text1: String, text2: String, evaluation: Evaluation, options: dict - ) -> Optional[Real]: - "WordSimilarity[text1_String, text2_String, OptionsPattern[WordSimilarity]]" - doc1 = self._nlp(text1.value, evaluation, options) - if doc1: - doc2 = self._nlp(text2.value, evaluation, options) - if doc2: - return Real(doc1.similarity(doc2)) - - def eval_pair(self, text1, i1, text2, i2, evaluation: Evaluation, options: dict): - "WordSimilarity[{text1_String, i1_}, {text2_String, i2_}, OptionsPattern[WordSimilarity]]" - doc1 = self._nlp(text1.value, evaluation, options) - if doc1: - if text2.value == text1.value: - doc2 = doc1 - else: - doc2 = self._nlp(text2.value, evaluation, options) - if doc2: - if i1.get_head() is SymbolList and i2.get_head() is SymbolList: - if len(i1.elements) != len(i2.elements): - evaluation.message("TextSimilarity", "idxfmt") - return - if any( - not all(isinstance(i, Integer) for i in li.elements) - for li in (i1, i2) - ): - evaluation.message("TextSimilarity", "idxfmt") - return - indices1 = [i.value for i in i1.elements] - indices2 = [i.value for i in i2.elements] - multiple = True - elif isinstance(i1, Integer) and isinstance(i2, Integer): - indices1 = [i1.value] - indices2 = [i2.value] - multiple = False - else: - evaluation.message("TextSimilarity", "idxfmt") - return - - for index1, index2 in zip(indices1, indices2): - for i, pos, doc in zip((index1, index2), (1, 2), (doc1, doc2)): - if i < 1 or i > len(doc): - evaluation.message( - "TextSimilarity", "txtidx", i, pos, len(doc) - ) - return - - result = [ - Real(doc1[j1 - 1].similarity(doc2[j2 - 1])) - for j1, j2 in zip(indices1, indices2) - ] - - if multiple: - return ListExpression(*result) - else: - return result[0] - - -class WordStem(Builtin): - """ -
-
'WordStem[$word$]' -
returns a stemmed form of $word$, thereby reducing an inflected form to its root. - -
'WordStem[{$word1$, $word2$, ...}]' -
returns a stemmed form for list of $word$, thereby reducing an inflected form to its root. -
- - >> WordStem["towers"] - = tower - - >> WordStem[{"heroes", "roses", "knights", "queens"}] - = {hero, rose, knight, queen} - """ - - requires = ("nltk",) - - _stemmer = None - - @staticmethod - def _get_porter_stemmer(): - if WordStem._stemmer is None: - WordStem._stemmer = nltk.stem.porter.PorterStemmer() - return WordStem._stemmer - - @staticmethod - def porter(w): - return WordStem._get_porter_stemmer().stem(w) - - def eval(self, word: String, evaluation: Evaluation) -> String: - "WordStem[word_String]" - stemmer = self._get_porter_stemmer() - return String(stemmer.stem(word.value)) - - def eval_list(self, words, evaluation: Evaluation) -> Optional[ListExpression]: - "WordStem[words_List]" - if all(isinstance(w, String) for w in words.elements): - stemmer = self._get_porter_stemmer() - return ListExpression( - *[String(stemmer.stem(w.value)) for w in words.elements] - ) - - -class _WordNetBuiltin(Builtin): - requires = ("nltk",) - - options = { - "Language": '"English"', - } - - messages = { - "package": "NLTK's `` corpus is not installed. Please install it using nltk.download().", - "lang": 'Language "`1`" is currently not supported with `2`[]. Please install it manually.', - # 'load': 'Loading `1` word data. Please wait.', - "wordnet": "WordNet returned the following error: ``", - } - - _wordnet_instances = {} - - def _language_name(self, evaluation: Evaluation, options: dict): - return self.get_option(options, "Language", evaluation) - - def _init_wordnet(self, evaluation: Evaluation, language_name, language_code): - try: - wordnet_resource = nltk.data.find("corpora/wordnet2022") - _init_nltk_maps() - except LookupError: - evaluation.message(self.get_name(), "package", "wordnet2022") - return None - - try: - omw = nltk.corpus.util.LazyCorpusLoader( - "omw", - nltk.corpus.reader.CorpusReader, - r".*/wn-data-.*\.tab", - encoding="utf8", - ) - except LookupError: - evaluation.message(self.get_name(), "package", "omw") - return None - - wordnet = nltk.corpus.reader.wordnet.WordNetCorpusReader(wordnet_resource, omw) - - if language_code not in wordnet.langs(): - evaluation.message( - self.get_name(), "lang", language_name, strip_context(self.get_name()) - ) - return None - - return wordnet - - def _load_wordnet(self, evaluation: Evaluation, language_name) -> tuple: - language_code = None - if isinstance(language_name, String): - language_code = iso639_3.get(language_name.value) - if not language_code: - evaluation.message( - self.get_name(), "lang", language_name, strip_context(self.get_name()) - ) - return None, None - - wordnet = _WordNetBuiltin._wordnet_instances.get(language_code) - if not wordnet: - try: - wordnet = self._init_wordnet(evaluation, language_name, language_code) - except LookupError as e: - evaluation.message( - self.get_name(), "package", _parse_nltk_lookup_error(e) - ) - return None, None - - _WordNetBuiltin._wordnet_instances[language_code] = wordnet - - return wordnet, language_code - - @staticmethod - def _decode_synset(syn): - what, pos, nr = (syn.name().split(".") + ["01"])[:3] - return what.replace("_", " "), pos, nr - - @staticmethod - def _capitalize(s) -> str: - return re.sub(r"^[a-z]|\s[a-z]", lambda m: m.group(0).upper().lstrip(" "), s) - - @staticmethod - def _underscore(s) -> str: - return re.sub( - r"[a-z][A-Z]", lambda m: m.group(0)[0] + "_" + m.group(0)[1].lower(), s - ).lower() - - @staticmethod - def _list_syn_form(syn): - what, pos, nr = _WordNetBuiltin._decode_synset(syn) - - def containers(): - for name in syn.lemma_names(): - if name != what: - yield name - - for s in chain(syn.hypernyms(), syn.hyponyms(), syn.similar_tos()): - container, _, _ = _WordNetBuiltin._decode_synset(s) - yield container - - for lemma in WordProperty._synonymous_lemmas(syn): - yield lemma.name() - - return what, _wordnet_pos_to_type[pos], containers - - @staticmethod - def syn(syn, wordnet, language_code) -> tuple: - what, pos, nr = _WordNetBuiltin._decode_synset(syn) - for s, form in _WordNetBuiltin._iterate_senses(what, wordnet, language_code): - if s == syn: - return form - return what, pos, "Unknown" - - @staticmethod - def _iterate_senses(word, wordnet, language_code): - if not word: - return - - used = set() - output_word = word.replace("_", " ") - - for syn in wordnet.synsets(word, None, language_code): - if syn.lexname() in ("noun.location", "noun.person"): - continue # ignore - - what, pos, containers = _WordNetBuiltin._list_syn_form(syn) - - for container in containers(): - container = container.replace("_", " ") - if container != word: - if container not in used: - used.add(container) - yield syn, ( - output_word, - pos, - _WordNetBuiltin._capitalize(container), - ) - break - - def _senses(self, word, wordnet, language_code): - if isinstance(word, tuple): # find forms like ["tree", "Noun", "WoodyPlant"] - for syn, form in _WordNetBuiltin._iterate_senses( - word[0], wordnet, language_code - ): - if form == word: - return [[syn, form]] - else: # find word given as strings, e.g. "tree" - word = wordnet.morphy(word) # base form, e.g. trees -> tree - return list(_WordNetBuiltin._iterate_senses(word, wordnet, language_code)) - - -class WordDefinition(_WordNetBuiltin): - """ -
-
'WordDefinition[$word$]' -
returns a definition of $word$ or Missing["Available"] if $word$ is not known. -
- - >> WordDefinition["gram"] - = {a metric unit of weight equal to one thousandth of a kilogram} - """ - - def eval(self, word, evaluation: Evaluation, options: dict): - "WordDefinition[word_String, OptionsPattern[WordDefinition]]" - wordnet, language_code = self._load_wordnet( - evaluation, self._language_name(evaluation, options) - ) - if wordnet: - senses = self._senses(word.value.lower(), wordnet, language_code) - if senses: - return ListExpression(*[String(syn.definition()) for syn, _ in senses]) - else: - return Expression(SymbolMissing, StringNotAvailable) - - -class WordProperty: - def __init__(self, syn_form, wordnet, language_code): - self.syn_form = syn_form - self.wordnet = wordnet - self.language_code = language_code - - def syn(self, syn): - return self.syn_form(_WordNetBuiltin.syn(syn, self.wordnet, self.language_code)) - - @staticmethod - def _synonymous_lemmas(syn): - first_lemma = syn.name().split(".")[0] - return (s for s in syn.lemmas() if s.name() != first_lemma) - - @staticmethod - def _antonymous_lemmas(syn): - return (s for lemma in syn.lemmas() for s in lemma.antonyms()) - - def definitions(self, syn, desc): - return syn.definition() - - def examples(self, syn, desc): - return syn.examples() - - def synonyms(self, syn, desc): - _, pos, container = desc - return [ - self.syn_form((s.name().replace("_", " "), pos, container)) - for s in WordProperty._synonymous_lemmas(syn) - ] - - def antonyms(self, syn, desc): - return [self.syn(s.synset()) for s in WordProperty._antonymous_lemmas(syn)] - - def broader_terms(self, syn, desc): - return [self.syn(s) for s in syn.hypernyms()] - - def narrower_terms(self, syn, desc): - return [self.syn(s) for s in syn.hyponyms()] - - def usage_field(self, syn, desc): - return syn.usage_domains() - - def whole_terms(self, syn, desc): - return [self.syn(s) for s in syn.part_holonyms()] - - def part_terms(self, syn, desc): - return [self.syn(s) for s in syn.part_meronyms()] - - def material_terms(self, syn, desc): - return [self.syn(s) for s in syn.substance_meronyms()] - - def word_net_id(self, syn, desc): - return syn.offset() - - def entailed_terms(self, syn, desc): # e.g. fall to condense - return [self.syn(s) for s in syn.entailments()] - - def causes_terms(self, syn, desc): # e.g. ignite to burn - return [self.syn(s) for s in syn.causes()] - - def inflected_forms(self, syn, desc): - try: - word, pos, _ = desc - if pos == "Verb": - from pattern.en import lexeme - - return [w for w in reversed(lexeme(word)) if w != word] - elif pos == "Noun": - from pattern.en import pluralize - - return [pluralize(word)] - elif pos == "Adjective": - from pattern.en import comparative, superlative - - return [comparative(word), superlative(word)] - else: - return [] - except ImportError: - raise MessageException( - "General", "unavailable", 'WordData[_, "InflectedForms"]', "pattern" - ) - - -class _WordListBuiltin(_WordNetBuiltin): - _dictionary = {} - - def _words(self, language_name, ilk, evaluation): - wordnet, language_code = self._load_wordnet(evaluation, language_name) - - if not wordnet: - return - - key = "%s.%s" % (language_code, ilk) - words = self._dictionary.get(key) - if not words: - try: - if ilk == "All": - filtered_pos = [None] - else: - try: - filtered_pos = _wordnet_type_to_pos[ilk] - except KeyError: - evaluation.message( - self.get_name(), - "wordnet", - "type: %s is should be in %s" - % (ilk._wordnet_type_to_pos.keys()), - ) - return - - words = [] - for pos in filtered_pos: - words.extend(list(wordnet.all_lemma_names(pos, language_code))) - words.sort() - self._dictionary[key] = words - except nltk.corpus.reader.wordnet.WordNetError as err: - evaluation.message(self.get_name(), "wordnet", str(err)) - return - - return words - - -class WordData(_WordListBuiltin): - """ -
-
'WordData[$word$]' -
returns a list of possible senses of a word. - -
'WordData[$word$, $property$]' -
returns detailed information about a word regarding $property$, e.g. "Definitions" or "Examples". -
- - The following are valid properties: - - - >> WordData["riverside", "Definitions"] - = {{riverside, Noun, Bank} -> the bank of a river} - - >> WordData[{"fish", "Verb", "Angle"}, "Examples"] - = {{fish, Verb, Angle} -> {fish for compliments}} - """ - - messages = _merge_dictionaries( - _WordNetBuiltin.messages, - { - "notprop": "WordData[] does not recognize `1` as a valid property.", - }, - ) - - def _parse_word(self, word): - if isinstance(word, String): - return word.value.lower() - elif word.get_head_name() == "System`List": - if len(word.elements) == 3 and all( - isinstance(s, String) for s in word.elements - ): - return tuple(s.value for s in word.elements) - - def _standard_property( - self, py_word, py_form, py_property, wordnet, language_code, evaluation - ): - senses = self._senses(py_word, wordnet, language_code) - if not senses: - return Expression(SymbolMissing, StringNotAvailable) - elif py_form == "List": - word_property = WordProperty(self._short_syn_form, wordnet, language_code) - property_getter = getattr( - word_property, "%s" % self._underscore(py_property), None - ) - if property_getter: - return to_mathics_list( - *[property_getter(syn, desc) for syn, desc in senses] - ) - elif py_form in ("Rules", "ShortRules"): - syn_form = (lambda s: s) if py_form == "Rules" else (lambda s: s[0]) - word_property = WordProperty(syn_form, wordnet, language_code) - property_getter = getattr( - word_property, self._underscore(py_property), None - ) - if property_getter: - list_expr_elements = [ - to_expression(SymbolRule, desc, property_getter(syn, desc)) - for syn, desc in senses - ] - return to_mathics_list(*list_expr_elements) - evaluation.message(self.get_name(), "notprop", property) - - def _parts_of_speech(self, py_word, wordnet, language_code): - parts = set( - syn.pos() for syn, _ in self._senses(py_word, wordnet, language_code) - ) - if not parts: - return Expression(SymbolMissing, StringNotAvailable) - else: - return ListExpression( - *[String(s) for s in sorted([_wordnet_pos_to_type[p] for p in parts])] - ) - - def _property( - self, word, py_property, py_form, evaluation: Evaluation, options: dict - ): - if py_property == "PorterStem": - if isinstance(word, String): - return String(WordStem.porter(word.value)) - else: - return - - wordnet, language_code = self._load_wordnet( - evaluation, self._language_name(evaluation, options) - ) - if not wordnet: - return - - py_word = self._parse_word(word) - if not py_word: - return - - if py_property == "PartsOfSpeech": - return self._parts_of_speech(py_word, wordnet, language_code) - - try: - return self._standard_property( - py_word, py_form, py_property, wordnet, language_code, evaluation - ) - except MessageException as e: - e.message(evaluation) - - def eval(self, word, evaluation: Evaluation, options: dict) -> Optional[Expression]: - "WordData[word_, OptionsPattern[WordData]]" - if word.get_head() is SymbolStringExpression: - return Expression(SymbolDictionaryLookup, word) - elif isinstance(word, String) or word.get_head() is SymbolList: - pass - else: - return - - wordnet, language_code = self._load_wordnet( - evaluation, self._language_name(evaluation, options) - ) - if not wordnet: - return - - py_word = self._parse_word(word) - if not py_word: - return - - senses = self._senses(py_word, wordnet, language_code) - if senses is not None: - return ListExpression(*[[String(s) for s in desc] for syn, desc in senses]) - - def eval_property(self, word, property, evaluation: Evaluation, options: dict): - "WordData[word_, property_String, OptionsPattern[WordData]]" - if word.get_head is SymbolStringExpression: - if property.get_string_value() == "Lookup": - return Expression(SymbolDictionaryLookup, word) - elif isinstance(word, String) or word.get_head() is SymbolList: - return self._property( - word, property.get_string_value(), "ShortRules", evaluation, options - ) - - def eval_property_form( - self, word, property, form, evaluation: Evaluation, options: dict - ): - "WordData[word_, property_String, form_String, OptionsPattern[WordData]]" - if isinstance(word, String) or word.get_head() is SymbolList: - return self._property( - word, - property.value, - form.value, - evaluation, - options, - ) - - -class DictionaryWordQ(_WordNetBuiltin): - """ -
-
'DictionaryWordQ[$word$]' -
returns True if $word$ is a word usually found in dictionaries, and False otherwise. -
- - >> DictionaryWordQ["couch"] - = True - - >> DictionaryWordQ["meep-meep"] - = False - """ - - def eval(self, word, evaluation: Evaluation, options: dict): - "DictionaryWordQ[word_String, OptionsPattern[DictionaryWordQ]]" - if not isinstance(word, String): - return False - wordnet, language_code = self._load_wordnet( - evaluation, self._language_name(evaluation, options) - ) - if wordnet: - if list(wordnet.synsets(word.value.lower(), None, language_code)): - return SymbolTrue - else: - return SymbolFalse - - -class DictionaryLookup(_WordListBuiltin): - """ -
-
'DictionaryLookup[$word$]' -
lookup words that match the given $word$ or pattern. - -
'DictionaryLookup[$word$, $n$]' -
lookup first $n$ words that match the given $word$ or pattern. -
- - >> DictionaryLookup["bake" ~~ ___, 3] - = {bake, bakeapple, baked} - """ - - def compile(self, pattern, evaluation): - re_patt = to_regex(pattern, evaluation) - if re_patt is None: - evaluation.message( - "StringExpression", - "invld", - pattern, - Expression(SymbolStringExpression, pattern), - ) - return - re_patt = anchor_pattern(re_patt) - - return re.compile(re_patt, flags=re.IGNORECASE) - - def search(self, dictionary_words, pattern): - for dictionary_word in dictionary_words: - if pattern.match(dictionary_word): - yield dictionary_word.replace("_", " ") - - def lookup(self, language_name, word, n, evaluation): - pattern = self.compile(word, evaluation) - if pattern: - dictionary_words = self._words(language_name, "All", evaluation) - if dictionary_words is not None: - matches = self.search(dictionary_words, pattern) - if n is not None: - matches = itertools.islice(matches, 0, n) - return ListExpression(*(String(match) for match in sorted(matches))) - - def eval_english(self, word, evaluation): - "DictionaryLookup[word_]" - return self.lookup(String("English"), word, None, evaluation) - - def eval_language(self, language, word, evaluation): - "DictionaryLookup[{language_String, word_}]" - return self.lookup(language, word, None, evaluation) - - def eval_english_n(self, word, n, evaluation): - "DictionaryLookup[word_, n_Integer]" - return self.lookup(String("English"), word, n.value, evaluation) - - def eval_language_n(self, language, word, n, evaluation): - "DictionaryLookup[{language_String, word_}, n_Integer]" - return self.lookup(language, word, n.value, evaluation) - - -class WordList(_WordListBuiltin): - """ -
-
'WordList[]' -
returns a list of common words. - -
'WordList[$type$]' -
returns a list of common words of type $type$. -
- - >> N[Mean[StringLength /@ WordList["Adjective"]], 2] - = 9.3 - """ - - def eval(self, evaluation: Evaluation, options: dict): - "WordList[OptionsPattern[WordList]]" - words = self._words(self._language_name(evaluation, options), "All", evaluation) - if words is not None: - return to_mathics_list(*words, elements_conversion_fn=String) - - def eval_type(self, wordtype, evaluation: Evaluation, options: dict): - "WordList[wordtype_String, OptionsPattern[WordList]]" - words = self._words( - self._language_name(evaluation, options), - wordtype.value, - evaluation, - ) - if words is not None: - return to_mathics_list(*words, elements_conversion_fn=String) - - -class RandomWord(_WordListBuiltin): - """ -
-
'RandomWord[]' -
returns a random word. - -
'RandomWord[$type$]' -
returns a random word of the given $type$, e.g. of type "Noun" or "Adverb". - -
'RandomWord[$type$, $n$]' -
returns $n$ random words of the given $type$. -
- """ - - def _random_words(self, type, n, evaluation: Evaluation, options: dict): - words = self._words(self._language_name(evaluation, options), type, evaluation) - if words is not None: - with RandomEnv(evaluation) as rand: - return [ - String(words[rand.randint(0, len(words) - 1)].replace("_", " ")) - for _ in range(n) - ] - - def eval(self, evaluation: Evaluation, options: dict): - "RandomWord[OptionsPattern[RandomWord]]" - words = self._random_words("All", 1, evaluation, options) - if words: - return words[0] - - def eval_type(self, type, evaluation: Evaluation, options: dict): - "RandomWord[type_String, OptionsPattern[RandomWord]]" - words = self._random_words(type.value, 1, evaluation, options) - if words: - return words[0] - - def eval_type_n(self, type, n, evaluation: Evaluation, options: dict): - "RandomWord[type_String, n_Integer, OptionsPattern[RandomWord]]" - words = self._random_words(type.value, n.value, evaluation, options) - if words: - return ListExpression(*words) - - -class LanguageIdentify(Builtin): - """ -
-
'LanguageIdentify[$text$]' -
returns the name of the language used in $text$. -
- - >> LanguageIdentify["eins zwei drei"] - = German - """ - - def eval(self, text: String, evaluation: Evaluation) -> Union[Symbol, String]: - "LanguageIdentify[text_String]" - - # an alternative: https://github.com/Mimino666/langdetect - - code, _ = langid.classify(text.value) - language = pycountry.languages.get(alpha_2=code) - if language is None: - return SymbolFailed - return String(language.name) - - -class Pluralize(Builtin): - """ -
-
'Pluralize[$word$]' -
returns the plural form of $word$. -
- - >> Pluralize["potato"] - = potatoes - """ - - requires = ("pattern",) - - def eval(self, word, evaluation): - "Pluralize[word_String]" - - return String(pluralize(word.value)) - - -class SpellingCorrectionList(Builtin): - """ -
-
'SpellingCorrectionList[$word$]' -
returns a list of suggestions for spelling corrected versions of $word$. -
- - Results may differ depending on which dictionaries can be found by enchant. - - >> SpellingCorrectionList["hipopotamus"] - = {hippopotamus...} - """ - - options = { - "Language": '"English"', - } - - messages = { - "lang": "SpellingCorrectionList does not support `1` as a language.", - } - - _languages = { - "English": "en_US", # en_GB, en_AU - "German": "de_DE", - "French": "fr_FR", - } - - _dictionaries = {} - - def eval( - self, word: String, evaluation: Evaluation, options: dict - ) -> Optional[ListExpression]: - "SpellingCorrectionList[word_String, OptionsPattern[SpellingCorrectionList]]" - - language_name = self.get_option(options, "Language", evaluation) - if not isinstance(language_name, String): - return - language_code = SpellingCorrectionList._languages.get(language_name.value, None) - if not language_code: - evaluation.message("SpellingCorrectionList", "lang", language_name) - return - - d = SpellingCorrectionList._dictionaries.get(language_code, None) - if not d: - d = enchant.Dict(language_code) - SpellingCorrectionList._dictionaries[language_code] = d - - py_word = word.value - - if d.check(py_word): - return ListExpression(word) - else: - return to_mathics_list(*d.suggest(py_word), elements_conversion_fn=String) diff --git a/pymathics/natlang/normalization.py b/pymathics/natlang/normalization.py new file mode 100644 index 0000000..17d7e31 --- /dev/null +++ b/pymathics/natlang/normalization.py @@ -0,0 +1,301 @@ +""" + +Text normalization + +""" +import itertools +from itertools import islice +from typing import Optional + +import spacy +from mathics.core.atoms import Integer, String +from mathics.core.convert.python import from_python +from mathics.core.evaluation import Evaluation +from mathics.core.list import ListExpression + +from pymathics.natlang.spacy import _cases, _pos_tags, _position, _SpacyBuiltin + + +class DeleteStopwords(_SpacyBuiltin): + """ + Delete :stop words:https://en.wikipedia.org/wiki/Stop_word(\ + :WMA: + https://reference.wolfram.com/language/ref/DeleteStopwords.html\ + ) + +
+
'DeleteStopwords[$list$]' +
returns the words in $list$ without stopwords. + +
'DeleteStopwords[$string$]' +
returns $string$ without stopwords. +
+ + ## This has changed since old versions of natlang, and I am + ## not sure the old behavior was correct. + >> DeleteStopwords[{"Somewhere", "over", "the", "rainbow"}] + = ... + ## = {rainbow} + + >> DeleteStopwords["There was an Old Man of Apulia, whose conduct was very peculiar"] + = Old Man Apulia, conduct peculiar + """ + + summary_text = "Remove stopwords from a text" + + def eval_list(self, li, evaluation: Evaluation, options: dict) -> ListExpression: + "DeleteStopwords[li_List, OptionsPattern[DeleteStopwords]]" + is_stop = self._is_stop_lambda(evaluation, options) + + def filter_words(words): + for w in words: + s = w.get_string_value() + if s is not None: + yield String(s) + elif is_stop is not None and is_stop(s) is not None: + yield String(s) + + return ListExpression(*list(filter_words(li.elements))) + + def eval_string(self, s: String, evaluation: Evaluation, options: dict): + "DeleteStopwords[s_String, OptionsPattern[DeleteStopwords]]" + doc = self._nlp(s.value, evaluation, options) + if doc: + is_stop = self._is_stop_lambda(evaluation, options) + if is_stop: + + def tokens(): + for token in doc: + if not is_stop(token.text): + yield token.text_with_ws + else: + yield token.whitespace_.strip() + + return String("".join(tokens())) + + +class TextCases(_SpacyBuiltin): + """ + :WMA: + https://reference.wolfram.com/language/ref/TextCases.html + +
+
'TextCases[$text$, $form$]' +
returns all elements of type $form$ in $text$ in order of their appearance. +
+ + >> TextCases["I was in London last year.", "Pronoun"] + = {I} + + >> TextCases["I was in London last year.", "City"] + = {London} + + ## >> TextCases[Import["ExampleData/EinsteinSzilLetter.txt"], "Person", 3][[2;;3]] + ## = {L. Szilard, Joliot} + + >> TextCases["Anne, Peter and Mr Johnes say hello.", "Person", 3][[2;;3]] + = {Peter, Johnes} + + """ + + summary_text = "List the cases of words of a certain form in a text" + + def eval_string_form( + self, text: String, form, evaluation: Evaluation, options: dict + ): + "TextCases[text_String, form_, OptionsPattern[TextCases]]" + doc = self._nlp(text.value, evaluation, options) + if doc: + return ListExpression(*[String(t.text) for t in _cases(doc, form)]) + + def eval_string_form_n( + self, text: String, form, n: Integer, evaluation: Evaluation, options: dict + ): + "TextCases[text_String, form_, n_Integer, OptionsPattern[TextCases]]" + doc = self._nlp(text.value, evaluation, options) + if doc: + items = islice((t.text for t in _cases(doc, form)), n.value) + return ListExpression(*(from_python(item) for item in items)) + + +class TextPosition(_SpacyBuiltin): + """ + :WMA: + https://reference.wolfram.com/language/ref/TextPosition.html + +
+
'TextPosition[$text$, $form$]' +
returns the positions of elements of type $form$ in $text$ in order of their appearance. +
+ + >> TextPosition["Liverpool and London are two English cities.", "City"] + = {{1, 9}, {15, 20}} + """ + + summary_text = "List the position of words of a given form in a text" + + def eval_text_form(self, text: String, form, evaluation: Evaluation, options: dict): + "TextPosition[text_String, form_, OptionsPattern[TextPosition]]" + doc = self._nlp(text.value, evaluation, options) + if doc: + return ListExpression( + *[from_python(_position(t)) for t in _cases(doc, form)] + ) + + def eval_text_form_n( + self, text: String, form, n: Integer, evaluation: Evaluation, options: dict + ): + "TextPosition[text_String, form_, n_Integer, OptionsPattern[TextPosition]]" + doc = self._nlp(text.value, evaluation, options) + if doc: + items = islice((_position(t) for t in _cases(doc, form)), n.value) + return ListExpression(*(from_python(item) for item in items)) + + +class TextSentences(_SpacyBuiltin): + """ + :Sentences:https://en.wikipedia.org/wiki/Sentence_(linguistics)\ + in a text (\ + :WMA: + https://reference.wolfram.com/language/ref/TextSentences.html\ + ) + + +
+
'TextSentences[$string$]' +
returns the sentences in $string$. + +
'TextSentences[$string$, $n$]' +
returns the first $n$ sentences in $string$ +
+ + >> TextSentences["Night and day. Day and night."] + = {Night and day., Day and night.} + + >> TextSentences["Night and day. Day and night.", 1] + = {Night and day.} + + >> TextSentences["Mr. Jones met Mrs. Jones."] + = {Mr. Jones met Mrs. Jones.} + """ + + summary_text = "list the sentences in a text" + + def eval(self, text: String, evaluation: Evaluation, options: dict): + "TextSentences[text_String, OptionsPattern[TextSentences]]" + doc = self._nlp(text.value, evaluation, options) + if doc: + return ListExpression(*[String(sent.text) for sent in doc.sents]) + + def eval_n(self, text: String, n: Integer, evaluation: Evaluation, options: dict): + "TextSentences[text_String, n_Integer, OptionsPattern[TextSentences]]" + doc = self._nlp(text.value, evaluation, options) + if doc: + return ListExpression( + *itertools.islice((String(sent.text) for sent in doc.sents), n.value), + ) + + +class TextStructure(_SpacyBuiltin): + """ + :WMA: + https://reference.wolfram.com/language/ref/TextStructure.html + +
+
'TextStructure[$text$, $form$]' +
returns the grammatical structure of $text$ as $form$. +
+ + >> TextStructure["The cat sat on the mat.", "ConstituentString"] + = {(Sentence, ((Verb Phrase, (Noun Phrase, (Determiner, The), (Noun, cat)), (Verb, sat), (Prepositional Phrase, (Preposition, on), (Noun Phrase, (Determiner, the), (Noun, mat))), (Punctuation, .))))} + """ + + _root_pos = set(i for i, names in _pos_tags.items() if names[1]) + summary_text = "Retrieve the grammatical structure of a text" + + def _to_constituent_string(self, node): + token, children = node + name, phrase_name = _pos_tags.get(token.pos, ("Unknown", "Unknown Phrase")) + if not children: + return "(%s, %s)" % (name, token.text) + else: + sub = ", ".join( + self._to_constituent_string(next_node) for next_node in children + ) + return "(%s, %s)" % (phrase_name, sub) + + def _to_tree(self, tokens, path=[]): + roots = [] + i = 0 + while i < len(tokens): + token = tokens[i] + + if token in path: + roots.append((token, None)) + i += 1 + else: + root = token + while root.head != root and root.head not in path: + root = root.head + + sub = list(root.subtree) + + if root.pos not in self._root_pos: + roots.extend(self._to_tree(sub, path + [root])) + else: + roots.append((root, self._to_tree(sub, path + [root]))) + + i += len(sub) + + return roots + + def eval(self, text, evaluation: Evaluation, options: dict): + 'TextStructure[text_String, "ConstituentString", OptionsPattern[TextStructure]]' + doc = self._nlp(text.value, evaluation, options) + if doc: + tree = self._to_tree(list(doc)) + sents = ["(Sentence, (%s))" % self._to_constituent_string(x) for x in tree] + return ListExpression(*(String(sent) for sent in sents)) + + +class TextWords(_SpacyBuiltin): + """ + :WMA: + https://reference.wolfram.com/language/ref/TextWords.html + +
+
'TextWords[$string$]' +
returns the words in $string$. + +
'TextWords[$string$, $n$]' +
returns the first $n$ words in $string$ +
+ + >> TextWords["Hickory, dickory, dock! The mouse ran up the clock."] + = {Hickory, dickory, dock, The, mouse, ran, up, the, clock} + """ + + summary_text = "list the words in a string" + + def eval( + self, text: String, evaluation: Evaluation, options: dict + ) -> Optional[ListExpression]: + "TextWords[text_String, OptionsPattern[WordCount]]" + doc = self._nlp(text.value, evaluation, options) + if doc: + punctuation = spacy.parts_of_speech.PUNCT + return ListExpression( + *[String(word.text) for word in doc if word.pos != punctuation], + ) + + def eval_n(self, text: String, n: Integer, evaluation: Evaluation, options: dict): + "TextWords[text_String, n_Integer, OptionsPattern[TextWords]]" + doc = self._nlp(text.value, evaluation, options) + if doc: + punctuation = spacy.parts_of_speech.PUNCT + return ListExpression( + *itertools.islice( + (String(word.text) for word in doc if word.pos != punctuation), + n.value, + ), + ) diff --git a/pymathics/natlang/spacy.py b/pymathics/natlang/spacy.py new file mode 100644 index 0000000..851d13f --- /dev/null +++ b/pymathics/natlang/spacy.py @@ -0,0 +1,249 @@ +# -*- coding: utf-8 -*- +# FIXME: split this up into smaller pieces + +""" +Spacy tools + +""" +import heapq +import re +from typing import Optional + +import spacy +from mathics.builtin.base import Builtin +from mathics.core.atoms import String +from mathics.core.evaluation import Evaluation +from mathics.core.symbols import strip_context +from spacy.tokens import Span + +no_doc = True + +# Mathics3 named entitiy names and their corresponding constants in spacy. +symbols = { + "Person": spacy.symbols.PERSON, + "Company": spacy.symbols.ORG, + "Quantity": spacy.symbols.QUANTITY, + "Number": spacy.symbols.CARDINAL, + "CurrencyAmount": spacy.symbols.MONEY, + "Country": spacy.symbols.GPE, # also includes cities and states + "City": spacy.symbols.GPE, # also includes countries and states +} + +# Part of speech tags and their public interface names in Mathics +# see http://www.mathcs.emory.edu/~choi/doc/clear-dependency-2012.pdf +_pos_tags = { + spacy.parts_of_speech.ADJ: ("Adjective", ""), + spacy.parts_of_speech.ADP: ("Preposition", "Prepositional Phrase"), + spacy.parts_of_speech.ADV: ("Adverb", ""), + spacy.parts_of_speech.CONJ: ("Conjunct", ""), + spacy.parts_of_speech.DET: ("Determiner", ""), + spacy.parts_of_speech.INTJ: ("Interjection", ""), + spacy.parts_of_speech.NOUN: ("Noun", "Noun Phrase"), + spacy.parts_of_speech.NUM: ("Number", ""), + spacy.parts_of_speech.PART: ("Particle", ""), + spacy.parts_of_speech.PRON: ("Pronoun", ""), + spacy.parts_of_speech.PROPN: ("Proposition", ""), + spacy.parts_of_speech.PUNCT: ("Punctuation", ""), + spacy.parts_of_speech.SCONJ: ("Sconj", ""), + spacy.parts_of_speech.SYM: ("Symbol", ""), + spacy.parts_of_speech.VERB: ("Verb", "Verb Phrase"), + spacy.parts_of_speech.X: ("X", ""), + spacy.parts_of_speech.EOL: ("EOL", ""), + spacy.parts_of_speech.SPACE: ("Space", ""), +} + + +def _cases(doc, form): + if isinstance(form, String): + generators = [_forms.get(form.value)] + elif form.get_head_name() == "System`Alternatives": + if not all(isinstance(f, String) for f in form.elements): + return # error + generators = [_forms.get(f.value) for f in form.elements] + elif form.get_head_name() == "PyMathics`Containing": + if len(form.elements) == 2: + for t in _containing(doc, *form.elements): + yield t + return + else: + return # error + else: + return # error + + def try_next(iterator): + try: + return next(iterator) + except StopIteration: + return None + + feeds = [] + for i, iterator in enumerate([iter(generator(doc)) for generator in generators]): + t = try_next(iterator) + if t: + feeds.append((_position(t), i, t, iterator)) + heapq.heapify(feeds) + while feeds: + pos, i, token, iterator = heapq.heappop(feeds) + yield token + t = try_next(iterator) + if t: + heapq.heappush(feeds, (_position(t), i, t, iterator)) + + +def _containing(doc, outer, inner): + if not isinstance(outer, String): + return # error + outer_generator = _forms.get(outer.value) + inner_iter = _cases(doc, inner) + inner_start = None + produce_t = False + try: + for t in outer_generator(doc): + start, end = _position(t) + if inner_start is not None and inner_start < end: + produce_t = True + if produce_t: + yield t + produce_t = False + while True: + inner_start, inner_end = _position(next(inner_iter)) + if inner_end > start: + break + if inner_start < end: + produce_t = True + except StopIteration: + pass + + +def _fragments(doc, sep): + start = 0 + for i, token in enumerate(doc): + if sep.match(token.text): + yield Span(doc, start, i) + start = i + 1 + end = len(doc) + if start < end: + yield Span(doc, start, end) + + +def _make_forms(): + forms = { + "Word": lambda doc: (token for token in doc), + "Sentence": lambda doc: (sent for sent in doc.sents), + "Paragraph": lambda doc: _fragments(doc, re.compile(r"^[\n][\n]+$")), + "Line": lambda doc: _fragments(doc, re.compile(r"^[\n]$")), + "URL": lambda doc: (token for token in doc if token.orth_.like_url()), + "EmailAddress": lambda doc: ( + token for token in doc if token.orth_.like_email() + ), + } + + def filter_named_entity(label): + def generator(doc): + for ent in doc.ents: + if ent.label == label: + yield ent + + return generator + + def filter_pos(pos): + def generator(doc): + for token in doc: + if token.pos == pos: + yield token + + return generator + + for name, symbol in symbols.items(): + forms[name] = filter_named_entity(symbol) + + for tag, names in _pos_tags.items(): + name, phrase_name = names + forms[name] = filter_pos(tag) + + return forms + + +# forms are everything one can use in TextCases[] or TextPosition[]. +_forms = _make_forms() + + +def _position(t): + if isinstance(t, Span): + i = t.doc[t.start] + r = t.doc[t.end - 1] + return 1 + i.idx, r.idx + len(r.text) + else: + return 1 + t.idx, t.idx + len(t.text) + + +class _SpacyBuiltin(Builtin): + requires = ("spacy",) + + options = { + "Language": '"English"', + } + + messages = { + "runtime": "Spacy gave the following error: ``", + "lang": 'Language "`1`" is currently not supported with `2`[].', + } + + _language_codes = { + "English": "en", + "German": "de", + } + + _spacy_instances = {} + + def _load_spacy(self, evaluation: Evaluation, options: dict): + language_code = None + language_name = self.get_option(options, "Language", evaluation) + if language_name is None: + language_name = String("Undefined") + if isinstance(language_name, String): + language_code = _SpacyBuiltin._language_codes.get(language_name.value) + if not language_code: + evaluation.message( + self.get_name(), "lang", language_name, strip_context(self.get_name()) + ) + return None + + instance = _SpacyBuiltin._spacy_instances.get(language_code) + if instance: + return instance + + try: + instance = spacy.load(f"{language_code}_core_web_md") + + # "via" parameter no longer exists. This was used in MATHICS3_SPACY_DATA + # if "MATHICS3_SPACY_DATA" in os.environ: + # instance = spacy.load( + # language_code, via=os.environ["MATHICS3_SPACY_DATA"] + # ) + # else: + # instance = spacy.load(f"{language_code}_core_web_md") + + _SpacyBuiltin._spacy_instances[language_code] = instance + return instance + except RuntimeError as e: + evaluation.message(self.get_name(), "runtime", str(e)) + return None + + def _nlp(self, text, evaluation, options) -> Optional[spacy.tokens.doc.Doc]: + nlp = self._load_spacy(evaluation, options) + if not nlp: + return None + return nlp(text) + + def _is_stop_lambda(self, evaluation: Evaluation, options: dict): + nlp = self._load_spacy(evaluation, options) + if not nlp: + return None + + vocab = nlp.vocab + + def is_stop(word): + return vocab[word].is_stop + + return is_stop diff --git a/pymathics/natlang/textual_analysis.py b/pymathics/natlang/textual_analysis.py new file mode 100644 index 0000000..87c7f57 --- /dev/null +++ b/pymathics/natlang/textual_analysis.py @@ -0,0 +1,427 @@ +# -*- coding: utf-8 -*- +""" +Text analysis functions + +:See WMA guide:https://reference.wolfram.com/language/guide/TextAnalysis.html + +""" + +import re +from itertools import islice +from typing import Optional + +import enchant +import nltk +import spacy +from mathics.builtin.atomic.strings import anchor_pattern, to_regex +from mathics.builtin.base import Builtin +from mathics.core.atoms import Integer, Real, String +from mathics.core.evaluation import Evaluation +from mathics.core.expression import Expression +from mathics.core.list import ListExpression +from mathics.core.symbols import SymbolFalse, SymbolList, SymbolTrue +from mathics.core.systemsymbols import SymbolStringExpression +from mathics.eval.nevaluator import eval_N + +from pymathics.natlang.spacy import _SpacyBuiltin +from pymathics.natlang.util import _WordListBuiltin, _WordNetBuiltin, merge_dictionaries + + +class Containing(Builtin): + """ + :WMA: + https://reference.wolfram.com/language/ref/Containing.html + +
+
'Containing[$outer$, $inner$]' +
represents an object of the type outer containing objects\ + of type inner. +
+ + """ + + summary_text = "Specify a container for matching" + + +class DictionaryLookup(_WordListBuiltin): + """ + :WMA: + https://reference.wolfram.com/language/ref/DictionaryLookup.html + +
+
'DictionaryLookup[$word$]' +
lookup words that match the given $word$ or pattern. + +
'DictionaryLookup[$word$, $n$]' +
lookup first $n$ words that match the given $word$ or pattern. +
+ + >> DictionaryLookup["bake" ~~ ___, 3] + = {bake, bakeapple, baked} + """ + + summary_text = "Lookup words matching a pattern in a dictionary" + + def compile(self, pattern, evaluation): + re_patt = to_regex(pattern, evaluation) + if re_patt is None: + evaluation.message( + "StringExpression", + "invld", + pattern, + Expression(SymbolStringExpression, pattern), + ) + return + re_patt = anchor_pattern(re_patt) + + return re.compile(re_patt, flags=re.IGNORECASE) + + def search(self, dictionary_words, pattern): + for dictionary_word in dictionary_words: + if pattern.match(dictionary_word): + yield dictionary_word.replace("_", " ") + + def lookup(self, language_name, word, n, evaluation): + pattern = self.compile(word, evaluation) + if pattern: + dictionary_words = self._words(language_name, "All", evaluation) + if dictionary_words is not None: + matches = self.search(dictionary_words, pattern) + if n is not None: + matches = islice(matches, 0, n) + return ListExpression(*(String(match) for match in sorted(matches))) + + def eval_english(self, word, evaluation): + "DictionaryLookup[word_]" + return self.lookup(String("English"), word, None, evaluation) + + def eval_language(self, language, word, evaluation): + "DictionaryLookup[{language_String, word_}]" + return self.lookup(language, word, None, evaluation) + + def eval_english_n(self, word, n, evaluation): + "DictionaryLookup[word_, n_Integer]" + return self.lookup(String("English"), word, n.value, evaluation) + + def eval_language_n(self, language, word, n, evaluation): + "DictionaryLookup[{language_String, word_}, n_Integer]" + return self.lookup(language, word, n.value, evaluation) + + +class DictionaryWordQ(_WordNetBuiltin): + """ + :WMA: + https://reference.wolfram.com/language/ref/DictionaryWordQ.html + +
+
'DictionaryWordQ[$word$]' +
returns True if $word$ is a word usually found in dictionaries, and False otherwise. +
+ + >> DictionaryWordQ["couch"] + = True + + >> DictionaryWordQ["meep-meep"] + = False + """ + + summary_text = "Check if a word is in the dictionary" + + def eval(self, word, evaluation: Evaluation, options: dict): + "DictionaryWordQ[word_String, OptionsPattern[DictionaryWordQ]]" + if not isinstance(word, String): + return False + wordnet, language_code = self._load_wordnet( + evaluation, self._language_name(evaluation, options) + ) + if wordnet: + if list(wordnet.synsets(word.value.lower(), None, language_code)): + return SymbolTrue + else: + return SymbolFalse + + +class SpellingCorrectionList(Builtin): + """ + :WMA: + https://reference.wolfram.com/language/ref/SpellingCorrectionList.html + +
+
'SpellingCorrectionList[$word$]' +
returns a list of suggestions for spelling corrected versions of $word$. +
+ + Results may differ depending on which dictionaries can be found by enchant. + + >> SpellingCorrectionList["hipopotamus"] + = {hippopotamus...} + """ + + options = { + "Language": '"English"', + } + + messages = { + "lang": "SpellingCorrectionList does not support `1` as a language.", + } + + _languages = { + "English": "en_US", # en_GB, en_AU + "German": "de_DE", + "French": "fr_FR", + } + + _dictionaries = {} + + summary_text = "Look for spelling correction candidates of a word" + + def eval( + self, word: String, evaluation: Evaluation, options: dict + ) -> Optional[ListExpression]: + "SpellingCorrectionList[word_String, OptionsPattern[SpellingCorrectionList]]" + + language_name = self.get_option(options, "Language", evaluation) + if not isinstance(language_name, String): + return + language_code = SpellingCorrectionList._languages.get(language_name.value, None) + if not language_code: + evaluation.message("SpellingCorrectionList", "lang", language_name) + return + + d = SpellingCorrectionList._dictionaries.get(language_code, None) + if not d: + d = enchant.Dict(language_code) + SpellingCorrectionList._dictionaries[language_code] = d + + py_word = word.value + + if d.check(py_word): + return ListExpression(word) + else: + return ListExpression(*(String(word) for word in d.suggest(py_word))) + + +class WordCount(_SpacyBuiltin): + """ + :WMA: + https://reference.wolfram.com/language/ref/WordCount.html + +
+
'WordCount[$string$]' +
returns the number of words in $string$. +
+ + >> WordCount["A long time ago"] + = 4 + """ + + summary_text = "Count the words in a text" + + def eval(self, text, evaluation: Evaluation, options: dict): + "WordCount[text_String, OptionsPattern[WordCount]]" + doc = self._nlp(text.value, evaluation, options) + if doc: + punctuation = spacy.parts_of_speech.PUNCT + return Integer(sum(1 for word in doc if word.pos != punctuation)) + + +class WordFrequency(_SpacyBuiltin): + """ + :WMA: + https://reference.wolfram.com/language/ref/WordFrequency.html + +
+
'WordFrequency[$text$, $word$]' +
returns the relative frequency of $word$ in $text$. +
+ + $word$ may also specify multiple words using $a$ | $b$ | ... + + ## Problem with import for certain characters in the text. + ## >> text = Import["ExampleData/EinsteinSzilLetter.txt"]; + >> text = "I have a dairy cow, it's not just any cow. \ + She gives me milkshake, oh what a salty cow. She is the best\ + cow in the county."; + + >> WordFrequency[text, "a" | "the"] + = 0.114286 + + >> WordFrequency["Apple Tree", "apple", IgnoreCase -> True] + = 0.5 + """ + + options = _SpacyBuiltin.options + options.update({"IgnoreCase": "False"}) + summary_text = "Retrieve the frequency of a word in a text" + + def eval( + self, text: String, word, evaluation: Evaluation, options: dict + ) -> Optional[Expression]: + "WordFrequency[text_String, word_, OptionsPattern[WordFrequency]]" + doc = self._nlp(text.value, evaluation, options) + if not doc: + return + if isinstance(word, String): + words = set([word.value]) + elif word.get_head_name() == "System`Alternatives": + if not all(isinstance(a, String) for a in word.elements): + return # error + words = set(a.value for a in word.elements) + else: + return # error + + ignore_case = self.get_option(options, "IgnoreCase", evaluation) is SymbolTrue + if ignore_case: + words = [w.lower() for w in words] + n = 0 + for token in doc: + token_text = token.text + if ignore_case: + token_text = token_text.lower() + if token_text in words: + n += 1 + return eval_N(Integer(n) / Integer(len(doc)), evaluation) + + +class WordSimilarity(_SpacyBuiltin): + """ + + :WMA: + https://reference.wolfram.com/language/ref/WordSimilarity.html + +
+
'WordSimilarity[$text1$, $text2$]' +
returns a real-valued measure of semantic similarity of two texts or words. + +
'WordSimilarity[{$text1$, $i1$}, {$text2$, $j1$}]' +
returns a measure of similarity of two words within two texts. + +
'WordSimilarity[{$text1$, {$i1$, $i2$, ...}}, {$text2$, {$j1$, $j2$, ...}}]' +
returns a measure of similarity of multiple words within two texts. +
+ + >> NumberForm[WordSimilarity["car", "train"], 3] + = 0.439 + + >> NumberForm[WordSimilarity["car", "hedgehog"], 3] + = 0.195 + + >> NumberForm[WordSimilarity[{"An ocean full of water.", {2, 2}}, { "A desert full of sand.", {2, 5}}], 3] + = {0.505, 0.481} + """ + + messages = merge_dictionaries( + _SpacyBuiltin.messages, + { + "txtidx": "Index `1` in position `2` must be between 1 and `3`.", + "idxfmt": "Indices must be integers or lists of integers of the same length.", + }, + ) + summary_text = "Measure the similarity of two texts" + + def eval( + self, text1: String, text2: String, evaluation: Evaluation, options: dict + ) -> Optional[Real]: + "WordSimilarity[text1_String, text2_String, OptionsPattern[WordSimilarity]]" + doc1 = self._nlp(text1.value, evaluation, options) + if doc1: + doc2 = self._nlp(text2.value, evaluation, options) + if doc2: + return Real(doc1.similarity(doc2)) + + def eval_pair(self, text1, i1, text2, i2, evaluation: Evaluation, options: dict): + "WordSimilarity[{text1_String, i1_}, {text2_String, i2_}, OptionsPattern[WordSimilarity]]" + doc1 = self._nlp(text1.value, evaluation, options) + if doc1: + if text2.value == text1.value: + doc2 = doc1 + else: + doc2 = self._nlp(text2.value, evaluation, options) + if doc2: + if i1.get_head() is SymbolList and i2.get_head() is SymbolList: + if len(i1.elements) != len(i2.elements): + evaluation.message("TextSimilarity", "idxfmt") + return + if any( + not all(isinstance(i, Integer) for i in li.elements) + for li in (i1, i2) + ): + evaluation.message("TextSimilarity", "idxfmt") + return + indices1 = [i.value for i in i1.elements] + indices2 = [i.value for i in i2.elements] + multiple = True + elif isinstance(i1, Integer) and isinstance(i2, Integer): + indices1 = [i1.value] + indices2 = [i2.value] + multiple = False + else: + evaluation.message("TextSimilarity", "idxfmt") + return + + for index1, index2 in zip(indices1, indices2): + for i, pos, doc in zip((index1, index2), (1, 2), (doc1, doc2)): + if i < 1 or i > len(doc): + evaluation.message( + "TextSimilarity", "txtidx", i, pos, len(doc) + ) + return + + result = [ + Real(doc1[j1 - 1].similarity(doc2[j2 - 1])) + for j1, j2 in zip(indices1, indices2) + ] + + if multiple: + return ListExpression(*result) + else: + return result[0] + + +class WordStem(Builtin): + """ + :WMA: + https://reference.wolfram.com/language/ref/WordStem.html + +
+
'WordStem[$word$]' +
returns a stemmed form of $word$, thereby reducing an inflected form to its root. + +
'WordStem[{$word1$, $word2$, ...}]' +
returns a stemmed form for list of $word$, thereby reducing an inflected form to its root. +
+ + >> WordStem["towers"] + = tower + + >> WordStem[{"heroes", "roses", "knights", "queens"}] + = {hero, rose, knight, queen} + """ + + _stemmer = None + + requires = ("nltk",) + summary_text = "Retrieve the stem of a word" + + @staticmethod + def _get_porter_stemmer(): + if WordStem._stemmer is None: + WordStem._stemmer = nltk.stem.porter.PorterStemmer() + return WordStem._stemmer + + @staticmethod + def porter(w): + return WordStem._get_porter_stemmer().stem(w) + + def eval(self, word: String, evaluation: Evaluation) -> String: + "WordStem[word_String]" + stemmer = self._get_porter_stemmer() + return String(stemmer.stem(word.value)) + + def eval_list(self, words, evaluation: Evaluation) -> Optional[ListExpression]: + "WordStem[words_List]" + if all(isinstance(w, String) for w in words.elements): + stemmer = self._get_porter_stemmer() + return ListExpression( + *[String(stemmer.stem(w.value)) for w in words.elements] + ) diff --git a/pymathics/natlang/translation.py b/pymathics/natlang/translation.py new file mode 100644 index 0000000..4bb0d73 --- /dev/null +++ b/pymathics/natlang/translation.py @@ -0,0 +1,45 @@ +# -*- coding: utf-8 -*- + + +""" +Language translation + +""" + +from typing import Union + +import langid # see https://github.com/saffsd/langid.py +import pycountry +from mathics.builtin.base import Builtin +from mathics.core.atoms import String +from mathics.core.evaluation import Evaluation +from mathics.core.symbols import Symbol +from mathics.core.systemsymbols import SymbolFailed + + +class LanguageIdentify(Builtin): + """ + :WMA: + https://reference.wolfram.com/language/ref/LanguageIdentify.html + +
+
'LanguageIdentify[$text$]' +
returns the name of the language used in $text$. +
+ + >> LanguageIdentify["eins zwei drei"] + = German + """ + + summary_text = "determine the predominant human language in a string" + + def eval(self, text: String, evaluation: Evaluation) -> Union[Symbol, String]: + "LanguageIdentify[text_String]" + + # an alternative: https://github.com/Mimino666/langdetect + + code, _ = langid.classify(text.value) + language = pycountry.languages.get(alpha_2=code) + if language is None: + return SymbolFailed + return String(language.name) diff --git a/pymathics/natlang/util.py b/pymathics/natlang/util.py new file mode 100644 index 0000000..2c3b33c --- /dev/null +++ b/pymathics/natlang/util.py @@ -0,0 +1,328 @@ +# -*- coding: utf-8 -*- + +""" +utils +""" +import re +from itertools import chain + +import nltk +from mathics.builtin.base import Builtin, MessageException +from mathics.builtin.codetables import iso639_3 +from mathics.core.atoms import String +from mathics.core.evaluation import Evaluation +from mathics.core.symbols import strip_context + +no_doc = True + + +_wordnet_pos_to_type = {} +_wordnet_type_to_pos = {} + + +def _init_nltk_maps(): + _wordnet_pos_to_type.update( + { + nltk.corpus.wordnet.VERB: "Verb", + nltk.corpus.wordnet.NOUN: "Noun", + nltk.corpus.wordnet.ADJ: "Adjective", + nltk.corpus.wordnet.ADJ_SAT: "Adjective", + nltk.corpus.wordnet.ADV: "Adverb", + } + ) + _wordnet_type_to_pos.update( + { + "Verb": [nltk.corpus.wordnet.VERB], + "Noun": [nltk.corpus.wordnet.NOUN], + "Adjective": [nltk.corpus.wordnet.ADJ, nltk.corpus.wordnet.ADJ_SAT], + "Adverb": [nltk.corpus.wordnet.ADV], + } + ) + + +def _parse_nltk_lookup_error(e): + m = re.search(r"Resource '([^']+)' not found\.", str(e)) + if m: + return m.group(1) + else: + return "unknown" + + +def merge_dictionaries(a, b): + c = a.copy() + c.update(b) + return c + + +class _WordNetBuiltin(Builtin): + requires = ("nltk",) + + options = { + "Language": '"English"', + } + + messages = { + "package": "NLTK's `` corpus is not installed. Please install it using nltk.download().", + "lang": 'Language "`1`" is currently not supported with `2`[]. Please install it manually.', + # 'load': 'Loading `1` word data. Please wait.', + "wordnet": "WordNet returned the following error: ``", + } + + _wordnet_instances = {} + + def _language_name(self, evaluation: Evaluation, options: dict): + return self.get_option(options, "Language", evaluation) + + def _init_wordnet(self, evaluation: Evaluation, language_name, language_code): + try: + wordnet_resource = nltk.data.find("corpora/wordnet2022") + _init_nltk_maps() + except LookupError: + evaluation.message(self.get_name(), "package", "wordnet2022") + return None + + try: + omw = nltk.corpus.util.LazyCorpusLoader( + "omw", + nltk.corpus.reader.CorpusReader, + r".*/wn-data-.*\.tab", + encoding="utf8", + ) + except LookupError: + evaluation.message(self.get_name(), "package", "omw") + return None + + wordnet = nltk.corpus.reader.wordnet.WordNetCorpusReader(wordnet_resource, omw) + + if language_code not in wordnet.langs(): + evaluation.message( + self.get_name(), "lang", language_name, strip_context(self.get_name()) + ) + return None + + return wordnet + + def _load_wordnet(self, evaluation: Evaluation, language_name) -> tuple: + language_code = None + if isinstance(language_name, String): + language_code = iso639_3.get(language_name.value) + if not language_code: + evaluation.message( + self.get_name(), "lang", language_name, strip_context(self.get_name()) + ) + return None, None + + wordnet = _WordNetBuiltin._wordnet_instances.get(language_code) + if not wordnet: + try: + wordnet = self._init_wordnet(evaluation, language_name, language_code) + except LookupError as e: + evaluation.message( + self.get_name(), "package", _parse_nltk_lookup_error(e) + ) + return None, None + + _WordNetBuiltin._wordnet_instances[language_code] = wordnet + + return wordnet, language_code + + @staticmethod + def _decode_synset(syn): + what, pos, nr = (syn.name().split(".") + ["01"])[:3] + return what.replace("_", " "), pos, nr + + @staticmethod + def _capitalize(s) -> str: + return re.sub(r"^[a-z]|\s[a-z]", lambda m: m.group(0).upper().lstrip(" "), s) + + @staticmethod + def _underscore(s) -> str: + return re.sub( + r"[a-z][A-Z]", lambda m: m.group(0)[0] + "_" + m.group(0)[1].lower(), s + ).lower() + + @staticmethod + def _list_syn_form(syn): + what, pos, nr = _WordNetBuiltin._decode_synset(syn) + + def containers(): + for name in syn.lemma_names(): + if name != what: + yield name + + for s in chain(syn.hypernyms(), syn.hyponyms(), syn.similar_tos()): + container, _, _ = _WordNetBuiltin._decode_synset(s) + yield container + + for lemma in WordProperty._synonymous_lemmas(syn): + yield lemma.name() + + return what, _wordnet_pos_to_type[pos], containers + + @staticmethod + def syn(syn, wordnet, language_code) -> tuple: + what, pos, nr = _WordNetBuiltin._decode_synset(syn) + for s, form in _WordNetBuiltin._iterate_senses(what, wordnet, language_code): + if s == syn: + return form + return what, pos, "Unknown" + + @staticmethod + def _iterate_senses(word, wordnet, language_code): + if not word: + return + + used = set() + output_word = word.replace("_", " ") + + for syn in wordnet.synsets(word, None, language_code): + if syn.lexname() in ("noun.location", "noun.person"): + continue # ignore + + what, pos, containers = _WordNetBuiltin._list_syn_form(syn) + + for container in containers(): + container = container.replace("_", " ") + if container != word: + if container not in used: + used.add(container) + yield syn, ( + output_word, + pos, + _WordNetBuiltin._capitalize(container), + ) + break + + def _senses(self, word, wordnet, language_code): + if isinstance(word, tuple): # find forms like ["tree", "Noun", "WoodyPlant"] + for syn, form in _WordNetBuiltin._iterate_senses( + word[0], wordnet, language_code + ): + if form == word: + return [[syn, form]] + else: # find word given as strings, e.g. "tree" + word = wordnet.morphy(word) # base form, e.g. trees -> tree + return list(_WordNetBuiltin._iterate_senses(word, wordnet, language_code)) + + +class _WordListBuiltin(_WordNetBuiltin): + _dictionary = {} + + def _words(self, language_name, ilk, evaluation): + wordnet, language_code = self._load_wordnet(evaluation, language_name) + + if not wordnet: + return + + key = "%s.%s" % (language_code, ilk) + words = self._dictionary.get(key) + if not words: + try: + if ilk == "All": + filtered_pos = [None] + else: + try: + filtered_pos = _wordnet_type_to_pos[ilk] + except KeyError: + evaluation.message( + self.get_name(), + "wordnet", + "type: %s is should be in %s" + % (ilk._wordnet_type_to_pos.keys()), + ) + return + + words = [] + for pos in filtered_pos: + words.extend(list(wordnet.all_lemma_names(pos, language_code))) + words.sort() + self._dictionary[key] = words + except nltk.corpus.reader.wordnet.WordNetError as err: + evaluation.message(self.get_name(), "wordnet", str(err)) + return + + return words + + +class WordProperty: + def __init__(self, syn_form, wordnet, language_code): + self.syn_form = syn_form + self.wordnet = wordnet + self.language_code = language_code + + def syn(self, syn): + return self.syn_form(_WordNetBuiltin.syn(syn, self.wordnet, self.language_code)) + + @staticmethod + def _synonymous_lemmas(syn): + first_lemma = syn.name().split(".")[0] + return (s for s in syn.lemmas() if s.name() != first_lemma) + + @staticmethod + def _antonymous_lemmas(syn): + return (s for lemma in syn.lemmas() for s in lemma.antonyms()) + + def definitions(self, syn, desc): + return syn.definition() + + def examples(self, syn, desc): + return syn.examples() + + def synonyms(self, syn, desc): + _, pos, container = desc + return [ + self.syn_form((s.name().replace("_", " "), pos, container)) + for s in WordProperty._synonymous_lemmas(syn) + ] + + def antonyms(self, syn, desc): + return [self.syn(s.synset()) for s in WordProperty._antonymous_lemmas(syn)] + + def broader_terms(self, syn, desc): + return [self.syn(s) for s in syn.hypernyms()] + + def narrower_terms(self, syn, desc): + return [self.syn(s) for s in syn.hyponyms()] + + def usage_field(self, syn, desc): + return syn.usage_domains() + + def whole_terms(self, syn, desc): + return [self.syn(s) for s in syn.part_holonyms()] + + def part_terms(self, syn, desc): + return [self.syn(s) for s in syn.part_meronyms()] + + def material_terms(self, syn, desc): + return [self.syn(s) for s in syn.substance_meronyms()] + + def word_net_id(self, syn, desc): + return syn.offset() + + def entailed_terms(self, syn, desc): # e.g. fall to condense + return [self.syn(s) for s in syn.entailments()] + + def causes_terms(self, syn, desc): # e.g. ignite to burn + return [self.syn(s) for s in syn.causes()] + + def inflected_forms(self, syn, desc): + try: + word, pos, _ = desc + if pos == "Verb": + from pattern.en import lexeme + + return [w for w in reversed(lexeme(word)) if w != word] + elif pos == "Noun": + from pattern.en import pluralize + + return [pluralize(word)] + elif pos == "Adjective": + from pattern.en import comparative, superlative + + return [comparative(word), superlative(word)] + else: + return [] + except ImportError: + raise MessageException( + "General", "unavailable", 'WordData[_, "InflectedForms"]', "pattern" + ) diff --git a/setup.py b/setup.py index 57d7097..ef44e46 100644 --- a/setup.py +++ b/setup.py @@ -1,11 +1,12 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -import sys -import platform import os import os.path as osp -from setuptools import setup, find_namespace_packages +import platform +import sys + +from setuptools import find_namespace_packages, setup # Ensure user has the correct Python version if sys.version_info < (3, 6): diff --git a/test/consistency-and-style/test_summary_text.py b/test/consistency-and-style/test_summary_text.py index dd99c95..05deaa4 100644 --- a/test/consistency-and-style/test_summary_text.py +++ b/test/consistency-and-style/test_summary_text.py @@ -5,12 +5,11 @@ import pkgutil import pytest - -from pymathics.natlang import __file__ as module_initfile_path from mathics.builtin import name_is_builtin_symbol from mathics.builtin.base import Builtin from mathics.doc.common_doc import skip_doc +from pymathics.natlang import __file__ as module_initfile_path # Get file system path name for mathics.builtin module_path = osp.dirname(module_initfile_path) From f5a4282dab89caca0ee188aaec2ca5a3d13ab488 Mon Sep 17 00:00:00 2001 From: mmatera Date: Mon, 20 Feb 2023 18:09:04 -0300 Subject: [PATCH 02/14] adding comments --- pymathics/natlang/linguistic_data.py | 1 + pymathics/natlang/normalization.py | 4 ++++ pymathics/natlang/textual_analysis.py | 3 ++- pymathics/natlang/translation.py | 7 +++++++ 4 files changed, 14 insertions(+), 1 deletion(-) diff --git a/pymathics/natlang/linguistic_data.py b/pymathics/natlang/linguistic_data.py index 942ff48..e7b7d40 100644 --- a/pymathics/natlang/linguistic_data.py +++ b/pymathics/natlang/linguistic_data.py @@ -5,6 +5,7 @@ See :WMA:https://reference.wolfram.com/language/guide/LinguisticData.html guide. """ +# This module uses both nltk and spacy. Maybe it makes sense to split this further. # TODO: Complete me diff --git a/pymathics/natlang/normalization.py b/pymathics/natlang/normalization.py index 17d7e31..aaadd50 100644 --- a/pymathics/natlang/normalization.py +++ b/pymathics/natlang/normalization.py @@ -2,6 +2,10 @@ Text normalization +See :WMA: https://reference.wolfram.com/language/guide/TextNormalization.html guide. + + +This module uses spacy as a backend. """ import itertools from itertools import islice diff --git a/pymathics/natlang/textual_analysis.py b/pymathics/natlang/textual_analysis.py index 87c7f57..98657e4 100644 --- a/pymathics/natlang/textual_analysis.py +++ b/pymathics/natlang/textual_analysis.py @@ -3,9 +3,10 @@ Text analysis functions :See WMA guide:https://reference.wolfram.com/language/guide/TextAnalysis.html - """ +# This module uses both enchant, nltk and spacy. Maybe we want to split this further. + import re from itertools import islice from typing import Optional diff --git a/pymathics/natlang/translation.py b/pymathics/natlang/translation.py index 4bb0d73..96b73d8 100644 --- a/pymathics/natlang/translation.py +++ b/pymathics/natlang/translation.py @@ -4,8 +4,15 @@ """ Language translation + """ +# This is under Text Normalization in WR. But also in Natural Language Processing, +# and Linguistic Data. I put here because is the only module tuat uses langid and pycountry +# modules. +# +# TODO: WordTranslation, TextTranslation + from typing import Union import langid # see https://github.com/saffsd/langid.py From 63e72dd7e0a035b8ed89ad58406a8a263a50dc39 Mon Sep 17 00:00:00 2001 From: mmatera Date: Mon, 20 Feb 2023 19:18:16 -0300 Subject: [PATCH 03/14] black --- pymathics/natlang/translation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pymathics/natlang/translation.py b/pymathics/natlang/translation.py index 96b73d8..1bca23e 100644 --- a/pymathics/natlang/translation.py +++ b/pymathics/natlang/translation.py @@ -7,7 +7,7 @@ """ -# This is under Text Normalization in WR. But also in Natural Language Processing, +# This is under Text Normalization in WR. But also in Natural Language Processing, # and Linguistic Data. I put here because is the only module tuat uses langid and pycountry # modules. # From 29add1a19aa5be2bc1ac7afed90a6bd596182e76 Mon Sep 17 00:00:00 2001 From: mmatera Date: Mon, 20 Feb 2023 20:02:21 -0300 Subject: [PATCH 04/14] fix summaries --- pymathics/natlang/linguistic_data.py | 4 ++-- pymathics/natlang/normalization.py | 6 +++--- pymathics/natlang/spacy.py | 4 +++- pymathics/natlang/textual_analysis.py | 4 ++-- pymathics/natlang/translation.py | 2 +- 5 files changed, 11 insertions(+), 9 deletions(-) diff --git a/pymathics/natlang/linguistic_data.py b/pymathics/natlang/linguistic_data.py index e7b7d40..e2d27ee 100644 --- a/pymathics/natlang/linguistic_data.py +++ b/pymathics/natlang/linguistic_data.py @@ -58,7 +58,7 @@ class Pluralize(Builtin): """ requires = ("pattern",) - summary_text = "Retrieve the pluralized form of a word" + summary_text = "retrieve the pluralized form of a word" def eval(self, word, evaluation): "Pluralize[word_String]" @@ -332,7 +332,7 @@ class WordList(_WordListBuiltin): = 9.3 """ - summary_text = "retrieve a list of common words" + summary_text = "list of common words" def eval(self, evaluation: Evaluation, options: dict): "WordList[OptionsPattern[]]" diff --git a/pymathics/natlang/normalization.py b/pymathics/natlang/normalization.py index aaadd50..aeca510 100644 --- a/pymathics/natlang/normalization.py +++ b/pymathics/natlang/normalization.py @@ -102,7 +102,7 @@ class TextCases(_SpacyBuiltin): """ - summary_text = "List the cases of words of a certain form in a text" + summary_text = "list the cases of words of a certain form in a text" def eval_string_form( self, text: String, form, evaluation: Evaluation, options: dict @@ -136,7 +136,7 @@ class TextPosition(_SpacyBuiltin): = {{1, 9}, {15, 20}} """ - summary_text = "List the position of words of a given form in a text" + summary_text = "list the position of words of a given form in a text" def eval_text_form(self, text: String, form, evaluation: Evaluation, options: dict): "TextPosition[text_String, form_, OptionsPattern[TextPosition]]" @@ -215,7 +215,7 @@ class TextStructure(_SpacyBuiltin): """ _root_pos = set(i for i, names in _pos_tags.items() if names[1]) - summary_text = "Retrieve the grammatical structure of a text" + summary_text = "retrieve the grammatical structure of a text" def _to_constituent_string(self, node): token, children = node diff --git a/pymathics/natlang/spacy.py b/pymathics/natlang/spacy.py index 851d13f..78c38b6 100644 --- a/pymathics/natlang/spacy.py +++ b/pymathics/natlang/spacy.py @@ -1,10 +1,12 @@ # -*- coding: utf-8 -*- -# FIXME: split this up into smaller pieces """ Spacy tools """ + +# TODO: move here low-level implementation depending on spacy + import heapq import re from typing import Optional diff --git a/pymathics/natlang/textual_analysis.py b/pymathics/natlang/textual_analysis.py index 98657e4..c1f74d2 100644 --- a/pymathics/natlang/textual_analysis.py +++ b/pymathics/natlang/textual_analysis.py @@ -253,7 +253,7 @@ class WordFrequency(_SpacyBuiltin): options = _SpacyBuiltin.options options.update({"IgnoreCase": "False"}) - summary_text = "Retrieve the frequency of a word in a text" + summary_text = "retrieve the frequency of a word in a text" def eval( self, text: String, word, evaluation: Evaluation, options: dict @@ -402,7 +402,7 @@ class WordStem(Builtin): _stemmer = None requires = ("nltk",) - summary_text = "Retrieve the stem of a word" + summary_text = "retrieve the stem of a word" @staticmethod def _get_porter_stemmer(): diff --git a/pymathics/natlang/translation.py b/pymathics/natlang/translation.py index 1bca23e..0da39ad 100644 --- a/pymathics/natlang/translation.py +++ b/pymathics/natlang/translation.py @@ -8,7 +8,7 @@ """ # This is under Text Normalization in WR. But also in Natural Language Processing, -# and Linguistic Data. I put here because is the only module tuat uses langid and pycountry +# and Linguistic Data. I put here because is the only module that uses langid and pycountry # modules. # # TODO: WordTranslation, TextTranslation From 2b10a3cd6ef8282d01be1da7efdbc4d86e8ac152 Mon Sep 17 00:00:00 2001 From: mmatera Date: Tue, 21 Feb 2023 23:30:19 -0300 Subject: [PATCH 05/14] rocky's comments fixed --- pymathics/natlang/linguistic_data.py | 21 +++++++++++++-------- pymathics/natlang/normalization.py | 7 ++++--- pymathics/natlang/textual_analysis.py | 20 +++++++++++--------- pymathics/natlang/translation.py | 3 ++- 4 files changed, 30 insertions(+), 21 deletions(-) diff --git a/pymathics/natlang/linguistic_data.py b/pymathics/natlang/linguistic_data.py index e2d27ee..73d444c 100644 --- a/pymathics/natlang/linguistic_data.py +++ b/pymathics/natlang/linguistic_data.py @@ -2,7 +2,7 @@ """ Linguistic Data -See :WMA:https://reference.wolfram.com/language/guide/LinguisticData.html guide. +See :WMA link:https://reference.wolfram.com/language/guide/LinguisticData.html guide. """ # This module uses both nltk and spacy. Maybe it makes sense to split this further. @@ -23,6 +23,7 @@ # from mathics.builtin.codetables import iso639_3 from mathics.builtin.numbers.randomnumbers import RandomEnv from mathics.core.atoms import String +from mathics.core.element import ElementsProperties from mathics.core.convert.expression import Expression, to_expression from mathics.core.evaluation import Evaluation from mathics.core.list import ListExpression @@ -39,13 +40,15 @@ merge_dictionaries, ) +sort_order = "Linguistic Data" + SymbolDictionaryLookup = Symbol("Pymathics`Natlang`DictionaryLookup") StringNotAvailable = String("NotAvailable") class Pluralize(Builtin): """ - :WMA: + :WMA link: https://reference.wolfram.com/language/ref/Pluralize.html
@@ -68,7 +71,7 @@ def eval(self, word, evaluation): class RandomWord(_WordListBuiltin): """ - :WMA: + :WMA link: https://reference.wolfram.com/language/ref/RandomWord.html
@@ -122,7 +125,7 @@ def eval_type_n(self, type, n, evaluation: Evaluation, options: dict): class WordData(_WordListBuiltin): """ - :WMA: + :WMA link: https://reference.wolfram.com/language/ref/WordData.html
@@ -288,7 +291,7 @@ def eval_property_form( class WordDefinition(_WordNetBuiltin): """ - :WMA: + :WMA link: https://reference.wolfram.com/language/ref/WordDefinition.html
@@ -317,7 +320,7 @@ def eval(self, word, evaluation: Evaluation, options: dict): class WordList(_WordListBuiltin): """ - :WMA: + :WMA link: https://reference.wolfram.com/language/ref/WordList.html
@@ -338,7 +341,9 @@ def eval(self, evaluation: Evaluation, options: dict): "WordList[OptionsPattern[]]" words = self._words(self._language_name(evaluation, options), "All", evaluation) if words is not None: - return ListExpression(*(String(word) for word in words)) + words_mathics = (String(word) for word in words) + result = ListExpression(*words_mathics, elements_properties=ElementsProperties(False, False, True)) + return result def eval_type(self, wordtype, evaluation: Evaluation, options: dict): "WordList[wordtype_String, OptionsPattern[]]" @@ -348,4 +353,4 @@ def eval_type(self, wordtype, evaluation: Evaluation, options: dict): evaluation, ) if words is not None: - return ListExpression(*(String(word) for word in words)) + return ListExpression(*(String(word) for word in words), elements_properties=ElementsProperties(False, False, True)) diff --git a/pymathics/natlang/normalization.py b/pymathics/natlang/normalization.py index aeca510..6066cf0 100644 --- a/pymathics/natlang/normalization.py +++ b/pymathics/natlang/normalization.py @@ -1,6 +1,6 @@ """ -Text normalization +Text Normalization See :WMA: https://reference.wolfram.com/language/guide/TextNormalization.html guide. @@ -19,6 +19,7 @@ from pymathics.natlang.spacy import _cases, _pos_tags, _position, _SpacyBuiltin +sort_order = "Text Normalization" class DeleteStopwords(_SpacyBuiltin): """ @@ -284,7 +285,7 @@ class TextWords(_SpacyBuiltin): def eval( self, text: String, evaluation: Evaluation, options: dict ) -> Optional[ListExpression]: - "TextWords[text_String, OptionsPattern[WordCount]]" + "TextWords[text_String, OptionsPattern[]]" doc = self._nlp(text.value, evaluation, options) if doc: punctuation = spacy.parts_of_speech.PUNCT @@ -293,7 +294,7 @@ def eval( ) def eval_n(self, text: String, n: Integer, evaluation: Evaluation, options: dict): - "TextWords[text_String, n_Integer, OptionsPattern[TextWords]]" + "TextWords[text_String, n_Integer, OptionsPattern[]]" doc = self._nlp(text.value, evaluation, options) if doc: punctuation = spacy.parts_of_speech.PUNCT diff --git a/pymathics/natlang/textual_analysis.py b/pymathics/natlang/textual_analysis.py index c1f74d2..622efe6 100644 --- a/pymathics/natlang/textual_analysis.py +++ b/pymathics/natlang/textual_analysis.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- """ -Text analysis functions +Text Analysis :See WMA guide:https://reference.wolfram.com/language/guide/TextAnalysis.html """ @@ -28,9 +28,11 @@ from pymathics.natlang.util import _WordListBuiltin, _WordNetBuiltin, merge_dictionaries +sort_order = "Text Analysis" + class Containing(Builtin): """ - :WMA: + :WMA link: https://reference.wolfram.com/language/ref/Containing.html
@@ -46,7 +48,7 @@ class Containing(Builtin): class DictionaryLookup(_WordListBuiltin): """ - :WMA: + :WMA link: https://reference.wolfram.com/language/ref/DictionaryLookup.html
@@ -111,7 +113,7 @@ def eval_language_n(self, language, word, n, evaluation): class DictionaryWordQ(_WordNetBuiltin): """ - :WMA: + :WMA link: https://reference.wolfram.com/language/ref/DictionaryWordQ.html
@@ -144,7 +146,7 @@ def eval(self, word, evaluation: Evaluation, options: dict): class SpellingCorrectionList(Builtin): """ - :WMA: + :WMA link: https://reference.wolfram.com/language/ref/SpellingCorrectionList.html
@@ -204,7 +206,7 @@ def eval( class WordCount(_SpacyBuiltin): """ - :WMA: + :WMA link: https://reference.wolfram.com/language/ref/WordCount.html
@@ -228,7 +230,7 @@ def eval(self, text, evaluation: Evaluation, options: dict): class WordFrequency(_SpacyBuiltin): """ - :WMA: + :WMA link: https://reference.wolfram.com/language/ref/WordFrequency.html
@@ -287,7 +289,7 @@ def eval( class WordSimilarity(_SpacyBuiltin): """ - :WMA: + :WMA link: https://reference.wolfram.com/language/ref/WordSimilarity.html
@@ -381,7 +383,7 @@ def eval_pair(self, text1, i1, text2, i2, evaluation: Evaluation, options: dict) class WordStem(Builtin): """ - :WMA: + :WMA link: https://reference.wolfram.com/language/ref/WordStem.html
diff --git a/pymathics/natlang/translation.py b/pymathics/natlang/translation.py index 0da39ad..a08fd03 100644 --- a/pymathics/natlang/translation.py +++ b/pymathics/natlang/translation.py @@ -2,7 +2,7 @@ """ -Language translation +Language Translation """ @@ -23,6 +23,7 @@ from mathics.core.symbols import Symbol from mathics.core.systemsymbols import SymbolFailed +sort_order = "Language Translation" class LanguageIdentify(Builtin): """ From 042dfe37bddc9d2b278896663a21bcbe746f948d Mon Sep 17 00:00:00 2001 From: mmatera Date: Tue, 21 Feb 2023 23:31:13 -0300 Subject: [PATCH 06/14] black --- pymathics/natlang/linguistic_data.py | 12 +++++++++--- pymathics/natlang/normalization.py | 1 + pymathics/natlang/textual_analysis.py | 2 +- pymathics/natlang/translation.py | 1 + 4 files changed, 12 insertions(+), 4 deletions(-) diff --git a/pymathics/natlang/linguistic_data.py b/pymathics/natlang/linguistic_data.py index 73d444c..f8210be 100644 --- a/pymathics/natlang/linguistic_data.py +++ b/pymathics/natlang/linguistic_data.py @@ -23,8 +23,8 @@ # from mathics.builtin.codetables import iso639_3 from mathics.builtin.numbers.randomnumbers import RandomEnv from mathics.core.atoms import String -from mathics.core.element import ElementsProperties from mathics.core.convert.expression import Expression, to_expression +from mathics.core.element import ElementsProperties from mathics.core.evaluation import Evaluation from mathics.core.list import ListExpression from mathics.core.symbols import Symbol, SymbolList @@ -342,7 +342,10 @@ def eval(self, evaluation: Evaluation, options: dict): words = self._words(self._language_name(evaluation, options), "All", evaluation) if words is not None: words_mathics = (String(word) for word in words) - result = ListExpression(*words_mathics, elements_properties=ElementsProperties(False, False, True)) + result = ListExpression( + *words_mathics, + elements_properties=ElementsProperties(False, False, True) + ) return result def eval_type(self, wordtype, evaluation: Evaluation, options: dict): @@ -353,4 +356,7 @@ def eval_type(self, wordtype, evaluation: Evaluation, options: dict): evaluation, ) if words is not None: - return ListExpression(*(String(word) for word in words), elements_properties=ElementsProperties(False, False, True)) + return ListExpression( + *(String(word) for word in words), + elements_properties=ElementsProperties(False, False, True) + ) diff --git a/pymathics/natlang/normalization.py b/pymathics/natlang/normalization.py index 6066cf0..d9eea94 100644 --- a/pymathics/natlang/normalization.py +++ b/pymathics/natlang/normalization.py @@ -21,6 +21,7 @@ sort_order = "Text Normalization" + class DeleteStopwords(_SpacyBuiltin): """ Delete :stop words:https://en.wikipedia.org/wiki/Stop_word(\ diff --git a/pymathics/natlang/textual_analysis.py b/pymathics/natlang/textual_analysis.py index 622efe6..6bcd2f3 100644 --- a/pymathics/natlang/textual_analysis.py +++ b/pymathics/natlang/textual_analysis.py @@ -27,9 +27,9 @@ from pymathics.natlang.spacy import _SpacyBuiltin from pymathics.natlang.util import _WordListBuiltin, _WordNetBuiltin, merge_dictionaries - sort_order = "Text Analysis" + class Containing(Builtin): """ :WMA link: diff --git a/pymathics/natlang/translation.py b/pymathics/natlang/translation.py index a08fd03..f5624ef 100644 --- a/pymathics/natlang/translation.py +++ b/pymathics/natlang/translation.py @@ -25,6 +25,7 @@ sort_order = "Language Translation" + class LanguageIdentify(Builtin): """ :WMA: From 899d0bfa73e2cd81703b2b5f052f8153602a65c4 Mon Sep 17 00:00:00 2001 From: mmatera Date: Tue, 21 Feb 2023 23:46:49 -0300 Subject: [PATCH 07/14] test for wordlist --- pymathics/natlang/linguistic_data.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pymathics/natlang/linguistic_data.py b/pymathics/natlang/linguistic_data.py index f8210be..b6d7fb7 100644 --- a/pymathics/natlang/linguistic_data.py +++ b/pymathics/natlang/linguistic_data.py @@ -331,6 +331,8 @@ class WordList(_WordListBuiltin):
returns a list of common words of type $type$.
+ >> Length[WordList[]] > 10000 + = True >> N[Mean[StringLength /@ WordList["Adjective"]], 2] = 9.3 """ From da506b0d51092c04cdf38abfc51cc5ef99f702e6 Mon Sep 17 00:00:00 2001 From: mmatera Date: Wed, 22 Feb 2023 07:33:21 -0300 Subject: [PATCH 08/14] easy fixes --- pymathics/natlang/linguistic_data.py | 8 +++++--- pymathics/natlang/normalization.py | 10 +++++----- pymathics/natlang/textual_analysis.py | 8 ++++---- pymathics/natlang/translation.py | 2 +- pymathics/natlang/util.py | 4 ++-- test/test_natlang.py | 5 +++++ 6 files changed, 22 insertions(+), 15 deletions(-) diff --git a/pymathics/natlang/linguistic_data.py b/pymathics/natlang/linguistic_data.py index b6d7fb7..6332c1a 100644 --- a/pymathics/natlang/linguistic_data.py +++ b/pymathics/natlang/linguistic_data.py @@ -331,13 +331,15 @@ class WordList(_WordListBuiltin):
returns a list of common words of type $type$.
- >> Length[WordList[]] > 10000 - = True + Evaluate the average length over all the words in the dictionary: + >> N[Mean[StringLength /@ WordList[]], 3] + = 11.6 + Now, restricted to adjetives: >> N[Mean[StringLength /@ WordList["Adjective"]], 2] = 9.3 """ - summary_text = "list of common words" + summary_text = "list common words" def eval(self, evaluation: Evaluation, options: dict): "WordList[OptionsPattern[]]" diff --git a/pymathics/natlang/normalization.py b/pymathics/natlang/normalization.py index d9eea94..985809b 100644 --- a/pymathics/natlang/normalization.py +++ b/pymathics/natlang/normalization.py @@ -2,7 +2,7 @@ Text Normalization -See :WMA: https://reference.wolfram.com/language/guide/TextNormalization.html guide. +See :WMA link: https://reference.wolfram.com/language/guide/TextNormalization.html guide. This module uses spacy as a backend. @@ -82,7 +82,7 @@ def tokens(): class TextCases(_SpacyBuiltin): """ - :WMA: + :WMA link: https://reference.wolfram.com/language/ref/TextCases.html
@@ -126,7 +126,7 @@ def eval_string_form_n( class TextPosition(_SpacyBuiltin): """ - :WMA: + :WMA link: https://reference.wolfram.com/language/ref/TextPosition.html
@@ -204,7 +204,7 @@ def eval_n(self, text: String, n: Integer, evaluation: Evaluation, options: dict class TextStructure(_SpacyBuiltin): """ - :WMA: + :WMA link: https://reference.wolfram.com/language/ref/TextStructure.html
@@ -266,7 +266,7 @@ def eval(self, text, evaluation: Evaluation, options: dict): class TextWords(_SpacyBuiltin): """ - :WMA: + :WMA link: https://reference.wolfram.com/language/ref/TextWords.html
diff --git a/pymathics/natlang/textual_analysis.py b/pymathics/natlang/textual_analysis.py index 6bcd2f3..dc4df24 100644 --- a/pymathics/natlang/textual_analysis.py +++ b/pymathics/natlang/textual_analysis.py @@ -43,7 +43,7 @@ class Containing(Builtin): """ - summary_text = "Specify a container for matching" + summary_text = "specify a container for matching" class DictionaryLookup(_WordListBuiltin): @@ -243,11 +243,11 @@ class WordFrequency(_SpacyBuiltin): ## Problem with import for certain characters in the text. ## >> text = Import["ExampleData/EinsteinSzilLetter.txt"]; >> text = "I have a dairy cow, it's not just any cow. \ - She gives me milkshake, oh what a salty cow. She is the best\ - cow in the county."; +She gives me milkshake, oh what a salty cow. She is the best \ +cow in the county."; >> WordFrequency[text, "a" | "the"] - = 0.114286 + = 0.121212 >> WordFrequency["Apple Tree", "apple", IgnoreCase -> True] = 0.5 diff --git a/pymathics/natlang/translation.py b/pymathics/natlang/translation.py index f5624ef..a3aecd1 100644 --- a/pymathics/natlang/translation.py +++ b/pymathics/natlang/translation.py @@ -28,7 +28,7 @@ class LanguageIdentify(Builtin): """ - :WMA: + :WMA link: https://reference.wolfram.com/language/ref/LanguageIdentify.html
diff --git a/pymathics/natlang/util.py b/pymathics/natlang/util.py index 2c3b33c..c5a223c 100644 --- a/pymathics/natlang/util.py +++ b/pymathics/natlang/util.py @@ -227,8 +227,8 @@ def _words(self, language_name, ilk, evaluation): evaluation.message( self.get_name(), "wordnet", - "type: %s is should be in %s" - % (ilk._wordnet_type_to_pos.keys()), + "type: %s should be in %s" + % (ilk, _wordnet_type_to_pos.keys()), ) return diff --git a/test/test_natlang.py b/test/test_natlang.py index a64b31e..4951f30 100644 --- a/test/test_natlang.py +++ b/test/test_natlang.py @@ -16,6 +16,11 @@ def test_natlang(): "4", "WordCount", ), + ( + 'Length[WordList[]]>10000', + "True", + "WordList", + ), ( 'TextWords["Hickory, dickory, dock! The mouse ran up the clock."]', '{"Hickory", "dickory", "dock", "The", "mouse", "ran", "up", "the", "clock"}', From ef95fa51f4d04f6df1f16cd3c715c7b5b4eee1be Mon Sep 17 00:00:00 2001 From: mmatera Date: Wed, 22 Feb 2023 08:04:16 -0300 Subject: [PATCH 09/14] complete doctests --- pymathics/natlang/normalization.py | 4 ++++ pymathics/natlang/textual_analysis.py | 7 +++++-- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/pymathics/natlang/normalization.py b/pymathics/natlang/normalization.py index 985809b..51738c5 100644 --- a/pymathics/natlang/normalization.py +++ b/pymathics/natlang/normalization.py @@ -279,6 +279,10 @@ class TextWords(_SpacyBuiltin): >> TextWords["Hickory, dickory, dock! The mouse ran up the clock."] = {Hickory, dickory, dock, The, mouse, ran, up, the, clock} + + >> TextWords["Bruder Jakob, Schläfst du noch?", 2] + = {Bruder, Jakob} + """ summary_text = "list the words in a string" diff --git a/pymathics/natlang/textual_analysis.py b/pymathics/natlang/textual_analysis.py index dc4df24..b9e5d5d 100644 --- a/pymathics/natlang/textual_analysis.py +++ b/pymathics/natlang/textual_analysis.py @@ -59,8 +59,11 @@ class DictionaryLookup(_WordListBuiltin):
lookup first $n$ words that match the given $word$ or pattern.
- >> DictionaryLookup["bake" ~~ ___, 3] - = {bake, bakeapple, baked} + >> DictionaryLookup["baker" ~~ ___] + = {baker, baker's dozen, baker's eczema, baker's yeast, bakersfield, bakery} + + >> DictionaryLookup["baker" ~~ ___, 3] + = {baker, baker's dozen, baker's eczema} """ summary_text = "Lookup words matching a pattern in a dictionary" From 3ed1adac6e8bb0d953e680bacc460d4214c31691 Mon Sep 17 00:00:00 2001 From: mmatera Date: Wed, 22 Feb 2023 08:38:26 -0300 Subject: [PATCH 10/14] fix Containing. Adding test cases --- pymathics/natlang/spacy.py | 17 +++++++---------- pymathics/natlang/textual_analysis.py | 15 ++++++++++++++- test/test_natlang.py | 2 +- 3 files changed, 22 insertions(+), 12 deletions(-) diff --git a/pymathics/natlang/spacy.py b/pymathics/natlang/spacy.py index 78c38b6..1a8e301 100644 --- a/pymathics/natlang/spacy.py +++ b/pymathics/natlang/spacy.py @@ -16,6 +16,7 @@ from mathics.core.atoms import String from mathics.core.evaluation import Evaluation from mathics.core.symbols import strip_context +from mathics.core.systemsymbols import SymbolAlternatives from spacy.tokens import Span no_doc = True @@ -58,17 +59,14 @@ def _cases(doc, form): if isinstance(form, String): generators = [_forms.get(form.value)] - elif form.get_head_name() == "System`Alternatives": + elif form.get_head() is SymbolAlternatives: if not all(isinstance(f, String) for f in form.elements): return # error generators = [_forms.get(f.value) for f in form.elements] - elif form.get_head_name() == "PyMathics`Containing": - if len(form.elements) == 2: - for t in _containing(doc, *form.elements): - yield t - return - else: - return # error + elif form.has_form("Pymathics`Containing", 2): + for t in _containing(doc, *form.elements): + yield t + return else: return # error @@ -79,7 +77,7 @@ def try_next(iterator): return None feeds = [] - for i, iterator in enumerate([iter(generator(doc)) for generator in generators]): + for i, iterator in enumerate([iter(generator(doc)) for generator in generators if generator]): t = try_next(iterator) if t: feeds.append((_position(t), i, t, iterator)) @@ -169,7 +167,6 @@ def generator(doc): # forms are everything one can use in TextCases[] or TextPosition[]. _forms = _make_forms() - def _position(t): if isinstance(t, Span): i = t.doc[t.start] diff --git a/pymathics/natlang/textual_analysis.py b/pymathics/natlang/textual_analysis.py index b9e5d5d..6da5bd2 100644 --- a/pymathics/natlang/textual_analysis.py +++ b/pymathics/natlang/textual_analysis.py @@ -40,9 +40,22 @@ class Containing(Builtin):
represents an object of the type outer containing objects\ of type inner.
+ 'Containing' can be used as the second parameter in 'TextCases' and 'TextPosition'. + + Supported $outer$ strings are in {"Word", "Sentence", "Paragraph", "Line", "URL", "EmailAddress"}. - """ + Supported $inner$ strings are in {"Person", "Company", "Quantity", "Number", "CurrencyAmount", + "Country", "City"}. + + The implementation of this symbol is based on `spacy`. + + >> TextCases["This is a pencil. This is another pencil from England.", Containing["Sentence", "Country"]] + = {This is another pencil from England.} + >> TextPosition["This is a pencil. This is another pencil from England.", Containing["Sentence", "Country"]] + = {{19, 54}} + """ + # This is implemented in ``pymathics.natlang.spacy._containing`` summary_text = "specify a container for matching" diff --git a/test/test_natlang.py b/test/test_natlang.py index 4951f30..e2adc86 100644 --- a/test/test_natlang.py +++ b/test/test_natlang.py @@ -17,7 +17,7 @@ def test_natlang(): "WordCount", ), ( - 'Length[WordList[]]>10000', + "Length[WordList[]]>10000", "True", "WordList", ), From c01cb22daca4cf7b32343335a19dc2b2a97d6189 Mon Sep 17 00:00:00 2001 From: mmatera Date: Wed, 22 Feb 2023 08:39:32 -0300 Subject: [PATCH 11/14] black --- pymathics/natlang/spacy.py | 5 ++++- pymathics/natlang/textual_analysis.py | 1 + 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/pymathics/natlang/spacy.py b/pymathics/natlang/spacy.py index 1a8e301..b87b814 100644 --- a/pymathics/natlang/spacy.py +++ b/pymathics/natlang/spacy.py @@ -77,7 +77,9 @@ def try_next(iterator): return None feeds = [] - for i, iterator in enumerate([iter(generator(doc)) for generator in generators if generator]): + for i, iterator in enumerate( + [iter(generator(doc)) for generator in generators if generator] + ): t = try_next(iterator) if t: feeds.append((_position(t), i, t, iterator)) @@ -167,6 +169,7 @@ def generator(doc): # forms are everything one can use in TextCases[] or TextPosition[]. _forms = _make_forms() + def _position(t): if isinstance(t, Span): i = t.doc[t.start] diff --git a/pymathics/natlang/textual_analysis.py b/pymathics/natlang/textual_analysis.py index 6da5bd2..0b437ad 100644 --- a/pymathics/natlang/textual_analysis.py +++ b/pymathics/natlang/textual_analysis.py @@ -55,6 +55,7 @@ class Containing(Builtin): = {{19, 54}} """ + # This is implemented in ``pymathics.natlang.spacy._containing`` summary_text = "specify a container for matching" From 37f1b91a02c748a54858829315a10972ab6288dd Mon Sep 17 00:00:00 2001 From: mmatera Date: Wed, 22 Feb 2023 08:45:22 -0300 Subject: [PATCH 12/14] trailing typos --- pymathics/natlang/normalization.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pymathics/natlang/normalization.py b/pymathics/natlang/normalization.py index 51738c5..f4cf07d 100644 --- a/pymathics/natlang/normalization.py +++ b/pymathics/natlang/normalization.py @@ -47,7 +47,7 @@ class DeleteStopwords(_SpacyBuiltin): = Old Man Apulia, conduct peculiar """ - summary_text = "Remove stopwords from a text" + summary_text = "remove stopwords from a text" def eval_list(self, li, evaluation: Evaluation, options: dict) -> ListExpression: "DeleteStopwords[li_List, OptionsPattern[DeleteStopwords]]" @@ -104,7 +104,7 @@ class TextCases(_SpacyBuiltin): """ - summary_text = "list the cases of words of a certain form in a text" + summary_text = "list cases of words of a certain form in a text" def eval_string_form( self, text: String, form, evaluation: Evaluation, options: dict @@ -138,7 +138,7 @@ class TextPosition(_SpacyBuiltin): = {{1, 9}, {15, 20}} """ - summary_text = "list the position of words of a given form in a text" + summary_text = "list the positions of words of a given form in a text" def eval_text_form(self, text: String, form, evaluation: Evaluation, options: dict): "TextPosition[text_String, form_, OptionsPattern[TextPosition]]" From b249fe1bb4aa9ede63ad48e028ad849548bbcabb Mon Sep 17 00:00:00 2001 From: mmatera Date: Wed, 22 Feb 2023 09:24:58 -0300 Subject: [PATCH 13/14] split nltk from utils --- pymathics/natlang/__init__.py | 6 +- pymathics/natlang/linguistic_data.py | 118 ++++++++-- pymathics/natlang/manipulate.py | 36 +++ pymathics/natlang/nltk.py | 322 ++++++++++++++++++++++++++ pymathics/natlang/textual_analysis.py | 113 +-------- pymathics/natlang/util.py | 317 ------------------------- 6 files changed, 463 insertions(+), 449 deletions(-) create mode 100644 pymathics/natlang/manipulate.py create mode 100644 pymathics/natlang/nltk.py diff --git a/pymathics/natlang/__init__.py b/pymathics/natlang/__init__.py index c0d76ae..7b0498c 100644 --- a/pymathics/natlang/__init__.py +++ b/pymathics/natlang/__init__.py @@ -39,12 +39,14 @@ """ from pymathics.natlang.linguistic_data import ( - Pluralize, + DictionaryLookup, + DictionaryWordQ, RandomWord, WordData, WordDefinition, WordList, ) +from pymathics.natlang.manipulate import Pluralize from pymathics.natlang.normalization import ( DeleteStopwords, TextCases, @@ -55,8 +57,6 @@ ) from pymathics.natlang.textual_analysis import ( Containing, - DictionaryLookup, - DictionaryWordQ, SpellingCorrectionList, WordCount, WordFrequency, diff --git a/pymathics/natlang/linguistic_data.py b/pymathics/natlang/linguistic_data.py index 6332c1a..033c179 100644 --- a/pymathics/natlang/linguistic_data.py +++ b/pymathics/natlang/linguistic_data.py @@ -5,7 +5,7 @@ See :WMA link:https://reference.wolfram.com/language/guide/LinguisticData.html guide. """ -# This module uses both nltk and spacy. Maybe it makes sense to split this further. +# This module uses nltk. # TODO: Complete me @@ -16,29 +16,29 @@ # PartOfSpeech — possible parts of speech for a word +import re +from itertools import islice from typing import Optional -from mathics.builtin.base import Builtin, MessageException - -# from mathics.builtin.codetables import iso639_3 +from mathics.builtin.atomic.strings import anchor_pattern, to_regex +from mathics.builtin.base import MessageException from mathics.builtin.numbers.randomnumbers import RandomEnv from mathics.core.atoms import String from mathics.core.convert.expression import Expression, to_expression from mathics.core.element import ElementsProperties from mathics.core.evaluation import Evaluation from mathics.core.list import ListExpression -from mathics.core.symbols import Symbol, SymbolList +from mathics.core.symbols import Symbol, SymbolFalse, SymbolList, SymbolTrue from mathics.core.systemsymbols import SymbolMissing, SymbolRule, SymbolStringExpression -from pattern.en import pluralize -from pymathics.natlang.textual_analysis import WordStem -from pymathics.natlang.util import ( +from pymathics.natlang.nltk import ( WordProperty, _WordListBuiltin, _wordnet_pos_to_type, _WordNetBuiltin, - merge_dictionaries, ) +from pymathics.natlang.textual_analysis import WordStem +from pymathics.natlang.util import merge_dictionaries sort_order = "Linguistic Data" @@ -46,27 +46,105 @@ StringNotAvailable = String("NotAvailable") -class Pluralize(Builtin): +class DictionaryLookup(_WordListBuiltin): """ :WMA link: - https://reference.wolfram.com/language/ref/Pluralize.html + https://reference.wolfram.com/language/ref/DictionaryLookup.html
-
'Pluralize[$word$]' -
returns the plural form of $word$. +
'DictionaryLookup[$word$]' +
lookup words that match the given $word$ or pattern. + +
'DictionaryLookup[$word$, $n$]' +
lookup first $n$ words that match the given $word$ or pattern.
- >> Pluralize["potato"] - = potatoes + >> DictionaryLookup["baker" ~~ ___] + = {baker, baker's dozen, baker's eczema, baker's yeast, bakersfield, bakery} + + >> DictionaryLookup["baker" ~~ ___, 3] + = {baker, baker's dozen, baker's eczema} """ - requires = ("pattern",) - summary_text = "retrieve the pluralized form of a word" + summary_text = "Lookup words matching a pattern in a dictionary" + + def compile(self, pattern, evaluation): + re_patt = to_regex(pattern, evaluation) + if re_patt is None: + evaluation.message( + "StringExpression", + "invld", + pattern, + Expression(SymbolStringExpression, pattern), + ) + return + re_patt = anchor_pattern(re_patt) + + return re.compile(re_patt, flags=re.IGNORECASE) + + def search(self, dictionary_words, pattern): + for dictionary_word in dictionary_words: + if pattern.match(dictionary_word): + yield dictionary_word.replace("_", " ") + + def lookup(self, language_name, word, n, evaluation): + pattern = self.compile(word, evaluation) + if pattern: + dictionary_words = self._words(language_name, "All", evaluation) + if dictionary_words is not None: + matches = self.search(dictionary_words, pattern) + if n is not None: + matches = islice(matches, 0, n) + return ListExpression(*(String(match) for match in sorted(matches))) + + def eval_english(self, word, evaluation): + "DictionaryLookup[word_]" + return self.lookup(String("English"), word, None, evaluation) - def eval(self, word, evaluation): - "Pluralize[word_String]" + def eval_language(self, language, word, evaluation): + "DictionaryLookup[{language_String, word_}]" + return self.lookup(language, word, None, evaluation) - return String(pluralize(word.value)) + def eval_english_n(self, word, n, evaluation): + "DictionaryLookup[word_, n_Integer]" + return self.lookup(String("English"), word, n.value, evaluation) + + def eval_language_n(self, language, word, n, evaluation): + "DictionaryLookup[{language_String, word_}, n_Integer]" + return self.lookup(language, word, n.value, evaluation) + + +class DictionaryWordQ(_WordNetBuiltin): + """ + :WMA link: + https://reference.wolfram.com/language/ref/DictionaryWordQ.html + +
+
'DictionaryWordQ[$word$]' +
returns True if $word$ is a word usually found in dictionaries, and False otherwise. +
+ + >> DictionaryWordQ["couch"] + = True + + >> DictionaryWordQ["meep-meep"] + = False + """ + + summary_text = "Check if a word is in the dictionary" + + def eval(self, word, evaluation: Evaluation, options: dict): + "DictionaryWordQ[word_String, OptionsPattern[DictionaryWordQ]]" + if not isinstance(word, String): + return False + wordnet, language_code = self._load_wordnet( + evaluation, self._language_name(evaluation, options) + ) + if wordnet: + if list(wordnet.synsets(word.value.lower(), None, language_code)): + return SymbolTrue + else: + return SymbolFalse class RandomWord(_WordListBuiltin): diff --git a/pymathics/natlang/manipulate.py b/pymathics/natlang/manipulate.py new file mode 100644 index 0000000..7d1f7e9 --- /dev/null +++ b/pymathics/natlang/manipulate.py @@ -0,0 +1,36 @@ +# -*- coding: utf-8 -*- +""" +Word manipulation + +This module uses pattern.en to change the form of a word. + +""" +from mathics.builtin.base import Builtin +from mathics.core.atoms import String +from mathics.core.evaluation import Evaluation +from pattern.en import pluralize + +sort_order = "Word manipulation" + + +class Pluralize(Builtin): + """ + :WMA link: + https://reference.wolfram.com/language/ref/Pluralize.html + +
+
'Pluralize[$word$]' +
returns the plural form of $word$. +
+ + >> Pluralize["potato"] + = potatoes + """ + + requires = ("pattern",) + summary_text = "retrieve the pluralized form of a word" + + def eval(self, word: String, evaluation: Evaluation) -> String: + "Pluralize[word_String]" + + return String(pluralize(word.value)) diff --git a/pymathics/natlang/nltk.py b/pymathics/natlang/nltk.py new file mode 100644 index 0000000..ff04fc2 --- /dev/null +++ b/pymathics/natlang/nltk.py @@ -0,0 +1,322 @@ +# -*- coding: utf-8 -*- + +""" +nltk backend +""" +import re +from itertools import chain + +import nltk +from mathics.builtin.base import Builtin, MessageException +from mathics.builtin.codetables import iso639_3 +from mathics.core.atoms import String +from mathics.core.evaluation import Evaluation +from mathics.core.symbols import strip_context + +no_doc = True + + +_wordnet_pos_to_type = {} +_wordnet_type_to_pos = {} + + +def _init_nltk_maps(): + _wordnet_pos_to_type.update( + { + nltk.corpus.wordnet.VERB: "Verb", + nltk.corpus.wordnet.NOUN: "Noun", + nltk.corpus.wordnet.ADJ: "Adjective", + nltk.corpus.wordnet.ADJ_SAT: "Adjective", + nltk.corpus.wordnet.ADV: "Adverb", + } + ) + _wordnet_type_to_pos.update( + { + "Verb": [nltk.corpus.wordnet.VERB], + "Noun": [nltk.corpus.wordnet.NOUN], + "Adjective": [nltk.corpus.wordnet.ADJ, nltk.corpus.wordnet.ADJ_SAT], + "Adverb": [nltk.corpus.wordnet.ADV], + } + ) + + +def _parse_nltk_lookup_error(e): + m = re.search(r"Resource '([^']+)' not found\.", str(e)) + if m: + return m.group(1) + else: + return "unknown" + + +class _WordNetBuiltin(Builtin): + requires = ("nltk",) + + options = { + "Language": '"English"', + } + + messages = { + "package": "NLTK's `` corpus is not installed. Please install it using nltk.download().", + "lang": 'Language "`1`" is currently not supported with `2`[]. Please install it manually.', + # 'load': 'Loading `1` word data. Please wait.', + "wordnet": "WordNet returned the following error: ``", + } + + _wordnet_instances = {} + + def _language_name(self, evaluation: Evaluation, options: dict): + return self.get_option(options, "Language", evaluation) + + def _init_wordnet(self, evaluation: Evaluation, language_name, language_code): + try: + wordnet_resource = nltk.data.find("corpora/wordnet2022") + _init_nltk_maps() + except LookupError: + evaluation.message(self.get_name(), "package", "wordnet2022") + return None + + try: + omw = nltk.corpus.util.LazyCorpusLoader( + "omw", + nltk.corpus.reader.CorpusReader, + r".*/wn-data-.*\.tab", + encoding="utf8", + ) + except LookupError: + evaluation.message(self.get_name(), "package", "omw") + return None + + wordnet = nltk.corpus.reader.wordnet.WordNetCorpusReader(wordnet_resource, omw) + + if language_code not in wordnet.langs(): + evaluation.message( + self.get_name(), "lang", language_name, strip_context(self.get_name()) + ) + return None + + return wordnet + + def _load_wordnet(self, evaluation: Evaluation, language_name) -> tuple: + language_code = None + if isinstance(language_name, String): + language_code = iso639_3.get(language_name.value) + if not language_code: + evaluation.message( + self.get_name(), "lang", language_name, strip_context(self.get_name()) + ) + return None, None + + wordnet = _WordNetBuiltin._wordnet_instances.get(language_code) + if not wordnet: + try: + wordnet = self._init_wordnet(evaluation, language_name, language_code) + except LookupError as e: + evaluation.message( + self.get_name(), "package", _parse_nltk_lookup_error(e) + ) + return None, None + + _WordNetBuiltin._wordnet_instances[language_code] = wordnet + + return wordnet, language_code + + @staticmethod + def _decode_synset(syn): + what, pos, nr = (syn.name().split(".") + ["01"])[:3] + return what.replace("_", " "), pos, nr + + @staticmethod + def _capitalize(s) -> str: + return re.sub(r"^[a-z]|\s[a-z]", lambda m: m.group(0).upper().lstrip(" "), s) + + @staticmethod + def _underscore(s) -> str: + return re.sub( + r"[a-z][A-Z]", lambda m: m.group(0)[0] + "_" + m.group(0)[1].lower(), s + ).lower() + + @staticmethod + def _list_syn_form(syn): + what, pos, nr = _WordNetBuiltin._decode_synset(syn) + + def containers(): + for name in syn.lemma_names(): + if name != what: + yield name + + for s in chain(syn.hypernyms(), syn.hyponyms(), syn.similar_tos()): + container, _, _ = _WordNetBuiltin._decode_synset(s) + yield container + + for lemma in WordProperty._synonymous_lemmas(syn): + yield lemma.name() + + return what, _wordnet_pos_to_type[pos], containers + + @staticmethod + def syn(syn, wordnet, language_code) -> tuple: + what, pos, nr = _WordNetBuiltin._decode_synset(syn) + for s, form in _WordNetBuiltin._iterate_senses(what, wordnet, language_code): + if s == syn: + return form + return what, pos, "Unknown" + + @staticmethod + def _iterate_senses(word, wordnet, language_code): + if not word: + return + + used = set() + output_word = word.replace("_", " ") + + for syn in wordnet.synsets(word, None, language_code): + if syn.lexname() in ("noun.location", "noun.person"): + continue # ignore + + what, pos, containers = _WordNetBuiltin._list_syn_form(syn) + + for container in containers(): + container = container.replace("_", " ") + if container != word: + if container not in used: + used.add(container) + yield syn, ( + output_word, + pos, + _WordNetBuiltin._capitalize(container), + ) + break + + def _senses(self, word, wordnet, language_code): + if isinstance(word, tuple): # find forms like ["tree", "Noun", "WoodyPlant"] + for syn, form in _WordNetBuiltin._iterate_senses( + word[0], wordnet, language_code + ): + if form == word: + return [[syn, form]] + else: # find word given as strings, e.g. "tree" + word = wordnet.morphy(word) # base form, e.g. trees -> tree + return list(_WordNetBuiltin._iterate_senses(word, wordnet, language_code)) + + +class _WordListBuiltin(_WordNetBuiltin): + _dictionary = {} + + def _words(self, language_name, ilk, evaluation): + wordnet, language_code = self._load_wordnet(evaluation, language_name) + + if not wordnet: + return + + key = "%s.%s" % (language_code, ilk) + words = self._dictionary.get(key) + if not words: + try: + if ilk == "All": + filtered_pos = [None] + else: + try: + filtered_pos = _wordnet_type_to_pos[ilk] + except KeyError: + evaluation.message( + self.get_name(), + "wordnet", + "type: %s should be in %s" + % (ilk, _wordnet_type_to_pos.keys()), + ) + return + + words = [] + for pos in filtered_pos: + words.extend(list(wordnet.all_lemma_names(pos, language_code))) + words.sort() + self._dictionary[key] = words + except nltk.corpus.reader.wordnet.WordNetError as err: + evaluation.message(self.get_name(), "wordnet", str(err)) + return + + return words + + +class WordProperty: + def __init__(self, syn_form, wordnet, language_code): + self.syn_form = syn_form + self.wordnet = wordnet + self.language_code = language_code + + def syn(self, syn): + return self.syn_form(_WordNetBuiltin.syn(syn, self.wordnet, self.language_code)) + + @staticmethod + def _synonymous_lemmas(syn): + first_lemma = syn.name().split(".")[0] + return (s for s in syn.lemmas() if s.name() != first_lemma) + + @staticmethod + def _antonymous_lemmas(syn): + return (s for lemma in syn.lemmas() for s in lemma.antonyms()) + + def definitions(self, syn, desc): + return syn.definition() + + def examples(self, syn, desc): + return syn.examples() + + def synonyms(self, syn, desc): + _, pos, container = desc + return [ + self.syn_form((s.name().replace("_", " "), pos, container)) + for s in WordProperty._synonymous_lemmas(syn) + ] + + def antonyms(self, syn, desc): + return [self.syn(s.synset()) for s in WordProperty._antonymous_lemmas(syn)] + + def broader_terms(self, syn, desc): + return [self.syn(s) for s in syn.hypernyms()] + + def narrower_terms(self, syn, desc): + return [self.syn(s) for s in syn.hyponyms()] + + def usage_field(self, syn, desc): + return syn.usage_domains() + + def whole_terms(self, syn, desc): + return [self.syn(s) for s in syn.part_holonyms()] + + def part_terms(self, syn, desc): + return [self.syn(s) for s in syn.part_meronyms()] + + def material_terms(self, syn, desc): + return [self.syn(s) for s in syn.substance_meronyms()] + + def word_net_id(self, syn, desc): + return syn.offset() + + def entailed_terms(self, syn, desc): # e.g. fall to condense + return [self.syn(s) for s in syn.entailments()] + + def causes_terms(self, syn, desc): # e.g. ignite to burn + return [self.syn(s) for s in syn.causes()] + + def inflected_forms(self, syn, desc): + try: + word, pos, _ = desc + if pos == "Verb": + from pattern.en import lexeme + + return [w for w in reversed(lexeme(word)) if w != word] + elif pos == "Noun": + from pattern.en import pluralize + + return [pluralize(word)] + elif pos == "Adjective": + from pattern.en import comparative, superlative + + return [comparative(word), superlative(word)] + else: + return [] + except ImportError: + raise MessageException( + "General", "unavailable", 'WordData[_, "InflectedForms"]', "pattern" + ) diff --git a/pymathics/natlang/textual_analysis.py b/pymathics/natlang/textual_analysis.py index 0b437ad..de927e0 100644 --- a/pymathics/natlang/textual_analysis.py +++ b/pymathics/natlang/textual_analysis.py @@ -7,25 +7,21 @@ # This module uses both enchant, nltk and spacy. Maybe we want to split this further. -import re -from itertools import islice from typing import Optional import enchant import nltk import spacy -from mathics.builtin.atomic.strings import anchor_pattern, to_regex from mathics.builtin.base import Builtin from mathics.core.atoms import Integer, Real, String from mathics.core.evaluation import Evaluation from mathics.core.expression import Expression from mathics.core.list import ListExpression -from mathics.core.symbols import SymbolFalse, SymbolList, SymbolTrue -from mathics.core.systemsymbols import SymbolStringExpression +from mathics.core.symbols import SymbolList, SymbolTrue from mathics.eval.nevaluator import eval_N from pymathics.natlang.spacy import _SpacyBuiltin -from pymathics.natlang.util import _WordListBuiltin, _WordNetBuiltin, merge_dictionaries +from pymathics.natlang.util import merge_dictionaries sort_order = "Text Analysis" @@ -41,12 +37,12 @@ class Containing(Builtin): of type inner.
'Containing' can be used as the second parameter in 'TextCases' and 'TextPosition'. - + Supported $outer$ strings are in {"Word", "Sentence", "Paragraph", "Line", "URL", "EmailAddress"}. Supported $inner$ strings are in {"Person", "Company", "Quantity", "Number", "CurrencyAmount", "Country", "City"}. - + The implementation of this symbol is based on `spacy`. >> TextCases["This is a pencil. This is another pencil from England.", Containing["Sentence", "Country"]] @@ -60,107 +56,6 @@ class Containing(Builtin): summary_text = "specify a container for matching" -class DictionaryLookup(_WordListBuiltin): - """ - :WMA link: - https://reference.wolfram.com/language/ref/DictionaryLookup.html - -
-
'DictionaryLookup[$word$]' -
lookup words that match the given $word$ or pattern. - -
'DictionaryLookup[$word$, $n$]' -
lookup first $n$ words that match the given $word$ or pattern. -
- - >> DictionaryLookup["baker" ~~ ___] - = {baker, baker's dozen, baker's eczema, baker's yeast, bakersfield, bakery} - - >> DictionaryLookup["baker" ~~ ___, 3] - = {baker, baker's dozen, baker's eczema} - """ - - summary_text = "Lookup words matching a pattern in a dictionary" - - def compile(self, pattern, evaluation): - re_patt = to_regex(pattern, evaluation) - if re_patt is None: - evaluation.message( - "StringExpression", - "invld", - pattern, - Expression(SymbolStringExpression, pattern), - ) - return - re_patt = anchor_pattern(re_patt) - - return re.compile(re_patt, flags=re.IGNORECASE) - - def search(self, dictionary_words, pattern): - for dictionary_word in dictionary_words: - if pattern.match(dictionary_word): - yield dictionary_word.replace("_", " ") - - def lookup(self, language_name, word, n, evaluation): - pattern = self.compile(word, evaluation) - if pattern: - dictionary_words = self._words(language_name, "All", evaluation) - if dictionary_words is not None: - matches = self.search(dictionary_words, pattern) - if n is not None: - matches = islice(matches, 0, n) - return ListExpression(*(String(match) for match in sorted(matches))) - - def eval_english(self, word, evaluation): - "DictionaryLookup[word_]" - return self.lookup(String("English"), word, None, evaluation) - - def eval_language(self, language, word, evaluation): - "DictionaryLookup[{language_String, word_}]" - return self.lookup(language, word, None, evaluation) - - def eval_english_n(self, word, n, evaluation): - "DictionaryLookup[word_, n_Integer]" - return self.lookup(String("English"), word, n.value, evaluation) - - def eval_language_n(self, language, word, n, evaluation): - "DictionaryLookup[{language_String, word_}, n_Integer]" - return self.lookup(language, word, n.value, evaluation) - - -class DictionaryWordQ(_WordNetBuiltin): - """ - :WMA link: - https://reference.wolfram.com/language/ref/DictionaryWordQ.html - -
-
'DictionaryWordQ[$word$]' -
returns True if $word$ is a word usually found in dictionaries, and False otherwise. -
- - >> DictionaryWordQ["couch"] - = True - - >> DictionaryWordQ["meep-meep"] - = False - """ - - summary_text = "Check if a word is in the dictionary" - - def eval(self, word, evaluation: Evaluation, options: dict): - "DictionaryWordQ[word_String, OptionsPattern[DictionaryWordQ]]" - if not isinstance(word, String): - return False - wordnet, language_code = self._load_wordnet( - evaluation, self._language_name(evaluation, options) - ) - if wordnet: - if list(wordnet.synsets(word.value.lower(), None, language_code)): - return SymbolTrue - else: - return SymbolFalse - - class SpellingCorrectionList(Builtin): """ :WMA link: diff --git a/pymathics/natlang/util.py b/pymathics/natlang/util.py index c5a223c..4b80076 100644 --- a/pymathics/natlang/util.py +++ b/pymathics/natlang/util.py @@ -3,326 +3,9 @@ """ utils """ -import re -from itertools import chain - -import nltk -from mathics.builtin.base import Builtin, MessageException -from mathics.builtin.codetables import iso639_3 -from mathics.core.atoms import String -from mathics.core.evaluation import Evaluation -from mathics.core.symbols import strip_context - -no_doc = True - - -_wordnet_pos_to_type = {} -_wordnet_type_to_pos = {} - - -def _init_nltk_maps(): - _wordnet_pos_to_type.update( - { - nltk.corpus.wordnet.VERB: "Verb", - nltk.corpus.wordnet.NOUN: "Noun", - nltk.corpus.wordnet.ADJ: "Adjective", - nltk.corpus.wordnet.ADJ_SAT: "Adjective", - nltk.corpus.wordnet.ADV: "Adverb", - } - ) - _wordnet_type_to_pos.update( - { - "Verb": [nltk.corpus.wordnet.VERB], - "Noun": [nltk.corpus.wordnet.NOUN], - "Adjective": [nltk.corpus.wordnet.ADJ, nltk.corpus.wordnet.ADJ_SAT], - "Adverb": [nltk.corpus.wordnet.ADV], - } - ) - - -def _parse_nltk_lookup_error(e): - m = re.search(r"Resource '([^']+)' not found\.", str(e)) - if m: - return m.group(1) - else: - return "unknown" def merge_dictionaries(a, b): c = a.copy() c.update(b) return c - - -class _WordNetBuiltin(Builtin): - requires = ("nltk",) - - options = { - "Language": '"English"', - } - - messages = { - "package": "NLTK's `` corpus is not installed. Please install it using nltk.download().", - "lang": 'Language "`1`" is currently not supported with `2`[]. Please install it manually.', - # 'load': 'Loading `1` word data. Please wait.', - "wordnet": "WordNet returned the following error: ``", - } - - _wordnet_instances = {} - - def _language_name(self, evaluation: Evaluation, options: dict): - return self.get_option(options, "Language", evaluation) - - def _init_wordnet(self, evaluation: Evaluation, language_name, language_code): - try: - wordnet_resource = nltk.data.find("corpora/wordnet2022") - _init_nltk_maps() - except LookupError: - evaluation.message(self.get_name(), "package", "wordnet2022") - return None - - try: - omw = nltk.corpus.util.LazyCorpusLoader( - "omw", - nltk.corpus.reader.CorpusReader, - r".*/wn-data-.*\.tab", - encoding="utf8", - ) - except LookupError: - evaluation.message(self.get_name(), "package", "omw") - return None - - wordnet = nltk.corpus.reader.wordnet.WordNetCorpusReader(wordnet_resource, omw) - - if language_code not in wordnet.langs(): - evaluation.message( - self.get_name(), "lang", language_name, strip_context(self.get_name()) - ) - return None - - return wordnet - - def _load_wordnet(self, evaluation: Evaluation, language_name) -> tuple: - language_code = None - if isinstance(language_name, String): - language_code = iso639_3.get(language_name.value) - if not language_code: - evaluation.message( - self.get_name(), "lang", language_name, strip_context(self.get_name()) - ) - return None, None - - wordnet = _WordNetBuiltin._wordnet_instances.get(language_code) - if not wordnet: - try: - wordnet = self._init_wordnet(evaluation, language_name, language_code) - except LookupError as e: - evaluation.message( - self.get_name(), "package", _parse_nltk_lookup_error(e) - ) - return None, None - - _WordNetBuiltin._wordnet_instances[language_code] = wordnet - - return wordnet, language_code - - @staticmethod - def _decode_synset(syn): - what, pos, nr = (syn.name().split(".") + ["01"])[:3] - return what.replace("_", " "), pos, nr - - @staticmethod - def _capitalize(s) -> str: - return re.sub(r"^[a-z]|\s[a-z]", lambda m: m.group(0).upper().lstrip(" "), s) - - @staticmethod - def _underscore(s) -> str: - return re.sub( - r"[a-z][A-Z]", lambda m: m.group(0)[0] + "_" + m.group(0)[1].lower(), s - ).lower() - - @staticmethod - def _list_syn_form(syn): - what, pos, nr = _WordNetBuiltin._decode_synset(syn) - - def containers(): - for name in syn.lemma_names(): - if name != what: - yield name - - for s in chain(syn.hypernyms(), syn.hyponyms(), syn.similar_tos()): - container, _, _ = _WordNetBuiltin._decode_synset(s) - yield container - - for lemma in WordProperty._synonymous_lemmas(syn): - yield lemma.name() - - return what, _wordnet_pos_to_type[pos], containers - - @staticmethod - def syn(syn, wordnet, language_code) -> tuple: - what, pos, nr = _WordNetBuiltin._decode_synset(syn) - for s, form in _WordNetBuiltin._iterate_senses(what, wordnet, language_code): - if s == syn: - return form - return what, pos, "Unknown" - - @staticmethod - def _iterate_senses(word, wordnet, language_code): - if not word: - return - - used = set() - output_word = word.replace("_", " ") - - for syn in wordnet.synsets(word, None, language_code): - if syn.lexname() in ("noun.location", "noun.person"): - continue # ignore - - what, pos, containers = _WordNetBuiltin._list_syn_form(syn) - - for container in containers(): - container = container.replace("_", " ") - if container != word: - if container not in used: - used.add(container) - yield syn, ( - output_word, - pos, - _WordNetBuiltin._capitalize(container), - ) - break - - def _senses(self, word, wordnet, language_code): - if isinstance(word, tuple): # find forms like ["tree", "Noun", "WoodyPlant"] - for syn, form in _WordNetBuiltin._iterate_senses( - word[0], wordnet, language_code - ): - if form == word: - return [[syn, form]] - else: # find word given as strings, e.g. "tree" - word = wordnet.morphy(word) # base form, e.g. trees -> tree - return list(_WordNetBuiltin._iterate_senses(word, wordnet, language_code)) - - -class _WordListBuiltin(_WordNetBuiltin): - _dictionary = {} - - def _words(self, language_name, ilk, evaluation): - wordnet, language_code = self._load_wordnet(evaluation, language_name) - - if not wordnet: - return - - key = "%s.%s" % (language_code, ilk) - words = self._dictionary.get(key) - if not words: - try: - if ilk == "All": - filtered_pos = [None] - else: - try: - filtered_pos = _wordnet_type_to_pos[ilk] - except KeyError: - evaluation.message( - self.get_name(), - "wordnet", - "type: %s should be in %s" - % (ilk, _wordnet_type_to_pos.keys()), - ) - return - - words = [] - for pos in filtered_pos: - words.extend(list(wordnet.all_lemma_names(pos, language_code))) - words.sort() - self._dictionary[key] = words - except nltk.corpus.reader.wordnet.WordNetError as err: - evaluation.message(self.get_name(), "wordnet", str(err)) - return - - return words - - -class WordProperty: - def __init__(self, syn_form, wordnet, language_code): - self.syn_form = syn_form - self.wordnet = wordnet - self.language_code = language_code - - def syn(self, syn): - return self.syn_form(_WordNetBuiltin.syn(syn, self.wordnet, self.language_code)) - - @staticmethod - def _synonymous_lemmas(syn): - first_lemma = syn.name().split(".")[0] - return (s for s in syn.lemmas() if s.name() != first_lemma) - - @staticmethod - def _antonymous_lemmas(syn): - return (s for lemma in syn.lemmas() for s in lemma.antonyms()) - - def definitions(self, syn, desc): - return syn.definition() - - def examples(self, syn, desc): - return syn.examples() - - def synonyms(self, syn, desc): - _, pos, container = desc - return [ - self.syn_form((s.name().replace("_", " "), pos, container)) - for s in WordProperty._synonymous_lemmas(syn) - ] - - def antonyms(self, syn, desc): - return [self.syn(s.synset()) for s in WordProperty._antonymous_lemmas(syn)] - - def broader_terms(self, syn, desc): - return [self.syn(s) for s in syn.hypernyms()] - - def narrower_terms(self, syn, desc): - return [self.syn(s) for s in syn.hyponyms()] - - def usage_field(self, syn, desc): - return syn.usage_domains() - - def whole_terms(self, syn, desc): - return [self.syn(s) for s in syn.part_holonyms()] - - def part_terms(self, syn, desc): - return [self.syn(s) for s in syn.part_meronyms()] - - def material_terms(self, syn, desc): - return [self.syn(s) for s in syn.substance_meronyms()] - - def word_net_id(self, syn, desc): - return syn.offset() - - def entailed_terms(self, syn, desc): # e.g. fall to condense - return [self.syn(s) for s in syn.entailments()] - - def causes_terms(self, syn, desc): # e.g. ignite to burn - return [self.syn(s) for s in syn.causes()] - - def inflected_forms(self, syn, desc): - try: - word, pos, _ = desc - if pos == "Verb": - from pattern.en import lexeme - - return [w for w in reversed(lexeme(word)) if w != word] - elif pos == "Noun": - from pattern.en import pluralize - - return [pluralize(word)] - elif pos == "Adjective": - from pattern.en import comparative, superlative - - return [comparative(word), superlative(word)] - else: - return [] - except ImportError: - raise MessageException( - "General", "unavailable", 'WordData[_, "InflectedForms"]', "pattern" - ) From f3f8c413c983ce05a03f132fb6d703043938f818 Mon Sep 17 00:00:00 2001 From: "R. Bernstein" Date: Thu, 23 Feb 2023 17:50:45 -0500 Subject: [PATCH 14/14] Some small tweaks and conformance things (#15) --- pymathics/natlang/linguistic_data.py | 10 +++++----- pymathics/natlang/nltk.py | 1 + pymathics/natlang/normalization.py | 4 +++- pymathics/natlang/textual_analysis.py | 13 ++++++------- pymathics/natlang/util.py | 3 +++ 5 files changed, 18 insertions(+), 13 deletions(-) diff --git a/pymathics/natlang/linguistic_data.py b/pymathics/natlang/linguistic_data.py index 033c179..db62987 100644 --- a/pymathics/natlang/linguistic_data.py +++ b/pymathics/natlang/linguistic_data.py @@ -2,7 +2,7 @@ """ Linguistic Data -See :WMA link:https://reference.wolfram.com/language/guide/LinguisticData.html guide. +See the corresponding :WMA:https://reference.wolfram.com/language/guide/LinguisticData.html guide. """ # This module uses nltk. @@ -66,7 +66,7 @@ class DictionaryLookup(_WordListBuiltin): = {baker, baker's dozen, baker's eczema} """ - summary_text = "Lookup words matching a pattern in a dictionary" + summary_text = "lookup words matching a pattern in our word dictionary" def compile(self, pattern, evaluation): re_patt = to_regex(pattern, evaluation) @@ -131,7 +131,7 @@ class DictionaryWordQ(_WordNetBuiltin): = False """ - summary_text = "Check if a word is in the dictionary" + summary_text = "check if a word is in our word dictionary" def eval(self, word, evaluation: Evaluation, options: dict): "DictionaryWordQ[word_String, OptionsPattern[DictionaryWordQ]]" @@ -170,7 +170,7 @@ class RandomWord(_WordListBuiltin):
""" - summary_text = "generate a random word of a given kind" + summary_text = "generate a random word" def _random_words(self, type, n, evaluation: Evaluation, options: dict): words = self._words(self._language_name(evaluation, options), type, evaluation) @@ -412,7 +412,7 @@ class WordList(_WordListBuiltin): Evaluate the average length over all the words in the dictionary: >> N[Mean[StringLength /@ WordList[]], 3] = 11.6 - Now, restricted to adjetives: + Now, restricted to adjectives: >> N[Mean[StringLength /@ WordList["Adjective"]], 2] = 9.3 """ diff --git a/pymathics/natlang/nltk.py b/pymathics/natlang/nltk.py index ff04fc2..919f75c 100644 --- a/pymathics/natlang/nltk.py +++ b/pymathics/natlang/nltk.py @@ -13,6 +13,7 @@ from mathics.core.evaluation import Evaluation from mathics.core.symbols import strip_context +# Don't consider this for user documentation no_doc = True diff --git a/pymathics/natlang/normalization.py b/pymathics/natlang/normalization.py index f4cf07d..baf309c 100644 --- a/pymathics/natlang/normalization.py +++ b/pymathics/natlang/normalization.py @@ -2,7 +2,9 @@ Text Normalization -See :WMA link: https://reference.wolfram.com/language/guide/TextNormalization.html guide. +See the corresponding +:WMA: +https://reference.wolfram.com/language/guide/TextNormalization.html guide. This module uses spacy as a backend. diff --git a/pymathics/natlang/textual_analysis.py b/pymathics/natlang/textual_analysis.py index de927e0..01300d2 100644 --- a/pymathics/natlang/textual_analysis.py +++ b/pymathics/natlang/textual_analysis.py @@ -2,7 +2,8 @@ """ Text Analysis -:See WMA guide:https://reference.wolfram.com/language/guide/TextAnalysis.html +See the corresponding :WMA: +https://reference.wolfram.com/language/guide/TextAnalysis.html guide. """ # This module uses both enchant, nltk and spacy. Maybe we want to split this further. @@ -88,7 +89,7 @@ class SpellingCorrectionList(Builtin): _dictionaries = {} - summary_text = "Look for spelling correction candidates of a word" + summary_text = "look for spelling correction candidates of a word" def eval( self, word: String, evaluation: Evaluation, options: dict @@ -130,7 +131,7 @@ class WordCount(_SpacyBuiltin): = 4 """ - summary_text = "Count the words in a text" + summary_text = "count words in a text" def eval(self, text, evaluation: Evaluation, options: dict): "WordCount[text_String, OptionsPattern[WordCount]]" @@ -154,9 +155,7 @@ class WordFrequency(_SpacyBuiltin): ## Problem with import for certain characters in the text. ## >> text = Import["ExampleData/EinsteinSzilLetter.txt"]; - >> text = "I have a dairy cow, it's not just any cow. \ -She gives me milkshake, oh what a salty cow. She is the best \ -cow in the county."; + >> text = "I have a dairy cow, it's not just any cow. She gives me milkshake, oh what a salty cow. She is the best cow in the county."; >> WordFrequency[text, "a" | "the"] = 0.121212 @@ -232,7 +231,7 @@ class WordSimilarity(_SpacyBuiltin): "idxfmt": "Indices must be integers or lists of integers of the same length.", }, ) - summary_text = "Measure the similarity of two texts" + summary_text = "measure similarity of two texts" def eval( self, text1: String, text2: String, evaluation: Evaluation, options: dict diff --git a/pymathics/natlang/util.py b/pymathics/natlang/util.py index 4b80076..383c55c 100644 --- a/pymathics/natlang/util.py +++ b/pymathics/natlang/util.py @@ -4,6 +4,9 @@ utils """ +# Don't consider this for user documentation +no_doc = True + def merge_dictionaries(a, b): c = a.copy()