diff --git a/Makefile b/Makefile
index a4ac90b..6de236d 100644
--- a/Makefile
+++ b/Makefile
@@ -72,7 +72,7 @@ pytest:
doctest:
- MATHICS_CHARACTER_ENCODING="ASCII" $(PYTHON) -m mathics.docpipeline -l pymathics.natlang -c "Natural Language Processing" $o
+ MATHICS_CHARACTER_ENCODING="ASCII" $(PYTHON) -m mathics.docpipeline -l pymathics.natlang -c 'Natural Language Processing' $o
# #: Make Mathics PDF manual
@@ -89,5 +89,4 @@ ChangeLog: rmChangeLog
#: Run pytest consistency and style checks
check-consistency-and-style:
- # MATHICS_LINT=t $(PYTHON) -m pytest test/consistency-and-style
- echo "check-consistency-and-style deactivated. Activate me later. "
+ MATHICS_LINT=t $(PYTHON) -m pytest test/consistency-and-style
diff --git a/pymathics/natlang/__init__.py b/pymathics/natlang/__init__.py
index f3b883e..7b0498c 100644
--- a/pymathics/natlang/__init__.py
+++ b/pymathics/natlang/__init__.py
@@ -38,28 +38,32 @@
= Old Man Apulia, conduct peculiar
"""
-
-from pymathics.natlang.main import (
- DeleteStopwords,
+from pymathics.natlang.linguistic_data import (
DictionaryLookup,
DictionaryWordQ,
- LanguageIdentify,
- Pluralize,
RandomWord,
- SpellingCorrectionList,
+ WordData,
+ WordDefinition,
+ WordList,
+)
+from pymathics.natlang.manipulate import Pluralize
+from pymathics.natlang.normalization import (
+ DeleteStopwords,
TextCases,
TextPosition,
TextSentences,
TextStructure,
TextWords,
+)
+from pymathics.natlang.textual_analysis import (
+ Containing,
+ SpellingCorrectionList,
WordCount,
- WordData,
- WordDefinition,
WordFrequency,
- WordList,
WordSimilarity,
WordStem,
)
+from pymathics.natlang.translation import LanguageIdentify
from pymathics.natlang.version import __version__
pymathics_version_data = {
@@ -70,6 +74,7 @@
}
__all__ = [
+ "Containing",
"DeleteStopwords",
"DictionaryLookup",
"DictionaryWordQ",
diff --git a/pymathics/natlang/linguistic_data.py b/pymathics/natlang/linguistic_data.py
new file mode 100644
index 0000000..db62987
--- /dev/null
+++ b/pymathics/natlang/linguistic_data.py
@@ -0,0 +1,444 @@
+# -*- coding: utf-8 -*-
+"""
+Linguistic Data
+
+See the corresponding :WMA:https://reference.wolfram.com/language/guide/LinguisticData.html guide.
+
+"""
+# This module uses nltk.
+
+
+# TODO: Complete me
+
+# WordFrequencyData — data on typical current and historical word frequencies
+# Synonyms — synonyms for a word
+# Antonyms — antonyms for a word
+# PartOfSpeech — possible parts of speech for a word
+
+
+import re
+from itertools import islice
+from typing import Optional
+
+from mathics.builtin.atomic.strings import anchor_pattern, to_regex
+from mathics.builtin.base import MessageException
+from mathics.builtin.numbers.randomnumbers import RandomEnv
+from mathics.core.atoms import String
+from mathics.core.convert.expression import Expression, to_expression
+from mathics.core.element import ElementsProperties
+from mathics.core.evaluation import Evaluation
+from mathics.core.list import ListExpression
+from mathics.core.symbols import Symbol, SymbolFalse, SymbolList, SymbolTrue
+from mathics.core.systemsymbols import SymbolMissing, SymbolRule, SymbolStringExpression
+
+from pymathics.natlang.nltk import (
+ WordProperty,
+ _WordListBuiltin,
+ _wordnet_pos_to_type,
+ _WordNetBuiltin,
+)
+from pymathics.natlang.textual_analysis import WordStem
+from pymathics.natlang.util import merge_dictionaries
+
+sort_order = "Linguistic Data"
+
+SymbolDictionaryLookup = Symbol("Pymathics`Natlang`DictionaryLookup")
+StringNotAvailable = String("NotAvailable")
+
+
+class DictionaryLookup(_WordListBuiltin):
+ """
+ :WMA link:
+ https://reference.wolfram.com/language/ref/DictionaryLookup.html
+
+
+ - 'DictionaryLookup[$word$]'
+
- lookup words that match the given $word$ or pattern.
+
+
- 'DictionaryLookup[$word$, $n$]'
+
- lookup first $n$ words that match the given $word$ or pattern.
+
+
+ >> DictionaryLookup["baker" ~~ ___]
+ = {baker, baker's dozen, baker's eczema, baker's yeast, bakersfield, bakery}
+
+ >> DictionaryLookup["baker" ~~ ___, 3]
+ = {baker, baker's dozen, baker's eczema}
+ """
+
+ summary_text = "lookup words matching a pattern in our word dictionary"
+
+ def compile(self, pattern, evaluation):
+ re_patt = to_regex(pattern, evaluation)
+ if re_patt is None:
+ evaluation.message(
+ "StringExpression",
+ "invld",
+ pattern,
+ Expression(SymbolStringExpression, pattern),
+ )
+ return
+ re_patt = anchor_pattern(re_patt)
+
+ return re.compile(re_patt, flags=re.IGNORECASE)
+
+ def search(self, dictionary_words, pattern):
+ for dictionary_word in dictionary_words:
+ if pattern.match(dictionary_word):
+ yield dictionary_word.replace("_", " ")
+
+ def lookup(self, language_name, word, n, evaluation):
+ pattern = self.compile(word, evaluation)
+ if pattern:
+ dictionary_words = self._words(language_name, "All", evaluation)
+ if dictionary_words is not None:
+ matches = self.search(dictionary_words, pattern)
+ if n is not None:
+ matches = islice(matches, 0, n)
+ return ListExpression(*(String(match) for match in sorted(matches)))
+
+ def eval_english(self, word, evaluation):
+ "DictionaryLookup[word_]"
+ return self.lookup(String("English"), word, None, evaluation)
+
+ def eval_language(self, language, word, evaluation):
+ "DictionaryLookup[{language_String, word_}]"
+ return self.lookup(language, word, None, evaluation)
+
+ def eval_english_n(self, word, n, evaluation):
+ "DictionaryLookup[word_, n_Integer]"
+ return self.lookup(String("English"), word, n.value, evaluation)
+
+ def eval_language_n(self, language, word, n, evaluation):
+ "DictionaryLookup[{language_String, word_}, n_Integer]"
+ return self.lookup(language, word, n.value, evaluation)
+
+
+class DictionaryWordQ(_WordNetBuiltin):
+ """
+ :WMA link:
+ https://reference.wolfram.com/language/ref/DictionaryWordQ.html
+
+
+ - 'DictionaryWordQ[$word$]'
+
- returns True if $word$ is a word usually found in dictionaries, and False otherwise.
+
+
+ >> DictionaryWordQ["couch"]
+ = True
+
+ >> DictionaryWordQ["meep-meep"]
+ = False
+ """
+
+ summary_text = "check if a word is in our word dictionary"
+
+ def eval(self, word, evaluation: Evaluation, options: dict):
+ "DictionaryWordQ[word_String, OptionsPattern[DictionaryWordQ]]"
+ if not isinstance(word, String):
+ return False
+ wordnet, language_code = self._load_wordnet(
+ evaluation, self._language_name(evaluation, options)
+ )
+ if wordnet:
+ if list(wordnet.synsets(word.value.lower(), None, language_code)):
+ return SymbolTrue
+ else:
+ return SymbolFalse
+
+
+class RandomWord(_WordListBuiltin):
+ """
+ :WMA link:
+ https://reference.wolfram.com/language/ref/RandomWord.html
+
+
+ - 'RandomWord[]'
+
- returns a random word.
+
+
- 'RandomWord[$type$]'
+
- returns a random word of the given $type$, e.g. of type "Noun" or "Adverb".
+
+
- 'RandomWord[$type$, $n$]'
+
- returns $n$ random words of the given $type$.
+
+ >> RandomWord["Noun"]
+ = ...
+ >> RandomWord["Noun", 3]
+ = {..., ..., ...}
+
+
+ """
+
+ summary_text = "generate a random word"
+
+ def _random_words(self, type, n, evaluation: Evaluation, options: dict):
+ words = self._words(self._language_name(evaluation, options), type, evaluation)
+ if words is not None:
+ with RandomEnv(evaluation) as rand:
+ return [
+ String(words[rand.randint(0, len(words) - 1)].replace("_", " "))
+ for _ in range(n)
+ ]
+
+ def eval(self, evaluation: Evaluation, options: dict):
+ "RandomWord[OptionsPattern[RandomWord]]"
+ words = self._random_words("All", 1, evaluation, options)
+ if words:
+ return words[0]
+
+ def eval_type(self, type, evaluation: Evaluation, options: dict):
+ "RandomWord[type_String, OptionsPattern[RandomWord]]"
+ words = self._random_words(type.value, 1, evaluation, options)
+ if words:
+ return words[0]
+
+ def eval_type_n(self, type, n, evaluation: Evaluation, options: dict):
+ "RandomWord[type_String, n_Integer, OptionsPattern[RandomWord]]"
+ words = self._random_words(type.value, n.value, evaluation, options)
+ if words:
+ return ListExpression(*words)
+
+
+class WordData(_WordListBuiltin):
+ """
+
+ :WMA link:
+ https://reference.wolfram.com/language/ref/WordData.html
+
+
+ - 'WordData[$word$]'
+
- returns a list of possible senses of a word.
+
+
- 'WordData[$word$, $property$]'
+
- returns detailed information about a word regarding $property$, e.g. "Definitions" or "Examples".
+
+
+ The following are valid properties:
+
+ - Definitions, Examples
+
- InflectedForms
+
- Synonyms, Antonyms
+
- BroaderTerms, NarrowerTerms
+
- WholeTerms, PartTerms, MaterialTerms
+
- EntailedTerms, CausesTerms
+
- UsageField
+
- WordNetID
+
- Lookup
+
+
+ >> WordData["riverside", "Definitions"]
+ = {{riverside, Noun, Bank} -> the bank of a river}
+
+ >> WordData[{"fish", "Verb", "Angle"}, "Examples"]
+ = {{fish, Verb, Angle} -> {fish for compliments}}
+ """
+
+ messages = merge_dictionaries(
+ _WordNetBuiltin.messages,
+ {
+ "notprop": "WordData[] does not recognize `1` as a valid property.",
+ },
+ )
+ summary_text = "retrieve an association with properties of a word"
+
+ def _parse_word(self, word):
+ if isinstance(word, String):
+ return word.value.lower()
+ elif word.get_head_name() == "System`List":
+ if len(word.elements) == 3 and all(
+ isinstance(s, String) for s in word.elements
+ ):
+ return tuple(s.value for s in word.elements)
+
+ def _standard_property(
+ self, py_word, py_form, py_property, wordnet, language_code, evaluation
+ ):
+ senses = self._senses(py_word, wordnet, language_code)
+ if not senses:
+ return Expression(SymbolMissing, StringNotAvailable)
+ elif py_form == "List":
+ word_property = WordProperty(self._short_syn_form, wordnet, language_code)
+ property_getter = getattr(
+ word_property, "%s" % self._underscore(py_property), None
+ )
+ if property_getter:
+ return ListExpression(
+ *[property_getter(syn, desc) for syn, desc in senses]
+ )
+ elif py_form in ("Rules", "ShortRules"):
+ syn_form = (lambda s: s) if py_form == "Rules" else (lambda s: s[0])
+ word_property = WordProperty(syn_form, wordnet, language_code)
+ property_getter = getattr(
+ word_property, self._underscore(py_property), None
+ )
+ if property_getter:
+ list_expr_elements = [
+ to_expression(SymbolRule, desc, property_getter(syn, desc))
+ for syn, desc in senses
+ ]
+ return ListExpression(*list_expr_elements)
+ evaluation.message(self.get_name(), "notprop", property)
+
+ def _parts_of_speech(self, py_word, wordnet, language_code):
+ parts = set(
+ syn.pos() for syn, _ in self._senses(py_word, wordnet, language_code)
+ )
+ if not parts:
+ return Expression(SymbolMissing, StringNotAvailable)
+ else:
+ return ListExpression(
+ *[String(s) for s in sorted([_wordnet_pos_to_type[p] for p in parts])]
+ )
+
+ def _property(
+ self, word, py_property, py_form, evaluation: Evaluation, options: dict
+ ):
+ if py_property == "PorterStem":
+ if isinstance(word, String):
+ return String(WordStem.porter(word.value))
+ else:
+ return
+
+ wordnet, language_code = self._load_wordnet(
+ evaluation, self._language_name(evaluation, options)
+ )
+ if not wordnet:
+ return
+
+ py_word = self._parse_word(word)
+ if not py_word:
+ return
+
+ if py_property == "PartsOfSpeech":
+ return self._parts_of_speech(py_word, wordnet, language_code)
+
+ try:
+ return self._standard_property(
+ py_word, py_form, py_property, wordnet, language_code, evaluation
+ )
+ except MessageException as e:
+ e.message(evaluation)
+
+ def eval(self, word, evaluation: Evaluation, options: dict) -> Optional[Expression]:
+ "WordData[word_, OptionsPattern[WordData]]"
+ if word.get_head() is SymbolStringExpression:
+ return Expression(SymbolDictionaryLookup, word)
+ elif isinstance(word, String) or word.get_head() is SymbolList:
+ pass
+ else:
+ return
+
+ wordnet, language_code = self._load_wordnet(
+ evaluation, self._language_name(evaluation, options)
+ )
+ if not wordnet:
+ return
+
+ py_word = self._parse_word(word)
+ if not py_word:
+ return
+
+ senses = self._senses(py_word, wordnet, language_code)
+ if senses is not None:
+ return ListExpression(*[[String(s) for s in desc] for syn, desc in senses])
+
+ def eval_property(self, word, property, evaluation: Evaluation, options: dict):
+ "WordData[word_, property_String, OptionsPattern[WordData]]"
+ if word.get_head is SymbolStringExpression:
+ if property.get_string_value() == "Lookup":
+ return Expression(SymbolDictionaryLookup, word)
+ elif isinstance(word, String) or word.get_head() is SymbolList:
+ return self._property(
+ word, property.get_string_value(), "ShortRules", evaluation, options
+ )
+
+ def eval_property_form(
+ self, word, property, form, evaluation: Evaluation, options: dict
+ ):
+ "WordData[word_, property_String, form_String, OptionsPattern[WordData]]"
+ if isinstance(word, String) or word.get_head() is SymbolList:
+ return self._property(
+ word,
+ property.value,
+ form.value,
+ evaluation,
+ options,
+ )
+
+
+class WordDefinition(_WordNetBuiltin):
+ """
+ :WMA link:
+ https://reference.wolfram.com/language/ref/WordDefinition.html
+
+
+ - 'WordDefinition[$word$]'
+
- returns a definition of $word$ or Missing["Available"] if $word$ is not known.
+
+
+ >> WordDefinition["gram"]
+ = {a metric unit of weight equal to one thousandth of a kilogram}
+ """
+
+ summary_text = "retrieve the definition of a word"
+
+ def eval(self, word, evaluation: Evaluation, options: dict):
+ "WordDefinition[word_String, OptionsPattern[WordDefinition]]"
+ wordnet, language_code = self._load_wordnet(
+ evaluation, self._language_name(evaluation, options)
+ )
+ if wordnet:
+ senses = self._senses(word.value.lower(), wordnet, language_code)
+ if senses:
+ return ListExpression(*[String(syn.definition()) for syn, _ in senses])
+ else:
+ return Expression(SymbolMissing, StringNotAvailable)
+
+
+class WordList(_WordListBuiltin):
+ """
+ :WMA link:
+ https://reference.wolfram.com/language/ref/WordList.html
+
+
+ - 'WordList[]'
+
- returns a list of common words.
+
+
- 'WordList[$type$]'
+
- returns a list of common words of type $type$.
+
+
+ Evaluate the average length over all the words in the dictionary:
+ >> N[Mean[StringLength /@ WordList[]], 3]
+ = 11.6
+ Now, restricted to adjectives:
+ >> N[Mean[StringLength /@ WordList["Adjective"]], 2]
+ = 9.3
+ """
+
+ summary_text = "list common words"
+
+ def eval(self, evaluation: Evaluation, options: dict):
+ "WordList[OptionsPattern[]]"
+ words = self._words(self._language_name(evaluation, options), "All", evaluation)
+ if words is not None:
+ words_mathics = (String(word) for word in words)
+ result = ListExpression(
+ *words_mathics,
+ elements_properties=ElementsProperties(False, False, True)
+ )
+ return result
+
+ def eval_type(self, wordtype, evaluation: Evaluation, options: dict):
+ "WordList[wordtype_String, OptionsPattern[]]"
+ words = self._words(
+ self._language_name(evaluation, options),
+ wordtype.value,
+ evaluation,
+ )
+ if words is not None:
+ return ListExpression(
+ *(String(word) for word in words),
+ elements_properties=ElementsProperties(False, False, True)
+ )
diff --git a/pymathics/natlang/main.py b/pymathics/natlang/main.py
deleted file mode 100644
index 4d6dbe6..0000000
--- a/pymathics/natlang/main.py
+++ /dev/null
@@ -1,1506 +0,0 @@
-# -*- coding: utf-8 -*-
-# FIXME: split this up into smaller pieces
-
-"""
-Natural Language Functions
-
-"""
-
-import heapq
-import itertools
-import math
-
-# import os
-import re
-from itertools import chain
-from typing import Optional, Union
-
-import enchant
-import langid # see https://github.com/saffsd/langid.py
-import pycountry
-import spacy
-from mathics.builtin.atomic.strings import anchor_pattern, to_regex
-from mathics.builtin.base import Builtin, MessageException
-from mathics.builtin.codetables import iso639_3
-from mathics.builtin.numbers.randomnumbers import RandomEnv
-from mathics.core.atoms import Integer, Real, String
-from mathics.core.convert.expression import (
- ListExpression,
- to_expression,
- to_mathics_list,
-)
-from mathics.core.evaluation import Evaluation
-from mathics.core.expression import Expression
-from mathics.core.symbols import (
- Symbol,
- SymbolFalse,
- SymbolList,
- SymbolTrue,
- strip_context,
-)
-from mathics.core.systemsymbols import (
- SymbolFailed,
- SymbolMissing,
- SymbolRule,
- SymbolStringExpression,
-)
-from mathics.eval.nevaluator import eval_N
-from pattern.en import pluralize
-
-SymbolDictionaryLookup = Symbol("Pymathics`Natlang`DictionaryLookup")
-
-StringNotAvailable = String("NotAvailable")
-
-
-def _parse_nltk_lookup_error(e):
- m = re.search(r"Resource '([^']+)' not found\.", str(e))
- if m:
- return m.group(1)
- else:
- return "unknown"
-
-
-def _make_forms():
- forms = {
- "Word": lambda doc: (token for token in doc),
- "Sentence": lambda doc: (sent for sent in doc.sents),
- "Paragraph": lambda doc: _fragments(doc, re.compile(r"^[\n][\n]+$")),
- "Line": lambda doc: _fragments(doc, re.compile(r"^[\n]$")),
- "URL": lambda doc: (token for token in doc if token.orth_.like_url()),
- "EmailAddress": lambda doc: (
- token for token in doc if token.orth_.like_email()
- ),
- }
-
- def filter_named_entity(label):
- def generator(doc):
- for ent in doc.ents:
- if ent.label == label:
- yield ent
-
- return generator
-
- def filter_pos(pos):
- def generator(doc):
- for token in doc:
- if token.pos == pos:
- yield token
-
- return generator
-
- for name, symbol in _symbols.items():
- forms[name] = filter_named_entity(symbol)
-
- for tag, names in _pos_tags.items():
- name, phrase_name = names
- forms[name] = filter_pos(tag)
-
- return forms
-
-
-# the following two may only be accessed after_WordNetBuiltin._load_wordnet has
-# been called.
-
-_wordnet_pos_to_type = {}
-_wordnet_type_to_pos = {}
-
-import nltk
-
-
-def _init_nltk_maps():
- _wordnet_pos_to_type.update(
- {
- nltk.corpus.wordnet.VERB: "Verb",
- nltk.corpus.wordnet.NOUN: "Noun",
- nltk.corpus.wordnet.ADJ: "Adjective",
- nltk.corpus.wordnet.ADJ_SAT: "Adjective",
- nltk.corpus.wordnet.ADV: "Adverb",
- }
- )
- _wordnet_type_to_pos.update(
- {
- "Verb": [nltk.corpus.wordnet.VERB],
- "Noun": [nltk.corpus.wordnet.NOUN],
- "Adjective": [nltk.corpus.wordnet.ADJ, nltk.corpus.wordnet.ADJ_SAT],
- "Adverb": [nltk.corpus.wordnet.ADV],
- }
- )
-
-
-from spacy.tokens import Span
-
-# Part of speech tags and their public interface names in Mathics
-# see http://www.mathcs.emory.edu/~choi/doc/clear-dependency-2012.pdf
-_pos_tags = {
- spacy.parts_of_speech.ADJ: ("Adjective", ""),
- spacy.parts_of_speech.ADP: ("Preposition", "Prepositional Phrase"),
- spacy.parts_of_speech.ADV: ("Adverb", ""),
- spacy.parts_of_speech.CONJ: ("Conjunct", ""),
- spacy.parts_of_speech.DET: ("Determiner", ""),
- spacy.parts_of_speech.INTJ: ("Interjection", ""),
- spacy.parts_of_speech.NOUN: ("Noun", "Noun Phrase"),
- spacy.parts_of_speech.NUM: ("Number", ""),
- spacy.parts_of_speech.PART: ("Particle", ""),
- spacy.parts_of_speech.PRON: ("Pronoun", ""),
- spacy.parts_of_speech.PROPN: ("Proposition", ""),
- spacy.parts_of_speech.PUNCT: ("Punctuation", ""),
- spacy.parts_of_speech.SCONJ: ("Sconj", ""),
- spacy.parts_of_speech.SYM: ("Symbol", ""),
- spacy.parts_of_speech.VERB: ("Verb", "Verb Phrase"),
- spacy.parts_of_speech.X: ("X", ""),
- spacy.parts_of_speech.EOL: ("EOL", ""),
- spacy.parts_of_speech.SPACE: ("Space", ""),
-}
-
-# Mathics3 named entitiy names and their corresponding constants in spacy.
-_symbols = {
- "Person": spacy.symbols.PERSON,
- "Company": spacy.symbols.ORG,
- "Quantity": spacy.symbols.QUANTITY,
- "Number": spacy.symbols.CARDINAL,
- "CurrencyAmount": spacy.symbols.MONEY,
- "Country": spacy.symbols.GPE, # also includes cities and states
- "City": spacy.symbols.GPE, # also includes countries and states
-}
-
-# forms are everything one can use in TextCases[] or TextPosition[].
-_forms = _make_forms()
-
-
-def _merge_dictionaries(a, b):
- c = a.copy()
- c.update(b)
- return c
-
-
-def _position(t):
- if isinstance(t, Span):
- i = t.doc[t.start]
- r = t.doc[t.end - 1]
- return 1 + i.idx, r.idx + len(r.text)
- else:
- return 1 + t.idx, t.idx + len(t.text)
-
-
-def _fragments(doc, sep):
- start = 0
- for i, token in enumerate(doc):
- if sep.match(token.text):
- yield Span(doc, start, i)
- start = i + 1
- end = len(doc)
- if start < end:
- yield Span(doc, start, end)
-
-
-class _SpacyBuiltin(Builtin):
- requires = ("spacy",)
-
- options = {
- "Language": '"English"',
- }
-
- messages = {
- "runtime": "Spacy gave the following error: ``",
- "lang": 'Language "`1`" is currently not supported with `2`[].',
- }
-
- _language_codes = {
- "English": "en",
- "German": "de",
- }
-
- _spacy_instances = {}
-
- def _load_spacy(self, evaluation: Evaluation, options: dict):
- language_code = None
- language_name = self.get_option(options, "Language", evaluation)
- if language_name is None:
- language_name = String("Undefined")
- if isinstance(language_name, String):
- language_code = _SpacyBuiltin._language_codes.get(language_name.value)
- if not language_code:
- evaluation.message(
- self.get_name(), "lang", language_name, strip_context(self.get_name())
- )
- return None
-
- instance = _SpacyBuiltin._spacy_instances.get(language_code)
- if instance:
- return instance
-
- try:
- instance = spacy.load(f"{language_code}_core_web_md")
-
- # "via" parameter no longer exists. This was used in MATHICS3_SPACY_DATA
- # if "MATHICS3_SPACY_DATA" in os.environ:
- # instance = spacy.load(
- # language_code, via=os.environ["MATHICS3_SPACY_DATA"]
- # )
- # else:
- # instance = spacy.load(f"{language_code}_core_web_md")
-
- _SpacyBuiltin._spacy_instances[language_code] = instance
- return instance
- except RuntimeError as e:
- evaluation.message(self.get_name(), "runtime", str(e))
- return None
-
- def _nlp(self, text, evaluation, options) -> Optional[spacy.tokens.doc.Doc]:
- nlp = self._load_spacy(evaluation, options)
- if not nlp:
- return None
- return nlp(text)
-
- def _is_stop_lambda(self, evaluation: Evaluation, options: dict):
- nlp = self._load_spacy(evaluation, options)
- if not nlp:
- return None
-
- vocab = nlp.vocab
-
- def is_stop(word):
- return vocab[word].is_stop
-
- return is_stop
-
-
-class WordCount(_SpacyBuiltin):
- """
-
- - 'WordCount[$string$]'
-
- returns the number of words in $string$.
-
-
- >> WordCount["A long time ago"]
- = 4
- """
-
- def eval(self, text, evaluation: Evaluation, options: dict):
- "WordCount[text_String, OptionsPattern[WordCount]]"
- doc = self._nlp(text.value, evaluation, options)
- if doc:
- punctuation = spacy.parts_of_speech.PUNCT
- return Integer(sum(1 for word in doc if word.pos != punctuation))
-
-
-class TextWords(_SpacyBuiltin):
- """
-
- - 'TextWords[$string$]'
-
- returns the words in $string$.
-
-
- 'TextWords[$string$, $n$]'
-
- returns the first $n$ words in $string$
-
-
- >> TextWords["Hickory, dickory, dock! The mouse ran up the clock."]
- = {Hickory, dickory, dock, The, mouse, ran, up, the, clock}
- """
-
- def eval(
- self, text: String, evaluation: Evaluation, options: dict
- ) -> Optional[ListExpression]:
- "TextWords[text_String, OptionsPattern[WordCount]]"
- doc = self._nlp(text.value, evaluation, options)
- if doc:
- punctuation = spacy.parts_of_speech.PUNCT
- return ListExpression(
- *[String(word.text) for word in doc if word.pos != punctuation],
- )
-
- def eval_n(self, text: String, n: Integer, evaluation: Evaluation, options: dict):
- "TextWords[text_String, n_Integer, OptionsPattern[TextWords]]"
- doc = self._nlp(text.value, evaluation, options)
- if doc:
- punctuation = spacy.parts_of_speech.PUNCT
- return ListExpression(
- *itertools.islice(
- (String(word.text) for word in doc if word.pos != punctuation),
- n.value,
- ),
- )
-
-
-class TextSentences(_SpacyBuiltin):
- """
-
- - 'TextSentences[$string$]'
-
- returns the sentences in $string$.
-
-
- 'TextSentences[$string$, $n$]'
-
- returns the first $n$ sentences in $string$
-
-
- >> TextSentences["Night and day. Day and night."]
- = {Night and day., Day and night.}
-
- >> TextSentences["Night and day. Day and night.", 1]
- = {Night and day.}
-
- >> TextSentences["Mr. Jones met Mrs. Jones."]
- = {Mr. Jones met Mrs. Jones.}
- """
-
- def eval(self, text: String, evaluation: Evaluation, options: dict):
- "TextSentences[text_String, OptionsPattern[TextSentences]]"
- doc = self._nlp(text.value, evaluation, options)
- if doc:
- return ListExpression(*[String(sent.text) for sent in doc.sents])
-
- def eval_n(self, text: String, n: Integer, evaluation: Evaluation, options: dict):
- "TextSentences[text_String, n_Integer, OptionsPattern[TextSentences]]"
- doc = self._nlp(text.value, evaluation, options)
- if doc:
- return ListExpression(
- *itertools.islice((String(sent.text) for sent in doc.sents), n.value),
- )
-
-
-class DeleteStopwords(_SpacyBuiltin):
- """
-
- - 'DeleteStopwords[$list$]'
-
- returns the words in $list$ without stopwords.
-
-
- 'DeleteStopwords[$string$]'
-
- returns $string$ without stopwords.
-
-
- ## This has changed since old versions of natlang, and I am
- ## not sure the old behavior was correct.
- ## >> DeleteStopwords[{"Somewhere", "over", "the", "rainbow"}]
- ## = {rainbow}
-
- >> DeleteStopwords["There was an Old Man of Apulia, whose conduct was very peculiar"]
- = Old Man Apulia, conduct peculiar
- """
-
- def eval_list(self, li, evaluation: Evaluation, options: dict) -> ListExpression:
- "DeleteStopwords[li_List, OptionsPattern[DeleteStopwords]]"
- is_stop = self._is_stop_lambda(evaluation, options)
-
- def filter_words(words):
- for w in words:
- s = w.get_string_value()
- if s is not None:
- yield String(s)
- elif is_stop is not None and is_stop(s) is not None:
- yield String(s)
-
- return ListExpression(*list(filter_words(li.elements)))
-
- def eval_string(self, s: String, evaluation: Evaluation, options: dict):
- "DeleteStopwords[s_String, OptionsPattern[DeleteStopwords]]"
- doc = self._nlp(s.value, evaluation, options)
- if doc:
- is_stop = self._is_stop_lambda(evaluation, options)
- if is_stop:
-
- def tokens():
- for token in doc:
- if not is_stop(token.text):
- yield token.text_with_ws
- else:
- yield token.whitespace_.strip()
-
- return String("".join(tokens()))
-
-
-class WordFrequency(_SpacyBuiltin):
- """
-
- - 'WordFrequency[$text$, $word$]'
-
- returns the relative frequency of $word$ in $text$.
-
-
- $word$ may also specify multiple words using $a$ | $b$ | ...
-
- ## Problem with import for certain characters in the text.
- ## >> text = Import["ExampleData/EinsteinSzilLetter.txt"];
- >> text = "I have a dairy cow, it's not just any cow. \
- She gives me milkshake, oh what a salty cow. She is the best\
- cow in the county.";
-
- >> WordFrequency[text, "a" | "the"]
- = 0.114286
-
- >> WordFrequency["Apple Tree", "apple", IgnoreCase -> True]
- = 0.5
- """
-
- options = _SpacyBuiltin.options
- options.update({"IgnoreCase": "False"})
-
- def eval(
- self, text: String, word, evaluation: Evaluation, options: dict
- ) -> Optional[Expression]:
- "WordFrequency[text_String, word_, OptionsPattern[WordFrequency]]"
- doc = self._nlp(text.value, evaluation, options)
- if not doc:
- return
- if isinstance(word, String):
- words = set([word.value])
- elif word.get_head_name() == "System`Alternatives":
- if not all(isinstance(a, String) for a in word.elements):
- return # error
- words = set(a.value for a in word.elements)
- else:
- return # error
-
- ignore_case = self.get_option(options, "IgnoreCase", evaluation) is SymbolTrue
- if ignore_case:
- words = [w.lower() for w in words]
- n = 0
- for token in doc:
- token_text = token.text
- if ignore_case:
- token_text = token_text.lower()
- if token_text in words:
- n += 1
- return eval_N(Integer(n) / Integer(len(doc)), evaluation)
-
-
-class Containing(Builtin):
- pass
-
-
-def _cases(doc, form):
- if isinstance(form, String):
- generators = [_forms.get(form.value)]
- elif form.get_head_name() == "System`Alternatives":
- if not all(isinstance(f, String) for f in form.elements):
- return # error
- generators = [_forms.get(f.value) for f in form.elements]
- elif form.get_head_name() == "PyMathics`Containing":
- if len(form.elements) == 2:
- for t in _containing(doc, *form.elements):
- yield t
- return
- else:
- return # error
- else:
- return # error
-
- def try_next(iterator):
- try:
- return next(iterator)
- except StopIteration:
- return None
-
- feeds = []
- for i, iterator in enumerate([iter(generator(doc)) for generator in generators]):
- t = try_next(iterator)
- if t:
- feeds.append((_position(t), i, t, iterator))
- heapq.heapify(feeds)
- while feeds:
- pos, i, token, iterator = heapq.heappop(feeds)
- yield token
- t = try_next(iterator)
- if t:
- heapq.heappush(feeds, (_position(t), i, t, iterator))
-
-
-def _containing(doc, outer, inner):
- if not isinstance(outer, String):
- return # error
- outer_generator = _forms.get(outer.value)
- inner_iter = _cases(doc, inner)
- inner_start = None
- produce_t = False
- try:
- for t in outer_generator(doc):
- start, end = _position(t)
- if inner_start is not None and inner_start < end:
- produce_t = True
- if produce_t:
- yield t
- produce_t = False
- while True:
- inner_start, inner_end = _position(next(inner_iter))
- if inner_end > start:
- break
- if inner_start < end:
- produce_t = True
- except StopIteration:
- pass
-
-
-class TextCases(_SpacyBuiltin):
- """
-
- - 'TextCases[$text$, $form$]'
-
- returns all elements of type $form$ in $text$ in order of their appearance.
-
-
- >> TextCases["I was in London last year.", "Pronoun"]
- = {I}
-
- >> TextCases["I was in London last year.", "City"]
- = {London}
-
- ## >> TextCases[Import["ExampleData/EinsteinSzilLetter.txt"], "Person", 3][[2;;3]]
- ## = {L. Szilard, Joliot}
-
- >> TextCases["Anne, Peter and Mr Johnes say hello.", "Person", 3][[2;;3]]
- = {Peter, Johnes}
-
- """
-
- def eval_string_form(
- self, text: String, form, evaluation: Evaluation, options: dict
- ):
- "TextCases[text_String, form_, OptionsPattern[TextCases]]"
- doc = self._nlp(text.value, evaluation, options)
- if doc:
- return to_mathics_list(*[t.text for t in _cases(doc, form)])
-
- def eval_string_form_n(
- self, text: String, form, n: Integer, evaluation: Evaluation, options: dict
- ):
- "TextCases[text_String, form_, n_Integer, OptionsPattern[TextCases]]"
- doc = self._nlp(text.value, evaluation, options)
- if doc:
- return to_mathics_list(
- *itertools.islice((t.text for t in _cases(doc, form)), n.value)
- )
-
-
-class TextPosition(_SpacyBuiltin):
- """
-
- - 'TextPosition[$text$, $form$]'
-
- returns the positions of elements of type $form$ in $text$ in order of their appearance.
-
-
- >> TextPosition["Liverpool and London are two English cities.", "City"]
- = {{1, 9}, {15, 20}}
- """
-
- def eval_text_form(self, text: String, form, evaluation: Evaluation, options: dict):
- "TextPosition[text_String, form_, OptionsPattern[TextPosition]]"
- doc = self._nlp(text.value, evaluation, options)
- if doc:
- return to_mathics_list(*[_position(t) for t in _cases(doc, form)])
-
- def eval_text_form_n(
- self, text: String, form, n: Integer, evaluation: Evaluation, options: dict
- ):
- "TextPosition[text_String, form_, n_Integer, OptionsPattern[TextPosition]]"
- doc = self._nlp(text.value, evaluation, options)
- if doc:
- return to_mathics_list(
- *itertools.islice((_position(t) for t in _cases(doc, form)), n.value)
- )
-
-
-class TextStructure(_SpacyBuiltin):
- """
-
- - 'TextStructure[$text$, $form$]'
-
- returns the grammatical structure of $text$ as $form$.
-
-
- >> TextStructure["The cat sat on the mat.", "ConstituentString"]
- = {(Sentence, ((Verb Phrase, (Noun Phrase, (Determiner, The), (Noun, cat)), (Verb, sat), (Prepositional Phrase, (Preposition, on), (Noun Phrase, (Determiner, the), (Noun, mat))), (Punctuation, .))))}
- """
-
- _root_pos = set(i for i, names in _pos_tags.items() if names[1])
-
- def _to_constituent_string(self, node):
- token, children = node
- name, phrase_name = _pos_tags.get(token.pos, ("Unknown", "Unknown Phrase"))
- if not children:
- return "(%s, %s)" % (name, token.text)
- else:
- sub = ", ".join(
- self._to_constituent_string(next_node) for next_node in children
- )
- return "(%s, %s)" % (phrase_name, sub)
-
- def _to_tree(self, tokens, path=[]):
- roots = []
- i = 0
- while i < len(tokens):
- token = tokens[i]
-
- if token in path:
- roots.append((token, None))
- i += 1
- else:
- root = token
- while root.head != root and root.head not in path:
- root = root.head
-
- sub = list(root.subtree)
-
- if root.pos not in self._root_pos:
- roots.extend(self._to_tree(sub, path + [root]))
- else:
- roots.append((root, self._to_tree(sub, path + [root])))
-
- i += len(sub)
-
- return roots
-
- def eval(self, text, evaluation: Evaluation, options: dict):
- 'TextStructure[text_String, "ConstituentString", OptionsPattern[TextStructure]]'
- doc = self._nlp(text.value, evaluation, options)
- if doc:
- tree = self._to_tree(list(doc))
- sents = ["(Sentence, (%s))" % self._to_constituent_string(x) for x in tree]
- return to_mathics_list(*sents, elements_conversion_fn=String)
-
-
-class WordSimilarity(_SpacyBuiltin):
- """
-
- - 'WordSimilarity[$text1$, $text2$]'
-
- returns a real-valued measure of semantic similarity of two texts or words.
-
-
- 'WordSimilarity[{$text1$, $i1$}, {$text2$, $j1$}]'
-
- returns a measure of similarity of two words within two texts.
-
-
- 'WordSimilarity[{$text1$, {$i1$, $i2$, ...}}, {$text2$, {$j1$, $j2$, ...}}]'
-
- returns a measure of similarity of multiple words within two texts.
-
-
- >> NumberForm[WordSimilarity["car", "train"], 3]
- = 0.439
-
- >> NumberForm[WordSimilarity["car", "hedgehog"], 3]
- = 0.195
-
- >> NumberForm[WordSimilarity[{"An ocean full of water.", {2, 2}}, { "A desert full of sand.", {2, 5}}], 3]
- = {0.505, 0.481}
- """
-
- messages = _merge_dictionaries(
- _SpacyBuiltin.messages,
- {
- "txtidx": "Index `1` in position `2` must be between 1 and `3`.",
- "idxfmt": "Indices must be integers or lists of integers of the same length.",
- },
- )
-
- def eval(
- self, text1: String, text2: String, evaluation: Evaluation, options: dict
- ) -> Optional[Real]:
- "WordSimilarity[text1_String, text2_String, OptionsPattern[WordSimilarity]]"
- doc1 = self._nlp(text1.value, evaluation, options)
- if doc1:
- doc2 = self._nlp(text2.value, evaluation, options)
- if doc2:
- return Real(doc1.similarity(doc2))
-
- def eval_pair(self, text1, i1, text2, i2, evaluation: Evaluation, options: dict):
- "WordSimilarity[{text1_String, i1_}, {text2_String, i2_}, OptionsPattern[WordSimilarity]]"
- doc1 = self._nlp(text1.value, evaluation, options)
- if doc1:
- if text2.value == text1.value:
- doc2 = doc1
- else:
- doc2 = self._nlp(text2.value, evaluation, options)
- if doc2:
- if i1.get_head() is SymbolList and i2.get_head() is SymbolList:
- if len(i1.elements) != len(i2.elements):
- evaluation.message("TextSimilarity", "idxfmt")
- return
- if any(
- not all(isinstance(i, Integer) for i in li.elements)
- for li in (i1, i2)
- ):
- evaluation.message("TextSimilarity", "idxfmt")
- return
- indices1 = [i.value for i in i1.elements]
- indices2 = [i.value for i in i2.elements]
- multiple = True
- elif isinstance(i1, Integer) and isinstance(i2, Integer):
- indices1 = [i1.value]
- indices2 = [i2.value]
- multiple = False
- else:
- evaluation.message("TextSimilarity", "idxfmt")
- return
-
- for index1, index2 in zip(indices1, indices2):
- for i, pos, doc in zip((index1, index2), (1, 2), (doc1, doc2)):
- if i < 1 or i > len(doc):
- evaluation.message(
- "TextSimilarity", "txtidx", i, pos, len(doc)
- )
- return
-
- result = [
- Real(doc1[j1 - 1].similarity(doc2[j2 - 1]))
- for j1, j2 in zip(indices1, indices2)
- ]
-
- if multiple:
- return ListExpression(*result)
- else:
- return result[0]
-
-
-class WordStem(Builtin):
- """
-
- - 'WordStem[$word$]'
-
- returns a stemmed form of $word$, thereby reducing an inflected form to its root.
-
-
- 'WordStem[{$word1$, $word2$, ...}]'
-
- returns a stemmed form for list of $word$, thereby reducing an inflected form to its root.
-
-
- >> WordStem["towers"]
- = tower
-
- >> WordStem[{"heroes", "roses", "knights", "queens"}]
- = {hero, rose, knight, queen}
- """
-
- requires = ("nltk",)
-
- _stemmer = None
-
- @staticmethod
- def _get_porter_stemmer():
- if WordStem._stemmer is None:
- WordStem._stemmer = nltk.stem.porter.PorterStemmer()
- return WordStem._stemmer
-
- @staticmethod
- def porter(w):
- return WordStem._get_porter_stemmer().stem(w)
-
- def eval(self, word: String, evaluation: Evaluation) -> String:
- "WordStem[word_String]"
- stemmer = self._get_porter_stemmer()
- return String(stemmer.stem(word.value))
-
- def eval_list(self, words, evaluation: Evaluation) -> Optional[ListExpression]:
- "WordStem[words_List]"
- if all(isinstance(w, String) for w in words.elements):
- stemmer = self._get_porter_stemmer()
- return ListExpression(
- *[String(stemmer.stem(w.value)) for w in words.elements]
- )
-
-
-class _WordNetBuiltin(Builtin):
- requires = ("nltk",)
-
- options = {
- "Language": '"English"',
- }
-
- messages = {
- "package": "NLTK's `` corpus is not installed. Please install it using nltk.download().",
- "lang": 'Language "`1`" is currently not supported with `2`[]. Please install it manually.',
- # 'load': 'Loading `1` word data. Please wait.',
- "wordnet": "WordNet returned the following error: ``",
- }
-
- _wordnet_instances = {}
-
- def _language_name(self, evaluation: Evaluation, options: dict):
- return self.get_option(options, "Language", evaluation)
-
- def _init_wordnet(self, evaluation: Evaluation, language_name, language_code):
- try:
- wordnet_resource = nltk.data.find("corpora/wordnet2022")
- _init_nltk_maps()
- except LookupError:
- evaluation.message(self.get_name(), "package", "wordnet2022")
- return None
-
- try:
- omw = nltk.corpus.util.LazyCorpusLoader(
- "omw",
- nltk.corpus.reader.CorpusReader,
- r".*/wn-data-.*\.tab",
- encoding="utf8",
- )
- except LookupError:
- evaluation.message(self.get_name(), "package", "omw")
- return None
-
- wordnet = nltk.corpus.reader.wordnet.WordNetCorpusReader(wordnet_resource, omw)
-
- if language_code not in wordnet.langs():
- evaluation.message(
- self.get_name(), "lang", language_name, strip_context(self.get_name())
- )
- return None
-
- return wordnet
-
- def _load_wordnet(self, evaluation: Evaluation, language_name) -> tuple:
- language_code = None
- if isinstance(language_name, String):
- language_code = iso639_3.get(language_name.value)
- if not language_code:
- evaluation.message(
- self.get_name(), "lang", language_name, strip_context(self.get_name())
- )
- return None, None
-
- wordnet = _WordNetBuiltin._wordnet_instances.get(language_code)
- if not wordnet:
- try:
- wordnet = self._init_wordnet(evaluation, language_name, language_code)
- except LookupError as e:
- evaluation.message(
- self.get_name(), "package", _parse_nltk_lookup_error(e)
- )
- return None, None
-
- _WordNetBuiltin._wordnet_instances[language_code] = wordnet
-
- return wordnet, language_code
-
- @staticmethod
- def _decode_synset(syn):
- what, pos, nr = (syn.name().split(".") + ["01"])[:3]
- return what.replace("_", " "), pos, nr
-
- @staticmethod
- def _capitalize(s) -> str:
- return re.sub(r"^[a-z]|\s[a-z]", lambda m: m.group(0).upper().lstrip(" "), s)
-
- @staticmethod
- def _underscore(s) -> str:
- return re.sub(
- r"[a-z][A-Z]", lambda m: m.group(0)[0] + "_" + m.group(0)[1].lower(), s
- ).lower()
-
- @staticmethod
- def _list_syn_form(syn):
- what, pos, nr = _WordNetBuiltin._decode_synset(syn)
-
- def containers():
- for name in syn.lemma_names():
- if name != what:
- yield name
-
- for s in chain(syn.hypernyms(), syn.hyponyms(), syn.similar_tos()):
- container, _, _ = _WordNetBuiltin._decode_synset(s)
- yield container
-
- for lemma in WordProperty._synonymous_lemmas(syn):
- yield lemma.name()
-
- return what, _wordnet_pos_to_type[pos], containers
-
- @staticmethod
- def syn(syn, wordnet, language_code) -> tuple:
- what, pos, nr = _WordNetBuiltin._decode_synset(syn)
- for s, form in _WordNetBuiltin._iterate_senses(what, wordnet, language_code):
- if s == syn:
- return form
- return what, pos, "Unknown"
-
- @staticmethod
- def _iterate_senses(word, wordnet, language_code):
- if not word:
- return
-
- used = set()
- output_word = word.replace("_", " ")
-
- for syn in wordnet.synsets(word, None, language_code):
- if syn.lexname() in ("noun.location", "noun.person"):
- continue # ignore
-
- what, pos, containers = _WordNetBuiltin._list_syn_form(syn)
-
- for container in containers():
- container = container.replace("_", " ")
- if container != word:
- if container not in used:
- used.add(container)
- yield syn, (
- output_word,
- pos,
- _WordNetBuiltin._capitalize(container),
- )
- break
-
- def _senses(self, word, wordnet, language_code):
- if isinstance(word, tuple): # find forms like ["tree", "Noun", "WoodyPlant"]
- for syn, form in _WordNetBuiltin._iterate_senses(
- word[0], wordnet, language_code
- ):
- if form == word:
- return [[syn, form]]
- else: # find word given as strings, e.g. "tree"
- word = wordnet.morphy(word) # base form, e.g. trees -> tree
- return list(_WordNetBuiltin._iterate_senses(word, wordnet, language_code))
-
-
-class WordDefinition(_WordNetBuiltin):
- """
-
- - 'WordDefinition[$word$]'
-
- returns a definition of $word$ or Missing["Available"] if $word$ is not known.
-
-
- >> WordDefinition["gram"]
- = {a metric unit of weight equal to one thousandth of a kilogram}
- """
-
- def eval(self, word, evaluation: Evaluation, options: dict):
- "WordDefinition[word_String, OptionsPattern[WordDefinition]]"
- wordnet, language_code = self._load_wordnet(
- evaluation, self._language_name(evaluation, options)
- )
- if wordnet:
- senses = self._senses(word.value.lower(), wordnet, language_code)
- if senses:
- return ListExpression(*[String(syn.definition()) for syn, _ in senses])
- else:
- return Expression(SymbolMissing, StringNotAvailable)
-
-
-class WordProperty:
- def __init__(self, syn_form, wordnet, language_code):
- self.syn_form = syn_form
- self.wordnet = wordnet
- self.language_code = language_code
-
- def syn(self, syn):
- return self.syn_form(_WordNetBuiltin.syn(syn, self.wordnet, self.language_code))
-
- @staticmethod
- def _synonymous_lemmas(syn):
- first_lemma = syn.name().split(".")[0]
- return (s for s in syn.lemmas() if s.name() != first_lemma)
-
- @staticmethod
- def _antonymous_lemmas(syn):
- return (s for lemma in syn.lemmas() for s in lemma.antonyms())
-
- def definitions(self, syn, desc):
- return syn.definition()
-
- def examples(self, syn, desc):
- return syn.examples()
-
- def synonyms(self, syn, desc):
- _, pos, container = desc
- return [
- self.syn_form((s.name().replace("_", " "), pos, container))
- for s in WordProperty._synonymous_lemmas(syn)
- ]
-
- def antonyms(self, syn, desc):
- return [self.syn(s.synset()) for s in WordProperty._antonymous_lemmas(syn)]
-
- def broader_terms(self, syn, desc):
- return [self.syn(s) for s in syn.hypernyms()]
-
- def narrower_terms(self, syn, desc):
- return [self.syn(s) for s in syn.hyponyms()]
-
- def usage_field(self, syn, desc):
- return syn.usage_domains()
-
- def whole_terms(self, syn, desc):
- return [self.syn(s) for s in syn.part_holonyms()]
-
- def part_terms(self, syn, desc):
- return [self.syn(s) for s in syn.part_meronyms()]
-
- def material_terms(self, syn, desc):
- return [self.syn(s) for s in syn.substance_meronyms()]
-
- def word_net_id(self, syn, desc):
- return syn.offset()
-
- def entailed_terms(self, syn, desc): # e.g. fall to condense
- return [self.syn(s) for s in syn.entailments()]
-
- def causes_terms(self, syn, desc): # e.g. ignite to burn
- return [self.syn(s) for s in syn.causes()]
-
- def inflected_forms(self, syn, desc):
- try:
- word, pos, _ = desc
- if pos == "Verb":
- from pattern.en import lexeme
-
- return [w for w in reversed(lexeme(word)) if w != word]
- elif pos == "Noun":
- from pattern.en import pluralize
-
- return [pluralize(word)]
- elif pos == "Adjective":
- from pattern.en import comparative, superlative
-
- return [comparative(word), superlative(word)]
- else:
- return []
- except ImportError:
- raise MessageException(
- "General", "unavailable", 'WordData[_, "InflectedForms"]', "pattern"
- )
-
-
-class _WordListBuiltin(_WordNetBuiltin):
- _dictionary = {}
-
- def _words(self, language_name, ilk, evaluation):
- wordnet, language_code = self._load_wordnet(evaluation, language_name)
-
- if not wordnet:
- return
-
- key = "%s.%s" % (language_code, ilk)
- words = self._dictionary.get(key)
- if not words:
- try:
- if ilk == "All":
- filtered_pos = [None]
- else:
- try:
- filtered_pos = _wordnet_type_to_pos[ilk]
- except KeyError:
- evaluation.message(
- self.get_name(),
- "wordnet",
- "type: %s is should be in %s"
- % (ilk._wordnet_type_to_pos.keys()),
- )
- return
-
- words = []
- for pos in filtered_pos:
- words.extend(list(wordnet.all_lemma_names(pos, language_code)))
- words.sort()
- self._dictionary[key] = words
- except nltk.corpus.reader.wordnet.WordNetError as err:
- evaluation.message(self.get_name(), "wordnet", str(err))
- return
-
- return words
-
-
-class WordData(_WordListBuiltin):
- """
-
- - 'WordData[$word$]'
-
- returns a list of possible senses of a word.
-
-
- 'WordData[$word$, $property$]'
-
- returns detailed information about a word regarding $property$, e.g. "Definitions" or "Examples".
-
-
- The following are valid properties:
-
- - Definitions, Examples
-
- InflectedForms
-
- Synonyms, Antonyms
-
- BroaderTerms, NarrowerTerms
-
- WholeTerms, PartTerms, MaterialTerms
-
- EntailedTerms, CausesTerms
-
- UsageField
-
- WordNetID
-
- Lookup
-
-
- >> WordData["riverside", "Definitions"]
- = {{riverside, Noun, Bank} -> the bank of a river}
-
- >> WordData[{"fish", "Verb", "Angle"}, "Examples"]
- = {{fish, Verb, Angle} -> {fish for compliments}}
- """
-
- messages = _merge_dictionaries(
- _WordNetBuiltin.messages,
- {
- "notprop": "WordData[] does not recognize `1` as a valid property.",
- },
- )
-
- def _parse_word(self, word):
- if isinstance(word, String):
- return word.value.lower()
- elif word.get_head_name() == "System`List":
- if len(word.elements) == 3 and all(
- isinstance(s, String) for s in word.elements
- ):
- return tuple(s.value for s in word.elements)
-
- def _standard_property(
- self, py_word, py_form, py_property, wordnet, language_code, evaluation
- ):
- senses = self._senses(py_word, wordnet, language_code)
- if not senses:
- return Expression(SymbolMissing, StringNotAvailable)
- elif py_form == "List":
- word_property = WordProperty(self._short_syn_form, wordnet, language_code)
- property_getter = getattr(
- word_property, "%s" % self._underscore(py_property), None
- )
- if property_getter:
- return to_mathics_list(
- *[property_getter(syn, desc) for syn, desc in senses]
- )
- elif py_form in ("Rules", "ShortRules"):
- syn_form = (lambda s: s) if py_form == "Rules" else (lambda s: s[0])
- word_property = WordProperty(syn_form, wordnet, language_code)
- property_getter = getattr(
- word_property, self._underscore(py_property), None
- )
- if property_getter:
- list_expr_elements = [
- to_expression(SymbolRule, desc, property_getter(syn, desc))
- for syn, desc in senses
- ]
- return to_mathics_list(*list_expr_elements)
- evaluation.message(self.get_name(), "notprop", property)
-
- def _parts_of_speech(self, py_word, wordnet, language_code):
- parts = set(
- syn.pos() for syn, _ in self._senses(py_word, wordnet, language_code)
- )
- if not parts:
- return Expression(SymbolMissing, StringNotAvailable)
- else:
- return ListExpression(
- *[String(s) for s in sorted([_wordnet_pos_to_type[p] for p in parts])]
- )
-
- def _property(
- self, word, py_property, py_form, evaluation: Evaluation, options: dict
- ):
- if py_property == "PorterStem":
- if isinstance(word, String):
- return String(WordStem.porter(word.value))
- else:
- return
-
- wordnet, language_code = self._load_wordnet(
- evaluation, self._language_name(evaluation, options)
- )
- if not wordnet:
- return
-
- py_word = self._parse_word(word)
- if not py_word:
- return
-
- if py_property == "PartsOfSpeech":
- return self._parts_of_speech(py_word, wordnet, language_code)
-
- try:
- return self._standard_property(
- py_word, py_form, py_property, wordnet, language_code, evaluation
- )
- except MessageException as e:
- e.message(evaluation)
-
- def eval(self, word, evaluation: Evaluation, options: dict) -> Optional[Expression]:
- "WordData[word_, OptionsPattern[WordData]]"
- if word.get_head() is SymbolStringExpression:
- return Expression(SymbolDictionaryLookup, word)
- elif isinstance(word, String) or word.get_head() is SymbolList:
- pass
- else:
- return
-
- wordnet, language_code = self._load_wordnet(
- evaluation, self._language_name(evaluation, options)
- )
- if not wordnet:
- return
-
- py_word = self._parse_word(word)
- if not py_word:
- return
-
- senses = self._senses(py_word, wordnet, language_code)
- if senses is not None:
- return ListExpression(*[[String(s) for s in desc] for syn, desc in senses])
-
- def eval_property(self, word, property, evaluation: Evaluation, options: dict):
- "WordData[word_, property_String, OptionsPattern[WordData]]"
- if word.get_head is SymbolStringExpression:
- if property.get_string_value() == "Lookup":
- return Expression(SymbolDictionaryLookup, word)
- elif isinstance(word, String) or word.get_head() is SymbolList:
- return self._property(
- word, property.get_string_value(), "ShortRules", evaluation, options
- )
-
- def eval_property_form(
- self, word, property, form, evaluation: Evaluation, options: dict
- ):
- "WordData[word_, property_String, form_String, OptionsPattern[WordData]]"
- if isinstance(word, String) or word.get_head() is SymbolList:
- return self._property(
- word,
- property.value,
- form.value,
- evaluation,
- options,
- )
-
-
-class DictionaryWordQ(_WordNetBuiltin):
- """
-
- - 'DictionaryWordQ[$word$]'
-
- returns True if $word$ is a word usually found in dictionaries, and False otherwise.
-
-
- >> DictionaryWordQ["couch"]
- = True
-
- >> DictionaryWordQ["meep-meep"]
- = False
- """
-
- def eval(self, word, evaluation: Evaluation, options: dict):
- "DictionaryWordQ[word_String, OptionsPattern[DictionaryWordQ]]"
- if not isinstance(word, String):
- return False
- wordnet, language_code = self._load_wordnet(
- evaluation, self._language_name(evaluation, options)
- )
- if wordnet:
- if list(wordnet.synsets(word.value.lower(), None, language_code)):
- return SymbolTrue
- else:
- return SymbolFalse
-
-
-class DictionaryLookup(_WordListBuiltin):
- """
-
- - 'DictionaryLookup[$word$]'
-
- lookup words that match the given $word$ or pattern.
-
-
- 'DictionaryLookup[$word$, $n$]'
-
- lookup first $n$ words that match the given $word$ or pattern.
-
-
- >> DictionaryLookup["bake" ~~ ___, 3]
- = {bake, bakeapple, baked}
- """
-
- def compile(self, pattern, evaluation):
- re_patt = to_regex(pattern, evaluation)
- if re_patt is None:
- evaluation.message(
- "StringExpression",
- "invld",
- pattern,
- Expression(SymbolStringExpression, pattern),
- )
- return
- re_patt = anchor_pattern(re_patt)
-
- return re.compile(re_patt, flags=re.IGNORECASE)
-
- def search(self, dictionary_words, pattern):
- for dictionary_word in dictionary_words:
- if pattern.match(dictionary_word):
- yield dictionary_word.replace("_", " ")
-
- def lookup(self, language_name, word, n, evaluation):
- pattern = self.compile(word, evaluation)
- if pattern:
- dictionary_words = self._words(language_name, "All", evaluation)
- if dictionary_words is not None:
- matches = self.search(dictionary_words, pattern)
- if n is not None:
- matches = itertools.islice(matches, 0, n)
- return ListExpression(*(String(match) for match in sorted(matches)))
-
- def eval_english(self, word, evaluation):
- "DictionaryLookup[word_]"
- return self.lookup(String("English"), word, None, evaluation)
-
- def eval_language(self, language, word, evaluation):
- "DictionaryLookup[{language_String, word_}]"
- return self.lookup(language, word, None, evaluation)
-
- def eval_english_n(self, word, n, evaluation):
- "DictionaryLookup[word_, n_Integer]"
- return self.lookup(String("English"), word, n.value, evaluation)
-
- def eval_language_n(self, language, word, n, evaluation):
- "DictionaryLookup[{language_String, word_}, n_Integer]"
- return self.lookup(language, word, n.value, evaluation)
-
-
-class WordList(_WordListBuiltin):
- """
-
- - 'WordList[]'
-
- returns a list of common words.
-
-
- 'WordList[$type$]'
-
- returns a list of common words of type $type$.
-
-
- >> N[Mean[StringLength /@ WordList["Adjective"]], 2]
- = 9.3
- """
-
- def eval(self, evaluation: Evaluation, options: dict):
- "WordList[OptionsPattern[WordList]]"
- words = self._words(self._language_name(evaluation, options), "All", evaluation)
- if words is not None:
- return to_mathics_list(*words, elements_conversion_fn=String)
-
- def eval_type(self, wordtype, evaluation: Evaluation, options: dict):
- "WordList[wordtype_String, OptionsPattern[WordList]]"
- words = self._words(
- self._language_name(evaluation, options),
- wordtype.value,
- evaluation,
- )
- if words is not None:
- return to_mathics_list(*words, elements_conversion_fn=String)
-
-
-class RandomWord(_WordListBuiltin):
- """
-
- - 'RandomWord[]'
-
- returns a random word.
-
-
- 'RandomWord[$type$]'
-
- returns a random word of the given $type$, e.g. of type "Noun" or "Adverb".
-
-
- 'RandomWord[$type$, $n$]'
-
- returns $n$ random words of the given $type$.
-
- """
-
- def _random_words(self, type, n, evaluation: Evaluation, options: dict):
- words = self._words(self._language_name(evaluation, options), type, evaluation)
- if words is not None:
- with RandomEnv(evaluation) as rand:
- return [
- String(words[rand.randint(0, len(words) - 1)].replace("_", " "))
- for _ in range(n)
- ]
-
- def eval(self, evaluation: Evaluation, options: dict):
- "RandomWord[OptionsPattern[RandomWord]]"
- words = self._random_words("All", 1, evaluation, options)
- if words:
- return words[0]
-
- def eval_type(self, type, evaluation: Evaluation, options: dict):
- "RandomWord[type_String, OptionsPattern[RandomWord]]"
- words = self._random_words(type.value, 1, evaluation, options)
- if words:
- return words[0]
-
- def eval_type_n(self, type, n, evaluation: Evaluation, options: dict):
- "RandomWord[type_String, n_Integer, OptionsPattern[RandomWord]]"
- words = self._random_words(type.value, n.value, evaluation, options)
- if words:
- return ListExpression(*words)
-
-
-class LanguageIdentify(Builtin):
- """
-
- - 'LanguageIdentify[$text$]'
-
- returns the name of the language used in $text$.
-
-
- >> LanguageIdentify["eins zwei drei"]
- = German
- """
-
- def eval(self, text: String, evaluation: Evaluation) -> Union[Symbol, String]:
- "LanguageIdentify[text_String]"
-
- # an alternative: https://github.com/Mimino666/langdetect
-
- code, _ = langid.classify(text.value)
- language = pycountry.languages.get(alpha_2=code)
- if language is None:
- return SymbolFailed
- return String(language.name)
-
-
-class Pluralize(Builtin):
- """
-
- - 'Pluralize[$word$]'
-
- returns the plural form of $word$.
-
-
- >> Pluralize["potato"]
- = potatoes
- """
-
- requires = ("pattern",)
-
- def eval(self, word, evaluation):
- "Pluralize[word_String]"
-
- return String(pluralize(word.value))
-
-
-class SpellingCorrectionList(Builtin):
- """
-
- - 'SpellingCorrectionList[$word$]'
-
- returns a list of suggestions for spelling corrected versions of $word$.
-
-
- Results may differ depending on which dictionaries can be found by enchant.
-
- >> SpellingCorrectionList["hipopotamus"]
- = {hippopotamus...}
- """
-
- options = {
- "Language": '"English"',
- }
-
- messages = {
- "lang": "SpellingCorrectionList does not support `1` as a language.",
- }
-
- _languages = {
- "English": "en_US", # en_GB, en_AU
- "German": "de_DE",
- "French": "fr_FR",
- }
-
- _dictionaries = {}
-
- def eval(
- self, word: String, evaluation: Evaluation, options: dict
- ) -> Optional[ListExpression]:
- "SpellingCorrectionList[word_String, OptionsPattern[SpellingCorrectionList]]"
-
- language_name = self.get_option(options, "Language", evaluation)
- if not isinstance(language_name, String):
- return
- language_code = SpellingCorrectionList._languages.get(language_name.value, None)
- if not language_code:
- evaluation.message("SpellingCorrectionList", "lang", language_name)
- return
-
- d = SpellingCorrectionList._dictionaries.get(language_code, None)
- if not d:
- d = enchant.Dict(language_code)
- SpellingCorrectionList._dictionaries[language_code] = d
-
- py_word = word.value
-
- if d.check(py_word):
- return ListExpression(word)
- else:
- return to_mathics_list(*d.suggest(py_word), elements_conversion_fn=String)
diff --git a/pymathics/natlang/manipulate.py b/pymathics/natlang/manipulate.py
new file mode 100644
index 0000000..7d1f7e9
--- /dev/null
+++ b/pymathics/natlang/manipulate.py
@@ -0,0 +1,36 @@
+# -*- coding: utf-8 -*-
+"""
+Word manipulation
+
+This module uses pattern.en to change the form of a word.
+
+"""
+from mathics.builtin.base import Builtin
+from mathics.core.atoms import String
+from mathics.core.evaluation import Evaluation
+from pattern.en import pluralize
+
+sort_order = "Word manipulation"
+
+
+class Pluralize(Builtin):
+ """
+ :WMA link:
+ https://reference.wolfram.com/language/ref/Pluralize.html
+
+
+ - 'Pluralize[$word$]'
+
- returns the plural form of $word$.
+
+
+ >> Pluralize["potato"]
+ = potatoes
+ """
+
+ requires = ("pattern",)
+ summary_text = "retrieve the pluralized form of a word"
+
+ def eval(self, word: String, evaluation: Evaluation) -> String:
+ "Pluralize[word_String]"
+
+ return String(pluralize(word.value))
diff --git a/pymathics/natlang/nltk.py b/pymathics/natlang/nltk.py
new file mode 100644
index 0000000..919f75c
--- /dev/null
+++ b/pymathics/natlang/nltk.py
@@ -0,0 +1,323 @@
+# -*- coding: utf-8 -*-
+
+"""
+nltk backend
+"""
+import re
+from itertools import chain
+
+import nltk
+from mathics.builtin.base import Builtin, MessageException
+from mathics.builtin.codetables import iso639_3
+from mathics.core.atoms import String
+from mathics.core.evaluation import Evaluation
+from mathics.core.symbols import strip_context
+
+# Don't consider this for user documentation
+no_doc = True
+
+
+_wordnet_pos_to_type = {}
+_wordnet_type_to_pos = {}
+
+
+def _init_nltk_maps():
+ _wordnet_pos_to_type.update(
+ {
+ nltk.corpus.wordnet.VERB: "Verb",
+ nltk.corpus.wordnet.NOUN: "Noun",
+ nltk.corpus.wordnet.ADJ: "Adjective",
+ nltk.corpus.wordnet.ADJ_SAT: "Adjective",
+ nltk.corpus.wordnet.ADV: "Adverb",
+ }
+ )
+ _wordnet_type_to_pos.update(
+ {
+ "Verb": [nltk.corpus.wordnet.VERB],
+ "Noun": [nltk.corpus.wordnet.NOUN],
+ "Adjective": [nltk.corpus.wordnet.ADJ, nltk.corpus.wordnet.ADJ_SAT],
+ "Adverb": [nltk.corpus.wordnet.ADV],
+ }
+ )
+
+
+def _parse_nltk_lookup_error(e):
+ m = re.search(r"Resource '([^']+)' not found\.", str(e))
+ if m:
+ return m.group(1)
+ else:
+ return "unknown"
+
+
+class _WordNetBuiltin(Builtin):
+ requires = ("nltk",)
+
+ options = {
+ "Language": '"English"',
+ }
+
+ messages = {
+ "package": "NLTK's `` corpus is not installed. Please install it using nltk.download().",
+ "lang": 'Language "`1`" is currently not supported with `2`[]. Please install it manually.',
+ # 'load': 'Loading `1` word data. Please wait.',
+ "wordnet": "WordNet returned the following error: ``",
+ }
+
+ _wordnet_instances = {}
+
+ def _language_name(self, evaluation: Evaluation, options: dict):
+ return self.get_option(options, "Language", evaluation)
+
+ def _init_wordnet(self, evaluation: Evaluation, language_name, language_code):
+ try:
+ wordnet_resource = nltk.data.find("corpora/wordnet2022")
+ _init_nltk_maps()
+ except LookupError:
+ evaluation.message(self.get_name(), "package", "wordnet2022")
+ return None
+
+ try:
+ omw = nltk.corpus.util.LazyCorpusLoader(
+ "omw",
+ nltk.corpus.reader.CorpusReader,
+ r".*/wn-data-.*\.tab",
+ encoding="utf8",
+ )
+ except LookupError:
+ evaluation.message(self.get_name(), "package", "omw")
+ return None
+
+ wordnet = nltk.corpus.reader.wordnet.WordNetCorpusReader(wordnet_resource, omw)
+
+ if language_code not in wordnet.langs():
+ evaluation.message(
+ self.get_name(), "lang", language_name, strip_context(self.get_name())
+ )
+ return None
+
+ return wordnet
+
+ def _load_wordnet(self, evaluation: Evaluation, language_name) -> tuple:
+ language_code = None
+ if isinstance(language_name, String):
+ language_code = iso639_3.get(language_name.value)
+ if not language_code:
+ evaluation.message(
+ self.get_name(), "lang", language_name, strip_context(self.get_name())
+ )
+ return None, None
+
+ wordnet = _WordNetBuiltin._wordnet_instances.get(language_code)
+ if not wordnet:
+ try:
+ wordnet = self._init_wordnet(evaluation, language_name, language_code)
+ except LookupError as e:
+ evaluation.message(
+ self.get_name(), "package", _parse_nltk_lookup_error(e)
+ )
+ return None, None
+
+ _WordNetBuiltin._wordnet_instances[language_code] = wordnet
+
+ return wordnet, language_code
+
+ @staticmethod
+ def _decode_synset(syn):
+ what, pos, nr = (syn.name().split(".") + ["01"])[:3]
+ return what.replace("_", " "), pos, nr
+
+ @staticmethod
+ def _capitalize(s) -> str:
+ return re.sub(r"^[a-z]|\s[a-z]", lambda m: m.group(0).upper().lstrip(" "), s)
+
+ @staticmethod
+ def _underscore(s) -> str:
+ return re.sub(
+ r"[a-z][A-Z]", lambda m: m.group(0)[0] + "_" + m.group(0)[1].lower(), s
+ ).lower()
+
+ @staticmethod
+ def _list_syn_form(syn):
+ what, pos, nr = _WordNetBuiltin._decode_synset(syn)
+
+ def containers():
+ for name in syn.lemma_names():
+ if name != what:
+ yield name
+
+ for s in chain(syn.hypernyms(), syn.hyponyms(), syn.similar_tos()):
+ container, _, _ = _WordNetBuiltin._decode_synset(s)
+ yield container
+
+ for lemma in WordProperty._synonymous_lemmas(syn):
+ yield lemma.name()
+
+ return what, _wordnet_pos_to_type[pos], containers
+
+ @staticmethod
+ def syn(syn, wordnet, language_code) -> tuple:
+ what, pos, nr = _WordNetBuiltin._decode_synset(syn)
+ for s, form in _WordNetBuiltin._iterate_senses(what, wordnet, language_code):
+ if s == syn:
+ return form
+ return what, pos, "Unknown"
+
+ @staticmethod
+ def _iterate_senses(word, wordnet, language_code):
+ if not word:
+ return
+
+ used = set()
+ output_word = word.replace("_", " ")
+
+ for syn in wordnet.synsets(word, None, language_code):
+ if syn.lexname() in ("noun.location", "noun.person"):
+ continue # ignore
+
+ what, pos, containers = _WordNetBuiltin._list_syn_form(syn)
+
+ for container in containers():
+ container = container.replace("_", " ")
+ if container != word:
+ if container not in used:
+ used.add(container)
+ yield syn, (
+ output_word,
+ pos,
+ _WordNetBuiltin._capitalize(container),
+ )
+ break
+
+ def _senses(self, word, wordnet, language_code):
+ if isinstance(word, tuple): # find forms like ["tree", "Noun", "WoodyPlant"]
+ for syn, form in _WordNetBuiltin._iterate_senses(
+ word[0], wordnet, language_code
+ ):
+ if form == word:
+ return [[syn, form]]
+ else: # find word given as strings, e.g. "tree"
+ word = wordnet.morphy(word) # base form, e.g. trees -> tree
+ return list(_WordNetBuiltin._iterate_senses(word, wordnet, language_code))
+
+
+class _WordListBuiltin(_WordNetBuiltin):
+ _dictionary = {}
+
+ def _words(self, language_name, ilk, evaluation):
+ wordnet, language_code = self._load_wordnet(evaluation, language_name)
+
+ if not wordnet:
+ return
+
+ key = "%s.%s" % (language_code, ilk)
+ words = self._dictionary.get(key)
+ if not words:
+ try:
+ if ilk == "All":
+ filtered_pos = [None]
+ else:
+ try:
+ filtered_pos = _wordnet_type_to_pos[ilk]
+ except KeyError:
+ evaluation.message(
+ self.get_name(),
+ "wordnet",
+ "type: %s should be in %s"
+ % (ilk, _wordnet_type_to_pos.keys()),
+ )
+ return
+
+ words = []
+ for pos in filtered_pos:
+ words.extend(list(wordnet.all_lemma_names(pos, language_code)))
+ words.sort()
+ self._dictionary[key] = words
+ except nltk.corpus.reader.wordnet.WordNetError as err:
+ evaluation.message(self.get_name(), "wordnet", str(err))
+ return
+
+ return words
+
+
+class WordProperty:
+ def __init__(self, syn_form, wordnet, language_code):
+ self.syn_form = syn_form
+ self.wordnet = wordnet
+ self.language_code = language_code
+
+ def syn(self, syn):
+ return self.syn_form(_WordNetBuiltin.syn(syn, self.wordnet, self.language_code))
+
+ @staticmethod
+ def _synonymous_lemmas(syn):
+ first_lemma = syn.name().split(".")[0]
+ return (s for s in syn.lemmas() if s.name() != first_lemma)
+
+ @staticmethod
+ def _antonymous_lemmas(syn):
+ return (s for lemma in syn.lemmas() for s in lemma.antonyms())
+
+ def definitions(self, syn, desc):
+ return syn.definition()
+
+ def examples(self, syn, desc):
+ return syn.examples()
+
+ def synonyms(self, syn, desc):
+ _, pos, container = desc
+ return [
+ self.syn_form((s.name().replace("_", " "), pos, container))
+ for s in WordProperty._synonymous_lemmas(syn)
+ ]
+
+ def antonyms(self, syn, desc):
+ return [self.syn(s.synset()) for s in WordProperty._antonymous_lemmas(syn)]
+
+ def broader_terms(self, syn, desc):
+ return [self.syn(s) for s in syn.hypernyms()]
+
+ def narrower_terms(self, syn, desc):
+ return [self.syn(s) for s in syn.hyponyms()]
+
+ def usage_field(self, syn, desc):
+ return syn.usage_domains()
+
+ def whole_terms(self, syn, desc):
+ return [self.syn(s) for s in syn.part_holonyms()]
+
+ def part_terms(self, syn, desc):
+ return [self.syn(s) for s in syn.part_meronyms()]
+
+ def material_terms(self, syn, desc):
+ return [self.syn(s) for s in syn.substance_meronyms()]
+
+ def word_net_id(self, syn, desc):
+ return syn.offset()
+
+ def entailed_terms(self, syn, desc): # e.g. fall to condense
+ return [self.syn(s) for s in syn.entailments()]
+
+ def causes_terms(self, syn, desc): # e.g. ignite to burn
+ return [self.syn(s) for s in syn.causes()]
+
+ def inflected_forms(self, syn, desc):
+ try:
+ word, pos, _ = desc
+ if pos == "Verb":
+ from pattern.en import lexeme
+
+ return [w for w in reversed(lexeme(word)) if w != word]
+ elif pos == "Noun":
+ from pattern.en import pluralize
+
+ return [pluralize(word)]
+ elif pos == "Adjective":
+ from pattern.en import comparative, superlative
+
+ return [comparative(word), superlative(word)]
+ else:
+ return []
+ except ImportError:
+ raise MessageException(
+ "General", "unavailable", 'WordData[_, "InflectedForms"]', "pattern"
+ )
diff --git a/pymathics/natlang/normalization.py b/pymathics/natlang/normalization.py
new file mode 100644
index 0000000..baf309c
--- /dev/null
+++ b/pymathics/natlang/normalization.py
@@ -0,0 +1,313 @@
+"""
+
+Text Normalization
+
+See the corresponding
+:WMA:
+https://reference.wolfram.com/language/guide/TextNormalization.html guide.
+
+
+This module uses spacy as a backend.
+"""
+import itertools
+from itertools import islice
+from typing import Optional
+
+import spacy
+from mathics.core.atoms import Integer, String
+from mathics.core.convert.python import from_python
+from mathics.core.evaluation import Evaluation
+from mathics.core.list import ListExpression
+
+from pymathics.natlang.spacy import _cases, _pos_tags, _position, _SpacyBuiltin
+
+sort_order = "Text Normalization"
+
+
+class DeleteStopwords(_SpacyBuiltin):
+ """
+ Delete :stop words:https://en.wikipedia.org/wiki/Stop_word(\
+ :WMA:
+ https://reference.wolfram.com/language/ref/DeleteStopwords.html\
+ )
+
+
+ - 'DeleteStopwords[$list$]'
+
- returns the words in $list$ without stopwords.
+
+
- 'DeleteStopwords[$string$]'
+
- returns $string$ without stopwords.
+
+
+ ## This has changed since old versions of natlang, and I am
+ ## not sure the old behavior was correct.
+ >> DeleteStopwords[{"Somewhere", "over", "the", "rainbow"}]
+ = ...
+ ## = {rainbow}
+
+ >> DeleteStopwords["There was an Old Man of Apulia, whose conduct was very peculiar"]
+ = Old Man Apulia, conduct peculiar
+ """
+
+ summary_text = "remove stopwords from a text"
+
+ def eval_list(self, li, evaluation: Evaluation, options: dict) -> ListExpression:
+ "DeleteStopwords[li_List, OptionsPattern[DeleteStopwords]]"
+ is_stop = self._is_stop_lambda(evaluation, options)
+
+ def filter_words(words):
+ for w in words:
+ s = w.get_string_value()
+ if s is not None:
+ yield String(s)
+ elif is_stop is not None and is_stop(s) is not None:
+ yield String(s)
+
+ return ListExpression(*list(filter_words(li.elements)))
+
+ def eval_string(self, s: String, evaluation: Evaluation, options: dict):
+ "DeleteStopwords[s_String, OptionsPattern[DeleteStopwords]]"
+ doc = self._nlp(s.value, evaluation, options)
+ if doc:
+ is_stop = self._is_stop_lambda(evaluation, options)
+ if is_stop:
+
+ def tokens():
+ for token in doc:
+ if not is_stop(token.text):
+ yield token.text_with_ws
+ else:
+ yield token.whitespace_.strip()
+
+ return String("".join(tokens()))
+
+
+class TextCases(_SpacyBuiltin):
+ """
+ :WMA link:
+ https://reference.wolfram.com/language/ref/TextCases.html
+
+
+ - 'TextCases[$text$, $form$]'
+
- returns all elements of type $form$ in $text$ in order of their appearance.
+
+
+ >> TextCases["I was in London last year.", "Pronoun"]
+ = {I}
+
+ >> TextCases["I was in London last year.", "City"]
+ = {London}
+
+ ## >> TextCases[Import["ExampleData/EinsteinSzilLetter.txt"], "Person", 3][[2;;3]]
+ ## = {L. Szilard, Joliot}
+
+ >> TextCases["Anne, Peter and Mr Johnes say hello.", "Person", 3][[2;;3]]
+ = {Peter, Johnes}
+
+ """
+
+ summary_text = "list cases of words of a certain form in a text"
+
+ def eval_string_form(
+ self, text: String, form, evaluation: Evaluation, options: dict
+ ):
+ "TextCases[text_String, form_, OptionsPattern[TextCases]]"
+ doc = self._nlp(text.value, evaluation, options)
+ if doc:
+ return ListExpression(*[String(t.text) for t in _cases(doc, form)])
+
+ def eval_string_form_n(
+ self, text: String, form, n: Integer, evaluation: Evaluation, options: dict
+ ):
+ "TextCases[text_String, form_, n_Integer, OptionsPattern[TextCases]]"
+ doc = self._nlp(text.value, evaluation, options)
+ if doc:
+ items = islice((t.text for t in _cases(doc, form)), n.value)
+ return ListExpression(*(from_python(item) for item in items))
+
+
+class TextPosition(_SpacyBuiltin):
+ """
+ :WMA link:
+ https://reference.wolfram.com/language/ref/TextPosition.html
+
+
+ - 'TextPosition[$text$, $form$]'
+
- returns the positions of elements of type $form$ in $text$ in order of their appearance.
+
+
+ >> TextPosition["Liverpool and London are two English cities.", "City"]
+ = {{1, 9}, {15, 20}}
+ """
+
+ summary_text = "list the positions of words of a given form in a text"
+
+ def eval_text_form(self, text: String, form, evaluation: Evaluation, options: dict):
+ "TextPosition[text_String, form_, OptionsPattern[TextPosition]]"
+ doc = self._nlp(text.value, evaluation, options)
+ if doc:
+ return ListExpression(
+ *[from_python(_position(t)) for t in _cases(doc, form)]
+ )
+
+ def eval_text_form_n(
+ self, text: String, form, n: Integer, evaluation: Evaluation, options: dict
+ ):
+ "TextPosition[text_String, form_, n_Integer, OptionsPattern[TextPosition]]"
+ doc = self._nlp(text.value, evaluation, options)
+ if doc:
+ items = islice((_position(t) for t in _cases(doc, form)), n.value)
+ return ListExpression(*(from_python(item) for item in items))
+
+
+class TextSentences(_SpacyBuiltin):
+ """
+ :Sentences:https://en.wikipedia.org/wiki/Sentence_(linguistics)\
+ in a text (\
+ :WMA:
+ https://reference.wolfram.com/language/ref/TextSentences.html\
+ )
+
+
+
+ - 'TextSentences[$string$]'
+
- returns the sentences in $string$.
+
+
- 'TextSentences[$string$, $n$]'
+
- returns the first $n$ sentences in $string$
+
+
+ >> TextSentences["Night and day. Day and night."]
+ = {Night and day., Day and night.}
+
+ >> TextSentences["Night and day. Day and night.", 1]
+ = {Night and day.}
+
+ >> TextSentences["Mr. Jones met Mrs. Jones."]
+ = {Mr. Jones met Mrs. Jones.}
+ """
+
+ summary_text = "list the sentences in a text"
+
+ def eval(self, text: String, evaluation: Evaluation, options: dict):
+ "TextSentences[text_String, OptionsPattern[TextSentences]]"
+ doc = self._nlp(text.value, evaluation, options)
+ if doc:
+ return ListExpression(*[String(sent.text) for sent in doc.sents])
+
+ def eval_n(self, text: String, n: Integer, evaluation: Evaluation, options: dict):
+ "TextSentences[text_String, n_Integer, OptionsPattern[TextSentences]]"
+ doc = self._nlp(text.value, evaluation, options)
+ if doc:
+ return ListExpression(
+ *itertools.islice((String(sent.text) for sent in doc.sents), n.value),
+ )
+
+
+class TextStructure(_SpacyBuiltin):
+ """
+ :WMA link:
+ https://reference.wolfram.com/language/ref/TextStructure.html
+
+
+ - 'TextStructure[$text$, $form$]'
+
- returns the grammatical structure of $text$ as $form$.
+
+
+ >> TextStructure["The cat sat on the mat.", "ConstituentString"]
+ = {(Sentence, ((Verb Phrase, (Noun Phrase, (Determiner, The), (Noun, cat)), (Verb, sat), (Prepositional Phrase, (Preposition, on), (Noun Phrase, (Determiner, the), (Noun, mat))), (Punctuation, .))))}
+ """
+
+ _root_pos = set(i for i, names in _pos_tags.items() if names[1])
+ summary_text = "retrieve the grammatical structure of a text"
+
+ def _to_constituent_string(self, node):
+ token, children = node
+ name, phrase_name = _pos_tags.get(token.pos, ("Unknown", "Unknown Phrase"))
+ if not children:
+ return "(%s, %s)" % (name, token.text)
+ else:
+ sub = ", ".join(
+ self._to_constituent_string(next_node) for next_node in children
+ )
+ return "(%s, %s)" % (phrase_name, sub)
+
+ def _to_tree(self, tokens, path=[]):
+ roots = []
+ i = 0
+ while i < len(tokens):
+ token = tokens[i]
+
+ if token in path:
+ roots.append((token, None))
+ i += 1
+ else:
+ root = token
+ while root.head != root and root.head not in path:
+ root = root.head
+
+ sub = list(root.subtree)
+
+ if root.pos not in self._root_pos:
+ roots.extend(self._to_tree(sub, path + [root]))
+ else:
+ roots.append((root, self._to_tree(sub, path + [root])))
+
+ i += len(sub)
+
+ return roots
+
+ def eval(self, text, evaluation: Evaluation, options: dict):
+ 'TextStructure[text_String, "ConstituentString", OptionsPattern[TextStructure]]'
+ doc = self._nlp(text.value, evaluation, options)
+ if doc:
+ tree = self._to_tree(list(doc))
+ sents = ["(Sentence, (%s))" % self._to_constituent_string(x) for x in tree]
+ return ListExpression(*(String(sent) for sent in sents))
+
+
+class TextWords(_SpacyBuiltin):
+ """
+ :WMA link:
+ https://reference.wolfram.com/language/ref/TextWords.html
+
+
+ - 'TextWords[$string$]'
+
- returns the words in $string$.
+
+
- 'TextWords[$string$, $n$]'
+
- returns the first $n$ words in $string$
+
+
+ >> TextWords["Hickory, dickory, dock! The mouse ran up the clock."]
+ = {Hickory, dickory, dock, The, mouse, ran, up, the, clock}
+
+ >> TextWords["Bruder Jakob, Schläfst du noch?", 2]
+ = {Bruder, Jakob}
+
+ """
+
+ summary_text = "list the words in a string"
+
+ def eval(
+ self, text: String, evaluation: Evaluation, options: dict
+ ) -> Optional[ListExpression]:
+ "TextWords[text_String, OptionsPattern[]]"
+ doc = self._nlp(text.value, evaluation, options)
+ if doc:
+ punctuation = spacy.parts_of_speech.PUNCT
+ return ListExpression(
+ *[String(word.text) for word in doc if word.pos != punctuation],
+ )
+
+ def eval_n(self, text: String, n: Integer, evaluation: Evaluation, options: dict):
+ "TextWords[text_String, n_Integer, OptionsPattern[]]"
+ doc = self._nlp(text.value, evaluation, options)
+ if doc:
+ punctuation = spacy.parts_of_speech.PUNCT
+ return ListExpression(
+ *itertools.islice(
+ (String(word.text) for word in doc if word.pos != punctuation),
+ n.value,
+ ),
+ )
diff --git a/pymathics/natlang/spacy.py b/pymathics/natlang/spacy.py
new file mode 100644
index 0000000..b87b814
--- /dev/null
+++ b/pymathics/natlang/spacy.py
@@ -0,0 +1,251 @@
+# -*- coding: utf-8 -*-
+
+"""
+Spacy tools
+
+"""
+
+# TODO: move here low-level implementation depending on spacy
+
+import heapq
+import re
+from typing import Optional
+
+import spacy
+from mathics.builtin.base import Builtin
+from mathics.core.atoms import String
+from mathics.core.evaluation import Evaluation
+from mathics.core.symbols import strip_context
+from mathics.core.systemsymbols import SymbolAlternatives
+from spacy.tokens import Span
+
+no_doc = True
+
+# Mathics3 named entitiy names and their corresponding constants in spacy.
+symbols = {
+ "Person": spacy.symbols.PERSON,
+ "Company": spacy.symbols.ORG,
+ "Quantity": spacy.symbols.QUANTITY,
+ "Number": spacy.symbols.CARDINAL,
+ "CurrencyAmount": spacy.symbols.MONEY,
+ "Country": spacy.symbols.GPE, # also includes cities and states
+ "City": spacy.symbols.GPE, # also includes countries and states
+}
+
+# Part of speech tags and their public interface names in Mathics
+# see http://www.mathcs.emory.edu/~choi/doc/clear-dependency-2012.pdf
+_pos_tags = {
+ spacy.parts_of_speech.ADJ: ("Adjective", ""),
+ spacy.parts_of_speech.ADP: ("Preposition", "Prepositional Phrase"),
+ spacy.parts_of_speech.ADV: ("Adverb", ""),
+ spacy.parts_of_speech.CONJ: ("Conjunct", ""),
+ spacy.parts_of_speech.DET: ("Determiner", ""),
+ spacy.parts_of_speech.INTJ: ("Interjection", ""),
+ spacy.parts_of_speech.NOUN: ("Noun", "Noun Phrase"),
+ spacy.parts_of_speech.NUM: ("Number", ""),
+ spacy.parts_of_speech.PART: ("Particle", ""),
+ spacy.parts_of_speech.PRON: ("Pronoun", ""),
+ spacy.parts_of_speech.PROPN: ("Proposition", ""),
+ spacy.parts_of_speech.PUNCT: ("Punctuation", ""),
+ spacy.parts_of_speech.SCONJ: ("Sconj", ""),
+ spacy.parts_of_speech.SYM: ("Symbol", ""),
+ spacy.parts_of_speech.VERB: ("Verb", "Verb Phrase"),
+ spacy.parts_of_speech.X: ("X", ""),
+ spacy.parts_of_speech.EOL: ("EOL", ""),
+ spacy.parts_of_speech.SPACE: ("Space", ""),
+}
+
+
+def _cases(doc, form):
+ if isinstance(form, String):
+ generators = [_forms.get(form.value)]
+ elif form.get_head() is SymbolAlternatives:
+ if not all(isinstance(f, String) for f in form.elements):
+ return # error
+ generators = [_forms.get(f.value) for f in form.elements]
+ elif form.has_form("Pymathics`Containing", 2):
+ for t in _containing(doc, *form.elements):
+ yield t
+ return
+ else:
+ return # error
+
+ def try_next(iterator):
+ try:
+ return next(iterator)
+ except StopIteration:
+ return None
+
+ feeds = []
+ for i, iterator in enumerate(
+ [iter(generator(doc)) for generator in generators if generator]
+ ):
+ t = try_next(iterator)
+ if t:
+ feeds.append((_position(t), i, t, iterator))
+ heapq.heapify(feeds)
+ while feeds:
+ pos, i, token, iterator = heapq.heappop(feeds)
+ yield token
+ t = try_next(iterator)
+ if t:
+ heapq.heappush(feeds, (_position(t), i, t, iterator))
+
+
+def _containing(doc, outer, inner):
+ if not isinstance(outer, String):
+ return # error
+ outer_generator = _forms.get(outer.value)
+ inner_iter = _cases(doc, inner)
+ inner_start = None
+ produce_t = False
+ try:
+ for t in outer_generator(doc):
+ start, end = _position(t)
+ if inner_start is not None and inner_start < end:
+ produce_t = True
+ if produce_t:
+ yield t
+ produce_t = False
+ while True:
+ inner_start, inner_end = _position(next(inner_iter))
+ if inner_end > start:
+ break
+ if inner_start < end:
+ produce_t = True
+ except StopIteration:
+ pass
+
+
+def _fragments(doc, sep):
+ start = 0
+ for i, token in enumerate(doc):
+ if sep.match(token.text):
+ yield Span(doc, start, i)
+ start = i + 1
+ end = len(doc)
+ if start < end:
+ yield Span(doc, start, end)
+
+
+def _make_forms():
+ forms = {
+ "Word": lambda doc: (token for token in doc),
+ "Sentence": lambda doc: (sent for sent in doc.sents),
+ "Paragraph": lambda doc: _fragments(doc, re.compile(r"^[\n][\n]+$")),
+ "Line": lambda doc: _fragments(doc, re.compile(r"^[\n]$")),
+ "URL": lambda doc: (token for token in doc if token.orth_.like_url()),
+ "EmailAddress": lambda doc: (
+ token for token in doc if token.orth_.like_email()
+ ),
+ }
+
+ def filter_named_entity(label):
+ def generator(doc):
+ for ent in doc.ents:
+ if ent.label == label:
+ yield ent
+
+ return generator
+
+ def filter_pos(pos):
+ def generator(doc):
+ for token in doc:
+ if token.pos == pos:
+ yield token
+
+ return generator
+
+ for name, symbol in symbols.items():
+ forms[name] = filter_named_entity(symbol)
+
+ for tag, names in _pos_tags.items():
+ name, phrase_name = names
+ forms[name] = filter_pos(tag)
+
+ return forms
+
+
+# forms are everything one can use in TextCases[] or TextPosition[].
+_forms = _make_forms()
+
+
+def _position(t):
+ if isinstance(t, Span):
+ i = t.doc[t.start]
+ r = t.doc[t.end - 1]
+ return 1 + i.idx, r.idx + len(r.text)
+ else:
+ return 1 + t.idx, t.idx + len(t.text)
+
+
+class _SpacyBuiltin(Builtin):
+ requires = ("spacy",)
+
+ options = {
+ "Language": '"English"',
+ }
+
+ messages = {
+ "runtime": "Spacy gave the following error: ``",
+ "lang": 'Language "`1`" is currently not supported with `2`[].',
+ }
+
+ _language_codes = {
+ "English": "en",
+ "German": "de",
+ }
+
+ _spacy_instances = {}
+
+ def _load_spacy(self, evaluation: Evaluation, options: dict):
+ language_code = None
+ language_name = self.get_option(options, "Language", evaluation)
+ if language_name is None:
+ language_name = String("Undefined")
+ if isinstance(language_name, String):
+ language_code = _SpacyBuiltin._language_codes.get(language_name.value)
+ if not language_code:
+ evaluation.message(
+ self.get_name(), "lang", language_name, strip_context(self.get_name())
+ )
+ return None
+
+ instance = _SpacyBuiltin._spacy_instances.get(language_code)
+ if instance:
+ return instance
+
+ try:
+ instance = spacy.load(f"{language_code}_core_web_md")
+
+ # "via" parameter no longer exists. This was used in MATHICS3_SPACY_DATA
+ # if "MATHICS3_SPACY_DATA" in os.environ:
+ # instance = spacy.load(
+ # language_code, via=os.environ["MATHICS3_SPACY_DATA"]
+ # )
+ # else:
+ # instance = spacy.load(f"{language_code}_core_web_md")
+
+ _SpacyBuiltin._spacy_instances[language_code] = instance
+ return instance
+ except RuntimeError as e:
+ evaluation.message(self.get_name(), "runtime", str(e))
+ return None
+
+ def _nlp(self, text, evaluation, options) -> Optional[spacy.tokens.doc.Doc]:
+ nlp = self._load_spacy(evaluation, options)
+ if not nlp:
+ return None
+ return nlp(text)
+
+ def _is_stop_lambda(self, evaluation: Evaluation, options: dict):
+ nlp = self._load_spacy(evaluation, options)
+ if not nlp:
+ return None
+
+ vocab = nlp.vocab
+
+ def is_stop(word):
+ return vocab[word].is_stop
+
+ return is_stop
diff --git a/pymathics/natlang/textual_analysis.py b/pymathics/natlang/textual_analysis.py
new file mode 100644
index 0000000..01300d2
--- /dev/null
+++ b/pymathics/natlang/textual_analysis.py
@@ -0,0 +1,341 @@
+# -*- coding: utf-8 -*-
+"""
+Text Analysis
+
+See the corresponding :WMA:
+https://reference.wolfram.com/language/guide/TextAnalysis.html guide.
+"""
+
+# This module uses both enchant, nltk and spacy. Maybe we want to split this further.
+
+from typing import Optional
+
+import enchant
+import nltk
+import spacy
+from mathics.builtin.base import Builtin
+from mathics.core.atoms import Integer, Real, String
+from mathics.core.evaluation import Evaluation
+from mathics.core.expression import Expression
+from mathics.core.list import ListExpression
+from mathics.core.symbols import SymbolList, SymbolTrue
+from mathics.eval.nevaluator import eval_N
+
+from pymathics.natlang.spacy import _SpacyBuiltin
+from pymathics.natlang.util import merge_dictionaries
+
+sort_order = "Text Analysis"
+
+
+class Containing(Builtin):
+ """
+ :WMA link:
+ https://reference.wolfram.com/language/ref/Containing.html
+
+
+ - 'Containing[$outer$, $inner$]'
+
- represents an object of the type outer containing objects\
+ of type inner.
+
+ 'Containing' can be used as the second parameter in 'TextCases' and 'TextPosition'.
+
+ Supported $outer$ strings are in {"Word", "Sentence", "Paragraph", "Line", "URL", "EmailAddress"}.
+
+ Supported $inner$ strings are in {"Person", "Company", "Quantity", "Number", "CurrencyAmount",
+ "Country", "City"}.
+
+ The implementation of this symbol is based on `spacy`.
+
+ >> TextCases["This is a pencil. This is another pencil from England.", Containing["Sentence", "Country"]]
+ = {This is another pencil from England.}
+ >> TextPosition["This is a pencil. This is another pencil from England.", Containing["Sentence", "Country"]]
+ = {{19, 54}}
+
+ """
+
+ # This is implemented in ``pymathics.natlang.spacy._containing``
+ summary_text = "specify a container for matching"
+
+
+class SpellingCorrectionList(Builtin):
+ """
+ :WMA link:
+ https://reference.wolfram.com/language/ref/SpellingCorrectionList.html
+
+
+ - 'SpellingCorrectionList[$word$]'
+
- returns a list of suggestions for spelling corrected versions of $word$.
+
+
+ Results may differ depending on which dictionaries can be found by enchant.
+
+ >> SpellingCorrectionList["hipopotamus"]
+ = {hippopotamus...}
+ """
+
+ options = {
+ "Language": '"English"',
+ }
+
+ messages = {
+ "lang": "SpellingCorrectionList does not support `1` as a language.",
+ }
+
+ _languages = {
+ "English": "en_US", # en_GB, en_AU
+ "German": "de_DE",
+ "French": "fr_FR",
+ }
+
+ _dictionaries = {}
+
+ summary_text = "look for spelling correction candidates of a word"
+
+ def eval(
+ self, word: String, evaluation: Evaluation, options: dict
+ ) -> Optional[ListExpression]:
+ "SpellingCorrectionList[word_String, OptionsPattern[SpellingCorrectionList]]"
+
+ language_name = self.get_option(options, "Language", evaluation)
+ if not isinstance(language_name, String):
+ return
+ language_code = SpellingCorrectionList._languages.get(language_name.value, None)
+ if not language_code:
+ evaluation.message("SpellingCorrectionList", "lang", language_name)
+ return
+
+ d = SpellingCorrectionList._dictionaries.get(language_code, None)
+ if not d:
+ d = enchant.Dict(language_code)
+ SpellingCorrectionList._dictionaries[language_code] = d
+
+ py_word = word.value
+
+ if d.check(py_word):
+ return ListExpression(word)
+ else:
+ return ListExpression(*(String(word) for word in d.suggest(py_word)))
+
+
+class WordCount(_SpacyBuiltin):
+ """
+ :WMA link:
+ https://reference.wolfram.com/language/ref/WordCount.html
+
+
+ - 'WordCount[$string$]'
+
- returns the number of words in $string$.
+
+
+ >> WordCount["A long time ago"]
+ = 4
+ """
+
+ summary_text = "count words in a text"
+
+ def eval(self, text, evaluation: Evaluation, options: dict):
+ "WordCount[text_String, OptionsPattern[WordCount]]"
+ doc = self._nlp(text.value, evaluation, options)
+ if doc:
+ punctuation = spacy.parts_of_speech.PUNCT
+ return Integer(sum(1 for word in doc if word.pos != punctuation))
+
+
+class WordFrequency(_SpacyBuiltin):
+ """
+ :WMA link:
+ https://reference.wolfram.com/language/ref/WordFrequency.html
+
+
+ - 'WordFrequency[$text$, $word$]'
+
- returns the relative frequency of $word$ in $text$.
+
+
+ $word$ may also specify multiple words using $a$ | $b$ | ...
+
+ ## Problem with import for certain characters in the text.
+ ## >> text = Import["ExampleData/EinsteinSzilLetter.txt"];
+ >> text = "I have a dairy cow, it's not just any cow. She gives me milkshake, oh what a salty cow. She is the best cow in the county.";
+
+ >> WordFrequency[text, "a" | "the"]
+ = 0.121212
+
+ >> WordFrequency["Apple Tree", "apple", IgnoreCase -> True]
+ = 0.5
+ """
+
+ options = _SpacyBuiltin.options
+ options.update({"IgnoreCase": "False"})
+ summary_text = "retrieve the frequency of a word in a text"
+
+ def eval(
+ self, text: String, word, evaluation: Evaluation, options: dict
+ ) -> Optional[Expression]:
+ "WordFrequency[text_String, word_, OptionsPattern[WordFrequency]]"
+ doc = self._nlp(text.value, evaluation, options)
+ if not doc:
+ return
+ if isinstance(word, String):
+ words = set([word.value])
+ elif word.get_head_name() == "System`Alternatives":
+ if not all(isinstance(a, String) for a in word.elements):
+ return # error
+ words = set(a.value for a in word.elements)
+ else:
+ return # error
+
+ ignore_case = self.get_option(options, "IgnoreCase", evaluation) is SymbolTrue
+ if ignore_case:
+ words = [w.lower() for w in words]
+ n = 0
+ for token in doc:
+ token_text = token.text
+ if ignore_case:
+ token_text = token_text.lower()
+ if token_text in words:
+ n += 1
+ return eval_N(Integer(n) / Integer(len(doc)), evaluation)
+
+
+class WordSimilarity(_SpacyBuiltin):
+ """
+
+ :WMA link:
+ https://reference.wolfram.com/language/ref/WordSimilarity.html
+
+
+ - 'WordSimilarity[$text1$, $text2$]'
+
- returns a real-valued measure of semantic similarity of two texts or words.
+
+
- 'WordSimilarity[{$text1$, $i1$}, {$text2$, $j1$}]'
+
- returns a measure of similarity of two words within two texts.
+
+
- 'WordSimilarity[{$text1$, {$i1$, $i2$, ...}}, {$text2$, {$j1$, $j2$, ...}}]'
+
- returns a measure of similarity of multiple words within two texts.
+
+
+ >> NumberForm[WordSimilarity["car", "train"], 3]
+ = 0.439
+
+ >> NumberForm[WordSimilarity["car", "hedgehog"], 3]
+ = 0.195
+
+ >> NumberForm[WordSimilarity[{"An ocean full of water.", {2, 2}}, { "A desert full of sand.", {2, 5}}], 3]
+ = {0.505, 0.481}
+ """
+
+ messages = merge_dictionaries(
+ _SpacyBuiltin.messages,
+ {
+ "txtidx": "Index `1` in position `2` must be between 1 and `3`.",
+ "idxfmt": "Indices must be integers or lists of integers of the same length.",
+ },
+ )
+ summary_text = "measure similarity of two texts"
+
+ def eval(
+ self, text1: String, text2: String, evaluation: Evaluation, options: dict
+ ) -> Optional[Real]:
+ "WordSimilarity[text1_String, text2_String, OptionsPattern[WordSimilarity]]"
+ doc1 = self._nlp(text1.value, evaluation, options)
+ if doc1:
+ doc2 = self._nlp(text2.value, evaluation, options)
+ if doc2:
+ return Real(doc1.similarity(doc2))
+
+ def eval_pair(self, text1, i1, text2, i2, evaluation: Evaluation, options: dict):
+ "WordSimilarity[{text1_String, i1_}, {text2_String, i2_}, OptionsPattern[WordSimilarity]]"
+ doc1 = self._nlp(text1.value, evaluation, options)
+ if doc1:
+ if text2.value == text1.value:
+ doc2 = doc1
+ else:
+ doc2 = self._nlp(text2.value, evaluation, options)
+ if doc2:
+ if i1.get_head() is SymbolList and i2.get_head() is SymbolList:
+ if len(i1.elements) != len(i2.elements):
+ evaluation.message("TextSimilarity", "idxfmt")
+ return
+ if any(
+ not all(isinstance(i, Integer) for i in li.elements)
+ for li in (i1, i2)
+ ):
+ evaluation.message("TextSimilarity", "idxfmt")
+ return
+ indices1 = [i.value for i in i1.elements]
+ indices2 = [i.value for i in i2.elements]
+ multiple = True
+ elif isinstance(i1, Integer) and isinstance(i2, Integer):
+ indices1 = [i1.value]
+ indices2 = [i2.value]
+ multiple = False
+ else:
+ evaluation.message("TextSimilarity", "idxfmt")
+ return
+
+ for index1, index2 in zip(indices1, indices2):
+ for i, pos, doc in zip((index1, index2), (1, 2), (doc1, doc2)):
+ if i < 1 or i > len(doc):
+ evaluation.message(
+ "TextSimilarity", "txtidx", i, pos, len(doc)
+ )
+ return
+
+ result = [
+ Real(doc1[j1 - 1].similarity(doc2[j2 - 1]))
+ for j1, j2 in zip(indices1, indices2)
+ ]
+
+ if multiple:
+ return ListExpression(*result)
+ else:
+ return result[0]
+
+
+class WordStem(Builtin):
+ """
+ :WMA link:
+ https://reference.wolfram.com/language/ref/WordStem.html
+
+
+ - 'WordStem[$word$]'
+
- returns a stemmed form of $word$, thereby reducing an inflected form to its root.
+
+
- 'WordStem[{$word1$, $word2$, ...}]'
+
- returns a stemmed form for list of $word$, thereby reducing an inflected form to its root.
+
+
+ >> WordStem["towers"]
+ = tower
+
+ >> WordStem[{"heroes", "roses", "knights", "queens"}]
+ = {hero, rose, knight, queen}
+ """
+
+ _stemmer = None
+
+ requires = ("nltk",)
+ summary_text = "retrieve the stem of a word"
+
+ @staticmethod
+ def _get_porter_stemmer():
+ if WordStem._stemmer is None:
+ WordStem._stemmer = nltk.stem.porter.PorterStemmer()
+ return WordStem._stemmer
+
+ @staticmethod
+ def porter(w):
+ return WordStem._get_porter_stemmer().stem(w)
+
+ def eval(self, word: String, evaluation: Evaluation) -> String:
+ "WordStem[word_String]"
+ stemmer = self._get_porter_stemmer()
+ return String(stemmer.stem(word.value))
+
+ def eval_list(self, words, evaluation: Evaluation) -> Optional[ListExpression]:
+ "WordStem[words_List]"
+ if all(isinstance(w, String) for w in words.elements):
+ stemmer = self._get_porter_stemmer()
+ return ListExpression(
+ *[String(stemmer.stem(w.value)) for w in words.elements]
+ )
diff --git a/pymathics/natlang/translation.py b/pymathics/natlang/translation.py
new file mode 100644
index 0000000..a3aecd1
--- /dev/null
+++ b/pymathics/natlang/translation.py
@@ -0,0 +1,54 @@
+# -*- coding: utf-8 -*-
+
+
+"""
+Language Translation
+
+
+"""
+
+# This is under Text Normalization in WR. But also in Natural Language Processing,
+# and Linguistic Data. I put here because is the only module that uses langid and pycountry
+# modules.
+#
+# TODO: WordTranslation, TextTranslation
+
+from typing import Union
+
+import langid # see https://github.com/saffsd/langid.py
+import pycountry
+from mathics.builtin.base import Builtin
+from mathics.core.atoms import String
+from mathics.core.evaluation import Evaluation
+from mathics.core.symbols import Symbol
+from mathics.core.systemsymbols import SymbolFailed
+
+sort_order = "Language Translation"
+
+
+class LanguageIdentify(Builtin):
+ """
+ :WMA link:
+ https://reference.wolfram.com/language/ref/LanguageIdentify.html
+
+
+ - 'LanguageIdentify[$text$]'
+
- returns the name of the language used in $text$.
+
+
+ >> LanguageIdentify["eins zwei drei"]
+ = German
+ """
+
+ summary_text = "determine the predominant human language in a string"
+
+ def eval(self, text: String, evaluation: Evaluation) -> Union[Symbol, String]:
+ "LanguageIdentify[text_String]"
+
+ # an alternative: https://github.com/Mimino666/langdetect
+
+ code, _ = langid.classify(text.value)
+ language = pycountry.languages.get(alpha_2=code)
+ if language is None:
+ return SymbolFailed
+ return String(language.name)
diff --git a/pymathics/natlang/util.py b/pymathics/natlang/util.py
new file mode 100644
index 0000000..383c55c
--- /dev/null
+++ b/pymathics/natlang/util.py
@@ -0,0 +1,14 @@
+# -*- coding: utf-8 -*-
+
+"""
+utils
+"""
+
+# Don't consider this for user documentation
+no_doc = True
+
+
+def merge_dictionaries(a, b):
+ c = a.copy()
+ c.update(b)
+ return c
diff --git a/setup.py b/setup.py
index 57d7097..ef44e46 100644
--- a/setup.py
+++ b/setup.py
@@ -1,11 +1,12 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
-import sys
-import platform
import os
import os.path as osp
-from setuptools import setup, find_namespace_packages
+import platform
+import sys
+
+from setuptools import find_namespace_packages, setup
# Ensure user has the correct Python version
if sys.version_info < (3, 6):
diff --git a/test/consistency-and-style/test_summary_text.py b/test/consistency-and-style/test_summary_text.py
index dd99c95..05deaa4 100644
--- a/test/consistency-and-style/test_summary_text.py
+++ b/test/consistency-and-style/test_summary_text.py
@@ -5,12 +5,11 @@
import pkgutil
import pytest
-
-from pymathics.natlang import __file__ as module_initfile_path
from mathics.builtin import name_is_builtin_symbol
from mathics.builtin.base import Builtin
from mathics.doc.common_doc import skip_doc
+from pymathics.natlang import __file__ as module_initfile_path
# Get file system path name for mathics.builtin
module_path = osp.dirname(module_initfile_path)
diff --git a/test/test_natlang.py b/test/test_natlang.py
index a64b31e..e2adc86 100644
--- a/test/test_natlang.py
+++ b/test/test_natlang.py
@@ -16,6 +16,11 @@ def test_natlang():
"4",
"WordCount",
),
+ (
+ "Length[WordList[]]>10000",
+ "True",
+ "WordList",
+ ),
(
'TextWords["Hickory, dickory, dock! The mouse ran up the clock."]',
'{"Hickory", "dickory", "dock", "The", "mouse", "ran", "up", "the", "clock"}',