From f840599e1d1e0db4e23940a8848a4abe52aedaeb Mon Sep 17 00:00:00 2001
From: mmatera <matera@fisica.unlp.edu.ar>
Date: Mon, 20 Feb 2023 17:49:56 -0300
Subject: [PATCH 01/14] split main in modules

---
 Makefile                                      |    5 +-
 pymathics/natlang/__init__.py                 |   25 +-
 pymathics/natlang/linguistic_data.py          |  350 ++++
 pymathics/natlang/main.py                     | 1506 -----------------
 pymathics/natlang/normalization.py            |  301 ++++
 pymathics/natlang/spacy.py                    |  249 +++
 pymathics/natlang/textual_analysis.py         |  427 +++++
 pymathics/natlang/translation.py              |   45 +
 pymathics/natlang/util.py                     |  328 ++++
 setup.py                                      |    7 +-
 .../test_summary_text.py                      |    3 +-
 11 files changed, 1722 insertions(+), 1524 deletions(-)
 create mode 100644 pymathics/natlang/linguistic_data.py
 delete mode 100644 pymathics/natlang/main.py
 create mode 100644 pymathics/natlang/normalization.py
 create mode 100644 pymathics/natlang/spacy.py
 create mode 100644 pymathics/natlang/textual_analysis.py
 create mode 100644 pymathics/natlang/translation.py
 create mode 100644 pymathics/natlang/util.py
diff --git a/Makefile b/Makefile
index a4ac90b..6de236d 100644
--- a/Makefile
+++ b/Makefile
@@ -72,7 +72,7 @@ pytest:
 
 
 doctest:
-	MATHICS_CHARACTER_ENCODING="ASCII" $(PYTHON) -m mathics.docpipeline -l pymathics.natlang -c  "Natural Language Processing"  $o
+	MATHICS_CHARACTER_ENCODING="ASCII" $(PYTHON) -m mathics.docpipeline -l pymathics.natlang -c 'Natural Language Processing' $o
 
 
 # #: Make Mathics PDF manual
@@ -89,5 +89,4 @@ ChangeLog: rmChangeLog
 
 #: Run pytest consistency and style checks
 check-consistency-and-style:
-	# MATHICS_LINT=t $(PYTHON) -m pytest test/consistency-and-style
-	echo "check-consistency-and-style deactivated. Activate me later. "
+	MATHICS_LINT=t $(PYTHON) -m pytest test/consistency-and-style
diff --git a/pymathics/natlang/__init__.py b/pymathics/natlang/__init__.py
index f3b883e..c0d76ae 100644
--- a/pymathics/natlang/__init__.py
+++ b/pymathics/natlang/__init__.py
@@ -38,28 +38,32 @@
      = Old Man Apulia, conduct peculiar
 """
 
-
-from pymathics.natlang.main import (
-    DeleteStopwords,
-    DictionaryLookup,
-    DictionaryWordQ,
-    LanguageIdentify,
+from pymathics.natlang.linguistic_data import (
     Pluralize,
     RandomWord,
-    SpellingCorrectionList,
+    WordData,
+    WordDefinition,
+    WordList,
+)
+from pymathics.natlang.normalization import (
+    DeleteStopwords,
     TextCases,
     TextPosition,
     TextSentences,
     TextStructure,
     TextWords,
+)
+from pymathics.natlang.textual_analysis import (
+    Containing,
+    DictionaryLookup,
+    DictionaryWordQ,
+    SpellingCorrectionList,
     WordCount,
-    WordData,
-    WordDefinition,
     WordFrequency,
-    WordList,
     WordSimilarity,
     WordStem,
 )
+from pymathics.natlang.translation import LanguageIdentify
 from pymathics.natlang.version import __version__
 
 pymathics_version_data = {
@@ -70,6 +74,7 @@
 }
 
 __all__ = [
+    "Containing",
     "DeleteStopwords",
     "DictionaryLookup",
     "DictionaryWordQ",
diff --git a/pymathics/natlang/linguistic_data.py b/pymathics/natlang/linguistic_data.py
new file mode 100644
index 0000000..942ff48
--- /dev/null
+++ b/pymathics/natlang/linguistic_data.py
@@ -0,0 +1,350 @@
+# -*- coding: utf-8 -*-
+"""
+Linguistic Data
+
+See <url>:WMA:https://reference.wolfram.com/language/guide/LinguisticData.html</url> guide.
+
+"""
+
+
+# TODO: Complete me
+
+# WordFrequencyData — data on typical current and historical word frequencies
+# Synonyms — synonyms for a word
+# Antonyms — antonyms for a word
+# PartOfSpeech — possible parts of speech for a word
+
+
+from typing import Optional
+
+from mathics.builtin.base import Builtin, MessageException
+
+# from mathics.builtin.codetables import iso639_3
+from mathics.builtin.numbers.randomnumbers import RandomEnv
+from mathics.core.atoms import String
+from mathics.core.convert.expression import Expression, to_expression
+from mathics.core.evaluation import Evaluation
+from mathics.core.list import ListExpression
+from mathics.core.symbols import Symbol, SymbolList
+from mathics.core.systemsymbols import SymbolMissing, SymbolRule, SymbolStringExpression
+from pattern.en import pluralize
+
+from pymathics.natlang.textual_analysis import WordStem
+from pymathics.natlang.util import (
+    WordProperty,
+    _WordListBuiltin,
+    _wordnet_pos_to_type,
+    _WordNetBuiltin,
+    merge_dictionaries,
+)
+
+SymbolDictionaryLookup = Symbol("Pymathics`Natlang`DictionaryLookup")
+StringNotAvailable = String("NotAvailable")
+
+
+class Pluralize(Builtin):
+    """
+    <url>:WMA:
+    https://reference.wolfram.com/language/ref/Pluralize.html</url>
+
+    <dl>
+      <dt>'Pluralize[$word$]'
+      <dd>returns the plural form of $word$.
+    </dl>
+
+    >> Pluralize["potato"]
+     = potatoes
+    """
+
+    requires = ("pattern",)
+    summary_text = "Retrieve the pluralized form of a word"
+
+    def eval(self, word, evaluation):
+        "Pluralize[word_String]"
+
+        return String(pluralize(word.value))
+
+
+class RandomWord(_WordListBuiltin):
+    """
+    <url>:WMA:
+    https://reference.wolfram.com/language/ref/RandomWord.html</url>
+
+    <dl>
+      <dt>'RandomWord[]'
+      <dd>returns a random word.
+
+      <dt>'RandomWord[$type$]'
+      <dd>returns a random word of the given $type$, e.g. of type "Noun" or "Adverb".
+
+      <dt>'RandomWord[$type$, $n$]'
+      <dd>returns $n$ random words of the given $type$.
+
+    >> RandomWord["Noun"]
+     = ...
+    >> RandomWord["Noun", 3]
+     = {..., ..., ...}
+
+    </dl>
+    """
+
+    summary_text = "generate a random word of a given kind"
+
+    def _random_words(self, type, n, evaluation: Evaluation, options: dict):
+        words = self._words(self._language_name(evaluation, options), type, evaluation)
+        if words is not None:
+            with RandomEnv(evaluation) as rand:
+                return [
+                    String(words[rand.randint(0, len(words) - 1)].replace("_", " "))
+                    for _ in range(n)
+                ]
+
+    def eval(self, evaluation: Evaluation, options: dict):
+        "RandomWord[OptionsPattern[RandomWord]]"
+        words = self._random_words("All", 1, evaluation, options)
+        if words:
+            return words[0]
+
+    def eval_type(self, type, evaluation: Evaluation, options: dict):
+        "RandomWord[type_String, OptionsPattern[RandomWord]]"
+        words = self._random_words(type.value, 1, evaluation, options)
+        if words:
+            return words[0]
+
+    def eval_type_n(self, type, n, evaluation: Evaluation, options: dict):
+        "RandomWord[type_String, n_Integer, OptionsPattern[RandomWord]]"
+        words = self._random_words(type.value, n.value, evaluation, options)
+        if words:
+            return ListExpression(*words)
+
+
+class WordData(_WordListBuiltin):
+    """
+
+    <url>:WMA:
+    https://reference.wolfram.com/language/ref/WordData.html</url>
+
+    <dl>
+      <dt>'WordData[$word$]'
+      <dd>returns a list of possible senses of a word.
+
+      <dt>'WordData[$word$, $property$]'
+      <dd>returns detailed information about a word regarding $property$, e.g. "Definitions" or "Examples".
+    </dl>
+
+    The following are valid properties:
+    <ul>
+      <li> Definitions, Examples
+      <li> InflectedForms
+      <li> Synonyms, Antonyms
+      <li> BroaderTerms, NarrowerTerms
+      <li> WholeTerms, PartTerms, MaterialTerms
+      <li> EntailedTerms, CausesTerms
+      <li> UsageField
+      <li> WordNetID
+      <li> Lookup
+    </ul>
+
+    >> WordData["riverside", "Definitions"]
+     = {{riverside, Noun, Bank} -> the bank of a river}
+
+    >> WordData[{"fish", "Verb", "Angle"}, "Examples"]
+     = {{fish, Verb, Angle} -> {fish for compliments}}
+    """
+
+    messages = merge_dictionaries(
+        _WordNetBuiltin.messages,
+        {
+            "notprop": "WordData[] does not recognize `1` as a valid property.",
+        },
+    )
+    summary_text = "retrieve an association with properties of a word"
+
+    def _parse_word(self, word):
+        if isinstance(word, String):
+            return word.value.lower()
+        elif word.get_head_name() == "System`List":
+            if len(word.elements) == 3 and all(
+                isinstance(s, String) for s in word.elements
+            ):
+                return tuple(s.value for s in word.elements)
+
+    def _standard_property(
+        self, py_word, py_form, py_property, wordnet, language_code, evaluation
+    ):
+        senses = self._senses(py_word, wordnet, language_code)
+        if not senses:
+            return Expression(SymbolMissing, StringNotAvailable)
+        elif py_form == "List":
+            word_property = WordProperty(self._short_syn_form, wordnet, language_code)
+            property_getter = getattr(
+                word_property, "%s" % self._underscore(py_property), None
+            )
+            if property_getter:
+                return ListExpression(
+                    *[property_getter(syn, desc) for syn, desc in senses]
+                )
+        elif py_form in ("Rules", "ShortRules"):
+            syn_form = (lambda s: s) if py_form == "Rules" else (lambda s: s[0])
+            word_property = WordProperty(syn_form, wordnet, language_code)
+            property_getter = getattr(
+                word_property, self._underscore(py_property), None
+            )
+            if property_getter:
+                list_expr_elements = [
+                    to_expression(SymbolRule, desc, property_getter(syn, desc))
+                    for syn, desc in senses
+                ]
+                return ListExpression(*list_expr_elements)
+        evaluation.message(self.get_name(), "notprop", property)
+
+    def _parts_of_speech(self, py_word, wordnet, language_code):
+        parts = set(
+            syn.pos() for syn, _ in self._senses(py_word, wordnet, language_code)
+        )
+        if not parts:
+            return Expression(SymbolMissing, StringNotAvailable)
+        else:
+            return ListExpression(
+                *[String(s) for s in sorted([_wordnet_pos_to_type[p] for p in parts])]
+            )
+
+    def _property(
+        self, word, py_property, py_form, evaluation: Evaluation, options: dict
+    ):
+        if py_property == "PorterStem":
+            if isinstance(word, String):
+                return String(WordStem.porter(word.value))
+            else:
+                return
+
+        wordnet, language_code = self._load_wordnet(
+            evaluation, self._language_name(evaluation, options)
+        )
+        if not wordnet:
+            return
+
+        py_word = self._parse_word(word)
+        if not py_word:
+            return
+
+        if py_property == "PartsOfSpeech":
+            return self._parts_of_speech(py_word, wordnet, language_code)
+
+        try:
+            return self._standard_property(
+                py_word, py_form, py_property, wordnet, language_code, evaluation
+            )
+        except MessageException as e:
+            e.message(evaluation)
+
+    def eval(self, word, evaluation: Evaluation, options: dict) -> Optional[Expression]:
+        "WordData[word_, OptionsPattern[WordData]]"
+        if word.get_head() is SymbolStringExpression:
+            return Expression(SymbolDictionaryLookup, word)
+        elif isinstance(word, String) or word.get_head() is SymbolList:
+            pass
+        else:
+            return
+
+        wordnet, language_code = self._load_wordnet(
+            evaluation, self._language_name(evaluation, options)
+        )
+        if not wordnet:
+            return
+
+        py_word = self._parse_word(word)
+        if not py_word:
+            return
+
+        senses = self._senses(py_word, wordnet, language_code)
+        if senses is not None:
+            return ListExpression(*[[String(s) for s in desc] for syn, desc in senses])
+
+    def eval_property(self, word, property, evaluation: Evaluation, options: dict):
+        "WordData[word_, property_String, OptionsPattern[WordData]]"
+        if word.get_head is SymbolStringExpression:
+            if property.get_string_value() == "Lookup":
+                return Expression(SymbolDictionaryLookup, word)
+        elif isinstance(word, String) or word.get_head() is SymbolList:
+            return self._property(
+                word, property.get_string_value(), "ShortRules", evaluation, options
+            )
+
+    def eval_property_form(
+        self, word, property, form, evaluation: Evaluation, options: dict
+    ):
+        "WordData[word_, property_String, form_String, OptionsPattern[WordData]]"
+        if isinstance(word, String) or word.get_head() is SymbolList:
+            return self._property(
+                word,
+                property.value,
+                form.value,
+                evaluation,
+                options,
+            )
+
+
+class WordDefinition(_WordNetBuiltin):
+    """
+    <url>:WMA:
+    https://reference.wolfram.com/language/ref/WordDefinition.html</url>
+
+    <dl>
+      <dt>'WordDefinition[$word$]'
+      <dd>returns a definition of $word$ or Missing["Available"] if $word$ is not known.
+    </dl>
+
+    >> WordDefinition["gram"]
+     = {a metric unit of weight equal to one thousandth of a kilogram}
+    """
+
+    summary_text = "retrieve the definition of a word"
+
+    def eval(self, word, evaluation: Evaluation, options: dict):
+        "WordDefinition[word_String, OptionsPattern[WordDefinition]]"
+        wordnet, language_code = self._load_wordnet(
+            evaluation, self._language_name(evaluation, options)
+        )
+        if wordnet:
+            senses = self._senses(word.value.lower(), wordnet, language_code)
+            if senses:
+                return ListExpression(*[String(syn.definition()) for syn, _ in senses])
+            else:
+                return Expression(SymbolMissing, StringNotAvailable)
+
+
+class WordList(_WordListBuiltin):
+    """
+    <url>:WMA:
+    https://reference.wolfram.com/language/ref/WordList.html</url>
+
+    <dl>
+      <dt>'WordList[]'
+      <dd>returns a list of common words.
+
+      <dt>'WordList[$type$]'
+      <dd>returns a list of common words of type $type$.
+    </dl>
+
+    >> N[Mean[StringLength /@ WordList["Adjective"]], 2]
+     = 9.3
+    """
+
+    summary_text = "retrieve a list of common words"
+
+    def eval(self, evaluation: Evaluation, options: dict):
+        "WordList[OptionsPattern[]]"
+        words = self._words(self._language_name(evaluation, options), "All", evaluation)
+        if words is not None:
+            return ListExpression(*(String(word) for word in words))
+
+    def eval_type(self, wordtype, evaluation: Evaluation, options: dict):
+        "WordList[wordtype_String, OptionsPattern[]]"
+        words = self._words(
+            self._language_name(evaluation, options),
+            wordtype.value,
+            evaluation,
+        )
+        if words is not None:
+            return ListExpression(*(String(word) for word in words))
diff --git a/pymathics/natlang/main.py b/pymathics/natlang/main.py
deleted file mode 100644
index 4d6dbe6..0000000
--- a/pymathics/natlang/main.py
+++ /dev/null
@@ -1,1506 +0,0 @@
-# -*- coding: utf-8 -*-
-# FIXME: split this up into smaller pieces
-
-"""
-Natural Language Functions
-
-"""
-
-import heapq
-import itertools
-import math
-
-# import os
-import re
-from itertools import chain
-from typing import Optional, Union
-
-import enchant
-import langid  # see https://github.com/saffsd/langid.py
-import pycountry
-import spacy
-from mathics.builtin.atomic.strings import anchor_pattern, to_regex
-from mathics.builtin.base import Builtin, MessageException
-from mathics.builtin.codetables import iso639_3
-from mathics.builtin.numbers.randomnumbers import RandomEnv
-from mathics.core.atoms import Integer, Real, String
-from mathics.core.convert.expression import (
-    ListExpression,
-    to_expression,
-    to_mathics_list,
-)
-from mathics.core.evaluation import Evaluation
-from mathics.core.expression import Expression
-from mathics.core.symbols import (
-    Symbol,
-    SymbolFalse,
-    SymbolList,
-    SymbolTrue,
-    strip_context,
-)
-from mathics.core.systemsymbols import (
-    SymbolFailed,
-    SymbolMissing,
-    SymbolRule,
-    SymbolStringExpression,
-)
-from mathics.eval.nevaluator import eval_N
-from pattern.en import pluralize
-
-SymbolDictionaryLookup = Symbol("Pymathics`Natlang`DictionaryLookup")
-
-StringNotAvailable = String("NotAvailable")
-
-
-def _parse_nltk_lookup_error(e):
-    m = re.search(r"Resource '([^']+)' not found\.", str(e))
-    if m:
-        return m.group(1)
-    else:
-        return "unknown"
-
-
-def _make_forms():
-    forms = {
-        "Word": lambda doc: (token for token in doc),
-        "Sentence": lambda doc: (sent for sent in doc.sents),
-        "Paragraph": lambda doc: _fragments(doc, re.compile(r"^[\n][\n]+$")),
-        "Line": lambda doc: _fragments(doc, re.compile(r"^[\n]$")),
-        "URL": lambda doc: (token for token in doc if token.orth_.like_url()),
-        "EmailAddress": lambda doc: (
-            token for token in doc if token.orth_.like_email()
-        ),
-    }
-
-    def filter_named_entity(label):
-        def generator(doc):
-            for ent in doc.ents:
-                if ent.label == label:
-                    yield ent
-
-        return generator
-
-    def filter_pos(pos):
-        def generator(doc):
-            for token in doc:
-                if token.pos == pos:
-                    yield token
-
-        return generator
-
-    for name, symbol in _symbols.items():
-        forms[name] = filter_named_entity(symbol)
-
-    for tag, names in _pos_tags.items():
-        name, phrase_name = names
-        forms[name] = filter_pos(tag)
-
-    return forms
-
-
-# the following two may only be accessed after_WordNetBuiltin._load_wordnet has
-# been called.
-
-_wordnet_pos_to_type = {}
-_wordnet_type_to_pos = {}
-
-import nltk
-
-
-def _init_nltk_maps():
-    _wordnet_pos_to_type.update(
-        {
-            nltk.corpus.wordnet.VERB: "Verb",
-            nltk.corpus.wordnet.NOUN: "Noun",
-            nltk.corpus.wordnet.ADJ: "Adjective",
-            nltk.corpus.wordnet.ADJ_SAT: "Adjective",
-            nltk.corpus.wordnet.ADV: "Adverb",
-        }
-    )
-    _wordnet_type_to_pos.update(
-        {
-            "Verb": [nltk.corpus.wordnet.VERB],
-            "Noun": [nltk.corpus.wordnet.NOUN],
-            "Adjective": [nltk.corpus.wordnet.ADJ, nltk.corpus.wordnet.ADJ_SAT],
-            "Adverb": [nltk.corpus.wordnet.ADV],
-        }
-    )
-
-
-from spacy.tokens import Span
-
-# Part of speech tags and their public interface names in Mathics
-# see http://www.mathcs.emory.edu/~choi/doc/clear-dependency-2012.pdf
-_pos_tags = {
-    spacy.parts_of_speech.ADJ: ("Adjective", ""),
-    spacy.parts_of_speech.ADP: ("Preposition", "Prepositional Phrase"),
-    spacy.parts_of_speech.ADV: ("Adverb", ""),
-    spacy.parts_of_speech.CONJ: ("Conjunct", ""),
-    spacy.parts_of_speech.DET: ("Determiner", ""),
-    spacy.parts_of_speech.INTJ: ("Interjection", ""),
-    spacy.parts_of_speech.NOUN: ("Noun", "Noun Phrase"),
-    spacy.parts_of_speech.NUM: ("Number", ""),
-    spacy.parts_of_speech.PART: ("Particle", ""),
-    spacy.parts_of_speech.PRON: ("Pronoun", ""),
-    spacy.parts_of_speech.PROPN: ("Proposition", ""),
-    spacy.parts_of_speech.PUNCT: ("Punctuation", ""),
-    spacy.parts_of_speech.SCONJ: ("Sconj", ""),
-    spacy.parts_of_speech.SYM: ("Symbol", ""),
-    spacy.parts_of_speech.VERB: ("Verb", "Verb Phrase"),
-    spacy.parts_of_speech.X: ("X", ""),
-    spacy.parts_of_speech.EOL: ("EOL", ""),
-    spacy.parts_of_speech.SPACE: ("Space", ""),
-}
-
-# Mathics3 named entitiy names and their corresponding constants in spacy.
-_symbols = {
-    "Person": spacy.symbols.PERSON,
-    "Company": spacy.symbols.ORG,
-    "Quantity": spacy.symbols.QUANTITY,
-    "Number": spacy.symbols.CARDINAL,
-    "CurrencyAmount": spacy.symbols.MONEY,
-    "Country": spacy.symbols.GPE,  # also includes cities and states
-    "City": spacy.symbols.GPE,  # also includes countries and states
-}
-
-# forms are everything one can use in TextCases[] or TextPosition[].
-_forms = _make_forms()
-
-
-def _merge_dictionaries(a, b):
-    c = a.copy()
-    c.update(b)
-    return c
-
-
-def _position(t):
-    if isinstance(t, Span):
-        i = t.doc[t.start]
-        r = t.doc[t.end - 1]
-        return 1 + i.idx, r.idx + len(r.text)
-    else:
-        return 1 + t.idx, t.idx + len(t.text)
-
-
-def _fragments(doc, sep):
-    start = 0
-    for i, token in enumerate(doc):
-        if sep.match(token.text):
-            yield Span(doc, start, i)
-            start = i + 1
-    end = len(doc)
-    if start < end:
-        yield Span(doc, start, end)
-
-
-class _SpacyBuiltin(Builtin):
-    requires = ("spacy",)
-
-    options = {
-        "Language": '"English"',
-    }
-
-    messages = {
-        "runtime": "Spacy gave the following error: ``",
-        "lang": 'Language "`1`" is currently not supported with `2`[].',
-    }
-
-    _language_codes = {
-        "English": "en",
-        "German": "de",
-    }
-
-    _spacy_instances = {}
-
-    def _load_spacy(self, evaluation: Evaluation, options: dict):
-        language_code = None
-        language_name = self.get_option(options, "Language", evaluation)
-        if language_name is None:
-            language_name = String("Undefined")
-        if isinstance(language_name, String):
-            language_code = _SpacyBuiltin._language_codes.get(language_name.value)
-        if not language_code:
-            evaluation.message(
-                self.get_name(), "lang", language_name, strip_context(self.get_name())
-            )
-            return None
-
-        instance = _SpacyBuiltin._spacy_instances.get(language_code)
-        if instance:
-            return instance
-
-        try:
-            instance = spacy.load(f"{language_code}_core_web_md")
-
-            # "via" parameter no longer exists. This was used in MATHICS3_SPACY_DATA
-            # if "MATHICS3_SPACY_DATA" in os.environ:
-            #     instance = spacy.load(
-            #         language_code, via=os.environ["MATHICS3_SPACY_DATA"]
-            #     )
-            # else:
-            #     instance = spacy.load(f"{language_code}_core_web_md")
-
-            _SpacyBuiltin._spacy_instances[language_code] = instance
-            return instance
-        except RuntimeError as e:
-            evaluation.message(self.get_name(), "runtime", str(e))
-            return None
-
-    def _nlp(self, text, evaluation, options) -> Optional[spacy.tokens.doc.Doc]:
-        nlp = self._load_spacy(evaluation, options)
-        if not nlp:
-            return None
-        return nlp(text)
-
-    def _is_stop_lambda(self, evaluation: Evaluation, options: dict):
-        nlp = self._load_spacy(evaluation, options)
-        if not nlp:
-            return None
-
-        vocab = nlp.vocab
-
-        def is_stop(word):
-            return vocab[word].is_stop
-
-        return is_stop
-
-
-class WordCount(_SpacyBuiltin):
-    """
-    <dl>
-      <dt>'WordCount[$string$]'
-      <dd>returns the number of words in $string$.
-    </dl>
-
-    >> WordCount["A long time ago"]
-     = 4
-    """
-
-    def eval(self, text, evaluation: Evaluation, options: dict):
-        "WordCount[text_String, OptionsPattern[WordCount]]"
-        doc = self._nlp(text.value, evaluation, options)
-        if doc:
-            punctuation = spacy.parts_of_speech.PUNCT
-            return Integer(sum(1 for word in doc if word.pos != punctuation))
-
-
-class TextWords(_SpacyBuiltin):
-    """
-    <dl>
-      <dt>'TextWords[$string$]'
-      <dd>returns the words in $string$.
-
-      <dt>'TextWords[$string$, $n$]'
-      <dd>returns the first $n$ words in $string$
-    </dl>
-
-    >> TextWords["Hickory, dickory, dock! The mouse ran up the clock."]
-     = {Hickory, dickory, dock, The, mouse, ran, up, the, clock}
-    """
-
-    def eval(
-        self, text: String, evaluation: Evaluation, options: dict
-    ) -> Optional[ListExpression]:
-        "TextWords[text_String, OptionsPattern[WordCount]]"
-        doc = self._nlp(text.value, evaluation, options)
-        if doc:
-            punctuation = spacy.parts_of_speech.PUNCT
-            return ListExpression(
-                *[String(word.text) for word in doc if word.pos != punctuation],
-            )
-
-    def eval_n(self, text: String, n: Integer, evaluation: Evaluation, options: dict):
-        "TextWords[text_String, n_Integer, OptionsPattern[TextWords]]"
-        doc = self._nlp(text.value, evaluation, options)
-        if doc:
-            punctuation = spacy.parts_of_speech.PUNCT
-            return ListExpression(
-                *itertools.islice(
-                    (String(word.text) for word in doc if word.pos != punctuation),
-                    n.value,
-                ),
-            )
-
-
-class TextSentences(_SpacyBuiltin):
-    """
-    <dl>
-      <dt>'TextSentences[$string$]'
-      <dd>returns the sentences in $string$.
-
-      <dt>'TextSentences[$string$, $n$]'
-      <dd>returns the first $n$ sentences in $string$
-    </dl>
-
-    >> TextSentences["Night and day. Day and night."]
-     = {Night and day., Day and night.}
-
-    >> TextSentences["Night and day. Day and night.", 1]
-     = {Night and day.}
-
-    >> TextSentences["Mr. Jones met Mrs. Jones."]
-     = {Mr. Jones met Mrs. Jones.}
-    """
-
-    def eval(self, text: String, evaluation: Evaluation, options: dict):
-        "TextSentences[text_String, OptionsPattern[TextSentences]]"
-        doc = self._nlp(text.value, evaluation, options)
-        if doc:
-            return ListExpression(*[String(sent.text) for sent in doc.sents])
-
-    def eval_n(self, text: String, n: Integer, evaluation: Evaluation, options: dict):
-        "TextSentences[text_String, n_Integer, OptionsPattern[TextSentences]]"
-        doc = self._nlp(text.value, evaluation, options)
-        if doc:
-            return ListExpression(
-                *itertools.islice((String(sent.text) for sent in doc.sents), n.value),
-            )
-
-
-class DeleteStopwords(_SpacyBuiltin):
-    """
-    <dl>
-      <dt>'DeleteStopwords[$list$]'
-      <dd>returns the words in $list$ without stopwords.
-
-      <dt>'DeleteStopwords[$string$]'
-      <dd>returns $string$ without stopwords.
-    </dl>
-
-    ## This has changed since old versions of natlang, and I am
-    ## not sure the old behavior was correct.
-    ## >> DeleteStopwords[{"Somewhere", "over", "the", "rainbow"}]
-    ##  = {rainbow}
-
-    >> DeleteStopwords["There was an Old Man of Apulia, whose conduct was very peculiar"]
-     = Old Man Apulia, conduct peculiar
-    """
-
-    def eval_list(self, li, evaluation: Evaluation, options: dict) -> ListExpression:
-        "DeleteStopwords[li_List, OptionsPattern[DeleteStopwords]]"
-        is_stop = self._is_stop_lambda(evaluation, options)
-
-        def filter_words(words):
-            for w in words:
-                s = w.get_string_value()
-                if s is not None:
-                    yield String(s)
-                elif is_stop is not None and is_stop(s) is not None:
-                    yield String(s)
-
-        return ListExpression(*list(filter_words(li.elements)))
-
-    def eval_string(self, s: String, evaluation: Evaluation, options: dict):
-        "DeleteStopwords[s_String, OptionsPattern[DeleteStopwords]]"
-        doc = self._nlp(s.value, evaluation, options)
-        if doc:
-            is_stop = self._is_stop_lambda(evaluation, options)
-            if is_stop:
-
-                def tokens():
-                    for token in doc:
-                        if not is_stop(token.text):
-                            yield token.text_with_ws
-                        else:
-                            yield token.whitespace_.strip()
-
-                return String("".join(tokens()))
-
-
-class WordFrequency(_SpacyBuiltin):
-    """
-    <dl>
-      <dt>'WordFrequency[$text$, $word$]'
-      <dd>returns the relative frequency of $word$ in $text$.
-    </dl>
-
-    $word$ may also specify multiple words using $a$ | $b$ | ...
-
-    ## Problem with import for certain characters in the text.
-    ## >> text = Import["ExampleData/EinsteinSzilLetter.txt"];
-    >> text = "I have a dairy cow, it's not just any cow. \
-       She gives me milkshake, oh what a salty cow. She is the best\
-       cow in the county.";
-
-    >> WordFrequency[text, "a" | "the"]
-     = 0.114286
-
-    >> WordFrequency["Apple Tree", "apple", IgnoreCase -> True]
-     = 0.5
-    """
-
-    options = _SpacyBuiltin.options
-    options.update({"IgnoreCase": "False"})
-
-    def eval(
-        self, text: String, word, evaluation: Evaluation, options: dict
-    ) -> Optional[Expression]:
-        "WordFrequency[text_String, word_, OptionsPattern[WordFrequency]]"
-        doc = self._nlp(text.value, evaluation, options)
-        if not doc:
-            return
-        if isinstance(word, String):
-            words = set([word.value])
-        elif word.get_head_name() == "System`Alternatives":
-            if not all(isinstance(a, String) for a in word.elements):
-                return  # error
-            words = set(a.value for a in word.elements)
-        else:
-            return  # error
-
-        ignore_case = self.get_option(options, "IgnoreCase", evaluation) is SymbolTrue
-        if ignore_case:
-            words = [w.lower() for w in words]
-        n = 0
-        for token in doc:
-            token_text = token.text
-            if ignore_case:
-                token_text = token_text.lower()
-            if token_text in words:
-                n += 1
-        return eval_N(Integer(n) / Integer(len(doc)), evaluation)
-
-
-class Containing(Builtin):
-    pass
-
-
-def _cases(doc, form):
-    if isinstance(form, String):
-        generators = [_forms.get(form.value)]
-    elif form.get_head_name() == "System`Alternatives":
-        if not all(isinstance(f, String) for f in form.elements):
-            return  # error
-        generators = [_forms.get(f.value) for f in form.elements]
-    elif form.get_head_name() == "PyMathics`Containing":
-        if len(form.elements) == 2:
-            for t in _containing(doc, *form.elements):
-                yield t
-            return
-        else:
-            return  # error
-    else:
-        return  # error
-
-    def try_next(iterator):
-        try:
-            return next(iterator)
-        except StopIteration:
-            return None
-
-    feeds = []
-    for i, iterator in enumerate([iter(generator(doc)) for generator in generators]):
-        t = try_next(iterator)
-        if t:
-            feeds.append((_position(t), i, t, iterator))
-    heapq.heapify(feeds)
-    while feeds:
-        pos, i, token, iterator = heapq.heappop(feeds)
-        yield token
-        t = try_next(iterator)
-        if t:
-            heapq.heappush(feeds, (_position(t), i, t, iterator))
-
-
-def _containing(doc, outer, inner):
-    if not isinstance(outer, String):
-        return  # error
-    outer_generator = _forms.get(outer.value)
-    inner_iter = _cases(doc, inner)
-    inner_start = None
-    produce_t = False
-    try:
-        for t in outer_generator(doc):
-            start, end = _position(t)
-            if inner_start is not None and inner_start < end:
-                produce_t = True
-            if produce_t:
-                yield t
-                produce_t = False
-            while True:
-                inner_start, inner_end = _position(next(inner_iter))
-                if inner_end > start:
-                    break
-            if inner_start < end:
-                produce_t = True
-    except StopIteration:
-        pass
-
-
-class TextCases(_SpacyBuiltin):
-    """
-    <dl>
-      <dt>'TextCases[$text$, $form$]'
-      <dd>returns all elements of type $form$ in $text$ in order of their appearance.
-    </dl>
-
-    >> TextCases["I was in London last year.", "Pronoun"]
-     = {I}
-
-    >> TextCases["I was in London last year.", "City"]
-     = {London}
-
-    ## >> TextCases[Import["ExampleData/EinsteinSzilLetter.txt"], "Person", 3][[2;;3]]
-    ##  = {L. Szilard, Joliot}
-
-    >> TextCases["Anne, Peter and Mr Johnes say hello.", "Person", 3][[2;;3]]
-     = {Peter, Johnes}
-
-    """
-
-    def eval_string_form(
-        self, text: String, form, evaluation: Evaluation, options: dict
-    ):
-        "TextCases[text_String, form_,  OptionsPattern[TextCases]]"
-        doc = self._nlp(text.value, evaluation, options)
-        if doc:
-            return to_mathics_list(*[t.text for t in _cases(doc, form)])
-
-    def eval_string_form_n(
-        self, text: String, form, n: Integer, evaluation: Evaluation, options: dict
-    ):
-        "TextCases[text_String, form_, n_Integer,  OptionsPattern[TextCases]]"
-        doc = self._nlp(text.value, evaluation, options)
-        if doc:
-            return to_mathics_list(
-                *itertools.islice((t.text for t in _cases(doc, form)), n.value)
-            )
-
-
-class TextPosition(_SpacyBuiltin):
-    """
-    <dl>
-      <dt>'TextPosition[$text$, $form$]'
-      <dd>returns the positions of elements of type $form$ in $text$ in order of their appearance.
-    </dl>
-
-    >> TextPosition["Liverpool and London are two English cities.", "City"]
-     = {{1, 9}, {15, 20}}
-    """
-
-    def eval_text_form(self, text: String, form, evaluation: Evaluation, options: dict):
-        "TextPosition[text_String, form_,  OptionsPattern[TextPosition]]"
-        doc = self._nlp(text.value, evaluation, options)
-        if doc:
-            return to_mathics_list(*[_position(t) for t in _cases(doc, form)])
-
-    def eval_text_form_n(
-        self, text: String, form, n: Integer, evaluation: Evaluation, options: dict
-    ):
-        "TextPosition[text_String, form_, n_Integer,  OptionsPattern[TextPosition]]"
-        doc = self._nlp(text.value, evaluation, options)
-        if doc:
-            return to_mathics_list(
-                *itertools.islice((_position(t) for t in _cases(doc, form)), n.value)
-            )
-
-
-class TextStructure(_SpacyBuiltin):
-    """
-    <dl>
-      <dt>'TextStructure[$text$, $form$]'
-      <dd>returns the grammatical structure of $text$ as $form$.
-    </dl>
-
-    >> TextStructure["The cat sat on the mat.", "ConstituentString"]
-     = {(Sentence, ((Verb Phrase, (Noun Phrase, (Determiner, The), (Noun, cat)), (Verb, sat), (Prepositional Phrase, (Preposition, on), (Noun Phrase, (Determiner, the), (Noun, mat))), (Punctuation, .))))}
-    """
-
-    _root_pos = set(i for i, names in _pos_tags.items() if names[1])
-
-    def _to_constituent_string(self, node):
-        token, children = node
-        name, phrase_name = _pos_tags.get(token.pos, ("Unknown", "Unknown Phrase"))
-        if not children:
-            return "(%s, %s)" % (name, token.text)
-        else:
-            sub = ", ".join(
-                self._to_constituent_string(next_node) for next_node in children
-            )
-            return "(%s, %s)" % (phrase_name, sub)
-
-    def _to_tree(self, tokens, path=[]):
-        roots = []
-        i = 0
-        while i < len(tokens):
-            token = tokens[i]
-
-            if token in path:
-                roots.append((token, None))
-                i += 1
-            else:
-                root = token
-                while root.head != root and root.head not in path:
-                    root = root.head
-
-                sub = list(root.subtree)
-
-                if root.pos not in self._root_pos:
-                    roots.extend(self._to_tree(sub, path + [root]))
-                else:
-                    roots.append((root, self._to_tree(sub, path + [root])))
-
-                i += len(sub)
-
-        return roots
-
-    def eval(self, text, evaluation: Evaluation, options: dict):
-        'TextStructure[text_String, "ConstituentString",  OptionsPattern[TextStructure]]'
-        doc = self._nlp(text.value, evaluation, options)
-        if doc:
-            tree = self._to_tree(list(doc))
-            sents = ["(Sentence, (%s))" % self._to_constituent_string(x) for x in tree]
-            return to_mathics_list(*sents, elements_conversion_fn=String)
-
-
-class WordSimilarity(_SpacyBuiltin):
-    """
-    <dl>
-      <dt>'WordSimilarity[$text1$, $text2$]'
-      <dd>returns a real-valued measure of semantic similarity of two texts or words.
-
-      <dt>'WordSimilarity[{$text1$, $i1$}, {$text2$, $j1$}]'
-      <dd>returns a measure of similarity of two words within two texts.
-
-      <dt>'WordSimilarity[{$text1$, {$i1$, $i2$, ...}}, {$text2$, {$j1$, $j2$, ...}}]'
-      <dd>returns a measure of similarity of multiple words within two texts.
-    </dl>
-
-    >> NumberForm[WordSimilarity["car", "train"], 3]
-     = 0.439
-
-    >> NumberForm[WordSimilarity["car", "hedgehog"], 3]
-     = 0.195
-
-    >> NumberForm[WordSimilarity[{"An ocean full of water.", {2, 2}}, { "A desert full of sand.", {2, 5}}], 3]
-     = {0.505, 0.481}
-    """
-
-    messages = _merge_dictionaries(
-        _SpacyBuiltin.messages,
-        {
-            "txtidx": "Index `1` in position `2` must be between 1 and `3`.",
-            "idxfmt": "Indices must be integers or lists of integers of the same length.",
-        },
-    )
-
-    def eval(
-        self, text1: String, text2: String, evaluation: Evaluation, options: dict
-    ) -> Optional[Real]:
-        "WordSimilarity[text1_String, text2_String, OptionsPattern[WordSimilarity]]"
-        doc1 = self._nlp(text1.value, evaluation, options)
-        if doc1:
-            doc2 = self._nlp(text2.value, evaluation, options)
-            if doc2:
-                return Real(doc1.similarity(doc2))
-
-    def eval_pair(self, text1, i1, text2, i2, evaluation: Evaluation, options: dict):
-        "WordSimilarity[{text1_String, i1_}, {text2_String, i2_}, OptionsPattern[WordSimilarity]]"
-        doc1 = self._nlp(text1.value, evaluation, options)
-        if doc1:
-            if text2.value == text1.value:
-                doc2 = doc1
-            else:
-                doc2 = self._nlp(text2.value, evaluation, options)
-            if doc2:
-                if i1.get_head() is SymbolList and i2.get_head() is SymbolList:
-                    if len(i1.elements) != len(i2.elements):
-                        evaluation.message("TextSimilarity", "idxfmt")
-                        return
-                    if any(
-                        not all(isinstance(i, Integer) for i in li.elements)
-                        for li in (i1, i2)
-                    ):
-                        evaluation.message("TextSimilarity", "idxfmt")
-                        return
-                    indices1 = [i.value for i in i1.elements]
-                    indices2 = [i.value for i in i2.elements]
-                    multiple = True
-                elif isinstance(i1, Integer) and isinstance(i2, Integer):
-                    indices1 = [i1.value]
-                    indices2 = [i2.value]
-                    multiple = False
-                else:
-                    evaluation.message("TextSimilarity", "idxfmt")
-                    return
-
-                for index1, index2 in zip(indices1, indices2):
-                    for i, pos, doc in zip((index1, index2), (1, 2), (doc1, doc2)):
-                        if i < 1 or i > len(doc):
-                            evaluation.message(
-                                "TextSimilarity", "txtidx", i, pos, len(doc)
-                            )
-                            return
-
-                result = [
-                    Real(doc1[j1 - 1].similarity(doc2[j2 - 1]))
-                    for j1, j2 in zip(indices1, indices2)
-                ]
-
-                if multiple:
-                    return ListExpression(*result)
-                else:
-                    return result[0]
-
-
-class WordStem(Builtin):
-    """
-    <dl>
-      <dt>'WordStem[$word$]'
-      <dd>returns a stemmed form of $word$, thereby reducing an inflected form to its root.
-
-      <dt>'WordStem[{$word1$, $word2$, ...}]'
-      <dd>returns a stemmed form for list of $word$, thereby reducing an inflected form to its root.
-    </dl>
-
-    >> WordStem["towers"]
-     = tower
-
-    >> WordStem[{"heroes", "roses", "knights", "queens"}]
-     = {hero, rose, knight, queen}
-    """
-
-    requires = ("nltk",)
-
-    _stemmer = None
-
-    @staticmethod
-    def _get_porter_stemmer():
-        if WordStem._stemmer is None:
-            WordStem._stemmer = nltk.stem.porter.PorterStemmer()
-        return WordStem._stemmer
-
-    @staticmethod
-    def porter(w):
-        return WordStem._get_porter_stemmer().stem(w)
-
-    def eval(self, word: String, evaluation: Evaluation) -> String:
-        "WordStem[word_String]"
-        stemmer = self._get_porter_stemmer()
-        return String(stemmer.stem(word.value))
-
-    def eval_list(self, words, evaluation: Evaluation) -> Optional[ListExpression]:
-        "WordStem[words_List]"
-        if all(isinstance(w, String) for w in words.elements):
-            stemmer = self._get_porter_stemmer()
-            return ListExpression(
-                *[String(stemmer.stem(w.value)) for w in words.elements]
-            )
-
-
-class _WordNetBuiltin(Builtin):
-    requires = ("nltk",)
-
-    options = {
-        "Language": '"English"',
-    }
-
-    messages = {
-        "package": "NLTK's `` corpus is not installed. Please install it using nltk.download().",
-        "lang": 'Language "`1`" is currently not supported with `2`[]. Please install it manually.',
-        # 'load': 'Loading `1` word data. Please wait.',
-        "wordnet": "WordNet returned the following error: ``",
-    }
-
-    _wordnet_instances = {}
-
-    def _language_name(self, evaluation: Evaluation, options: dict):
-        return self.get_option(options, "Language", evaluation)
-
-    def _init_wordnet(self, evaluation: Evaluation, language_name, language_code):
-        try:
-            wordnet_resource = nltk.data.find("corpora/wordnet2022")
-            _init_nltk_maps()
-        except LookupError:
-            evaluation.message(self.get_name(), "package", "wordnet2022")
-            return None
-
-        try:
-            omw = nltk.corpus.util.LazyCorpusLoader(
-                "omw",
-                nltk.corpus.reader.CorpusReader,
-                r".*/wn-data-.*\.tab",
-                encoding="utf8",
-            )
-        except LookupError:
-            evaluation.message(self.get_name(), "package", "omw")
-            return None
-
-        wordnet = nltk.corpus.reader.wordnet.WordNetCorpusReader(wordnet_resource, omw)
-
-        if language_code not in wordnet.langs():
-            evaluation.message(
-                self.get_name(), "lang", language_name, strip_context(self.get_name())
-            )
-            return None
-
-        return wordnet
-
-    def _load_wordnet(self, evaluation: Evaluation, language_name) -> tuple:
-        language_code = None
-        if isinstance(language_name, String):
-            language_code = iso639_3.get(language_name.value)
-        if not language_code:
-            evaluation.message(
-                self.get_name(), "lang", language_name, strip_context(self.get_name())
-            )
-            return None, None
-
-        wordnet = _WordNetBuiltin._wordnet_instances.get(language_code)
-        if not wordnet:
-            try:
-                wordnet = self._init_wordnet(evaluation, language_name, language_code)
-            except LookupError as e:
-                evaluation.message(
-                    self.get_name(), "package", _parse_nltk_lookup_error(e)
-                )
-                return None, None
-
-            _WordNetBuiltin._wordnet_instances[language_code] = wordnet
-
-        return wordnet, language_code
-
-    @staticmethod
-    def _decode_synset(syn):
-        what, pos, nr = (syn.name().split(".") + ["01"])[:3]
-        return what.replace("_", " "), pos, nr
-
-    @staticmethod
-    def _capitalize(s) -> str:
-        return re.sub(r"^[a-z]|\s[a-z]", lambda m: m.group(0).upper().lstrip(" "), s)
-
-    @staticmethod
-    def _underscore(s) -> str:
-        return re.sub(
-            r"[a-z][A-Z]", lambda m: m.group(0)[0] + "_" + m.group(0)[1].lower(), s
-        ).lower()
-
-    @staticmethod
-    def _list_syn_form(syn):
-        what, pos, nr = _WordNetBuiltin._decode_synset(syn)
-
-        def containers():
-            for name in syn.lemma_names():
-                if name != what:
-                    yield name
-
-            for s in chain(syn.hypernyms(), syn.hyponyms(), syn.similar_tos()):
-                container, _, _ = _WordNetBuiltin._decode_synset(s)
-                yield container
-
-            for lemma in WordProperty._synonymous_lemmas(syn):
-                yield lemma.name()
-
-        return what, _wordnet_pos_to_type[pos], containers
-
-    @staticmethod
-    def syn(syn, wordnet, language_code) -> tuple:
-        what, pos, nr = _WordNetBuiltin._decode_synset(syn)
-        for s, form in _WordNetBuiltin._iterate_senses(what, wordnet, language_code):
-            if s == syn:
-                return form
-        return what, pos, "Unknown"
-
-    @staticmethod
-    def _iterate_senses(word, wordnet, language_code):
-        if not word:
-            return
-
-        used = set()
-        output_word = word.replace("_", " ")
-
-        for syn in wordnet.synsets(word, None, language_code):
-            if syn.lexname() in ("noun.location", "noun.person"):
-                continue  # ignore
-
-            what, pos, containers = _WordNetBuiltin._list_syn_form(syn)
-
-            for container in containers():
-                container = container.replace("_", " ")
-                if container != word:
-                    if container not in used:
-                        used.add(container)
-                        yield syn, (
-                            output_word,
-                            pos,
-                            _WordNetBuiltin._capitalize(container),
-                        )
-                        break
-
-    def _senses(self, word, wordnet, language_code):
-        if isinstance(word, tuple):  # find forms like ["tree", "Noun", "WoodyPlant"]
-            for syn, form in _WordNetBuiltin._iterate_senses(
-                word[0], wordnet, language_code
-            ):
-                if form == word:
-                    return [[syn, form]]
-        else:  # find word given as strings, e.g. "tree"
-            word = wordnet.morphy(word)  # base form, e.g. trees -> tree
-            return list(_WordNetBuiltin._iterate_senses(word, wordnet, language_code))
-
-
-class WordDefinition(_WordNetBuiltin):
-    """
-    <dl>
-      <dt>'WordDefinition[$word$]'
-      <dd>returns a definition of $word$ or Missing["Available"] if $word$ is not known.
-    </dl>
-
-    >> WordDefinition["gram"]
-     = {a metric unit of weight equal to one thousandth of a kilogram}
-    """
-
-    def eval(self, word, evaluation: Evaluation, options: dict):
-        "WordDefinition[word_String, OptionsPattern[WordDefinition]]"
-        wordnet, language_code = self._load_wordnet(
-            evaluation, self._language_name(evaluation, options)
-        )
-        if wordnet:
-            senses = self._senses(word.value.lower(), wordnet, language_code)
-            if senses:
-                return ListExpression(*[String(syn.definition()) for syn, _ in senses])
-            else:
-                return Expression(SymbolMissing, StringNotAvailable)
-
-
-class WordProperty:
-    def __init__(self, syn_form, wordnet, language_code):
-        self.syn_form = syn_form
-        self.wordnet = wordnet
-        self.language_code = language_code
-
-    def syn(self, syn):
-        return self.syn_form(_WordNetBuiltin.syn(syn, self.wordnet, self.language_code))
-
-    @staticmethod
-    def _synonymous_lemmas(syn):
-        first_lemma = syn.name().split(".")[0]
-        return (s for s in syn.lemmas() if s.name() != first_lemma)
-
-    @staticmethod
-    def _antonymous_lemmas(syn):
-        return (s for lemma in syn.lemmas() for s in lemma.antonyms())
-
-    def definitions(self, syn, desc):
-        return syn.definition()
-
-    def examples(self, syn, desc):
-        return syn.examples()
-
-    def synonyms(self, syn, desc):
-        _, pos, container = desc
-        return [
-            self.syn_form((s.name().replace("_", " "), pos, container))
-            for s in WordProperty._synonymous_lemmas(syn)
-        ]
-
-    def antonyms(self, syn, desc):
-        return [self.syn(s.synset()) for s in WordProperty._antonymous_lemmas(syn)]
-
-    def broader_terms(self, syn, desc):
-        return [self.syn(s) for s in syn.hypernyms()]
-
-    def narrower_terms(self, syn, desc):
-        return [self.syn(s) for s in syn.hyponyms()]
-
-    def usage_field(self, syn, desc):
-        return syn.usage_domains()
-
-    def whole_terms(self, syn, desc):
-        return [self.syn(s) for s in syn.part_holonyms()]
-
-    def part_terms(self, syn, desc):
-        return [self.syn(s) for s in syn.part_meronyms()]
-
-    def material_terms(self, syn, desc):
-        return [self.syn(s) for s in syn.substance_meronyms()]
-
-    def word_net_id(self, syn, desc):
-        return syn.offset()
-
-    def entailed_terms(self, syn, desc):  # e.g. fall to condense
-        return [self.syn(s) for s in syn.entailments()]
-
-    def causes_terms(self, syn, desc):  # e.g. ignite to burn
-        return [self.syn(s) for s in syn.causes()]
-
-    def inflected_forms(self, syn, desc):
-        try:
-            word, pos, _ = desc
-            if pos == "Verb":
-                from pattern.en import lexeme
-
-                return [w for w in reversed(lexeme(word)) if w != word]
-            elif pos == "Noun":
-                from pattern.en import pluralize
-
-                return [pluralize(word)]
-            elif pos == "Adjective":
-                from pattern.en import comparative, superlative
-
-                return [comparative(word), superlative(word)]
-            else:
-                return []
-        except ImportError:
-            raise MessageException(
-                "General", "unavailable", 'WordData[_, "InflectedForms"]', "pattern"
-            )
-
-
-class _WordListBuiltin(_WordNetBuiltin):
-    _dictionary = {}
-
-    def _words(self, language_name, ilk, evaluation):
-        wordnet, language_code = self._load_wordnet(evaluation, language_name)
-
-        if not wordnet:
-            return
-
-        key = "%s.%s" % (language_code, ilk)
-        words = self._dictionary.get(key)
-        if not words:
-            try:
-                if ilk == "All":
-                    filtered_pos = [None]
-                else:
-                    try:
-                        filtered_pos = _wordnet_type_to_pos[ilk]
-                    except KeyError:
-                        evaluation.message(
-                            self.get_name(),
-                            "wordnet",
-                            "type: %s is should be in %s"
-                            % (ilk._wordnet_type_to_pos.keys()),
-                        )
-                        return
-
-                words = []
-                for pos in filtered_pos:
-                    words.extend(list(wordnet.all_lemma_names(pos, language_code)))
-                words.sort()
-                self._dictionary[key] = words
-            except nltk.corpus.reader.wordnet.WordNetError as err:
-                evaluation.message(self.get_name(), "wordnet", str(err))
-                return
-
-        return words
-
-
-class WordData(_WordListBuiltin):
-    """
-    <dl>
-      <dt>'WordData[$word$]'
-      <dd>returns a list of possible senses of a word.
-
-      <dt>'WordData[$word$, $property$]'
-      <dd>returns detailed information about a word regarding $property$, e.g. "Definitions" or "Examples".
-    </dl>
-
-    The following are valid properties:
-    <ul>
-      <li> Definitions, Examples
-      <li> InflectedForms
-      <li> Synonyms, Antonyms
-      <li> BroaderTerms, NarrowerTerms
-      <li> WholeTerms, PartTerms, MaterialTerms
-      <li> EntailedTerms, CausesTerms
-      <li> UsageField
-      <li> WordNetID
-      <li> Lookup
-    </ul>
-
-    >> WordData["riverside", "Definitions"]
-     = {{riverside, Noun, Bank} -> the bank of a river}
-
-    >> WordData[{"fish", "Verb", "Angle"}, "Examples"]
-     = {{fish, Verb, Angle} -> {fish for compliments}}
-    """
-
-    messages = _merge_dictionaries(
-        _WordNetBuiltin.messages,
-        {
-            "notprop": "WordData[] does not recognize `1` as a valid property.",
-        },
-    )
-
-    def _parse_word(self, word):
-        if isinstance(word, String):
-            return word.value.lower()
-        elif word.get_head_name() == "System`List":
-            if len(word.elements) == 3 and all(
-                isinstance(s, String) for s in word.elements
-            ):
-                return tuple(s.value for s in word.elements)
-
-    def _standard_property(
-        self, py_word, py_form, py_property, wordnet, language_code, evaluation
-    ):
-        senses = self._senses(py_word, wordnet, language_code)
-        if not senses:
-            return Expression(SymbolMissing, StringNotAvailable)
-        elif py_form == "List":
-            word_property = WordProperty(self._short_syn_form, wordnet, language_code)
-            property_getter = getattr(
-                word_property, "%s" % self._underscore(py_property), None
-            )
-            if property_getter:
-                return to_mathics_list(
-                    *[property_getter(syn, desc) for syn, desc in senses]
-                )
-        elif py_form in ("Rules", "ShortRules"):
-            syn_form = (lambda s: s) if py_form == "Rules" else (lambda s: s[0])
-            word_property = WordProperty(syn_form, wordnet, language_code)
-            property_getter = getattr(
-                word_property, self._underscore(py_property), None
-            )
-            if property_getter:
-                list_expr_elements = [
-                    to_expression(SymbolRule, desc, property_getter(syn, desc))
-                    for syn, desc in senses
-                ]
-                return to_mathics_list(*list_expr_elements)
-        evaluation.message(self.get_name(), "notprop", property)
-
-    def _parts_of_speech(self, py_word, wordnet, language_code):
-        parts = set(
-            syn.pos() for syn, _ in self._senses(py_word, wordnet, language_code)
-        )
-        if not parts:
-            return Expression(SymbolMissing, StringNotAvailable)
-        else:
-            return ListExpression(
-                *[String(s) for s in sorted([_wordnet_pos_to_type[p] for p in parts])]
-            )
-
-    def _property(
-        self, word, py_property, py_form, evaluation: Evaluation, options: dict
-    ):
-        if py_property == "PorterStem":
-            if isinstance(word, String):
-                return String(WordStem.porter(word.value))
-            else:
-                return
-
-        wordnet, language_code = self._load_wordnet(
-            evaluation, self._language_name(evaluation, options)
-        )
-        if not wordnet:
-            return
-
-        py_word = self._parse_word(word)
-        if not py_word:
-            return
-
-        if py_property == "PartsOfSpeech":
-            return self._parts_of_speech(py_word, wordnet, language_code)
-
-        try:
-            return self._standard_property(
-                py_word, py_form, py_property, wordnet, language_code, evaluation
-            )
-        except MessageException as e:
-            e.message(evaluation)
-
-    def eval(self, word, evaluation: Evaluation, options: dict) -> Optional[Expression]:
-        "WordData[word_, OptionsPattern[WordData]]"
-        if word.get_head() is SymbolStringExpression:
-            return Expression(SymbolDictionaryLookup, word)
-        elif isinstance(word, String) or word.get_head() is SymbolList:
-            pass
-        else:
-            return
-
-        wordnet, language_code = self._load_wordnet(
-            evaluation, self._language_name(evaluation, options)
-        )
-        if not wordnet:
-            return
-
-        py_word = self._parse_word(word)
-        if not py_word:
-            return
-
-        senses = self._senses(py_word, wordnet, language_code)
-        if senses is not None:
-            return ListExpression(*[[String(s) for s in desc] for syn, desc in senses])
-
-    def eval_property(self, word, property, evaluation: Evaluation, options: dict):
-        "WordData[word_, property_String, OptionsPattern[WordData]]"
-        if word.get_head is SymbolStringExpression:
-            if property.get_string_value() == "Lookup":
-                return Expression(SymbolDictionaryLookup, word)
-        elif isinstance(word, String) or word.get_head() is SymbolList:
-            return self._property(
-                word, property.get_string_value(), "ShortRules", evaluation, options
-            )
-
-    def eval_property_form(
-        self, word, property, form, evaluation: Evaluation, options: dict
-    ):
-        "WordData[word_, property_String, form_String, OptionsPattern[WordData]]"
-        if isinstance(word, String) or word.get_head() is SymbolList:
-            return self._property(
-                word,
-                property.value,
-                form.value,
-                evaluation,
-                options,
-            )
-
-
-class DictionaryWordQ(_WordNetBuiltin):
-    """
-    <dl>
-      <dt>'DictionaryWordQ[$word$]'
-      <dd>returns True if $word$ is a word usually found in dictionaries, and False otherwise.
-    </dl>
-
-    >> DictionaryWordQ["couch"]
-     = True
-
-    >> DictionaryWordQ["meep-meep"]
-     = False
-    """
-
-    def eval(self, word, evaluation: Evaluation, options: dict):
-        "DictionaryWordQ[word_String,  OptionsPattern[DictionaryWordQ]]"
-        if not isinstance(word, String):
-            return False
-        wordnet, language_code = self._load_wordnet(
-            evaluation, self._language_name(evaluation, options)
-        )
-        if wordnet:
-            if list(wordnet.synsets(word.value.lower(), None, language_code)):
-                return SymbolTrue
-            else:
-                return SymbolFalse
-
-
-class DictionaryLookup(_WordListBuiltin):
-    """
-    <dl>
-      <dt>'DictionaryLookup[$word$]'
-      <dd>lookup words that match the given $word$ or pattern.
-
-      <dt>'DictionaryLookup[$word$, $n$]'
-      <dd>lookup first $n$ words that match the given $word$ or pattern.
-    </dl>
-
-    >> DictionaryLookup["bake" ~~ ___, 3]
-     = {bake, bakeapple, baked}
-    """
-
-    def compile(self, pattern, evaluation):
-        re_patt = to_regex(pattern, evaluation)
-        if re_patt is None:
-            evaluation.message(
-                "StringExpression",
-                "invld",
-                pattern,
-                Expression(SymbolStringExpression, pattern),
-            )
-            return
-        re_patt = anchor_pattern(re_patt)
-
-        return re.compile(re_patt, flags=re.IGNORECASE)
-
-    def search(self, dictionary_words, pattern):
-        for dictionary_word in dictionary_words:
-            if pattern.match(dictionary_word):
-                yield dictionary_word.replace("_", " ")
-
-    def lookup(self, language_name, word, n, evaluation):
-        pattern = self.compile(word, evaluation)
-        if pattern:
-            dictionary_words = self._words(language_name, "All", evaluation)
-            if dictionary_words is not None:
-                matches = self.search(dictionary_words, pattern)
-                if n is not None:
-                    matches = itertools.islice(matches, 0, n)
-                return ListExpression(*(String(match) for match in sorted(matches)))
-
-    def eval_english(self, word, evaluation):
-        "DictionaryLookup[word_]"
-        return self.lookup(String("English"), word, None, evaluation)
-
-    def eval_language(self, language, word, evaluation):
-        "DictionaryLookup[{language_String, word_}]"
-        return self.lookup(language, word, None, evaluation)
-
-    def eval_english_n(self, word, n, evaluation):
-        "DictionaryLookup[word_, n_Integer]"
-        return self.lookup(String("English"), word, n.value, evaluation)
-
-    def eval_language_n(self, language, word, n, evaluation):
-        "DictionaryLookup[{language_String, word_}, n_Integer]"
-        return self.lookup(language, word, n.value, evaluation)
-
-
-class WordList(_WordListBuiltin):
-    """
-    <dl>
-      <dt>'WordList[]'
-      <dd>returns a list of common words.
-
-      <dt>'WordList[$type$]'
-      <dd>returns a list of common words of type $type$.
-    </dl>
-
-    >> N[Mean[StringLength /@ WordList["Adjective"]], 2]
-     = 9.3
-    """
-
-    def eval(self, evaluation: Evaluation, options: dict):
-        "WordList[OptionsPattern[WordList]]"
-        words = self._words(self._language_name(evaluation, options), "All", evaluation)
-        if words is not None:
-            return to_mathics_list(*words, elements_conversion_fn=String)
-
-    def eval_type(self, wordtype, evaluation: Evaluation, options: dict):
-        "WordList[wordtype_String, OptionsPattern[WordList]]"
-        words = self._words(
-            self._language_name(evaluation, options),
-            wordtype.value,
-            evaluation,
-        )
-        if words is not None:
-            return to_mathics_list(*words, elements_conversion_fn=String)
-
-
-class RandomWord(_WordListBuiltin):
-    """
-    <dl>
-      <dt>'RandomWord[]'
-      <dd>returns a random word.
-
-      <dt>'RandomWord[$type$]'
-      <dd>returns a random word of the given $type$, e.g. of type "Noun" or "Adverb".
-
-      <dt>'RandomWord[$type$, $n$]'
-      <dd>returns $n$ random words of the given $type$.
-    </dl>
-    """
-
-    def _random_words(self, type, n, evaluation: Evaluation, options: dict):
-        words = self._words(self._language_name(evaluation, options), type, evaluation)
-        if words is not None:
-            with RandomEnv(evaluation) as rand:
-                return [
-                    String(words[rand.randint(0, len(words) - 1)].replace("_", " "))
-                    for _ in range(n)
-                ]
-
-    def eval(self, evaluation: Evaluation, options: dict):
-        "RandomWord[OptionsPattern[RandomWord]]"
-        words = self._random_words("All", 1, evaluation, options)
-        if words:
-            return words[0]
-
-    def eval_type(self, type, evaluation: Evaluation, options: dict):
-        "RandomWord[type_String, OptionsPattern[RandomWord]]"
-        words = self._random_words(type.value, 1, evaluation, options)
-        if words:
-            return words[0]
-
-    def eval_type_n(self, type, n, evaluation: Evaluation, options: dict):
-        "RandomWord[type_String, n_Integer, OptionsPattern[RandomWord]]"
-        words = self._random_words(type.value, n.value, evaluation, options)
-        if words:
-            return ListExpression(*words)
-
-
-class LanguageIdentify(Builtin):
-    """
-    <dl>
-      <dt>'LanguageIdentify[$text$]'
-      <dd>returns the name of the language used in $text$.
-    </dl>
-
-    >> LanguageIdentify["eins zwei drei"]
-     = German
-    """
-
-    def eval(self, text: String, evaluation: Evaluation) -> Union[Symbol, String]:
-        "LanguageIdentify[text_String]"
-
-        # an alternative: https://github.com/Mimino666/langdetect
-
-        code, _ = langid.classify(text.value)
-        language = pycountry.languages.get(alpha_2=code)
-        if language is None:
-            return SymbolFailed
-        return String(language.name)
-
-
-class Pluralize(Builtin):
-    """
-    <dl>
-      <dt>'Pluralize[$word$]'
-      <dd>returns the plural form of $word$.
-    </dl>
-
-    >> Pluralize["potato"]
-     = potatoes
-    """
-
-    requires = ("pattern",)
-
-    def eval(self, word, evaluation):
-        "Pluralize[word_String]"
-
-        return String(pluralize(word.value))
-
-
-class SpellingCorrectionList(Builtin):
-    """
-    <dl>
-      <dt>'SpellingCorrectionList[$word$]'
-      <dd>returns a list of suggestions for spelling corrected versions of $word$.
-    </dl>
-
-    Results may differ depending on which dictionaries can be found by enchant.
-
-    >> SpellingCorrectionList["hipopotamus"]
-     = {hippopotamus...}
-    """
-
-    options = {
-        "Language": '"English"',
-    }
-
-    messages = {
-        "lang": "SpellingCorrectionList does not support `1` as a language.",
-    }
-
-    _languages = {
-        "English": "en_US",  # en_GB, en_AU
-        "German": "de_DE",
-        "French": "fr_FR",
-    }
-
-    _dictionaries = {}
-
-    def eval(
-        self, word: String, evaluation: Evaluation, options: dict
-    ) -> Optional[ListExpression]:
-        "SpellingCorrectionList[word_String, OptionsPattern[SpellingCorrectionList]]"
-
-        language_name = self.get_option(options, "Language", evaluation)
-        if not isinstance(language_name, String):
-            return
-        language_code = SpellingCorrectionList._languages.get(language_name.value, None)
-        if not language_code:
-            evaluation.message("SpellingCorrectionList", "lang", language_name)
-            return
-
-        d = SpellingCorrectionList._dictionaries.get(language_code, None)
-        if not d:
-            d = enchant.Dict(language_code)
-            SpellingCorrectionList._dictionaries[language_code] = d
-
-        py_word = word.value
-
-        if d.check(py_word):
-            return ListExpression(word)
-        else:
-            return to_mathics_list(*d.suggest(py_word), elements_conversion_fn=String)
diff --git a/pymathics/natlang/normalization.py b/pymathics/natlang/normalization.py
new file mode 100644
index 0000000..17d7e31
--- /dev/null
+++ b/pymathics/natlang/normalization.py
@@ -0,0 +1,301 @@
+"""
+
+Text normalization
+
+"""
+import itertools
+from itertools import islice
+from typing import Optional
+
+import spacy
+from mathics.core.atoms import Integer, String
+from mathics.core.convert.python import from_python
+from mathics.core.evaluation import Evaluation
+from mathics.core.list import ListExpression
+
+from pymathics.natlang.spacy import _cases, _pos_tags, _position, _SpacyBuiltin
+
+
+class DeleteStopwords(_SpacyBuiltin):
+    """
+    Delete <url>:stop words:https://en.wikipedia.org/wiki/Stop_word</url>(\
+    <url>:WMA:
+    https://reference.wolfram.com/language/ref/DeleteStopwords.html</url>\
+    )
+
+    <dl>
+      <dt>'DeleteStopwords[$list$]'
+      <dd>returns the words in $list$ without stopwords.
+
+      <dt>'DeleteStopwords[$string$]'
+      <dd>returns $string$ without stopwords.
+    </dl>
+
+    ## This has changed since old versions of natlang, and I am
+    ## not sure the old behavior was correct.
+    >> DeleteStopwords[{"Somewhere", "over", "the", "rainbow"}]
+     = ...
+    ## = {rainbow}
+
+    >> DeleteStopwords["There was an Old Man of Apulia, whose conduct was very peculiar"]
+     = Old Man Apulia, conduct peculiar
+    """
+
+    summary_text = "Remove stopwords from a text"
+
+    def eval_list(self, li, evaluation: Evaluation, options: dict) -> ListExpression:
+        "DeleteStopwords[li_List, OptionsPattern[DeleteStopwords]]"
+        is_stop = self._is_stop_lambda(evaluation, options)
+
+        def filter_words(words):
+            for w in words:
+                s = w.get_string_value()
+                if s is not None:
+                    yield String(s)
+                elif is_stop is not None and is_stop(s) is not None:
+                    yield String(s)
+
+        return ListExpression(*list(filter_words(li.elements)))
+
+    def eval_string(self, s: String, evaluation: Evaluation, options: dict):
+        "DeleteStopwords[s_String, OptionsPattern[DeleteStopwords]]"
+        doc = self._nlp(s.value, evaluation, options)
+        if doc:
+            is_stop = self._is_stop_lambda(evaluation, options)
+            if is_stop:
+
+                def tokens():
+                    for token in doc:
+                        if not is_stop(token.text):
+                            yield token.text_with_ws
+                        else:
+                            yield token.whitespace_.strip()
+
+                return String("".join(tokens()))
+
+
+class TextCases(_SpacyBuiltin):
+    """
+    <url>:WMA:
+    https://reference.wolfram.com/language/ref/TextCases.html</url>
+
+    <dl>
+      <dt>'TextCases[$text$, $form$]'
+      <dd>returns all elements of type $form$ in $text$ in order of their appearance.
+    </dl>
+
+    >> TextCases["I was in London last year.", "Pronoun"]
+     = {I}
+
+    >> TextCases["I was in London last year.", "City"]
+     = {London}
+
+    ## >> TextCases[Import["ExampleData/EinsteinSzilLetter.txt"], "Person", 3][[2;;3]]
+    ##  = {L. Szilard, Joliot}
+
+    >> TextCases["Anne, Peter and Mr Johnes say hello.", "Person", 3][[2;;3]]
+     = {Peter, Johnes}
+
+    """
+
+    summary_text = "List the cases of words of a certain form in a text"
+
+    def eval_string_form(
+        self, text: String, form, evaluation: Evaluation, options: dict
+    ):
+        "TextCases[text_String, form_,  OptionsPattern[TextCases]]"
+        doc = self._nlp(text.value, evaluation, options)
+        if doc:
+            return ListExpression(*[String(t.text) for t in _cases(doc, form)])
+
+    def eval_string_form_n(
+        self, text: String, form, n: Integer, evaluation: Evaluation, options: dict
+    ):
+        "TextCases[text_String, form_, n_Integer,  OptionsPattern[TextCases]]"
+        doc = self._nlp(text.value, evaluation, options)
+        if doc:
+            items = islice((t.text for t in _cases(doc, form)), n.value)
+            return ListExpression(*(from_python(item) for item in items))
+
+
+class TextPosition(_SpacyBuiltin):
+    """
+    <url>:WMA:
+    https://reference.wolfram.com/language/ref/TextPosition.html</url>
+
+    <dl>
+      <dt>'TextPosition[$text$, $form$]'
+      <dd>returns the positions of elements of type $form$ in $text$ in order of their appearance.
+    </dl>
+
+    >> TextPosition["Liverpool and London are two English cities.", "City"]
+     = {{1, 9}, {15, 20}}
+    """
+
+    summary_text = "List the position of words of a given form in a text"
+
+    def eval_text_form(self, text: String, form, evaluation: Evaluation, options: dict):
+        "TextPosition[text_String, form_,  OptionsPattern[TextPosition]]"
+        doc = self._nlp(text.value, evaluation, options)
+        if doc:
+            return ListExpression(
+                *[from_python(_position(t)) for t in _cases(doc, form)]
+            )
+
+    def eval_text_form_n(
+        self, text: String, form, n: Integer, evaluation: Evaluation, options: dict
+    ):
+        "TextPosition[text_String, form_, n_Integer,  OptionsPattern[TextPosition]]"
+        doc = self._nlp(text.value, evaluation, options)
+        if doc:
+            items = islice((_position(t) for t in _cases(doc, form)), n.value)
+            return ListExpression(*(from_python(item) for item in items))
+
+
+class TextSentences(_SpacyBuiltin):
+    """
+    <url>:Sentences:https://en.wikipedia.org/wiki/Sentence_(linguistics)</url>\
+    in a text (\
+    <url>:WMA:
+    https://reference.wolfram.com/language/ref/TextSentences.html</url>\
+    )
+
+
+    <dl>
+      <dt>'TextSentences[$string$]'
+      <dd>returns the sentences in $string$.
+
+      <dt>'TextSentences[$string$, $n$]'
+      <dd>returns the first $n$ sentences in $string$
+    </dl>
+
+    >> TextSentences["Night and day. Day and night."]
+     = {Night and day., Day and night.}
+
+    >> TextSentences["Night and day. Day and night.", 1]
+     = {Night and day.}
+
+    >> TextSentences["Mr. Jones met Mrs. Jones."]
+     = {Mr. Jones met Mrs. Jones.}
+    """
+
+    summary_text = "list the sentences in a text"
+
+    def eval(self, text: String, evaluation: Evaluation, options: dict):
+        "TextSentences[text_String, OptionsPattern[TextSentences]]"
+        doc = self._nlp(text.value, evaluation, options)
+        if doc:
+            return ListExpression(*[String(sent.text) for sent in doc.sents])
+
+    def eval_n(self, text: String, n: Integer, evaluation: Evaluation, options: dict):
+        "TextSentences[text_String, n_Integer, OptionsPattern[TextSentences]]"
+        doc = self._nlp(text.value, evaluation, options)
+        if doc:
+            return ListExpression(
+                *itertools.islice((String(sent.text) for sent in doc.sents), n.value),
+            )
+
+
+class TextStructure(_SpacyBuiltin):
+    """
+    <url>:WMA:
+    https://reference.wolfram.com/language/ref/TextStructure.html</url>
+
+    <dl>
+      <dt>'TextStructure[$text$, $form$]'
+      <dd>returns the grammatical structure of $text$ as $form$.
+    </dl>
+
+    >> TextStructure["The cat sat on the mat.", "ConstituentString"]
+     = {(Sentence, ((Verb Phrase, (Noun Phrase, (Determiner, The), (Noun, cat)), (Verb, sat), (Prepositional Phrase, (Preposition, on), (Noun Phrase, (Determiner, the), (Noun, mat))), (Punctuation, .))))}
+    """
+
+    _root_pos = set(i for i, names in _pos_tags.items() if names[1])
+    summary_text = "Retrieve the grammatical structure of a text"
+
+    def _to_constituent_string(self, node):
+        token, children = node
+        name, phrase_name = _pos_tags.get(token.pos, ("Unknown", "Unknown Phrase"))
+        if not children:
+            return "(%s, %s)" % (name, token.text)
+        else:
+            sub = ", ".join(
+                self._to_constituent_string(next_node) for next_node in children
+            )
+            return "(%s, %s)" % (phrase_name, sub)
+
+    def _to_tree(self, tokens, path=[]):
+        roots = []
+        i = 0
+        while i < len(tokens):
+            token = tokens[i]
+
+            if token in path:
+                roots.append((token, None))
+                i += 1
+            else:
+                root = token
+                while root.head != root and root.head not in path:
+                    root = root.head
+
+                sub = list(root.subtree)
+
+                if root.pos not in self._root_pos:
+                    roots.extend(self._to_tree(sub, path + [root]))
+                else:
+                    roots.append((root, self._to_tree(sub, path + [root])))
+
+                i += len(sub)
+
+        return roots
+
+    def eval(self, text, evaluation: Evaluation, options: dict):
+        'TextStructure[text_String, "ConstituentString",  OptionsPattern[TextStructure]]'
+        doc = self._nlp(text.value, evaluation, options)
+        if doc:
+            tree = self._to_tree(list(doc))
+            sents = ["(Sentence, (%s))" % self._to_constituent_string(x) for x in tree]
+            return ListExpression(*(String(sent) for sent in sents))
+
+
+class TextWords(_SpacyBuiltin):
+    """
+    <url>:WMA:
+    https://reference.wolfram.com/language/ref/TextWords.html</url>
+
+    <dl>
+      <dt>'TextWords[$string$]'
+      <dd>returns the words in $string$.
+
+      <dt>'TextWords[$string$, $n$]'
+      <dd>returns the first $n$ words in $string$
+    </dl>
+
+    >> TextWords["Hickory, dickory, dock! The mouse ran up the clock."]
+     = {Hickory, dickory, dock, The, mouse, ran, up, the, clock}
+    """
+
+    summary_text = "list the words in a string"
+
+    def eval(
+        self, text: String, evaluation: Evaluation, options: dict
+    ) -> Optional[ListExpression]:
+        "TextWords[text_String, OptionsPattern[WordCount]]"
+        doc = self._nlp(text.value, evaluation, options)
+        if doc:
+            punctuation = spacy.parts_of_speech.PUNCT
+            return ListExpression(
+                *[String(word.text) for word in doc if word.pos != punctuation],
+            )
+
+    def eval_n(self, text: String, n: Integer, evaluation: Evaluation, options: dict):
+        "TextWords[text_String, n_Integer, OptionsPattern[TextWords]]"
+        doc = self._nlp(text.value, evaluation, options)
+        if doc:
+            punctuation = spacy.parts_of_speech.PUNCT
+            return ListExpression(
+                *itertools.islice(
+                    (String(word.text) for word in doc if word.pos != punctuation),
+                    n.value,
+                ),
+            )
diff --git a/pymathics/natlang/spacy.py b/pymathics/natlang/spacy.py
new file mode 100644
index 0000000..851d13f
--- /dev/null
+++ b/pymathics/natlang/spacy.py
@@ -0,0 +1,249 @@
+# -*- coding: utf-8 -*-
+# FIXME: split this up into smaller pieces
+
+"""
+Spacy tools
+
+"""
+import heapq
+import re
+from typing import Optional
+
+import spacy
+from mathics.builtin.base import Builtin
+from mathics.core.atoms import String
+from mathics.core.evaluation import Evaluation
+from mathics.core.symbols import strip_context
+from spacy.tokens import Span
+
+no_doc = True
+
+# Mathics3 named entitiy names and their corresponding constants in spacy.
+symbols = {
+    "Person": spacy.symbols.PERSON,
+    "Company": spacy.symbols.ORG,
+    "Quantity": spacy.symbols.QUANTITY,
+    "Number": spacy.symbols.CARDINAL,
+    "CurrencyAmount": spacy.symbols.MONEY,
+    "Country": spacy.symbols.GPE,  # also includes cities and states
+    "City": spacy.symbols.GPE,  # also includes countries and states
+}
+
+# Part of speech tags and their public interface names in Mathics
+# see http://www.mathcs.emory.edu/~choi/doc/clear-dependency-2012.pdf
+_pos_tags = {
+    spacy.parts_of_speech.ADJ: ("Adjective", ""),
+    spacy.parts_of_speech.ADP: ("Preposition", "Prepositional Phrase"),
+    spacy.parts_of_speech.ADV: ("Adverb", ""),
+    spacy.parts_of_speech.CONJ: ("Conjunct", ""),
+    spacy.parts_of_speech.DET: ("Determiner", ""),
+    spacy.parts_of_speech.INTJ: ("Interjection", ""),
+    spacy.parts_of_speech.NOUN: ("Noun", "Noun Phrase"),
+    spacy.parts_of_speech.NUM: ("Number", ""),
+    spacy.parts_of_speech.PART: ("Particle", ""),
+    spacy.parts_of_speech.PRON: ("Pronoun", ""),
+    spacy.parts_of_speech.PROPN: ("Proposition", ""),
+    spacy.parts_of_speech.PUNCT: ("Punctuation", ""),
+    spacy.parts_of_speech.SCONJ: ("Sconj", ""),
+    spacy.parts_of_speech.SYM: ("Symbol", ""),
+    spacy.parts_of_speech.VERB: ("Verb", "Verb Phrase"),
+    spacy.parts_of_speech.X: ("X", ""),
+    spacy.parts_of_speech.EOL: ("EOL", ""),
+    spacy.parts_of_speech.SPACE: ("Space", ""),
+}
+
+
+def _cases(doc, form):
+    if isinstance(form, String):
+        generators = [_forms.get(form.value)]
+    elif form.get_head_name() == "System`Alternatives":
+        if not all(isinstance(f, String) for f in form.elements):
+            return  # error
+        generators = [_forms.get(f.value) for f in form.elements]
+    elif form.get_head_name() == "PyMathics`Containing":
+        if len(form.elements) == 2:
+            for t in _containing(doc, *form.elements):
+                yield t
+            return
+        else:
+            return  # error
+    else:
+        return  # error
+
+    def try_next(iterator):
+        try:
+            return next(iterator)
+        except StopIteration:
+            return None
+
+    feeds = []
+    for i, iterator in enumerate([iter(generator(doc)) for generator in generators]):
+        t = try_next(iterator)
+        if t:
+            feeds.append((_position(t), i, t, iterator))
+    heapq.heapify(feeds)
+    while feeds:
+        pos, i, token, iterator = heapq.heappop(feeds)
+        yield token
+        t = try_next(iterator)
+        if t:
+            heapq.heappush(feeds, (_position(t), i, t, iterator))
+
+
+def _containing(doc, outer, inner):
+    if not isinstance(outer, String):
+        return  # error
+    outer_generator = _forms.get(outer.value)
+    inner_iter = _cases(doc, inner)
+    inner_start = None
+    produce_t = False
+    try:
+        for t in outer_generator(doc):
+            start, end = _position(t)
+            if inner_start is not None and inner_start < end:
+                produce_t = True
+            if produce_t:
+                yield t
+                produce_t = False
+            while True:
+                inner_start, inner_end = _position(next(inner_iter))
+                if inner_end > start:
+                    break
+            if inner_start < end:
+                produce_t = True
+    except StopIteration:
+        pass
+
+
+def _fragments(doc, sep):
+    start = 0
+    for i, token in enumerate(doc):
+        if sep.match(token.text):
+            yield Span(doc, start, i)
+            start = i + 1
+    end = len(doc)
+    if start < end:
+        yield Span(doc, start, end)
+
+
+def _make_forms():
+    forms = {
+        "Word": lambda doc: (token for token in doc),
+        "Sentence": lambda doc: (sent for sent in doc.sents),
+        "Paragraph": lambda doc: _fragments(doc, re.compile(r"^[\n][\n]+$")),
+        "Line": lambda doc: _fragments(doc, re.compile(r"^[\n]$")),
+        "URL": lambda doc: (token for token in doc if token.orth_.like_url()),
+        "EmailAddress": lambda doc: (
+            token for token in doc if token.orth_.like_email()
+        ),
+    }
+
+    def filter_named_entity(label):
+        def generator(doc):
+            for ent in doc.ents:
+                if ent.label == label:
+                    yield ent
+
+        return generator
+
+    def filter_pos(pos):
+        def generator(doc):
+            for token in doc:
+                if token.pos == pos:
+                    yield token
+
+        return generator
+
+    for name, symbol in symbols.items():
+        forms[name] = filter_named_entity(symbol)
+
+    for tag, names in _pos_tags.items():
+        name, phrase_name = names
+        forms[name] = filter_pos(tag)
+
+    return forms
+
+
+# forms are everything one can use in TextCases[] or TextPosition[].
+_forms = _make_forms()
+
+
+def _position(t):
+    if isinstance(t, Span):
+        i = t.doc[t.start]
+        r = t.doc[t.end - 1]
+        return 1 + i.idx, r.idx + len(r.text)
+    else:
+        return 1 + t.idx, t.idx + len(t.text)
+
+
+class _SpacyBuiltin(Builtin):
+    requires = ("spacy",)
+
+    options = {
+        "Language": '"English"',
+    }
+
+    messages = {
+        "runtime": "Spacy gave the following error: ``",
+        "lang": 'Language "`1`" is currently not supported with `2`[].',
+    }
+
+    _language_codes = {
+        "English": "en",
+        "German": "de",
+    }
+
+    _spacy_instances = {}
+
+    def _load_spacy(self, evaluation: Evaluation, options: dict):
+        language_code = None
+        language_name = self.get_option(options, "Language", evaluation)
+        if language_name is None:
+            language_name = String("Undefined")
+        if isinstance(language_name, String):
+            language_code = _SpacyBuiltin._language_codes.get(language_name.value)
+        if not language_code:
+            evaluation.message(
+                self.get_name(), "lang", language_name, strip_context(self.get_name())
+            )
+            return None
+
+        instance = _SpacyBuiltin._spacy_instances.get(language_code)
+        if instance:
+            return instance
+
+        try:
+            instance = spacy.load(f"{language_code}_core_web_md")
+
+            # "via" parameter no longer exists. This was used in MATHICS3_SPACY_DATA
+            # if "MATHICS3_SPACY_DATA" in os.environ:
+            #     instance = spacy.load(
+            #         language_code, via=os.environ["MATHICS3_SPACY_DATA"]
+            #     )
+            # else:
+            #     instance = spacy.load(f"{language_code}_core_web_md")
+
+            _SpacyBuiltin._spacy_instances[language_code] = instance
+            return instance
+        except RuntimeError as e:
+            evaluation.message(self.get_name(), "runtime", str(e))
+            return None
+
+    def _nlp(self, text, evaluation, options) -> Optional[spacy.tokens.doc.Doc]:
+        nlp = self._load_spacy(evaluation, options)
+        if not nlp:
+            return None
+        return nlp(text)
+
+    def _is_stop_lambda(self, evaluation: Evaluation, options: dict):
+        nlp = self._load_spacy(evaluation, options)
+        if not nlp:
+            return None
+
+        vocab = nlp.vocab
+
+        def is_stop(word):
+            return vocab[word].is_stop
+
+        return is_stop
diff --git a/pymathics/natlang/textual_analysis.py b/pymathics/natlang/textual_analysis.py
new file mode 100644
index 0000000..87c7f57
--- /dev/null
+++ b/pymathics/natlang/textual_analysis.py
@@ -0,0 +1,427 @@
+# -*- coding: utf-8 -*-
+"""
+Text analysis functions
+
+<url>:See WMA guide:https://reference.wolfram.com/language/guide/TextAnalysis.html</url>
+
+"""
+
+import re
+from itertools import islice
+from typing import Optional
+
+import enchant
+import nltk
+import spacy
+from mathics.builtin.atomic.strings import anchor_pattern, to_regex
+from mathics.builtin.base import Builtin
+from mathics.core.atoms import Integer, Real, String
+from mathics.core.evaluation import Evaluation
+from mathics.core.expression import Expression
+from mathics.core.list import ListExpression
+from mathics.core.symbols import SymbolFalse, SymbolList, SymbolTrue
+from mathics.core.systemsymbols import SymbolStringExpression
+from mathics.eval.nevaluator import eval_N
+
+from pymathics.natlang.spacy import _SpacyBuiltin
+from pymathics.natlang.util import _WordListBuiltin, _WordNetBuiltin, merge_dictionaries
+
+
+class Containing(Builtin):
+    """
+    <url>:WMA:
+    https://reference.wolfram.com/language/ref/Containing.html</url>
+
+    <dl>
+      <dt>'Containing[$outer$, $inner$]'
+      <dd>represents an object of the type outer containing objects\
+          of type inner.
+    </dl>
+
+    """
+
+    summary_text = "Specify a container for matching"
+
+
+class DictionaryLookup(_WordListBuiltin):
+    """
+    <url>:WMA:
+    https://reference.wolfram.com/language/ref/DictionaryLookup.html</url>
+
+    <dl>
+      <dt>'DictionaryLookup[$word$]'
+      <dd>lookup words that match the given $word$ or pattern.
+
+      <dt>'DictionaryLookup[$word$, $n$]'
+      <dd>lookup first $n$ words that match the given $word$ or pattern.
+    </dl>
+
+    >> DictionaryLookup["bake" ~~ ___, 3]
+     = {bake, bakeapple, baked}
+    """
+
+    summary_text = "Lookup words matching a pattern in a dictionary"
+
+    def compile(self, pattern, evaluation):
+        re_patt = to_regex(pattern, evaluation)
+        if re_patt is None:
+            evaluation.message(
+                "StringExpression",
+                "invld",
+                pattern,
+                Expression(SymbolStringExpression, pattern),
+            )
+            return
+        re_patt = anchor_pattern(re_patt)
+
+        return re.compile(re_patt, flags=re.IGNORECASE)
+
+    def search(self, dictionary_words, pattern):
+        for dictionary_word in dictionary_words:
+            if pattern.match(dictionary_word):
+                yield dictionary_word.replace("_", " ")
+
+    def lookup(self, language_name, word, n, evaluation):
+        pattern = self.compile(word, evaluation)
+        if pattern:
+            dictionary_words = self._words(language_name, "All", evaluation)
+            if dictionary_words is not None:
+                matches = self.search(dictionary_words, pattern)
+                if n is not None:
+                    matches = islice(matches, 0, n)
+                return ListExpression(*(String(match) for match in sorted(matches)))
+
+    def eval_english(self, word, evaluation):
+        "DictionaryLookup[word_]"
+        return self.lookup(String("English"), word, None, evaluation)
+
+    def eval_language(self, language, word, evaluation):
+        "DictionaryLookup[{language_String, word_}]"
+        return self.lookup(language, word, None, evaluation)
+
+    def eval_english_n(self, word, n, evaluation):
+        "DictionaryLookup[word_, n_Integer]"
+        return self.lookup(String("English"), word, n.value, evaluation)
+
+    def eval_language_n(self, language, word, n, evaluation):
+        "DictionaryLookup[{language_String, word_}, n_Integer]"
+        return self.lookup(language, word, n.value, evaluation)
+
+
+class DictionaryWordQ(_WordNetBuiltin):
+    """
+    <url>:WMA:
+    https://reference.wolfram.com/language/ref/DictionaryWordQ.html</url>
+
+    <dl>
+      <dt>'DictionaryWordQ[$word$]'
+      <dd>returns True if $word$ is a word usually found in dictionaries, and False otherwise.
+    </dl>
+
+    >> DictionaryWordQ["couch"]
+     = True
+
+    >> DictionaryWordQ["meep-meep"]
+     = False
+    """
+
+    summary_text = "Check if a word is in the dictionary"
+
+    def eval(self, word, evaluation: Evaluation, options: dict):
+        "DictionaryWordQ[word_String,  OptionsPattern[DictionaryWordQ]]"
+        if not isinstance(word, String):
+            return False
+        wordnet, language_code = self._load_wordnet(
+            evaluation, self._language_name(evaluation, options)
+        )
+        if wordnet:
+            if list(wordnet.synsets(word.value.lower(), None, language_code)):
+                return SymbolTrue
+            else:
+                return SymbolFalse
+
+
+class SpellingCorrectionList(Builtin):
+    """
+    <url>:WMA:
+    https://reference.wolfram.com/language/ref/SpellingCorrectionList.html</url>
+
+    <dl>
+      <dt>'SpellingCorrectionList[$word$]'
+      <dd>returns a list of suggestions for spelling corrected versions of $word$.
+    </dl>
+
+    Results may differ depending on which dictionaries can be found by enchant.
+
+    >> SpellingCorrectionList["hipopotamus"]
+     = {hippopotamus...}
+    """
+
+    options = {
+        "Language": '"English"',
+    }
+
+    messages = {
+        "lang": "SpellingCorrectionList does not support `1` as a language.",
+    }
+
+    _languages = {
+        "English": "en_US",  # en_GB, en_AU
+        "German": "de_DE",
+        "French": "fr_FR",
+    }
+
+    _dictionaries = {}
+
+    summary_text = "Look for spelling correction candidates of a word"
+
+    def eval(
+        self, word: String, evaluation: Evaluation, options: dict
+    ) -> Optional[ListExpression]:
+        "SpellingCorrectionList[word_String, OptionsPattern[SpellingCorrectionList]]"
+
+        language_name = self.get_option(options, "Language", evaluation)
+        if not isinstance(language_name, String):
+            return
+        language_code = SpellingCorrectionList._languages.get(language_name.value, None)
+        if not language_code:
+            evaluation.message("SpellingCorrectionList", "lang", language_name)
+            return
+
+        d = SpellingCorrectionList._dictionaries.get(language_code, None)
+        if not d:
+            d = enchant.Dict(language_code)
+            SpellingCorrectionList._dictionaries[language_code] = d
+
+        py_word = word.value
+
+        if d.check(py_word):
+            return ListExpression(word)
+        else:
+            return ListExpression(*(String(word) for word in d.suggest(py_word)))
+
+
+class WordCount(_SpacyBuiltin):
+    """
+    <url>:WMA:
+    https://reference.wolfram.com/language/ref/WordCount.html</url>
+
+    <dl>
+      <dt>'WordCount[$string$]'
+      <dd>returns the number of words in $string$.
+    </dl>
+
+    >> WordCount["A long time ago"]
+     = 4
+    """
+
+    summary_text = "Count the words in a text"
+
+    def eval(self, text, evaluation: Evaluation, options: dict):
+        "WordCount[text_String, OptionsPattern[WordCount]]"
+        doc = self._nlp(text.value, evaluation, options)
+        if doc:
+            punctuation = spacy.parts_of_speech.PUNCT
+            return Integer(sum(1 for word in doc if word.pos != punctuation))
+
+
+class WordFrequency(_SpacyBuiltin):
+    """
+    <url>:WMA:
+    https://reference.wolfram.com/language/ref/WordFrequency.html</url>
+
+    <dl>
+      <dt>'WordFrequency[$text$, $word$]'
+      <dd>returns the relative frequency of $word$ in $text$.
+    </dl>
+
+    $word$ may also specify multiple words using $a$ | $b$ | ...
+
+    ## Problem with import for certain characters in the text.
+    ## >> text = Import["ExampleData/EinsteinSzilLetter.txt"];
+    >> text = "I have a dairy cow, it's not just any cow. \
+       She gives me milkshake, oh what a salty cow. She is the best\
+       cow in the county.";
+
+    >> WordFrequency[text, "a" | "the"]
+     = 0.114286
+
+    >> WordFrequency["Apple Tree", "apple", IgnoreCase -> True]
+     = 0.5
+    """
+
+    options = _SpacyBuiltin.options
+    options.update({"IgnoreCase": "False"})
+    summary_text = "Retrieve the frequency of a word in a text"
+
+    def eval(
+        self, text: String, word, evaluation: Evaluation, options: dict
+    ) -> Optional[Expression]:
+        "WordFrequency[text_String, word_, OptionsPattern[WordFrequency]]"
+        doc = self._nlp(text.value, evaluation, options)
+        if not doc:
+            return
+        if isinstance(word, String):
+            words = set([word.value])
+        elif word.get_head_name() == "System`Alternatives":
+            if not all(isinstance(a, String) for a in word.elements):
+                return  # error
+            words = set(a.value for a in word.elements)
+        else:
+            return  # error
+
+        ignore_case = self.get_option(options, "IgnoreCase", evaluation) is SymbolTrue
+        if ignore_case:
+            words = [w.lower() for w in words]
+        n = 0
+        for token in doc:
+            token_text = token.text
+            if ignore_case:
+                token_text = token_text.lower()
+            if token_text in words:
+                n += 1
+        return eval_N(Integer(n) / Integer(len(doc)), evaluation)
+
+
+class WordSimilarity(_SpacyBuiltin):
+    """
+
+    <url>:WMA:
+    https://reference.wolfram.com/language/ref/WordSimilarity.html</url>
+
+    <dl>
+      <dt>'WordSimilarity[$text1$, $text2$]'
+      <dd>returns a real-valued measure of semantic similarity of two texts or words.
+
+      <dt>'WordSimilarity[{$text1$, $i1$}, {$text2$, $j1$}]'
+      <dd>returns a measure of similarity of two words within two texts.
+
+      <dt>'WordSimilarity[{$text1$, {$i1$, $i2$, ...}}, {$text2$, {$j1$, $j2$, ...}}]'
+      <dd>returns a measure of similarity of multiple words within two texts.
+    </dl>
+
+    >> NumberForm[WordSimilarity["car", "train"], 3]
+     = 0.439
+
+    >> NumberForm[WordSimilarity["car", "hedgehog"], 3]
+     = 0.195
+
+    >> NumberForm[WordSimilarity[{"An ocean full of water.", {2, 2}}, { "A desert full of sand.", {2, 5}}], 3]
+     = {0.505, 0.481}
+    """
+
+    messages = merge_dictionaries(
+        _SpacyBuiltin.messages,
+        {
+            "txtidx": "Index `1` in position `2` must be between 1 and `3`.",
+            "idxfmt": "Indices must be integers or lists of integers of the same length.",
+        },
+    )
+    summary_text = "Measure the similarity of two texts"
+
+    def eval(
+        self, text1: String, text2: String, evaluation: Evaluation, options: dict
+    ) -> Optional[Real]:
+        "WordSimilarity[text1_String, text2_String, OptionsPattern[WordSimilarity]]"
+        doc1 = self._nlp(text1.value, evaluation, options)
+        if doc1:
+            doc2 = self._nlp(text2.value, evaluation, options)
+            if doc2:
+                return Real(doc1.similarity(doc2))
+
+    def eval_pair(self, text1, i1, text2, i2, evaluation: Evaluation, options: dict):
+        "WordSimilarity[{text1_String, i1_}, {text2_String, i2_}, OptionsPattern[WordSimilarity]]"
+        doc1 = self._nlp(text1.value, evaluation, options)
+        if doc1:
+            if text2.value == text1.value:
+                doc2 = doc1
+            else:
+                doc2 = self._nlp(text2.value, evaluation, options)
+            if doc2:
+                if i1.get_head() is SymbolList and i2.get_head() is SymbolList:
+                    if len(i1.elements) != len(i2.elements):
+                        evaluation.message("TextSimilarity", "idxfmt")
+                        return
+                    if any(
+                        not all(isinstance(i, Integer) for i in li.elements)
+                        for li in (i1, i2)
+                    ):
+                        evaluation.message("TextSimilarity", "idxfmt")
+                        return
+                    indices1 = [i.value for i in i1.elements]
+                    indices2 = [i.value for i in i2.elements]
+                    multiple = True
+                elif isinstance(i1, Integer) and isinstance(i2, Integer):
+                    indices1 = [i1.value]
+                    indices2 = [i2.value]
+                    multiple = False
+                else:
+                    evaluation.message("TextSimilarity", "idxfmt")
+                    return
+
+                for index1, index2 in zip(indices1, indices2):
+                    for i, pos, doc in zip((index1, index2), (1, 2), (doc1, doc2)):
+                        if i < 1 or i > len(doc):
+                            evaluation.message(
+                                "TextSimilarity", "txtidx", i, pos, len(doc)
+                            )
+                            return
+
+                result = [
+                    Real(doc1[j1 - 1].similarity(doc2[j2 - 1]))
+                    for j1, j2 in zip(indices1, indices2)
+                ]
+
+                if multiple:
+                    return ListExpression(*result)
+                else:
+                    return result[0]
+
+
+class WordStem(Builtin):
+    """
+    <url>:WMA:
+    https://reference.wolfram.com/language/ref/WordStem.html</url>
+
+    <dl>
+      <dt>'WordStem[$word$]'
+      <dd>returns a stemmed form of $word$, thereby reducing an inflected form to its root.
+
+      <dt>'WordStem[{$word1$, $word2$, ...}]'
+      <dd>returns a stemmed form for list of $word$, thereby reducing an inflected form to its root.
+    </dl>
+
+    >> WordStem["towers"]
+     = tower
+
+    >> WordStem[{"heroes", "roses", "knights", "queens"}]
+     = {hero, rose, knight, queen}
+    """
+
+    _stemmer = None
+
+    requires = ("nltk",)
+    summary_text = "Retrieve the stem of a word"
+
+    @staticmethod
+    def _get_porter_stemmer():
+        if WordStem._stemmer is None:
+            WordStem._stemmer = nltk.stem.porter.PorterStemmer()
+        return WordStem._stemmer
+
+    @staticmethod
+    def porter(w):
+        return WordStem._get_porter_stemmer().stem(w)
+
+    def eval(self, word: String, evaluation: Evaluation) -> String:
+        "WordStem[word_String]"
+        stemmer = self._get_porter_stemmer()
+        return String(stemmer.stem(word.value))
+
+    def eval_list(self, words, evaluation: Evaluation) -> Optional[ListExpression]:
+        "WordStem[words_List]"
+        if all(isinstance(w, String) for w in words.elements):
+            stemmer = self._get_porter_stemmer()
+            return ListExpression(
+                *[String(stemmer.stem(w.value)) for w in words.elements]
+            )
diff --git a/pymathics/natlang/translation.py b/pymathics/natlang/translation.py
new file mode 100644
index 0000000..4bb0d73
--- /dev/null
+++ b/pymathics/natlang/translation.py
@@ -0,0 +1,45 @@
+# -*- coding: utf-8 -*-
+
+
+"""
+Language translation
+
+"""
+
+from typing import Union
+
+import langid  # see https://github.com/saffsd/langid.py
+import pycountry
+from mathics.builtin.base import Builtin
+from mathics.core.atoms import String
+from mathics.core.evaluation import Evaluation
+from mathics.core.symbols import Symbol
+from mathics.core.systemsymbols import SymbolFailed
+
+
+class LanguageIdentify(Builtin):
+    """
+    <url>:WMA:
+    https://reference.wolfram.com/language/ref/LanguageIdentify.html</url>
+
+    <dl>
+      <dt>'LanguageIdentify[$text$]'
+      <dd>returns the name of the language used in $text$.
+    </dl>
+
+    >> LanguageIdentify["eins zwei drei"]
+     = German
+    """
+
+    summary_text = "determine the predominant human language in a string"
+
+    def eval(self, text: String, evaluation: Evaluation) -> Union[Symbol, String]:
+        "LanguageIdentify[text_String]"
+
+        # an alternative: https://github.com/Mimino666/langdetect
+
+        code, _ = langid.classify(text.value)
+        language = pycountry.languages.get(alpha_2=code)
+        if language is None:
+            return SymbolFailed
+        return String(language.name)
diff --git a/pymathics/natlang/util.py b/pymathics/natlang/util.py
new file mode 100644
index 0000000..2c3b33c
--- /dev/null
+++ b/pymathics/natlang/util.py
@@ -0,0 +1,328 @@
+# -*- coding: utf-8 -*-
+
+"""
+utils
+"""
+import re
+from itertools import chain
+
+import nltk
+from mathics.builtin.base import Builtin, MessageException
+from mathics.builtin.codetables import iso639_3
+from mathics.core.atoms import String
+from mathics.core.evaluation import Evaluation
+from mathics.core.symbols import strip_context
+
+no_doc = True
+
+
+_wordnet_pos_to_type = {}
+_wordnet_type_to_pos = {}
+
+
+def _init_nltk_maps():
+    _wordnet_pos_to_type.update(
+        {
+            nltk.corpus.wordnet.VERB: "Verb",
+            nltk.corpus.wordnet.NOUN: "Noun",
+            nltk.corpus.wordnet.ADJ: "Adjective",
+            nltk.corpus.wordnet.ADJ_SAT: "Adjective",
+            nltk.corpus.wordnet.ADV: "Adverb",
+        }
+    )
+    _wordnet_type_to_pos.update(
+        {
+            "Verb": [nltk.corpus.wordnet.VERB],
+            "Noun": [nltk.corpus.wordnet.NOUN],
+            "Adjective": [nltk.corpus.wordnet.ADJ, nltk.corpus.wordnet.ADJ_SAT],
+            "Adverb": [nltk.corpus.wordnet.ADV],
+        }
+    )
+
+
+def _parse_nltk_lookup_error(e):
+    m = re.search(r"Resource '([^']+)' not found\.", str(e))
+    if m:
+        return m.group(1)
+    else:
+        return "unknown"
+
+
+def merge_dictionaries(a, b):
+    c = a.copy()
+    c.update(b)
+    return c
+
+
+class _WordNetBuiltin(Builtin):
+    requires = ("nltk",)
+
+    options = {
+        "Language": '"English"',
+    }
+
+    messages = {
+        "package": "NLTK's `` corpus is not installed. Please install it using nltk.download().",
+        "lang": 'Language "`1`" is currently not supported with `2`[]. Please install it manually.',
+        # 'load': 'Loading `1` word data. Please wait.',
+        "wordnet": "WordNet returned the following error: ``",
+    }
+
+    _wordnet_instances = {}
+
+    def _language_name(self, evaluation: Evaluation, options: dict):
+        return self.get_option(options, "Language", evaluation)
+
+    def _init_wordnet(self, evaluation: Evaluation, language_name, language_code):
+        try:
+            wordnet_resource = nltk.data.find("corpora/wordnet2022")
+            _init_nltk_maps()
+        except LookupError:
+            evaluation.message(self.get_name(), "package", "wordnet2022")
+            return None
+
+        try:
+            omw = nltk.corpus.util.LazyCorpusLoader(
+                "omw",
+                nltk.corpus.reader.CorpusReader,
+                r".*/wn-data-.*\.tab",
+                encoding="utf8",
+            )
+        except LookupError:
+            evaluation.message(self.get_name(), "package", "omw")
+            return None
+
+        wordnet = nltk.corpus.reader.wordnet.WordNetCorpusReader(wordnet_resource, omw)
+
+        if language_code not in wordnet.langs():
+            evaluation.message(
+                self.get_name(), "lang", language_name, strip_context(self.get_name())
+            )
+            return None
+
+        return wordnet
+
+    def _load_wordnet(self, evaluation: Evaluation, language_name) -> tuple:
+        language_code = None
+        if isinstance(language_name, String):
+            language_code = iso639_3.get(language_name.value)
+        if not language_code:
+            evaluation.message(
+                self.get_name(), "lang", language_name, strip_context(self.get_name())
+            )
+            return None, None
+
+        wordnet = _WordNetBuiltin._wordnet_instances.get(language_code)
+        if not wordnet:
+            try:
+                wordnet = self._init_wordnet(evaluation, language_name, language_code)
+            except LookupError as e:
+                evaluation.message(
+                    self.get_name(), "package", _parse_nltk_lookup_error(e)
+                )
+                return None, None
+
+            _WordNetBuiltin._wordnet_instances[language_code] = wordnet
+
+        return wordnet, language_code
+
+    @staticmethod
+    def _decode_synset(syn):
+        what, pos, nr = (syn.name().split(".") + ["01"])[:3]
+        return what.replace("_", " "), pos, nr
+
+    @staticmethod
+    def _capitalize(s) -> str:
+        return re.sub(r"^[a-z]|\s[a-z]", lambda m: m.group(0).upper().lstrip(" "), s)
+
+    @staticmethod
+    def _underscore(s) -> str:
+        return re.sub(
+            r"[a-z][A-Z]", lambda m: m.group(0)[0] + "_" + m.group(0)[1].lower(), s
+        ).lower()
+
+    @staticmethod
+    def _list_syn_form(syn):
+        what, pos, nr = _WordNetBuiltin._decode_synset(syn)
+
+        def containers():
+            for name in syn.lemma_names():
+                if name != what:
+                    yield name
+
+            for s in chain(syn.hypernyms(), syn.hyponyms(), syn.similar_tos()):
+                container, _, _ = _WordNetBuiltin._decode_synset(s)
+                yield container
+
+            for lemma in WordProperty._synonymous_lemmas(syn):
+                yield lemma.name()
+
+        return what, _wordnet_pos_to_type[pos], containers
+
+    @staticmethod
+    def syn(syn, wordnet, language_code) -> tuple:
+        what, pos, nr = _WordNetBuiltin._decode_synset(syn)
+        for s, form in _WordNetBuiltin._iterate_senses(what, wordnet, language_code):
+            if s == syn:
+                return form
+        return what, pos, "Unknown"
+
+    @staticmethod
+    def _iterate_senses(word, wordnet, language_code):
+        if not word:
+            return
+
+        used = set()
+        output_word = word.replace("_", " ")
+
+        for syn in wordnet.synsets(word, None, language_code):
+            if syn.lexname() in ("noun.location", "noun.person"):
+                continue  # ignore
+
+            what, pos, containers = _WordNetBuiltin._list_syn_form(syn)
+
+            for container in containers():
+                container = container.replace("_", " ")
+                if container != word:
+                    if container not in used:
+                        used.add(container)
+                        yield syn, (
+                            output_word,
+                            pos,
+                            _WordNetBuiltin._capitalize(container),
+                        )
+                        break
+
+    def _senses(self, word, wordnet, language_code):
+        if isinstance(word, tuple):  # find forms like ["tree", "Noun", "WoodyPlant"]
+            for syn, form in _WordNetBuiltin._iterate_senses(
+                word[0], wordnet, language_code
+            ):
+                if form == word:
+                    return [[syn, form]]
+        else:  # find word given as strings, e.g. "tree"
+            word = wordnet.morphy(word)  # base form, e.g. trees -> tree
+            return list(_WordNetBuiltin._iterate_senses(word, wordnet, language_code))
+
+
+class _WordListBuiltin(_WordNetBuiltin):
+    _dictionary = {}
+
+    def _words(self, language_name, ilk, evaluation):
+        wordnet, language_code = self._load_wordnet(evaluation, language_name)
+
+        if not wordnet:
+            return
+
+        key = "%s.%s" % (language_code, ilk)
+        words = self._dictionary.get(key)
+        if not words:
+            try:
+                if ilk == "All":
+                    filtered_pos = [None]
+                else:
+                    try:
+                        filtered_pos = _wordnet_type_to_pos[ilk]
+                    except KeyError:
+                        evaluation.message(
+                            self.get_name(),
+                            "wordnet",
+                            "type: %s is should be in %s"
+                            % (ilk._wordnet_type_to_pos.keys()),
+                        )
+                        return
+
+                words = []
+                for pos in filtered_pos:
+                    words.extend(list(wordnet.all_lemma_names(pos, language_code)))
+                words.sort()
+                self._dictionary[key] = words
+            except nltk.corpus.reader.wordnet.WordNetError as err:
+                evaluation.message(self.get_name(), "wordnet", str(err))
+                return
+
+        return words
+
+
+class WordProperty:
+    def __init__(self, syn_form, wordnet, language_code):
+        self.syn_form = syn_form
+        self.wordnet = wordnet
+        self.language_code = language_code
+
+    def syn(self, syn):
+        return self.syn_form(_WordNetBuiltin.syn(syn, self.wordnet, self.language_code))
+
+    @staticmethod
+    def _synonymous_lemmas(syn):
+        first_lemma = syn.name().split(".")[0]
+        return (s for s in syn.lemmas() if s.name() != first_lemma)
+
+    @staticmethod
+    def _antonymous_lemmas(syn):
+        return (s for lemma in syn.lemmas() for s in lemma.antonyms())
+
+    def definitions(self, syn, desc):
+        return syn.definition()
+
+    def examples(self, syn, desc):
+        return syn.examples()
+
+    def synonyms(self, syn, desc):
+        _, pos, container = desc
+        return [
+            self.syn_form((s.name().replace("_", " "), pos, container))
+            for s in WordProperty._synonymous_lemmas(syn)
+        ]
+
+    def antonyms(self, syn, desc):
+        return [self.syn(s.synset()) for s in WordProperty._antonymous_lemmas(syn)]
+
+    def broader_terms(self, syn, desc):
+        return [self.syn(s) for s in syn.hypernyms()]
+
+    def narrower_terms(self, syn, desc):
+        return [self.syn(s) for s in syn.hyponyms()]
+
+    def usage_field(self, syn, desc):
+        return syn.usage_domains()
+
+    def whole_terms(self, syn, desc):
+        return [self.syn(s) for s in syn.part_holonyms()]
+
+    def part_terms(self, syn, desc):
+        return [self.syn(s) for s in syn.part_meronyms()]
+
+    def material_terms(self, syn, desc):
+        return [self.syn(s) for s in syn.substance_meronyms()]
+
+    def word_net_id(self, syn, desc):
+        return syn.offset()
+
+    def entailed_terms(self, syn, desc):  # e.g. fall to condense
+        return [self.syn(s) for s in syn.entailments()]
+
+    def causes_terms(self, syn, desc):  # e.g. ignite to burn
+        return [self.syn(s) for s in syn.causes()]
+
+    def inflected_forms(self, syn, desc):
+        try:
+            word, pos, _ = desc
+            if pos == "Verb":
+                from pattern.en import lexeme
+
+                return [w for w in reversed(lexeme(word)) if w != word]
+            elif pos == "Noun":
+                from pattern.en import pluralize
+
+                return [pluralize(word)]
+            elif pos == "Adjective":
+                from pattern.en import comparative, superlative
+
+                return [comparative(word), superlative(word)]
+            else:
+                return []
+        except ImportError:
+            raise MessageException(
+                "General", "unavailable", 'WordData[_, "InflectedForms"]', "pattern"
+            )
diff --git a/setup.py b/setup.py
index 57d7097..ef44e46 100644
--- a/setup.py
+++ b/setup.py
@@ -1,11 +1,12 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 
-import sys
-import platform
 import os
 import os.path as osp
-from setuptools import setup, find_namespace_packages
+import platform
+import sys
+
+from setuptools import find_namespace_packages, setup
 
 # Ensure user has the correct Python version
 if sys.version_info < (3, 6):
diff --git a/test/consistency-and-style/test_summary_text.py b/test/consistency-and-style/test_summary_text.py
index dd99c95..05deaa4 100644
--- a/test/consistency-and-style/test_summary_text.py
+++ b/test/consistency-and-style/test_summary_text.py
@@ -5,12 +5,11 @@
 import pkgutil
 
 import pytest
-
-from pymathics.natlang import __file__ as module_initfile_path
 from mathics.builtin import name_is_builtin_symbol
 from mathics.builtin.base import Builtin
 from mathics.doc.common_doc import skip_doc
 
+from pymathics.natlang import __file__ as module_initfile_path
 
 # Get file system path name for mathics.builtin
 module_path = osp.dirname(module_initfile_path)

From f5a4282dab89caca0ee188aaec2ca5a3d13ab488 Mon Sep 17 00:00:00 2001
From: mmatera <matera@fisica.unlp.edu.ar>
Date: Mon, 20 Feb 2023 18:09:04 -0300
Subject: [PATCH 02/14] adding comments

---
 pymathics/natlang/linguistic_data.py  | 1 +
 pymathics/natlang/normalization.py    | 4 ++++
 pymathics/natlang/textual_analysis.py | 3 ++-
 pymathics/natlang/translation.py      | 7 +++++++
 4 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/pymathics/natlang/linguistic_data.py b/pymathics/natlang/linguistic_data.py
index 942ff48..e7b7d40 100644
--- a/pymathics/natlang/linguistic_data.py
+++ b/pymathics/natlang/linguistic_data.py
@@ -5,6 +5,7 @@
 See <url>:WMA:https://reference.wolfram.com/language/guide/LinguisticData.html</url> guide.
 
 """
+# This module uses both nltk and spacy. Maybe it makes sense to split this further.
 
 
 # TODO: Complete me
diff --git a/pymathics/natlang/normalization.py b/pymathics/natlang/normalization.py
index 17d7e31..aaadd50 100644
--- a/pymathics/natlang/normalization.py
+++ b/pymathics/natlang/normalization.py
@@ -2,6 +2,10 @@
 
 Text normalization
 
+See <url>:WMA: https://reference.wolfram.com/language/guide/TextNormalization.html</url> guide.
+
+
+This module uses spacy as a backend.
 """
 import itertools
 from itertools import islice
diff --git a/pymathics/natlang/textual_analysis.py b/pymathics/natlang/textual_analysis.py
index 87c7f57..98657e4 100644
--- a/pymathics/natlang/textual_analysis.py
+++ b/pymathics/natlang/textual_analysis.py
@@ -3,9 +3,10 @@
 Text analysis functions
 
 <url>:See WMA guide:https://reference.wolfram.com/language/guide/TextAnalysis.html</url>
-
 """
 
+# This module uses both enchant, nltk and spacy. Maybe we want to split this further.
+
 import re
 from itertools import islice
 from typing import Optional
diff --git a/pymathics/natlang/translation.py b/pymathics/natlang/translation.py
index 4bb0d73..96b73d8 100644
--- a/pymathics/natlang/translation.py
+++ b/pymathics/natlang/translation.py
@@ -4,8 +4,15 @@
 """
 Language translation
 
+
 """
 
+# This is under  Text Normalization in WR. But also in Natural Language Processing, 
+# and Linguistic Data. I put here because is the only module tuat uses langid and pycountry
+# modules.
+#
+# TODO: WordTranslation, TextTranslation
+
 from typing import Union
 
 import langid  # see https://github.com/saffsd/langid.py

From 63e72dd7e0a035b8ed89ad58406a8a263a50dc39 Mon Sep 17 00:00:00 2001
From: mmatera <matera@fisica.unlp.edu.ar>
Date: Mon, 20 Feb 2023 19:18:16 -0300
Subject: [PATCH 03/14] black

---
 pymathics/natlang/translation.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pymathics/natlang/translation.py b/pymathics/natlang/translation.py
index 96b73d8..1bca23e 100644
--- a/pymathics/natlang/translation.py
+++ b/pymathics/natlang/translation.py
@@ -7,7 +7,7 @@
 
 """
 
-# This is under  Text Normalization in WR. But also in Natural Language Processing, 
+# This is under  Text Normalization in WR. But also in Natural Language Processing,
 # and Linguistic Data. I put here because is the only module tuat uses langid and pycountry
 # modules.
 #

From 29add1a19aa5be2bc1ac7afed90a6bd596182e76 Mon Sep 17 00:00:00 2001
From: mmatera <matera@fisica.unlp.edu.ar>
Date: Mon, 20 Feb 2023 20:02:21 -0300
Subject: [PATCH 04/14] fix summaries

---
 pymathics/natlang/linguistic_data.py  | 4 ++--
 pymathics/natlang/normalization.py    | 6 +++---
 pymathics/natlang/spacy.py            | 4 +++-
 pymathics/natlang/textual_analysis.py | 4 ++--
 pymathics/natlang/translation.py      | 2 +-
 5 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/pymathics/natlang/linguistic_data.py b/pymathics/natlang/linguistic_data.py
index e7b7d40..e2d27ee 100644
--- a/pymathics/natlang/linguistic_data.py
+++ b/pymathics/natlang/linguistic_data.py
@@ -58,7 +58,7 @@ class Pluralize(Builtin):
     """
 
     requires = ("pattern",)
-    summary_text = "Retrieve the pluralized form of a word"
+    summary_text = "retrieve the pluralized form of a word"
 
     def eval(self, word, evaluation):
         "Pluralize[word_String]"
@@ -332,7 +332,7 @@ class WordList(_WordListBuiltin):
      = 9.3
     """
 
-    summary_text = "retrieve a list of common words"
+    summary_text = "list of common words"
 
     def eval(self, evaluation: Evaluation, options: dict):
         "WordList[OptionsPattern[]]"
diff --git a/pymathics/natlang/normalization.py b/pymathics/natlang/normalization.py
index aaadd50..aeca510 100644
--- a/pymathics/natlang/normalization.py
+++ b/pymathics/natlang/normalization.py
@@ -102,7 +102,7 @@ class TextCases(_SpacyBuiltin):
 
     """
 
-    summary_text = "List the cases of words of a certain form in a text"
+    summary_text = "list the cases of words of a certain form in a text"
 
     def eval_string_form(
         self, text: String, form, evaluation: Evaluation, options: dict
@@ -136,7 +136,7 @@ class TextPosition(_SpacyBuiltin):
      = {{1, 9}, {15, 20}}
     """
 
-    summary_text = "List the position of words of a given form in a text"
+    summary_text = "list the position of words of a given form in a text"
 
     def eval_text_form(self, text: String, form, evaluation: Evaluation, options: dict):
         "TextPosition[text_String, form_,  OptionsPattern[TextPosition]]"
@@ -215,7 +215,7 @@ class TextStructure(_SpacyBuiltin):
     """
 
     _root_pos = set(i for i, names in _pos_tags.items() if names[1])
-    summary_text = "Retrieve the grammatical structure of a text"
+    summary_text = "retrieve the grammatical structure of a text"
 
     def _to_constituent_string(self, node):
         token, children = node
diff --git a/pymathics/natlang/spacy.py b/pymathics/natlang/spacy.py
index 851d13f..78c38b6 100644
--- a/pymathics/natlang/spacy.py
+++ b/pymathics/natlang/spacy.py
@@ -1,10 +1,12 @@
 # -*- coding: utf-8 -*-
-# FIXME: split this up into smaller pieces
 
 """
 Spacy tools
 
 """
+
+# TODO: move here low-level implementation depending on spacy
+
 import heapq
 import re
 from typing import Optional
diff --git a/pymathics/natlang/textual_analysis.py b/pymathics/natlang/textual_analysis.py
index 98657e4..c1f74d2 100644
--- a/pymathics/natlang/textual_analysis.py
+++ b/pymathics/natlang/textual_analysis.py
@@ -253,7 +253,7 @@ class WordFrequency(_SpacyBuiltin):
 
     options = _SpacyBuiltin.options
     options.update({"IgnoreCase": "False"})
-    summary_text = "Retrieve the frequency of a word in a text"
+    summary_text = "retrieve the frequency of a word in a text"
 
     def eval(
         self, text: String, word, evaluation: Evaluation, options: dict
@@ -402,7 +402,7 @@ class WordStem(Builtin):
     _stemmer = None
 
     requires = ("nltk",)
-    summary_text = "Retrieve the stem of a word"
+    summary_text = "retrieve the stem of a word"
 
     @staticmethod
     def _get_porter_stemmer():
diff --git a/pymathics/natlang/translation.py b/pymathics/natlang/translation.py
index 1bca23e..0da39ad 100644
--- a/pymathics/natlang/translation.py
+++ b/pymathics/natlang/translation.py
@@ -8,7 +8,7 @@
 """
 
 # This is under  Text Normalization in WR. But also in Natural Language Processing,
-# and Linguistic Data. I put here because is the only module tuat uses langid and pycountry
+# and Linguistic Data. I put here because is the only module that uses langid and pycountry
 # modules.
 #
 # TODO: WordTranslation, TextTranslation

From 2b10a3cd6ef8282d01be1da7efdbc4d86e8ac152 Mon Sep 17 00:00:00 2001
From: mmatera <matera@fisica.unlp.edu.ar>
Date: Tue, 21 Feb 2023 23:30:19 -0300
Subject: [PATCH 05/14] rocky's  comments fixed

---
 pymathics/natlang/linguistic_data.py  | 21 +++++++++++++--------
 pymathics/natlang/normalization.py    |  7 ++++---
 pymathics/natlang/textual_analysis.py | 20 +++++++++++---------
 pymathics/natlang/translation.py      |  3 ++-
 4 files changed, 30 insertions(+), 21 deletions(-)

diff --git a/pymathics/natlang/linguistic_data.py b/pymathics/natlang/linguistic_data.py
index e2d27ee..73d444c 100644
--- a/pymathics/natlang/linguistic_data.py
+++ b/pymathics/natlang/linguistic_data.py
@@ -2,7 +2,7 @@
 """
 Linguistic Data
 
-See <url>:WMA:https://reference.wolfram.com/language/guide/LinguisticData.html</url> guide.
+See <url>:WMA link:https://reference.wolfram.com/language/guide/LinguisticData.html</url> guide.
 
 """
 # This module uses both nltk and spacy. Maybe it makes sense to split this further.
@@ -23,6 +23,7 @@
 # from mathics.builtin.codetables import iso639_3
 from mathics.builtin.numbers.randomnumbers import RandomEnv
 from mathics.core.atoms import String
+from mathics.core.element import ElementsProperties
 from mathics.core.convert.expression import Expression, to_expression
 from mathics.core.evaluation import Evaluation
 from mathics.core.list import ListExpression
@@ -39,13 +40,15 @@
     merge_dictionaries,
 )
 
+sort_order = "Linguistic Data"
+
 SymbolDictionaryLookup = Symbol("Pymathics`Natlang`DictionaryLookup")
 StringNotAvailable = String("NotAvailable")
 
 
 class Pluralize(Builtin):
     """
-    <url>:WMA:
+    <url>:WMA link:
     https://reference.wolfram.com/language/ref/Pluralize.html</url>
 
     <dl>
@@ -68,7 +71,7 @@ def eval(self, word, evaluation):
 
 class RandomWord(_WordListBuiltin):
     """
-    <url>:WMA:
+    <url>:WMA link:
     https://reference.wolfram.com/language/ref/RandomWord.html</url>
 
     <dl>
@@ -122,7 +125,7 @@ def eval_type_n(self, type, n, evaluation: Evaluation, options: dict):
 class WordData(_WordListBuiltin):
     """
 
-    <url>:WMA:
+    <url>:WMA link:
     https://reference.wolfram.com/language/ref/WordData.html</url>
 
     <dl>
@@ -288,7 +291,7 @@ def eval_property_form(
 
 class WordDefinition(_WordNetBuiltin):
     """
-    <url>:WMA:
+    <url>:WMA link:
     https://reference.wolfram.com/language/ref/WordDefinition.html</url>
 
     <dl>
@@ -317,7 +320,7 @@ def eval(self, word, evaluation: Evaluation, options: dict):
 
 class WordList(_WordListBuiltin):
     """
-    <url>:WMA:
+    <url>:WMA link:
     https://reference.wolfram.com/language/ref/WordList.html</url>
 
     <dl>
@@ -338,7 +341,9 @@ def eval(self, evaluation: Evaluation, options: dict):
         "WordList[OptionsPattern[]]"
         words = self._words(self._language_name(evaluation, options), "All", evaluation)
         if words is not None:
-            return ListExpression(*(String(word) for word in words))
+            words_mathics = (String(word) for word in words)
+            result = ListExpression(*words_mathics, elements_properties=ElementsProperties(False, False, True))
+            return result
 
     def eval_type(self, wordtype, evaluation: Evaluation, options: dict):
         "WordList[wordtype_String, OptionsPattern[]]"
@@ -348,4 +353,4 @@ def eval_type(self, wordtype, evaluation: Evaluation, options: dict):
             evaluation,
         )
         if words is not None:
-            return ListExpression(*(String(word) for word in words))
+            return ListExpression(*(String(word) for word in words), elements_properties=ElementsProperties(False, False, True))
diff --git a/pymathics/natlang/normalization.py b/pymathics/natlang/normalization.py
index aeca510..6066cf0 100644
--- a/pymathics/natlang/normalization.py
+++ b/pymathics/natlang/normalization.py
@@ -1,6 +1,6 @@
 """
 
-Text normalization
+Text Normalization
 
 See <url>:WMA: https://reference.wolfram.com/language/guide/TextNormalization.html</url> guide.
 
@@ -19,6 +19,7 @@
 
 from pymathics.natlang.spacy import _cases, _pos_tags, _position, _SpacyBuiltin
 
+sort_order = "Text Normalization"
 
 class DeleteStopwords(_SpacyBuiltin):
     """
@@ -284,7 +285,7 @@ class TextWords(_SpacyBuiltin):
     def eval(
         self, text: String, evaluation: Evaluation, options: dict
     ) -> Optional[ListExpression]:
-        "TextWords[text_String, OptionsPattern[WordCount]]"
+        "TextWords[text_String, OptionsPattern[]]"
         doc = self._nlp(text.value, evaluation, options)
         if doc:
             punctuation = spacy.parts_of_speech.PUNCT
@@ -293,7 +294,7 @@ def eval(
             )
 
     def eval_n(self, text: String, n: Integer, evaluation: Evaluation, options: dict):
-        "TextWords[text_String, n_Integer, OptionsPattern[TextWords]]"
+        "TextWords[text_String, n_Integer, OptionsPattern[]]"
         doc = self._nlp(text.value, evaluation, options)
         if doc:
             punctuation = spacy.parts_of_speech.PUNCT
diff --git a/pymathics/natlang/textual_analysis.py b/pymathics/natlang/textual_analysis.py
index c1f74d2..622efe6 100644
--- a/pymathics/natlang/textual_analysis.py
+++ b/pymathics/natlang/textual_analysis.py
@@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-
 """
-Text analysis functions
+Text Analysis
 
 <url>:See WMA guide:https://reference.wolfram.com/language/guide/TextAnalysis.html</url>
 """
@@ -28,9 +28,11 @@
 from pymathics.natlang.util import _WordListBuiltin, _WordNetBuiltin, merge_dictionaries
 
 
+sort_order = "Text Analysis"
+
 class Containing(Builtin):
     """
-    <url>:WMA:
+    <url>:WMA link:
     https://reference.wolfram.com/language/ref/Containing.html</url>
 
     <dl>
@@ -46,7 +48,7 @@ class Containing(Builtin):
 
 class DictionaryLookup(_WordListBuiltin):
     """
-    <url>:WMA:
+    <url>:WMA link:
     https://reference.wolfram.com/language/ref/DictionaryLookup.html</url>
 
     <dl>
@@ -111,7 +113,7 @@ def eval_language_n(self, language, word, n, evaluation):
 
 class DictionaryWordQ(_WordNetBuiltin):
     """
-    <url>:WMA:
+    <url>:WMA link:
     https://reference.wolfram.com/language/ref/DictionaryWordQ.html</url>
 
     <dl>
@@ -144,7 +146,7 @@ def eval(self, word, evaluation: Evaluation, options: dict):
 
 class SpellingCorrectionList(Builtin):
     """
-    <url>:WMA:
+    <url>:WMA link:
     https://reference.wolfram.com/language/ref/SpellingCorrectionList.html</url>
 
     <dl>
@@ -204,7 +206,7 @@ def eval(
 
 class WordCount(_SpacyBuiltin):
     """
-    <url>:WMA:
+    <url>:WMA link:
     https://reference.wolfram.com/language/ref/WordCount.html</url>
 
     <dl>
@@ -228,7 +230,7 @@ def eval(self, text, evaluation: Evaluation, options: dict):
 
 class WordFrequency(_SpacyBuiltin):
     """
-    <url>:WMA:
+    <url>:WMA link:
     https://reference.wolfram.com/language/ref/WordFrequency.html</url>
 
     <dl>
@@ -287,7 +289,7 @@ def eval(
 class WordSimilarity(_SpacyBuiltin):
     """
 
-    <url>:WMA:
+    <url>:WMA link:
     https://reference.wolfram.com/language/ref/WordSimilarity.html</url>
 
     <dl>
@@ -381,7 +383,7 @@ def eval_pair(self, text1, i1, text2, i2, evaluation: Evaluation, options: dict)
 
 class WordStem(Builtin):
     """
-    <url>:WMA:
+    <url>:WMA link:
     https://reference.wolfram.com/language/ref/WordStem.html</url>
 
     <dl>
diff --git a/pymathics/natlang/translation.py b/pymathics/natlang/translation.py
index 0da39ad..a08fd03 100644
--- a/pymathics/natlang/translation.py
+++ b/pymathics/natlang/translation.py
@@ -2,7 +2,7 @@
 
 
 """
-Language translation
+Language Translation
 
 
 """
@@ -23,6 +23,7 @@
 from mathics.core.symbols import Symbol
 from mathics.core.systemsymbols import SymbolFailed
 
+sort_order = "Language Translation"
 
 class LanguageIdentify(Builtin):
     """

From 042dfe37bddc9d2b278896663a21bcbe746f948d Mon Sep 17 00:00:00 2001
From: mmatera <matera@fisica.unlp.edu.ar>
Date: Tue, 21 Feb 2023 23:31:13 -0300
Subject: [PATCH 06/14] black

---
 pymathics/natlang/linguistic_data.py  | 12 +++++++++---
 pymathics/natlang/normalization.py    |  1 +
 pymathics/natlang/textual_analysis.py |  2 +-
 pymathics/natlang/translation.py      |  1 +
 4 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/pymathics/natlang/linguistic_data.py b/pymathics/natlang/linguistic_data.py
index 73d444c..f8210be 100644
--- a/pymathics/natlang/linguistic_data.py
+++ b/pymathics/natlang/linguistic_data.py
@@ -23,8 +23,8 @@
 # from mathics.builtin.codetables import iso639_3
 from mathics.builtin.numbers.randomnumbers import RandomEnv
 from mathics.core.atoms import String
-from mathics.core.element import ElementsProperties
 from mathics.core.convert.expression import Expression, to_expression
+from mathics.core.element import ElementsProperties
 from mathics.core.evaluation import Evaluation
 from mathics.core.list import ListExpression
 from mathics.core.symbols import Symbol, SymbolList
@@ -342,7 +342,10 @@ def eval(self, evaluation: Evaluation, options: dict):
         words = self._words(self._language_name(evaluation, options), "All", evaluation)
         if words is not None:
             words_mathics = (String(word) for word in words)
-            result = ListExpression(*words_mathics, elements_properties=ElementsProperties(False, False, True))
+            result = ListExpression(
+                *words_mathics,
+                elements_properties=ElementsProperties(False, False, True)
+            )
             return result
 
     def eval_type(self, wordtype, evaluation: Evaluation, options: dict):
@@ -353,4 +356,7 @@ def eval_type(self, wordtype, evaluation: Evaluation, options: dict):
             evaluation,
         )
         if words is not None:
-            return ListExpression(*(String(word) for word in words), elements_properties=ElementsProperties(False, False, True))
+            return ListExpression(
+                *(String(word) for word in words),
+                elements_properties=ElementsProperties(False, False, True)
+            )
diff --git a/pymathics/natlang/normalization.py b/pymathics/natlang/normalization.py
index 6066cf0..d9eea94 100644
--- a/pymathics/natlang/normalization.py
+++ b/pymathics/natlang/normalization.py
@@ -21,6 +21,7 @@
 
 sort_order = "Text Normalization"
 
+
 class DeleteStopwords(_SpacyBuiltin):
     """
     Delete <url>:stop words:https://en.wikipedia.org/wiki/Stop_word</url>(\
diff --git a/pymathics/natlang/textual_analysis.py b/pymathics/natlang/textual_analysis.py
index 622efe6..6bcd2f3 100644
--- a/pymathics/natlang/textual_analysis.py
+++ b/pymathics/natlang/textual_analysis.py
@@ -27,9 +27,9 @@
 from pymathics.natlang.spacy import _SpacyBuiltin
 from pymathics.natlang.util import _WordListBuiltin, _WordNetBuiltin, merge_dictionaries
 
-
 sort_order = "Text Analysis"
 
+
 class Containing(Builtin):
     """
     <url>:WMA link:
diff --git a/pymathics/natlang/translation.py b/pymathics/natlang/translation.py
index a08fd03..f5624ef 100644
--- a/pymathics/natlang/translation.py
+++ b/pymathics/natlang/translation.py
@@ -25,6 +25,7 @@
 
 sort_order = "Language Translation"
 
+
 class LanguageIdentify(Builtin):
     """
     <url>:WMA:

From 899d0bfa73e2cd81703b2b5f052f8153602a65c4 Mon Sep 17 00:00:00 2001
From: mmatera <matera@fisica.unlp.edu.ar>
Date: Tue, 21 Feb 2023 23:46:49 -0300
Subject: [PATCH 07/14] test for wordlist

---
 pymathics/natlang/linguistic_data.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/pymathics/natlang/linguistic_data.py b/pymathics/natlang/linguistic_data.py
index f8210be..b6d7fb7 100644
--- a/pymathics/natlang/linguistic_data.py
+++ b/pymathics/natlang/linguistic_data.py
@@ -331,6 +331,8 @@ class WordList(_WordListBuiltin):
       <dd>returns a list of common words of type $type$.
     </dl>
 
+    >> Length[WordList[]] > 10000
+     = True
     >> N[Mean[StringLength /@ WordList["Adjective"]], 2]
      = 9.3
     """

From da506b0d51092c04cdf38abfc51cc5ef99f702e6 Mon Sep 17 00:00:00 2001
From: mmatera <matera@fisica.unlp.edu.ar>
Date: Wed, 22 Feb 2023 07:33:21 -0300
Subject: [PATCH 08/14] easy fixes

---
 pymathics/natlang/linguistic_data.py  |  8 +++++---
 pymathics/natlang/normalization.py    | 10 +++++-----
 pymathics/natlang/textual_analysis.py |  8 ++++----
 pymathics/natlang/translation.py      |  2 +-
 pymathics/natlang/util.py             |  4 ++--
 test/test_natlang.py                  |  5 +++++
 6 files changed, 22 insertions(+), 15 deletions(-)

diff --git a/pymathics/natlang/linguistic_data.py b/pymathics/natlang/linguistic_data.py
index b6d7fb7..6332c1a 100644
--- a/pymathics/natlang/linguistic_data.py
+++ b/pymathics/natlang/linguistic_data.py
@@ -331,13 +331,15 @@ class WordList(_WordListBuiltin):
       <dd>returns a list of common words of type $type$.
     </dl>
 
-    >> Length[WordList[]] > 10000
-     = True
+    Evaluate the average length over all the words in the dictionary:
+    >> N[Mean[StringLength /@ WordList[]], 3]
+     = 11.6
+    Now, restricted to adjetives:
     >> N[Mean[StringLength /@ WordList["Adjective"]], 2]
      = 9.3
     """
 
-    summary_text = "list of common words"
+    summary_text = "list common words"
 
     def eval(self, evaluation: Evaluation, options: dict):
         "WordList[OptionsPattern[]]"
diff --git a/pymathics/natlang/normalization.py b/pymathics/natlang/normalization.py
index d9eea94..985809b 100644
--- a/pymathics/natlang/normalization.py
+++ b/pymathics/natlang/normalization.py
@@ -2,7 +2,7 @@
 
 Text Normalization
 
-See <url>:WMA: https://reference.wolfram.com/language/guide/TextNormalization.html</url> guide.
+See <url>:WMA link: https://reference.wolfram.com/language/guide/TextNormalization.html</url> guide.
 
 
 This module uses spacy as a backend.
@@ -82,7 +82,7 @@ def tokens():
 
 class TextCases(_SpacyBuiltin):
     """
-    <url>:WMA:
+    <url>:WMA link:
     https://reference.wolfram.com/language/ref/TextCases.html</url>
 
     <dl>
@@ -126,7 +126,7 @@ def eval_string_form_n(
 
 class TextPosition(_SpacyBuiltin):
     """
-    <url>:WMA:
+    <url>:WMA link:
     https://reference.wolfram.com/language/ref/TextPosition.html</url>
 
     <dl>
@@ -204,7 +204,7 @@ def eval_n(self, text: String, n: Integer, evaluation: Evaluation, options: dict
 
 class TextStructure(_SpacyBuiltin):
     """
-    <url>:WMA:
+    <url>:WMA link:
     https://reference.wolfram.com/language/ref/TextStructure.html</url>
 
     <dl>
@@ -266,7 +266,7 @@ def eval(self, text, evaluation: Evaluation, options: dict):
 
 class TextWords(_SpacyBuiltin):
     """
-    <url>:WMA:
+    <url>:WMA link:
     https://reference.wolfram.com/language/ref/TextWords.html</url>
 
     <dl>
diff --git a/pymathics/natlang/textual_analysis.py b/pymathics/natlang/textual_analysis.py
index 6bcd2f3..dc4df24 100644
--- a/pymathics/natlang/textual_analysis.py
+++ b/pymathics/natlang/textual_analysis.py
@@ -43,7 +43,7 @@ class Containing(Builtin):
 
     """
 
-    summary_text = "Specify a container for matching"
+    summary_text = "specify a container for matching"
 
 
 class DictionaryLookup(_WordListBuiltin):
@@ -243,11 +243,11 @@ class WordFrequency(_SpacyBuiltin):
     ## Problem with import for certain characters in the text.
     ## >> text = Import["ExampleData/EinsteinSzilLetter.txt"];
     >> text = "I have a dairy cow, it's not just any cow. \
-       She gives me milkshake, oh what a salty cow. She is the best\
-       cow in the county.";
+She gives me milkshake, oh what a salty cow. She is the best \
+cow in the county.";
 
     >> WordFrequency[text, "a" | "the"]
-     = 0.114286
+     = 0.121212
 
     >> WordFrequency["Apple Tree", "apple", IgnoreCase -> True]
      = 0.5
diff --git a/pymathics/natlang/translation.py b/pymathics/natlang/translation.py
index f5624ef..a3aecd1 100644
--- a/pymathics/natlang/translation.py
+++ b/pymathics/natlang/translation.py
@@ -28,7 +28,7 @@
 
 class LanguageIdentify(Builtin):
     """
-    <url>:WMA:
+    <url>:WMA link:
     https://reference.wolfram.com/language/ref/LanguageIdentify.html</url>
 
     <dl>
diff --git a/pymathics/natlang/util.py b/pymathics/natlang/util.py
index 2c3b33c..c5a223c 100644
--- a/pymathics/natlang/util.py
+++ b/pymathics/natlang/util.py
@@ -227,8 +227,8 @@ def _words(self, language_name, ilk, evaluation):
                         evaluation.message(
                             self.get_name(),
                             "wordnet",
-                            "type: %s is should be in %s"
-                            % (ilk._wordnet_type_to_pos.keys()),
+                            "type: %s should be in %s"
+                            % (ilk, _wordnet_type_to_pos.keys()),
                         )
                         return
 
diff --git a/test/test_natlang.py b/test/test_natlang.py
index a64b31e..4951f30 100644
--- a/test/test_natlang.py
+++ b/test/test_natlang.py
@@ -16,6 +16,11 @@ def test_natlang():
             "4",
             "WordCount",
         ),
+        (
+            'Length[WordList[]]>10000',
+            "True",
+            "WordList",
+        ),
         (
             'TextWords["Hickory, dickory, dock! The mouse ran up the clock."]',
             '{"Hickory", "dickory", "dock", "The", "mouse", "ran", "up", "the", "clock"}',

From ef95fa51f4d04f6df1f16cd3c715c7b5b4eee1be Mon Sep 17 00:00:00 2001
From: mmatera <matera@fisica.unlp.edu.ar>
Date: Wed, 22 Feb 2023 08:04:16 -0300
Subject: [PATCH 09/14] complete doctests

---
 pymathics/natlang/normalization.py    | 4 ++++
 pymathics/natlang/textual_analysis.py | 7 +++++--
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/pymathics/natlang/normalization.py b/pymathics/natlang/normalization.py
index 985809b..51738c5 100644
--- a/pymathics/natlang/normalization.py
+++ b/pymathics/natlang/normalization.py
@@ -279,6 +279,10 @@ class TextWords(_SpacyBuiltin):
 
     >> TextWords["Hickory, dickory, dock! The mouse ran up the clock."]
      = {Hickory, dickory, dock, The, mouse, ran, up, the, clock}
+
+    >> TextWords["Bruder Jakob, Schläfst du noch?", 2]
+     = {Bruder, Jakob}
+
     """
 
     summary_text = "list the words in a string"
diff --git a/pymathics/natlang/textual_analysis.py b/pymathics/natlang/textual_analysis.py
index dc4df24..b9e5d5d 100644
--- a/pymathics/natlang/textual_analysis.py
+++ b/pymathics/natlang/textual_analysis.py
@@ -59,8 +59,11 @@ class DictionaryLookup(_WordListBuiltin):
       <dd>lookup first $n$ words that match the given $word$ or pattern.
     </dl>
 
-    >> DictionaryLookup["bake" ~~ ___, 3]
-     = {bake, bakeapple, baked}
+    >> DictionaryLookup["baker" ~~ ___]
+     = {baker, baker's dozen, baker's eczema, baker's yeast, bakersfield, bakery}
+
+    >> DictionaryLookup["baker" ~~ ___, 3]
+     = {baker, baker's dozen, baker's eczema}
     """
 
     summary_text = "Lookup words matching a pattern in a dictionary"

From 3ed1adac6e8bb0d953e680bacc460d4214c31691 Mon Sep 17 00:00:00 2001
From: mmatera <matera@fisica.unlp.edu.ar>
Date: Wed, 22 Feb 2023 08:38:26 -0300
Subject: [PATCH 10/14] fix Containing. Adding test cases

---
 pymathics/natlang/spacy.py            | 17 +++++++----------
 pymathics/natlang/textual_analysis.py | 15 ++++++++++++++-
 test/test_natlang.py                  |  2 +-
 3 files changed, 22 insertions(+), 12 deletions(-)

diff --git a/pymathics/natlang/spacy.py b/pymathics/natlang/spacy.py
index 78c38b6..1a8e301 100644
--- a/pymathics/natlang/spacy.py
+++ b/pymathics/natlang/spacy.py
@@ -16,6 +16,7 @@
 from mathics.core.atoms import String
 from mathics.core.evaluation import Evaluation
 from mathics.core.symbols import strip_context
+from mathics.core.systemsymbols import SymbolAlternatives
 from spacy.tokens import Span
 
 no_doc = True
@@ -58,17 +59,14 @@
 def _cases(doc, form):
     if isinstance(form, String):
         generators = [_forms.get(form.value)]
-    elif form.get_head_name() == "System`Alternatives":
+    elif form.get_head() is SymbolAlternatives:
         if not all(isinstance(f, String) for f in form.elements):
             return  # error
         generators = [_forms.get(f.value) for f in form.elements]
-    elif form.get_head_name() == "PyMathics`Containing":
-        if len(form.elements) == 2:
-            for t in _containing(doc, *form.elements):
-                yield t
-            return
-        else:
-            return  # error
+    elif form.has_form("Pymathics`Containing", 2):
+        for t in _containing(doc, *form.elements):
+            yield t
+        return
     else:
         return  # error
 
@@ -79,7 +77,7 @@ def try_next(iterator):
             return None
 
     feeds = []
-    for i, iterator in enumerate([iter(generator(doc)) for generator in generators]):
+    for i, iterator in enumerate([iter(generator(doc)) for generator in generators if generator]):
         t = try_next(iterator)
         if t:
             feeds.append((_position(t), i, t, iterator))
@@ -169,7 +167,6 @@ def generator(doc):
 # forms are everything one can use in TextCases[] or TextPosition[].
 _forms = _make_forms()
 
-
 def _position(t):
     if isinstance(t, Span):
         i = t.doc[t.start]
diff --git a/pymathics/natlang/textual_analysis.py b/pymathics/natlang/textual_analysis.py
index b9e5d5d..6da5bd2 100644
--- a/pymathics/natlang/textual_analysis.py
+++ b/pymathics/natlang/textual_analysis.py
@@ -40,9 +40,22 @@ class Containing(Builtin):
       <dd>represents an object of the type outer containing objects\
           of type inner.
     </dl>
+    'Containing' can be used as the second parameter in 'TextCases' and 'TextPosition'.
+    
+    Supported $outer$ strings are in {"Word", "Sentence", "Paragraph", "Line", "URL", "EmailAddress"}.
 
-    """
+    Supported $inner$ strings are in {"Person", "Company", "Quantity", "Number", "CurrencyAmount",
+    "Country", "City"}.
+    
+    The implementation of this symbol is based on `spacy`.
+
+    >> TextCases["This is a pencil. This is another pencil from England.", Containing["Sentence", "Country"]]
+     = {This is another pencil from England.}
+    >> TextPosition["This is a pencil. This is another pencil from England.", Containing["Sentence", "Country"]]
+     = {{19, 54}}
 
+    """
+    # This is implemented in ``pymathics.natlang.spacy._containing``
     summary_text = "specify a container for matching"
 
 
diff --git a/test/test_natlang.py b/test/test_natlang.py
index 4951f30..e2adc86 100644
--- a/test/test_natlang.py
+++ b/test/test_natlang.py
@@ -17,7 +17,7 @@ def test_natlang():
             "WordCount",
         ),
         (
-            'Length[WordList[]]>10000',
+            "Length[WordList[]]>10000",
             "True",
             "WordList",
         ),

From c01cb22daca4cf7b32343335a19dc2b2a97d6189 Mon Sep 17 00:00:00 2001
From: mmatera <matera@fisica.unlp.edu.ar>
Date: Wed, 22 Feb 2023 08:39:32 -0300
Subject: [PATCH 11/14] black

---
 pymathics/natlang/spacy.py            | 5 ++++-
 pymathics/natlang/textual_analysis.py | 1 +
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/pymathics/natlang/spacy.py b/pymathics/natlang/spacy.py
index 1a8e301..b87b814 100644
--- a/pymathics/natlang/spacy.py
+++ b/pymathics/natlang/spacy.py
@@ -77,7 +77,9 @@ def try_next(iterator):
             return None
 
     feeds = []
-    for i, iterator in enumerate([iter(generator(doc)) for generator in generators if generator]):
+    for i, iterator in enumerate(
+        [iter(generator(doc)) for generator in generators if generator]
+    ):
         t = try_next(iterator)
         if t:
             feeds.append((_position(t), i, t, iterator))
@@ -167,6 +169,7 @@ def generator(doc):
 # forms are everything one can use in TextCases[] or TextPosition[].
 _forms = _make_forms()
 
+
 def _position(t):
     if isinstance(t, Span):
         i = t.doc[t.start]
diff --git a/pymathics/natlang/textual_analysis.py b/pymathics/natlang/textual_analysis.py
index 6da5bd2..0b437ad 100644
--- a/pymathics/natlang/textual_analysis.py
+++ b/pymathics/natlang/textual_analysis.py
@@ -55,6 +55,7 @@ class Containing(Builtin):
      = {{19, 54}}
 
     """
+
     # This is implemented in ``pymathics.natlang.spacy._containing``
     summary_text = "specify a container for matching"
 

From 37f1b91a02c748a54858829315a10972ab6288dd Mon Sep 17 00:00:00 2001
From: mmatera <matera@fisica.unlp.edu.ar>
Date: Wed, 22 Feb 2023 08:45:22 -0300
Subject: [PATCH 12/14] trailing typos

---
 pymathics/natlang/normalization.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/pymathics/natlang/normalization.py b/pymathics/natlang/normalization.py
index 51738c5..f4cf07d 100644
--- a/pymathics/natlang/normalization.py
+++ b/pymathics/natlang/normalization.py
@@ -47,7 +47,7 @@ class DeleteStopwords(_SpacyBuiltin):
      = Old Man Apulia, conduct peculiar
     """
 
-    summary_text = "Remove stopwords from a text"
+    summary_text = "remove stopwords from a text"
 
     def eval_list(self, li, evaluation: Evaluation, options: dict) -> ListExpression:
         "DeleteStopwords[li_List, OptionsPattern[DeleteStopwords]]"
@@ -104,7 +104,7 @@ class TextCases(_SpacyBuiltin):
 
     """
 
-    summary_text = "list the cases of words of a certain form in a text"
+    summary_text = "list cases of words of a certain form in a text"
 
     def eval_string_form(
         self, text: String, form, evaluation: Evaluation, options: dict
@@ -138,7 +138,7 @@ class TextPosition(_SpacyBuiltin):
      = {{1, 9}, {15, 20}}
     """
 
-    summary_text = "list the position of words of a given form in a text"
+    summary_text = "list the positions of words of a given form in a text"
 
     def eval_text_form(self, text: String, form, evaluation: Evaluation, options: dict):
         "TextPosition[text_String, form_,  OptionsPattern[TextPosition]]"

From b249fe1bb4aa9ede63ad48e028ad849548bbcabb Mon Sep 17 00:00:00 2001
From: mmatera <matera@fisica.unlp.edu.ar>
Date: Wed, 22 Feb 2023 09:24:58 -0300
Subject: [PATCH 13/14] split nltk from utils

---
 pymathics/natlang/__init__.py         |   6 +-
 pymathics/natlang/linguistic_data.py  | 118 ++++++++--
 pymathics/natlang/manipulate.py       |  36 +++
 pymathics/natlang/nltk.py             | 322 ++++++++++++++++++++++++++
 pymathics/natlang/textual_analysis.py | 113 +--------
 pymathics/natlang/util.py             | 317 -------------------------
 6 files changed, 463 insertions(+), 449 deletions(-)
 create mode 100644 pymathics/natlang/manipulate.py
 create mode 100644 pymathics/natlang/nltk.py

diff --git a/pymathics/natlang/__init__.py b/pymathics/natlang/__init__.py
index c0d76ae..7b0498c 100644
--- a/pymathics/natlang/__init__.py
+++ b/pymathics/natlang/__init__.py
@@ -39,12 +39,14 @@
 """
 
 from pymathics.natlang.linguistic_data import (
-    Pluralize,
+    DictionaryLookup,
+    DictionaryWordQ,
     RandomWord,
     WordData,
     WordDefinition,
     WordList,
 )
+from pymathics.natlang.manipulate import Pluralize
 from pymathics.natlang.normalization import (
     DeleteStopwords,
     TextCases,
@@ -55,8 +57,6 @@
 )
 from pymathics.natlang.textual_analysis import (
     Containing,
-    DictionaryLookup,
-    DictionaryWordQ,
     SpellingCorrectionList,
     WordCount,
     WordFrequency,
diff --git a/pymathics/natlang/linguistic_data.py b/pymathics/natlang/linguistic_data.py
index 6332c1a..033c179 100644
--- a/pymathics/natlang/linguistic_data.py
+++ b/pymathics/natlang/linguistic_data.py
@@ -5,7 +5,7 @@
 See <url>:WMA link:https://reference.wolfram.com/language/guide/LinguisticData.html</url> guide.
 
 """
-# This module uses both nltk and spacy. Maybe it makes sense to split this further.
+# This module uses nltk.
 
 
 # TODO: Complete me
@@ -16,29 +16,29 @@
 # PartOfSpeech — possible parts of speech for a word
 
 
+import re
+from itertools import islice
 from typing import Optional
 
-from mathics.builtin.base import Builtin, MessageException
-
-# from mathics.builtin.codetables import iso639_3
+from mathics.builtin.atomic.strings import anchor_pattern, to_regex
+from mathics.builtin.base import MessageException
 from mathics.builtin.numbers.randomnumbers import RandomEnv
 from mathics.core.atoms import String
 from mathics.core.convert.expression import Expression, to_expression
 from mathics.core.element import ElementsProperties
 from mathics.core.evaluation import Evaluation
 from mathics.core.list import ListExpression
-from mathics.core.symbols import Symbol, SymbolList
+from mathics.core.symbols import Symbol, SymbolFalse, SymbolList, SymbolTrue
 from mathics.core.systemsymbols import SymbolMissing, SymbolRule, SymbolStringExpression
-from pattern.en import pluralize
 
-from pymathics.natlang.textual_analysis import WordStem
-from pymathics.natlang.util import (
+from pymathics.natlang.nltk import (
     WordProperty,
     _WordListBuiltin,
     _wordnet_pos_to_type,
     _WordNetBuiltin,
-    merge_dictionaries,
 )
+from pymathics.natlang.textual_analysis import WordStem
+from pymathics.natlang.util import merge_dictionaries
 
 sort_order = "Linguistic Data"
 
@@ -46,27 +46,105 @@
 StringNotAvailable = String("NotAvailable")
 
 
-class Pluralize(Builtin):
+class DictionaryLookup(_WordListBuiltin):
     """
     <url>:WMA link:
-    https://reference.wolfram.com/language/ref/Pluralize.html</url>
+    https://reference.wolfram.com/language/ref/DictionaryLookup.html</url>
 
     <dl>
-      <dt>'Pluralize[$word$]'
-      <dd>returns the plural form of $word$.
+      <dt>'DictionaryLookup[$word$]'
+      <dd>lookup words that match the given $word$ or pattern.
+
+      <dt>'DictionaryLookup[$word$, $n$]'
+      <dd>lookup first $n$ words that match the given $word$ or pattern.
     </dl>
 
-    >> Pluralize["potato"]
-     = potatoes
+    >> DictionaryLookup["baker" ~~ ___]
+     = {baker, baker's dozen, baker's eczema, baker's yeast, bakersfield, bakery}
+
+    >> DictionaryLookup["baker" ~~ ___, 3]
+     = {baker, baker's dozen, baker's eczema}
     """
 
-    requires = ("pattern",)
-    summary_text = "retrieve the pluralized form of a word"
+    summary_text = "Lookup words matching a pattern in a dictionary"
+
+    def compile(self, pattern, evaluation):
+        re_patt = to_regex(pattern, evaluation)
+        if re_patt is None:
+            evaluation.message(
+                "StringExpression",
+                "invld",
+                pattern,
+                Expression(SymbolStringExpression, pattern),
+            )
+            return
+        re_patt = anchor_pattern(re_patt)
+
+        return re.compile(re_patt, flags=re.IGNORECASE)
+
+    def search(self, dictionary_words, pattern):
+        for dictionary_word in dictionary_words:
+            if pattern.match(dictionary_word):
+                yield dictionary_word.replace("_", " ")
+
+    def lookup(self, language_name, word, n, evaluation):
+        pattern = self.compile(word, evaluation)
+        if pattern:
+            dictionary_words = self._words(language_name, "All", evaluation)
+            if dictionary_words is not None:
+                matches = self.search(dictionary_words, pattern)
+                if n is not None:
+                    matches = islice(matches, 0, n)
+                return ListExpression(*(String(match) for match in sorted(matches)))
+
+    def eval_english(self, word, evaluation):
+        "DictionaryLookup[word_]"
+        return self.lookup(String("English"), word, None, evaluation)
 
-    def eval(self, word, evaluation):
-        "Pluralize[word_String]"
+    def eval_language(self, language, word, evaluation):
+        "DictionaryLookup[{language_String, word_}]"
+        return self.lookup(language, word, None, evaluation)
 
-        return String(pluralize(word.value))
+    def eval_english_n(self, word, n, evaluation):
+        "DictionaryLookup[word_, n_Integer]"
+        return self.lookup(String("English"), word, n.value, evaluation)
+
+    def eval_language_n(self, language, word, n, evaluation):
+        "DictionaryLookup[{language_String, word_}, n_Integer]"
+        return self.lookup(language, word, n.value, evaluation)
+
+
+class DictionaryWordQ(_WordNetBuiltin):
+    """
+    <url>:WMA link:
+    https://reference.wolfram.com/language/ref/DictionaryWordQ.html</url>
+
+    <dl>
+      <dt>'DictionaryWordQ[$word$]'
+      <dd>returns True if $word$ is a word usually found in dictionaries, and False otherwise.
+    </dl>
+
+    >> DictionaryWordQ["couch"]
+     = True
+
+    >> DictionaryWordQ["meep-meep"]
+     = False
+    """
+
+    summary_text = "Check if a word is in the dictionary"
+
+    def eval(self, word, evaluation: Evaluation, options: dict):
+        "DictionaryWordQ[word_String,  OptionsPattern[DictionaryWordQ]]"
+        if not isinstance(word, String):
+            return False
+        wordnet, language_code = self._load_wordnet(
+            evaluation, self._language_name(evaluation, options)
+        )
+        if wordnet:
+            if list(wordnet.synsets(word.value.lower(), None, language_code)):
+                return SymbolTrue
+            else:
+                return SymbolFalse
 
 
 class RandomWord(_WordListBuiltin):
diff --git a/pymathics/natlang/manipulate.py b/pymathics/natlang/manipulate.py
new file mode 100644
index 0000000..7d1f7e9
--- /dev/null
+++ b/pymathics/natlang/manipulate.py
@@ -0,0 +1,36 @@
+# -*- coding: utf-8 -*-
+"""
+Word manipulation
+
+This module uses pattern.en to change the form of a word.
+
+"""
+from mathics.builtin.base import Builtin
+from mathics.core.atoms import String
+from mathics.core.evaluation import Evaluation
+from pattern.en import pluralize
+
+sort_order = "Word manipulation"
+
+
+class Pluralize(Builtin):
+    """
+    <url>:WMA link:
+    https://reference.wolfram.com/language/ref/Pluralize.html</url>
+
+    <dl>
+      <dt>'Pluralize[$word$]'
+      <dd>returns the plural form of $word$.
+    </dl>
+
+    >> Pluralize["potato"]
+     = potatoes
+    """
+
+    requires = ("pattern",)
+    summary_text = "retrieve the pluralized form of a word"
+
+    def eval(self, word: String, evaluation: Evaluation) -> String:
+        "Pluralize[word_String]"
+
+        return String(pluralize(word.value))
diff --git a/pymathics/natlang/nltk.py b/pymathics/natlang/nltk.py
new file mode 100644
index 0000000..ff04fc2
--- /dev/null
+++ b/pymathics/natlang/nltk.py
@@ -0,0 +1,322 @@
+# -*- coding: utf-8 -*-
+
+"""
+nltk backend
+"""
+import re
+from itertools import chain
+
+import nltk
+from mathics.builtin.base import Builtin, MessageException
+from mathics.builtin.codetables import iso639_3
+from mathics.core.atoms import String
+from mathics.core.evaluation import Evaluation
+from mathics.core.symbols import strip_context
+
+no_doc = True
+
+
+_wordnet_pos_to_type = {}
+_wordnet_type_to_pos = {}
+
+
+def _init_nltk_maps():
+    _wordnet_pos_to_type.update(
+        {
+            nltk.corpus.wordnet.VERB: "Verb",
+            nltk.corpus.wordnet.NOUN: "Noun",
+            nltk.corpus.wordnet.ADJ: "Adjective",
+            nltk.corpus.wordnet.ADJ_SAT: "Adjective",
+            nltk.corpus.wordnet.ADV: "Adverb",
+        }
+    )
+    _wordnet_type_to_pos.update(
+        {
+            "Verb": [nltk.corpus.wordnet.VERB],
+            "Noun": [nltk.corpus.wordnet.NOUN],
+            "Adjective": [nltk.corpus.wordnet.ADJ, nltk.corpus.wordnet.ADJ_SAT],
+            "Adverb": [nltk.corpus.wordnet.ADV],
+        }
+    )
+
+
+def _parse_nltk_lookup_error(e):
+    m = re.search(r"Resource '([^']+)' not found\.", str(e))
+    if m:
+        return m.group(1)
+    else:
+        return "unknown"
+
+
+class _WordNetBuiltin(Builtin):
+    requires = ("nltk",)
+
+    options = {
+        "Language": '"English"',
+    }
+
+    messages = {
+        "package": "NLTK's `` corpus is not installed. Please install it using nltk.download().",
+        "lang": 'Language "`1`" is currently not supported with `2`[]. Please install it manually.',
+        # 'load': 'Loading `1` word data. Please wait.',
+        "wordnet": "WordNet returned the following error: ``",
+    }
+
+    _wordnet_instances = {}
+
+    def _language_name(self, evaluation: Evaluation, options: dict):
+        return self.get_option(options, "Language", evaluation)
+
+    def _init_wordnet(self, evaluation: Evaluation, language_name, language_code):
+        try:
+            wordnet_resource = nltk.data.find("corpora/wordnet2022")
+            _init_nltk_maps()
+        except LookupError:
+            evaluation.message(self.get_name(), "package", "wordnet2022")
+            return None
+
+        try:
+            omw = nltk.corpus.util.LazyCorpusLoader(
+                "omw",
+                nltk.corpus.reader.CorpusReader,
+                r".*/wn-data-.*\.tab",
+                encoding="utf8",
+            )
+        except LookupError:
+            evaluation.message(self.get_name(), "package", "omw")
+            return None
+
+        wordnet = nltk.corpus.reader.wordnet.WordNetCorpusReader(wordnet_resource, omw)
+
+        if language_code not in wordnet.langs():
+            evaluation.message(
+                self.get_name(), "lang", language_name, strip_context(self.get_name())
+            )
+            return None
+
+        return wordnet
+
+    def _load_wordnet(self, evaluation: Evaluation, language_name) -> tuple:
+        language_code = None
+        if isinstance(language_name, String):
+            language_code = iso639_3.get(language_name.value)
+        if not language_code:
+            evaluation.message(
+                self.get_name(), "lang", language_name, strip_context(self.get_name())
+            )
+            return None, None
+
+        wordnet = _WordNetBuiltin._wordnet_instances.get(language_code)
+        if not wordnet:
+            try:
+                wordnet = self._init_wordnet(evaluation, language_name, language_code)
+            except LookupError as e:
+                evaluation.message(
+                    self.get_name(), "package", _parse_nltk_lookup_error(e)
+                )
+                return None, None
+
+            _WordNetBuiltin._wordnet_instances[language_code] = wordnet
+
+        return wordnet, language_code
+
+    @staticmethod
+    def _decode_synset(syn):
+        what, pos, nr = (syn.name().split(".") + ["01"])[:3]
+        return what.replace("_", " "), pos, nr
+
+    @staticmethod
+    def _capitalize(s) -> str:
+        return re.sub(r"^[a-z]|\s[a-z]", lambda m: m.group(0).upper().lstrip(" "), s)
+
+    @staticmethod
+    def _underscore(s) -> str:
+        return re.sub(
+            r"[a-z][A-Z]", lambda m: m.group(0)[0] + "_" + m.group(0)[1].lower(), s
+        ).lower()
+
+    @staticmethod
+    def _list_syn_form(syn):
+        what, pos, nr = _WordNetBuiltin._decode_synset(syn)
+
+        def containers():
+            for name in syn.lemma_names():
+                if name != what:
+                    yield name
+
+            for s in chain(syn.hypernyms(), syn.hyponyms(), syn.similar_tos()):
+                container, _, _ = _WordNetBuiltin._decode_synset(s)
+                yield container
+
+            for lemma in WordProperty._synonymous_lemmas(syn):
+                yield lemma.name()
+
+        return what, _wordnet_pos_to_type[pos], containers
+
+    @staticmethod
+    def syn(syn, wordnet, language_code) -> tuple:
+        what, pos, nr = _WordNetBuiltin._decode_synset(syn)
+        for s, form in _WordNetBuiltin._iterate_senses(what, wordnet, language_code):
+            if s == syn:
+                return form
+        return what, pos, "Unknown"
+
+    @staticmethod
+    def _iterate_senses(word, wordnet, language_code):
+        if not word:
+            return
+
+        used = set()
+        output_word = word.replace("_", " ")
+
+        for syn in wordnet.synsets(word, None, language_code):
+            if syn.lexname() in ("noun.location", "noun.person"):
+                continue  # ignore
+
+            what, pos, containers = _WordNetBuiltin._list_syn_form(syn)
+
+            for container in containers():
+                container = container.replace("_", " ")
+                if container != word:
+                    if container not in used:
+                        used.add(container)
+                        yield syn, (
+                            output_word,
+                            pos,
+                            _WordNetBuiltin._capitalize(container),
+                        )
+                        break
+
+    def _senses(self, word, wordnet, language_code):
+        if isinstance(word, tuple):  # find forms like ["tree", "Noun", "WoodyPlant"]
+            for syn, form in _WordNetBuiltin._iterate_senses(
+                word[0], wordnet, language_code
+            ):
+                if form == word:
+                    return [[syn, form]]
+        else:  # find word given as strings, e.g. "tree"
+            word = wordnet.morphy(word)  # base form, e.g. trees -> tree
+            return list(_WordNetBuiltin._iterate_senses(word, wordnet, language_code))
+
+
+class _WordListBuiltin(_WordNetBuiltin):
+    _dictionary = {}
+
+    def _words(self, language_name, ilk, evaluation):
+        wordnet, language_code = self._load_wordnet(evaluation, language_name)
+
+        if not wordnet:
+            return
+
+        key = "%s.%s" % (language_code, ilk)
+        words = self._dictionary.get(key)
+        if not words:
+            try:
+                if ilk == "All":
+                    filtered_pos = [None]
+                else:
+                    try:
+                        filtered_pos = _wordnet_type_to_pos[ilk]
+                    except KeyError:
+                        evaluation.message(
+                            self.get_name(),
+                            "wordnet",
+                            "type: %s should be in %s"
+                            % (ilk, _wordnet_type_to_pos.keys()),
+                        )
+                        return
+
+                words = []
+                for pos in filtered_pos:
+                    words.extend(list(wordnet.all_lemma_names(pos, language_code)))
+                words.sort()
+                self._dictionary[key] = words
+            except nltk.corpus.reader.wordnet.WordNetError as err:
+                evaluation.message(self.get_name(), "wordnet", str(err))
+                return
+
+        return words
+
+
+class WordProperty:
+    def __init__(self, syn_form, wordnet, language_code):
+        self.syn_form = syn_form
+        self.wordnet = wordnet
+        self.language_code = language_code
+
+    def syn(self, syn):
+        return self.syn_form(_WordNetBuiltin.syn(syn, self.wordnet, self.language_code))
+
+    @staticmethod
+    def _synonymous_lemmas(syn):
+        first_lemma = syn.name().split(".")[0]
+        return (s for s in syn.lemmas() if s.name() != first_lemma)
+
+    @staticmethod
+    def _antonymous_lemmas(syn):
+        return (s for lemma in syn.lemmas() for s in lemma.antonyms())
+
+    def definitions(self, syn, desc):
+        return syn.definition()
+
+    def examples(self, syn, desc):
+        return syn.examples()
+
+    def synonyms(self, syn, desc):
+        _, pos, container = desc
+        return [
+            self.syn_form((s.name().replace("_", " "), pos, container))
+            for s in WordProperty._synonymous_lemmas(syn)
+        ]
+
+    def antonyms(self, syn, desc):
+        return [self.syn(s.synset()) for s in WordProperty._antonymous_lemmas(syn)]
+
+    def broader_terms(self, syn, desc):
+        return [self.syn(s) for s in syn.hypernyms()]
+
+    def narrower_terms(self, syn, desc):
+        return [self.syn(s) for s in syn.hyponyms()]
+
+    def usage_field(self, syn, desc):
+        return syn.usage_domains()
+
+    def whole_terms(self, syn, desc):
+        return [self.syn(s) for s in syn.part_holonyms()]
+
+    def part_terms(self, syn, desc):
+        return [self.syn(s) for s in syn.part_meronyms()]
+
+    def material_terms(self, syn, desc):
+        return [self.syn(s) for s in syn.substance_meronyms()]
+
+    def word_net_id(self, syn, desc):
+        return syn.offset()
+
+    def entailed_terms(self, syn, desc):  # e.g. fall to condense
+        return [self.syn(s) for s in syn.entailments()]
+
+    def causes_terms(self, syn, desc):  # e.g. ignite to burn
+        return [self.syn(s) for s in syn.causes()]
+
+    def inflected_forms(self, syn, desc):
+        try:
+            word, pos, _ = desc
+            if pos == "Verb":
+                from pattern.en import lexeme
+
+                return [w for w in reversed(lexeme(word)) if w != word]
+            elif pos == "Noun":
+                from pattern.en import pluralize
+
+                return [pluralize(word)]
+            elif pos == "Adjective":
+                from pattern.en import comparative, superlative
+
+                return [comparative(word), superlative(word)]
+            else:
+                return []
+        except ImportError:
+            raise MessageException(
+                "General", "unavailable", 'WordData[_, "InflectedForms"]', "pattern"
+            )
diff --git a/pymathics/natlang/textual_analysis.py b/pymathics/natlang/textual_analysis.py
index 0b437ad..de927e0 100644
--- a/pymathics/natlang/textual_analysis.py
+++ b/pymathics/natlang/textual_analysis.py
@@ -7,25 +7,21 @@
 
 # This module uses both enchant, nltk and spacy. Maybe we want to split this further.
 
-import re
-from itertools import islice
 from typing import Optional
 
 import enchant
 import nltk
 import spacy
-from mathics.builtin.atomic.strings import anchor_pattern, to_regex
 from mathics.builtin.base import Builtin
 from mathics.core.atoms import Integer, Real, String
 from mathics.core.evaluation import Evaluation
 from mathics.core.expression import Expression
 from mathics.core.list import ListExpression
-from mathics.core.symbols import SymbolFalse, SymbolList, SymbolTrue
-from mathics.core.systemsymbols import SymbolStringExpression
+from mathics.core.symbols import SymbolList, SymbolTrue
 from mathics.eval.nevaluator import eval_N
 
 from pymathics.natlang.spacy import _SpacyBuiltin
-from pymathics.natlang.util import _WordListBuiltin, _WordNetBuiltin, merge_dictionaries
+from pymathics.natlang.util import merge_dictionaries
 
 sort_order = "Text Analysis"
 
@@ -41,12 +37,12 @@ class Containing(Builtin):
           of type inner.
     </dl>
     'Containing' can be used as the second parameter in 'TextCases' and 'TextPosition'.
-    
+
     Supported $outer$ strings are in {"Word", "Sentence", "Paragraph", "Line", "URL", "EmailAddress"}.
 
     Supported $inner$ strings are in {"Person", "Company", "Quantity", "Number", "CurrencyAmount",
     "Country", "City"}.
-    
+
     The implementation of this symbol is based on `spacy`.
 
     >> TextCases["This is a pencil. This is another pencil from England.", Containing["Sentence", "Country"]]
@@ -60,107 +56,6 @@ class Containing(Builtin):
     summary_text = "specify a container for matching"
 
 
-class DictionaryLookup(_WordListBuiltin):
-    """
-    <url>:WMA link:
-    https://reference.wolfram.com/language/ref/DictionaryLookup.html</url>
-
-    <dl>
-      <dt>'DictionaryLookup[$word$]'
-      <dd>lookup words that match the given $word$ or pattern.
-
-      <dt>'DictionaryLookup[$word$, $n$]'
-      <dd>lookup first $n$ words that match the given $word$ or pattern.
-    </dl>
-
-    >> DictionaryLookup["baker" ~~ ___]
-     = {baker, baker's dozen, baker's eczema, baker's yeast, bakersfield, bakery}
-
-    >> DictionaryLookup["baker" ~~ ___, 3]
-     = {baker, baker's dozen, baker's eczema}
-    """
-
-    summary_text = "Lookup words matching a pattern in a dictionary"
-
-    def compile(self, pattern, evaluation):
-        re_patt = to_regex(pattern, evaluation)
-        if re_patt is None:
-            evaluation.message(
-                "StringExpression",
-                "invld",
-                pattern,
-                Expression(SymbolStringExpression, pattern),
-            )
-            return
-        re_patt = anchor_pattern(re_patt)
-
-        return re.compile(re_patt, flags=re.IGNORECASE)
-
-    def search(self, dictionary_words, pattern):
-        for dictionary_word in dictionary_words:
-            if pattern.match(dictionary_word):
-                yield dictionary_word.replace("_", " ")
-
-    def lookup(self, language_name, word, n, evaluation):
-        pattern = self.compile(word, evaluation)
-        if pattern:
-            dictionary_words = self._words(language_name, "All", evaluation)
-            if dictionary_words is not None:
-                matches = self.search(dictionary_words, pattern)
-                if n is not None:
-                    matches = islice(matches, 0, n)
-                return ListExpression(*(String(match) for match in sorted(matches)))
-
-    def eval_english(self, word, evaluation):
-        "DictionaryLookup[word_]"
-        return self.lookup(String("English"), word, None, evaluation)
-
-    def eval_language(self, language, word, evaluation):
-        "DictionaryLookup[{language_String, word_}]"
-        return self.lookup(language, word, None, evaluation)
-
-    def eval_english_n(self, word, n, evaluation):
-        "DictionaryLookup[word_, n_Integer]"
-        return self.lookup(String("English"), word, n.value, evaluation)
-
-    def eval_language_n(self, language, word, n, evaluation):
-        "DictionaryLookup[{language_String, word_}, n_Integer]"
-        return self.lookup(language, word, n.value, evaluation)
-
-
-class DictionaryWordQ(_WordNetBuiltin):
-    """
-    <url>:WMA link:
-    https://reference.wolfram.com/language/ref/DictionaryWordQ.html</url>
-
-    <dl>
-      <dt>'DictionaryWordQ[$word$]'
-      <dd>returns True if $word$ is a word usually found in dictionaries, and False otherwise.
-    </dl>
-
-    >> DictionaryWordQ["couch"]
-     = True
-
-    >> DictionaryWordQ["meep-meep"]
-     = False
-    """
-
-    summary_text = "Check if a word is in the dictionary"
-
-    def eval(self, word, evaluation: Evaluation, options: dict):
-        "DictionaryWordQ[word_String,  OptionsPattern[DictionaryWordQ]]"
-        if not isinstance(word, String):
-            return False
-        wordnet, language_code = self._load_wordnet(
-            evaluation, self._language_name(evaluation, options)
-        )
-        if wordnet:
-            if list(wordnet.synsets(word.value.lower(), None, language_code)):
-                return SymbolTrue
-            else:
-                return SymbolFalse
-
-
 class SpellingCorrectionList(Builtin):
     """
     <url>:WMA link:
diff --git a/pymathics/natlang/util.py b/pymathics/natlang/util.py
index c5a223c..4b80076 100644
--- a/pymathics/natlang/util.py
+++ b/pymathics/natlang/util.py
@@ -3,326 +3,9 @@
 """
 utils
 """
-import re
-from itertools import chain
-
-import nltk
-from mathics.builtin.base import Builtin, MessageException
-from mathics.builtin.codetables import iso639_3
-from mathics.core.atoms import String
-from mathics.core.evaluation import Evaluation
-from mathics.core.symbols import strip_context
-
-no_doc = True
-
-
-_wordnet_pos_to_type = {}
-_wordnet_type_to_pos = {}
-
-
-def _init_nltk_maps():
-    _wordnet_pos_to_type.update(
-        {
-            nltk.corpus.wordnet.VERB: "Verb",
-            nltk.corpus.wordnet.NOUN: "Noun",
-            nltk.corpus.wordnet.ADJ: "Adjective",
-            nltk.corpus.wordnet.ADJ_SAT: "Adjective",
-            nltk.corpus.wordnet.ADV: "Adverb",
-        }
-    )
-    _wordnet_type_to_pos.update(
-        {
-            "Verb": [nltk.corpus.wordnet.VERB],
-            "Noun": [nltk.corpus.wordnet.NOUN],
-            "Adjective": [nltk.corpus.wordnet.ADJ, nltk.corpus.wordnet.ADJ_SAT],
-            "Adverb": [nltk.corpus.wordnet.ADV],
-        }
-    )
-
-
-def _parse_nltk_lookup_error(e):
-    m = re.search(r"Resource '([^']+)' not found\.", str(e))
-    if m:
-        return m.group(1)
-    else:
-        return "unknown"
 
 
 def merge_dictionaries(a, b):
     c = a.copy()
     c.update(b)
     return c
-
-
-class _WordNetBuiltin(Builtin):
-    requires = ("nltk",)
-
-    options = {
-        "Language": '"English"',
-    }
-
-    messages = {
-        "package": "NLTK's `` corpus is not installed. Please install it using nltk.download().",
-        "lang": 'Language "`1`" is currently not supported with `2`[]. Please install it manually.',
-        # 'load': 'Loading `1` word data. Please wait.',
-        "wordnet": "WordNet returned the following error: ``",
-    }
-
-    _wordnet_instances = {}
-
-    def _language_name(self, evaluation: Evaluation, options: dict):
-        return self.get_option(options, "Language", evaluation)
-
-    def _init_wordnet(self, evaluation: Evaluation, language_name, language_code):
-        try:
-            wordnet_resource = nltk.data.find("corpora/wordnet2022")
-            _init_nltk_maps()
-        except LookupError:
-            evaluation.message(self.get_name(), "package", "wordnet2022")
-            return None
-
-        try:
-            omw = nltk.corpus.util.LazyCorpusLoader(
-                "omw",
-                nltk.corpus.reader.CorpusReader,
-                r".*/wn-data-.*\.tab",
-                encoding="utf8",
-            )
-        except LookupError:
-            evaluation.message(self.get_name(), "package", "omw")
-            return None
-
-        wordnet = nltk.corpus.reader.wordnet.WordNetCorpusReader(wordnet_resource, omw)
-
-        if language_code not in wordnet.langs():
-            evaluation.message(
-                self.get_name(), "lang", language_name, strip_context(self.get_name())
-            )
-            return None
-
-        return wordnet
-
-    def _load_wordnet(self, evaluation: Evaluation, language_name) -> tuple:
-        language_code = None
-        if isinstance(language_name, String):
-            language_code = iso639_3.get(language_name.value)
-        if not language_code:
-            evaluation.message(
-                self.get_name(), "lang", language_name, strip_context(self.get_name())
-            )
-            return None, None
-
-        wordnet = _WordNetBuiltin._wordnet_instances.get(language_code)
-        if not wordnet:
-            try:
-                wordnet = self._init_wordnet(evaluation, language_name, language_code)
-            except LookupError as e:
-                evaluation.message(
-                    self.get_name(), "package", _parse_nltk_lookup_error(e)
-                )
-                return None, None
-
-            _WordNetBuiltin._wordnet_instances[language_code] = wordnet
-
-        return wordnet, language_code
-
-    @staticmethod
-    def _decode_synset(syn):
-        what, pos, nr = (syn.name().split(".") + ["01"])[:3]
-        return what.replace("_", " "), pos, nr
-
-    @staticmethod
-    def _capitalize(s) -> str:
-        return re.sub(r"^[a-z]|\s[a-z]", lambda m: m.group(0).upper().lstrip(" "), s)
-
-    @staticmethod
-    def _underscore(s) -> str:
-        return re.sub(
-            r"[a-z][A-Z]", lambda m: m.group(0)[0] + "_" + m.group(0)[1].lower(), s
-        ).lower()
-
-    @staticmethod
-    def _list_syn_form(syn):
-        what, pos, nr = _WordNetBuiltin._decode_synset(syn)
-
-        def containers():
-            for name in syn.lemma_names():
-                if name != what:
-                    yield name
-
-            for s in chain(syn.hypernyms(), syn.hyponyms(), syn.similar_tos()):
-                container, _, _ = _WordNetBuiltin._decode_synset(s)
-                yield container
-
-            for lemma in WordProperty._synonymous_lemmas(syn):
-                yield lemma.name()
-
-        return what, _wordnet_pos_to_type[pos], containers
-
-    @staticmethod
-    def syn(syn, wordnet, language_code) -> tuple:
-        what, pos, nr = _WordNetBuiltin._decode_synset(syn)
-        for s, form in _WordNetBuiltin._iterate_senses(what, wordnet, language_code):
-            if s == syn:
-                return form
-        return what, pos, "Unknown"
-
-    @staticmethod
-    def _iterate_senses(word, wordnet, language_code):
-        if not word:
-            return
-
-        used = set()
-        output_word = word.replace("_", " ")
-
-        for syn in wordnet.synsets(word, None, language_code):
-            if syn.lexname() in ("noun.location", "noun.person"):
-                continue  # ignore
-
-            what, pos, containers = _WordNetBuiltin._list_syn_form(syn)
-
-            for container in containers():
-                container = container.replace("_", " ")
-                if container != word:
-                    if container not in used:
-                        used.add(container)
-                        yield syn, (
-                            output_word,
-                            pos,
-                            _WordNetBuiltin._capitalize(container),
-                        )
-                        break
-
-    def _senses(self, word, wordnet, language_code):
-        if isinstance(word, tuple):  # find forms like ["tree", "Noun", "WoodyPlant"]
-            for syn, form in _WordNetBuiltin._iterate_senses(
-                word[0], wordnet, language_code
-            ):
-                if form == word:
-                    return [[syn, form]]
-        else:  # find word given as strings, e.g. "tree"
-            word = wordnet.morphy(word)  # base form, e.g. trees -> tree
-            return list(_WordNetBuiltin._iterate_senses(word, wordnet, language_code))
-
-
-class _WordListBuiltin(_WordNetBuiltin):
-    _dictionary = {}
-
-    def _words(self, language_name, ilk, evaluation):
-        wordnet, language_code = self._load_wordnet(evaluation, language_name)
-
-        if not wordnet:
-            return
-
-        key = "%s.%s" % (language_code, ilk)
-        words = self._dictionary.get(key)
-        if not words:
-            try:
-                if ilk == "All":
-                    filtered_pos = [None]
-                else:
-                    try:
-                        filtered_pos = _wordnet_type_to_pos[ilk]
-                    except KeyError:
-                        evaluation.message(
-                            self.get_name(),
-                            "wordnet",
-                            "type: %s should be in %s"
-                            % (ilk, _wordnet_type_to_pos.keys()),
-                        )
-                        return
-
-                words = []
-                for pos in filtered_pos:
-                    words.extend(list(wordnet.all_lemma_names(pos, language_code)))
-                words.sort()
-                self._dictionary[key] = words
-            except nltk.corpus.reader.wordnet.WordNetError as err:
-                evaluation.message(self.get_name(), "wordnet", str(err))
-                return
-
-        return words
-
-
-class WordProperty:
-    def __init__(self, syn_form, wordnet, language_code):
-        self.syn_form = syn_form
-        self.wordnet = wordnet
-        self.language_code = language_code
-
-    def syn(self, syn):
-        return self.syn_form(_WordNetBuiltin.syn(syn, self.wordnet, self.language_code))
-
-    @staticmethod
-    def _synonymous_lemmas(syn):
-        first_lemma = syn.name().split(".")[0]
-        return (s for s in syn.lemmas() if s.name() != first_lemma)
-
-    @staticmethod
-    def _antonymous_lemmas(syn):
-        return (s for lemma in syn.lemmas() for s in lemma.antonyms())
-
-    def definitions(self, syn, desc):
-        return syn.definition()
-
-    def examples(self, syn, desc):
-        return syn.examples()
-
-    def synonyms(self, syn, desc):
-        _, pos, container = desc
-        return [
-            self.syn_form((s.name().replace("_", " "), pos, container))
-            for s in WordProperty._synonymous_lemmas(syn)
-        ]
-
-    def antonyms(self, syn, desc):
-        return [self.syn(s.synset()) for s in WordProperty._antonymous_lemmas(syn)]
-
-    def broader_terms(self, syn, desc):
-        return [self.syn(s) for s in syn.hypernyms()]
-
-    def narrower_terms(self, syn, desc):
-        return [self.syn(s) for s in syn.hyponyms()]
-
-    def usage_field(self, syn, desc):
-        return syn.usage_domains()
-
-    def whole_terms(self, syn, desc):
-        return [self.syn(s) for s in syn.part_holonyms()]
-
-    def part_terms(self, syn, desc):
-        return [self.syn(s) for s in syn.part_meronyms()]
-
-    def material_terms(self, syn, desc):
-        return [self.syn(s) for s in syn.substance_meronyms()]
-
-    def word_net_id(self, syn, desc):
-        return syn.offset()
-
-    def entailed_terms(self, syn, desc):  # e.g. fall to condense
-        return [self.syn(s) for s in syn.entailments()]
-
-    def causes_terms(self, syn, desc):  # e.g. ignite to burn
-        return [self.syn(s) for s in syn.causes()]
-
-    def inflected_forms(self, syn, desc):
-        try:
-            word, pos, _ = desc
-            if pos == "Verb":
-                from pattern.en import lexeme
-
-                return [w for w in reversed(lexeme(word)) if w != word]
-            elif pos == "Noun":
-                from pattern.en import pluralize
-
-                return [pluralize(word)]
-            elif pos == "Adjective":
-                from pattern.en import comparative, superlative
-
-                return [comparative(word), superlative(word)]
-            else:
-                return []
-        except ImportError:
-            raise MessageException(
-                "General", "unavailable", 'WordData[_, "InflectedForms"]', "pattern"
-            )

From f3f8c413c983ce05a03f132fb6d703043938f818 Mon Sep 17 00:00:00 2001
From: "R. Bernstein" <rocky@users.noreply.github.com>
Date: Thu, 23 Feb 2023 17:50:45 -0500
Subject: [PATCH 14/14] Some small tweaks and conformance things (#15)

---
 pymathics/natlang/linguistic_data.py  | 10 +++++-----
 pymathics/natlang/nltk.py             |  1 +
 pymathics/natlang/normalization.py    |  4 +++-
 pymathics/natlang/textual_analysis.py | 13 ++++++-------
 pymathics/natlang/util.py             |  3 +++
 5 files changed, 18 insertions(+), 13 deletions(-)

diff --git a/pymathics/natlang/linguistic_data.py b/pymathics/natlang/linguistic_data.py
index 033c179..db62987 100644
--- a/pymathics/natlang/linguistic_data.py
+++ b/pymathics/natlang/linguistic_data.py
@@ -2,7 +2,7 @@
 """
 Linguistic Data
 
-See <url>:WMA link:https://reference.wolfram.com/language/guide/LinguisticData.html</url> guide.
+See the corresponding <url>:WMA:https://reference.wolfram.com/language/guide/LinguisticData.html</url> guide.
 
 """
 # This module uses nltk.
@@ -66,7 +66,7 @@ class DictionaryLookup(_WordListBuiltin):
      = {baker, baker's dozen, baker's eczema}
     """
 
-    summary_text = "Lookup words matching a pattern in a dictionary"
+    summary_text = "lookup words matching a pattern in our word dictionary"
 
     def compile(self, pattern, evaluation):
         re_patt = to_regex(pattern, evaluation)
@@ -131,7 +131,7 @@ class DictionaryWordQ(_WordNetBuiltin):
      = False
     """
 
-    summary_text = "Check if a word is in the dictionary"
+    summary_text = "check if a word is in our word dictionary"
 
     def eval(self, word, evaluation: Evaluation, options: dict):
         "DictionaryWordQ[word_String,  OptionsPattern[DictionaryWordQ]]"
@@ -170,7 +170,7 @@ class RandomWord(_WordListBuiltin):
     </dl>
     """
 
-    summary_text = "generate a random word of a given kind"
+    summary_text = "generate a random word"
 
     def _random_words(self, type, n, evaluation: Evaluation, options: dict):
         words = self._words(self._language_name(evaluation, options), type, evaluation)
@@ -412,7 +412,7 @@ class WordList(_WordListBuiltin):
     Evaluate the average length over all the words in the dictionary:
     >> N[Mean[StringLength /@ WordList[]], 3]
      = 11.6
-    Now, restricted to adjetives:
+    Now, restricted to adjectives:
     >> N[Mean[StringLength /@ WordList["Adjective"]], 2]
      = 9.3
     """
diff --git a/pymathics/natlang/nltk.py b/pymathics/natlang/nltk.py
index ff04fc2..919f75c 100644
--- a/pymathics/natlang/nltk.py
+++ b/pymathics/natlang/nltk.py
@@ -13,6 +13,7 @@
 from mathics.core.evaluation import Evaluation
 from mathics.core.symbols import strip_context
 
+# Don't consider this for user documentation
 no_doc = True
 
 
diff --git a/pymathics/natlang/normalization.py b/pymathics/natlang/normalization.py
index f4cf07d..baf309c 100644
--- a/pymathics/natlang/normalization.py
+++ b/pymathics/natlang/normalization.py
@@ -2,7 +2,9 @@
 
 Text Normalization
 
-See <url>:WMA link: https://reference.wolfram.com/language/guide/TextNormalization.html</url> guide.
+See the corresponding <url>
+:WMA:
+https://reference.wolfram.com/language/guide/TextNormalization.html</url> guide.
 
 
 This module uses spacy as a backend.
diff --git a/pymathics/natlang/textual_analysis.py b/pymathics/natlang/textual_analysis.py
index de927e0..01300d2 100644
--- a/pymathics/natlang/textual_analysis.py
+++ b/pymathics/natlang/textual_analysis.py
@@ -2,7 +2,8 @@
 """
 Text Analysis
 
-<url>:See WMA guide:https://reference.wolfram.com/language/guide/TextAnalysis.html</url>
+See the corresponding <url>:WMA:
+https://reference.wolfram.com/language/guide/TextAnalysis.html</url> guide.
 """
 
 # This module uses both enchant, nltk and spacy. Maybe we want to split this further.
@@ -88,7 +89,7 @@ class SpellingCorrectionList(Builtin):
 
     _dictionaries = {}
 
-    summary_text = "Look for spelling correction candidates of a word"
+    summary_text = "look for spelling correction candidates of a word"
 
     def eval(
         self, word: String, evaluation: Evaluation, options: dict
@@ -130,7 +131,7 @@ class WordCount(_SpacyBuiltin):
      = 4
     """
 
-    summary_text = "Count the words in a text"
+    summary_text = "count words in a text"
 
     def eval(self, text, evaluation: Evaluation, options: dict):
         "WordCount[text_String, OptionsPattern[WordCount]]"
@@ -154,9 +155,7 @@ class WordFrequency(_SpacyBuiltin):
 
     ## Problem with import for certain characters in the text.
     ## >> text = Import["ExampleData/EinsteinSzilLetter.txt"];
-    >> text = "I have a dairy cow, it's not just any cow. \
-She gives me milkshake, oh what a salty cow. She is the best \
-cow in the county.";
+    >> text = "I have a dairy cow, it's not just any cow. She gives me milkshake, oh what a salty cow. She is the best cow in the county.";
 
     >> WordFrequency[text, "a" | "the"]
      = 0.121212
@@ -232,7 +231,7 @@ class WordSimilarity(_SpacyBuiltin):
             "idxfmt": "Indices must be integers or lists of integers of the same length.",
         },
     )
-    summary_text = "Measure the similarity of two texts"
+    summary_text = "measure similarity of two texts"
 
     def eval(
         self, text1: String, text2: String, evaluation: Evaluation, options: dict
diff --git a/pymathics/natlang/util.py b/pymathics/natlang/util.py
index 4b80076..383c55c 100644
--- a/pymathics/natlang/util.py
+++ b/pymathics/natlang/util.py
@@ -4,6 +4,9 @@
 utils
 """
 
+# Don't consider this for user documentation
+no_doc = True
+
 
 def merge_dictionaries(a, b):
     c = a.copy()