named entity recognition basics

KaratsubaLabs · Jun 22, 2022 · 6837cd3 · 6837cd3
1 parent f73c404
commit 6837cd3
Show file tree

Hide file tree

Showing 3 changed files with 15 additions and 2 deletions.
diff --git a/bin/download.py b/bin/download.py
@@ -3,6 +3,6 @@
 
 DOWNLOAD_DIR="./venv/nltk_data"
 
-resources = ["stopwords", "punkt"]
+resources = ["stopwords", "punkt", "averaged_perceptron_tagger", "maxent_ne_chunker", "words"]
 for resource in resources:
     nltk.download(resource, download_dir=DOWNLOAD_DIR)
diff --git a/main.py b/main.py
@@ -3,6 +3,8 @@
 from pipeop import pipes
 
 from nn import NeuralNet
+import nltk
+
 import preprocess
 import config
 
@@ -36,6 +38,16 @@ def preprocess_query(query):
     return torch.from_numpy(x)
 
 
+@pipes
+def analyze_query(query):
+    tagged = (query
+        >> preprocess.tokenize
+        >> nltk.pos_tag
+    )
+    print(tagged)
+    chunked = nltk.ne_chunk(tagged)
+    print(chunked)
+
 preprocessed = preprocess_query(query)
 # TODO catch a tensor that is all zero
 print(preprocessed)
@@ -48,6 +60,7 @@ def preprocess_query(query):
 
 if prob.item() > config.CONFIDENCE_THRESHOLD:
     print(f"[prob={prob.item():.4f}] {tag}")
+    analyze_query(query)
 else:
     print("query not understood")
 

diff --git a/preprocess.py b/preprocess.py
@@ -4,7 +4,7 @@
 from nltk.corpus import stopwords
 from nltk.stem import PorterStemmer
 
-STOP_WORDS = set(stopwords.words("english"))
+STOP_WORDS = set(stopwords.words("english")).union(['{}'])
 stemmer = PorterStemmer()