Skip to content

Commit

Permalink
named entity recognition basics
Browse files Browse the repository at this point in the history
  • Loading branch information
MrPicklePinosaur committed Jun 22, 2022
1 parent f73c404 commit 6837cd3
Show file tree
Hide file tree
Showing 3 changed files with 15 additions and 2 deletions.
2 changes: 1 addition & 1 deletion bin/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,6 @@

DOWNLOAD_DIR="./venv/nltk_data"

resources = ["stopwords", "punkt"]
resources = ["stopwords", "punkt", "averaged_perceptron_tagger", "maxent_ne_chunker", "words"]
for resource in resources:
nltk.download(resource, download_dir=DOWNLOAD_DIR)
13 changes: 13 additions & 0 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
from pipeop import pipes

from nn import NeuralNet
import nltk

import preprocess
import config

Expand Down Expand Up @@ -36,6 +38,16 @@ def preprocess_query(query):
return torch.from_numpy(x)


@pipes
def analyze_query(query):
tagged = (query
>> preprocess.tokenize
>> nltk.pos_tag
)
print(tagged)
chunked = nltk.ne_chunk(tagged)
print(chunked)

preprocessed = preprocess_query(query)
# TODO catch a tensor that is all zero
print(preprocessed)
Expand All @@ -48,6 +60,7 @@ def preprocess_query(query):

if prob.item() > config.CONFIDENCE_THRESHOLD:
print(f"[prob={prob.item():.4f}] {tag}")
analyze_query(query)
else:
print("query not understood")

Expand Down
2 changes: 1 addition & 1 deletion preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

STOP_WORDS = set(stopwords.words("english"))
STOP_WORDS = set(stopwords.words("english")).union(['{}'])
stemmer = PorterStemmer()


Expand Down

0 comments on commit 6837cd3

Please sign in to comment.