From edaea7ff6e4a3418856db504790b6f9b9d0b6afc Mon Sep 17 00:00:00 2001 From: Cameron Toy Date: Mon, 2 Mar 2020 13:41:50 -0800 Subject: [PATCH 1/4] performance improvements --- database_wrapper.py | 3 +- flask_api.py | 2 + nimbus.py | 17 +++-- nimbus_nlp/NIMBUS_NLP.py | 104 ++++++++++++++---------------- nimbus_nlp/question_classifier.py | 15 +++-- 5 files changed, 75 insertions(+), 66 deletions(-) diff --git a/database_wrapper.py b/database_wrapper.py index b43d5ff..0e51b16 100755 --- a/database_wrapper.py +++ b/database_wrapper.py @@ -92,6 +92,7 @@ ] } + class BadDictionaryKeyError(Exception): """Raised when the given JSON/dict is missing some required fields. @@ -154,7 +155,7 @@ def __init__(self, message: str): def get_current_time(): """ - Useful for answering questions like "Is prof availible now/tomorrow?" + Useful for answering questions like "Is prof available now/tomorrow?" """ pass diff --git a/flask_api.py b/flask_api.py index d6478c5..4df2438 100755 --- a/flask_api.py +++ b/flask_api.py @@ -4,6 +4,7 @@ Contains all the handlers for the API. Also the main code to run Flask. """ import json +import requests from flask import Flask, jsonify, request from flask_cors import CORS @@ -275,3 +276,4 @@ def convert_to_mfcc(): app.run(host='0.0.0.0', debug=gunicorn_config.DEBUG_MODE, port=gunicorn_config.PORT) + diff --git a/nimbus.py b/nimbus.py index 2ef8f87..f37fe1a 100644 --- a/nimbus.py +++ b/nimbus.py @@ -1,5 +1,5 @@ from QA import create_qa_mapping, generate_fact_QA -from nimbus_nlp.NIMBUS_NLP import NIMBUS_NLP +from nimbus_nlp.NIMBUS_NLP import NimbusNLP class Nimbus: @@ -8,24 +8,33 @@ def __init__(self): self.qa_dict = create_qa_mapping( generate_fact_QA("q_a_pairs.csv") ) + self.nimbus_nlp = NimbusNLP() def answer_question(self, question): - ans_dict = NIMBUS_NLP.predict_question(question) + ans_dict = self.nimbus_nlp.predict_question(question) print(ans_dict) try: qa = self.qa_dict[ans_dict["question class"]] except KeyError: + # Printed if question isn't found. This occurs because the training set is broader + # than the answerable question set. return "I'm sorry, I don't understand. Please try another question." else: answer = qa.answer(ans_dict) if answer is None: + # Printed when a database query was made and a null value was returned. + # Should be handled in the QA class in the future. return("I'm sorry, I understand your question but was unable to find an answer. " "Please try another question.") else: return answer + if __name__ == "__main__": nimbus = Nimbus() + # print(nimbus.answer_question("What is Irene's phone number?")) + # print(nimbus.answer_question("What is Dr. Khosmood's email?")) + # print(nimbus.answer_question("What are the prerequisites for CPE 357?")) while True: - question = input("Enter a question: ") - print(nimbus.answer_question(question)) \ No newline at end of file + q = input("Enter a question: ") + print(nimbus.answer_question(q)) \ No newline at end of file diff --git a/nimbus_nlp/NIMBUS_NLP.py b/nimbus_nlp/NIMBUS_NLP.py index d525d8f..b93cdd7 100644 --- a/nimbus_nlp/NIMBUS_NLP.py +++ b/nimbus_nlp/NIMBUS_NLP.py @@ -1,16 +1,3 @@ - -import nltk -import numpy as np -import os -import pandas as pd -import re -import sklearn.neighbors -import spacy -import sys - -from google.api_core.client_options import ClientOptions -from google.cloud import automl_v1 -from google.cloud.automl_v1.proto import service_pb2 import os import json from google.api_core.client_options import ClientOptions @@ -19,41 +6,45 @@ # Temporary import for the classifier from nimbus_nlp.question_classifier import QuestionClassifier -class NIMBUS_NLP: - @staticmethod - def predict_question(input_question): - ''' +# Made this an instantiable class to prevent the overhead of instantiating +# a variable extractor and question classifier for every question. +# Consider: Does this even need to be a class? Its functionality could be +# moved to the Nimbus class of nimbus.py +class NimbusNLP: + + def __init__(self): + # Instantiate variable extractor and question classifier + self.variable_extractor = VariableExtractor() + self.classifier = QuestionClassifier() + # Load classifier model + self.classifier.load_latest_classifier() + + def predict_question(self, input_question): + """ Runs through variable extraction and the question classifier to predict the intended question. Args: input_question (string) - user input question to answer - Return: nlp_props (dict) - contains the user's input question, + Return: nlp_props (dict) - contains the user"s input question, the variable extracted input question, the entity extracted, and the predicted answer - ''' - - # Instantiate the variable extraction class - variable_extraction = Variable_Extraction() + """ - # Obtain the properties from variable extraction - nlp_props = variable_extraction.extract_variables(input_question) + # Get dictionary of extracted variables + info from question + nlp_props = self.variable_extractor.extract_variables(input_question) - # Instantiate the question classifier class - classifier = QuestionClassifier() - classifier.load_latest_classifier() - - # Classify the question and add it to the nlp properties dictionary - nlp_props["question class"] = classifier.\ - classify_question(nlp_props["normalized question"]) + # Add classified question to nlp_props dictionary + nlp_props["question class"] = self.classifier.\ + classify_question(nlp_props["normalized question"]) return nlp_props -class Variable_Extraction: +class VariableExtractor: def __init__(self, config_file: str = "config.json"): @@ -68,34 +59,35 @@ def __init__(self, config_file: str = "config.json"): credential_path = os.getcwd() + "/auth.json" # TODO: consider does this even do anything useful? - os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = credential_path + os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = credential_path - def inline_text_payload(self, sent): - ''' - Converts the input sentence into GCP's callable format + @staticmethod + def inline_text_payload(sent): + """ + Converts the input sentence into GCP"s callable format Args: sent (string) - input sentence Return: (dict) - GCP NER input format - ''' + """ - return {'text_snippet': {'content': sent, 'mime_type': 'text/plain'} } + return {"text_snippet": {"content": sent, "mime_type": "text/plain"} } def get_prediction(self, sent): - ''' + """ Obtains the prediction from the input sentence and returns the normalized sentence Args: sent (string) - input sentence Return: request (PredictObject) - predictiton output - ''' + """ params = {} # Setup API - options = ClientOptions(api_endpoint='automl.googleapis.com') + options = ClientOptions(api_endpoint="automl.googleapis.com") # Create prediction object predictor = automl_v1.PredictionServiceClient(client_options=options) @@ -110,7 +102,7 @@ def get_prediction(self, sent): return request def extract_variables(self, sent): - ''' + """ Takes the prediction and replaces the entity with its corresponding tag Args: sent (string) - input sentence @@ -120,7 +112,7 @@ def extract_variables(self, sent): "normalized entity" - stripped entity "input question" - input question from the user "normalized question" - variable-replaced question - ''' + """ # Make the prediction request = self.get_prediction(sent) @@ -132,10 +124,10 @@ def extract_variables(self, sent): tag = request.payload[0].display_name # Removes excessive words from the entity - normalized_entity = Variable_Extraction.excess_word_removal(entity, tag) + normalized_entity = VariableExtractor.excess_word_removal(entity, tag) # Replaces the entity of input question with its corresponding tag - normalized_question = sent.replace(entity, '[' + tag + ']') + normalized_question = sent.replace(entity, "[" + tag + "]") return { "entity" : entity, @@ -147,31 +139,31 @@ def extract_variables(self, sent): @staticmethod def excess_word_removal(entity, tag): - ''' + """ Checks the tag and determines which excess word removal function to use Args: entity (string) - extracted entity from the input question Return: (string) - returns the normalized entity string - ''' + """ - if (tag == 'PROF'): - return Variable_Extraction.strip_titles(entity) + if tag == "PROF": + return VariableExtractor.strip_titles(entity) else: return entity @staticmethod def strip_titles(entity): - ''' + """ Strips titles from input entities Args: entity (string) - extracted entity from the input question Return: norm_entity (string) - the normalized, title-stripped entity - ''' + """ # list of titles for removal titles = {"professor", "dr.", "dr", "doctor", "prof", "instructor", "mrs.",\ @@ -189,12 +181,16 @@ def strip_titles(entity): # if there is no title in the word return entity + #TODO: Add the Question_Classifier code directly into this file +# Is this really necessary? Separation of dependencies might be good here. class Question_Classifier: pass -if __name__ == '__main__': + +if __name__ == "__main__": + nimbus_nlp = NimbusNLP() while True: question = input("Enter a question: ") - answer = NIMBUS_NLP.predict_question(question) - print(answer) + answer = nimbus_nlp.predict_question(question) + print(answer) \ No newline at end of file diff --git a/nimbus_nlp/question_classifier.py b/nimbus_nlp/question_classifier.py index e0c6882..b376f73 100644 --- a/nimbus_nlp/question_classifier.py +++ b/nimbus_nlp/question_classifier.py @@ -4,8 +4,6 @@ import numpy as np import sklearn.neighbors import pandas as pd -import sys -import json from nimbus_nlp.save_and_load_model import save_model, load_latest_model, PROJECT_DIR import json @@ -16,7 +14,13 @@ class QuestionClassifier: def __init__(self): - nltk.download('stopwords') + # Prevents classifier from attempting to download stopwords corpus every run + try: + from nltk.corpus import stopwords + except ImportError: + nltk.download('stopwords') + from nltk.corpus import stopwords + self.classifier = None self.nlp = spacy.load('en_core_web_sm') self.WH_WORDS = {'WDT', 'WP', 'WP$', 'WRB'} @@ -25,20 +29,17 @@ def __init__(self): def train_model(self): self.save_model = save_model - # REPLACE WITH API EVENTUALLY self.file_path = "question_set_clean.csv" # The possible WH word tags returned through NLTK part of speech tagging - self.classifier = self.build_question_classifier() save_model(self.classifier, "nlp-model") - def load_latest_classifier(self): self.classifier = load_latest_model() - with open(PROJECT_DIR+ '/models/features/overall_features.json', 'r') as fp: + with open(PROJECT_DIR + '/models/features/overall_features.json', 'r') as fp: self.overall_features = json.load(fp) def get_question_features(self, question): From c47f159c9522ae41ea52fbedbcc9c3ed0b76254b Mon Sep 17 00:00:00 2001 From: Cameron Toy Date: Mon, 2 Mar 2020 14:01:03 -0800 Subject: [PATCH 2/4] removed imports used for testing --- flask_api.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/flask_api.py b/flask_api.py index 4df2438..ae2eeb3 100755 --- a/flask_api.py +++ b/flask_api.py @@ -3,9 +3,6 @@ Contains all the handlers for the API. Also the main code to run Flask. """ -import json -import requests - from flask import Flask, jsonify, request from flask_cors import CORS from pydrive.auth import GoogleAuth From 4ec6c808e039675bf10a61de406d64c546a14d57 Mon Sep 17 00:00:00 2001 From: Cameron Toy Date: Tue, 7 Apr 2020 18:45:43 -0700 Subject: [PATCH 3/4] Added Professor/Section view access --- Entity/ProfessorSectionView.py | 46 ++++++++++++++++++++++++++++++++++ database_wrapper.py | 37 +++++++++++++++++++++++++-- test_view_access.py | 8 ++++++ 3 files changed, 89 insertions(+), 2 deletions(-) create mode 100644 Entity/ProfessorSectionView.py create mode 100644 test_view_access.py diff --git a/Entity/ProfessorSectionView.py b/Entity/ProfessorSectionView.py new file mode 100644 index 0000000..70d05aa --- /dev/null +++ b/Entity/ProfessorSectionView.py @@ -0,0 +1,46 @@ +from sqlalchemy import Column, Integer, String, Enum, Text +from sqlalchemy.ext.declarative import declarative_base +from sqlalchemy.dialects.mysql import SET +import enum + +Base = declarative_base() + + +class SectionType(enum.Enum): + activity = Act = 0 + seminar = Sem = 1 + independent = Ind = 2 + lab = 3 + lecture = Lec = 4 + + +class ProfessorSectionView(Base): + __tablename__ = "Professor_Teaches_Section_2" + id = Column(Integer, primary_key=True) + id_sections = Column(Integer) + profAliasId = Column(Integer) + section_name = Column(String(255)) + instructor = Column(String(255)) + profEmailAlias = Column(String(255)) + title = Column(String(255)) + phone = Column(String(255)) + office = Column(String(255)) + type = Column(Enum(SectionType)) + days = Column(SET('M', 'T', 'W', 'R', 'F')) + start = Column(String(255)) + end = Column(String(255)) + location = Column(String(255)) + department = Column(String(255)) + firstName = Column(String(50)) + lastName = Column(String(50)) + phoneNumber = Column(String(20)) + researchInterests = Column(Text) + email = Column(String(255)) + + def __repr__(self): + D = self.__dict__ + attributes = [ + f"{k}={D.get(k)}" for k in self.__dir__() if not k.startswith("_") + ] + attributes_string = ", ".join(attributes) + return f"{self.__class__.__name__}({attributes_string})" diff --git a/database_wrapper.py b/database_wrapper.py index f541ff9..e8beca0 100755 --- a/database_wrapper.py +++ b/database_wrapper.py @@ -27,7 +27,8 @@ from Entity.QuestionAnswerPair import QuestionAnswerPair, AnswerType from Entity.Professors import Professors, ProfessorsProperties from Entity.Clubs import Clubs -from Entity.Sections import Sections, SectionType +from Entity.Sections import Sections +from Entity.ProfessorSectionView import ProfessorSectionView from fuzzywuzzy import fuzz @@ -50,6 +51,7 @@ Professors: {"firstName", "lastName"}, Clubs: {"club_name"}, Sections: {"section_name"}, + ProfessorSectionView: {"firstName", "lastName"} } EXPECTED_KEYS_BY_ENTITY = { @@ -343,6 +345,7 @@ def __init__(self, config_file: str = "config.json") -> None: self.AudioSampleMetaData = AudioSampleMetaData self.Locations = Locations self.QuestionAnswerPair = QuestionAnswerPair + self.ProfessorSectionViews = ProfessorSectionView self.inspector = inspect(self.engine) self._create_database_session() print("initialized NimbusMySQLAlchemy") @@ -415,6 +418,7 @@ def __safe_create(SQLAlchemy_object): __safe_create(self.AudioSampleMetaData) __safe_create(self.Locations) __safe_create(self.QuestionAnswerPair) + __safe_create(self.ProfessorSectionViews) def _create_database_session(self): Session = sessionmaker(bind=self.engine) @@ -466,6 +470,35 @@ def get_property_from_entity( ) >>> ["foaad@calpoly.edu"] + Args: + prop: the relevant property value to retrieve from matching entities + entity: the type of entity we want to get the property from + identifier: a string that identifies the entity in some way (i.e., a professor's name) + tag_column_map: a dictionary mapping entity types to columns that identify the entities + ex: + {Professors: {"firstName", "lastName"}} + + Returns: + The closest value of `prop`, + such that the `entity` matches `identifier`. + """ + return self._get_property_from_entity( + prop, + entity, + identifier, + tag_column_map + )[-1][2] + + def _get_property_from_entity( + self, + prop: str, + entity: UNION_ENTITIES, + identifier: str, + tag_column_map: dict = default_tag_column_dict, + ): + """ + Returns a full list of matching entities. Used by get_property_from_entity() + Args: prop: the relevant property value to retrieve from matching entities entity: the type of entity we want to get the property from @@ -506,7 +539,7 @@ def get_property_from_entity( return None sorted_results = sorted(results, key=lambda pair: pair[0]) - return sorted_results[-1][2] + return sorted_results def get_course_properties( self, department: str, course_num: Union[str, int] diff --git a/test_view_access.py b/test_view_access.py new file mode 100644 index 0000000..5bb7458 --- /dev/null +++ b/test_view_access.py @@ -0,0 +1,8 @@ +from QA import db +from Entity.ProfessorSectionView import ProfessorSectionView + +print(db._get_property_from_entity( + "section_name", + ProfessorSectionView, + "Irene Humer" +)) \ No newline at end of file From 3c93013d6dca30b7e134274a363defd081dee2a5 Mon Sep 17 00:00:00 2001 From: Cameron Toy <38936057+cameron-toy@users.noreply.github.com> Date: Sat, 18 Apr 2020 22:39:56 -0700 Subject: [PATCH 4/4] Changed __tablename__ Changed from "Professor_Teaches_Section_2" to "Professor_Teaches_Section" --- Entity/ProfessorSectionView.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Entity/ProfessorSectionView.py b/Entity/ProfessorSectionView.py index 70d05aa..701a05e 100644 --- a/Entity/ProfessorSectionView.py +++ b/Entity/ProfessorSectionView.py @@ -15,7 +15,7 @@ class SectionType(enum.Enum): class ProfessorSectionView(Base): - __tablename__ = "Professor_Teaches_Section_2" + __tablename__ = "Professor_Teaches_Section" id = Column(Integer, primary_key=True) id_sections = Column(Integer) profAliasId = Column(Integer)