From 4d1d9f1c1921a498b68d8d5fbc63d65f3182ff8e Mon Sep 17 00:00:00 2001 From: Michael Fekadu Date: Wed, 4 Mar 2020 18:51:56 -0800 Subject: [PATCH 1/3] install monkey type as dev-dependency, ignore monkeytype.sqlite3 pipenv install MonkeyType --dev pipenv shell monkeytype --verbose run nimbus.py --- .gitignore | 3 +++ Pipfile | 2 +- Pipfile.lock | 61 +++++++++++++++++++++++++++++++++++----------------- 3 files changed, 45 insertions(+), 21 deletions(-) diff --git a/.gitignore b/.gitignore index 9532684..ea1a9df 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,6 @@ +# monkeytype +monkeytype.sqlite3 + nvenv/* diff --git a/Pipfile b/Pipfile index 6069200..dc31126 100644 --- a/Pipfile +++ b/Pipfile @@ -22,7 +22,7 @@ pytest = "==5.3.4" pyre-check = "==0.0.41" ## like the Unix `make` but better invoke = "==1.4.1" - +monkeytype = "*" [packages] # REST API diff --git a/Pipfile.lock b/Pipfile.lock index 6575e89..3d5dbb3 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "fb30d39142d3cc83d8909d9f4f4648a60ac33d4ec3a5a94d8dac7b90ef727a24" + "sha256": "348fee5e0ee39fb3206a8e4f15f486f53691109f3e3035829eb5a61d64ccdd6c" }, "pipfile-spec": 6, "requires": { @@ -636,20 +636,20 @@ }, "srsly": { "hashes": [ - "sha256:1102b4984f9f56364540e47d83fac3e7543903dfbb92f0d0e5dd3bfd40528934", - "sha256:1c4354095f63f59fc52a4362960faaddebcfa7a240f07209eb50e8f9ec39e700", - "sha256:3ceae42dbbda49b57a4937e0ca28f56c2a121c89008cc7ec09e0a9d8d705c03e", - "sha256:4ce9d6ab6d1c617150455ef5ba8abd5107a8e65956f06c2efc86697f4cb4b431", - "sha256:51c47f98dc06d5c2d1d7806cd38dcc834ab9906dc12170bc21105e5a9590a6fd", - "sha256:a672ffaa77680f355933cf424739ae9ecff767908a374ad194692b53040fda01", - "sha256:abe3d98d9ea8f7dac898119cd9861466c49cfe0f16287c9f859e0d4cab43a7a4", - "sha256:c6bdf53a87770139c6a9d75b3e664505bd81c022312fafca35ed38714e4ecdf1", - "sha256:c82e6dc3727454edc6ccdb1d07d5bc0aab3f43539fb8d9f973cf769135d2c7e4", - "sha256:ca1ec20ea6e14ad56ccaa84aa6c79d6e51fccf32e0040372b4d06c6e5dbb7fee", - "sha256:d5c0c718b2f67fc425d9bb3cc26b6141cb2f53251cdc145f58b70095241a3308", - "sha256:de329ba0ff451308d59e40c39372f5231e7c364f4933d7457788203630bdede2" + "sha256:18bad26c34cf5a8853fbf018fd168a7bf2ea7ce661e66476c25dac711cb79c9b", + "sha256:2179cf1e88c250e89e40227bd5848341011c170079b3d424987d067de6a73f42", + "sha256:21cfb0e5dea2c4515b5c2daa78402d5782c6425b4f58af40d2e2cb45e4778d8c", + "sha256:29434753a77481ec6129991f4116f983085cc8005c1ad963261124842e8c05fc", + "sha256:3f3975e8cb67194d26dd03508469b1303f8b994f30e7782f7eae25fef6dc4aad", + "sha256:46213d8f094b348a9433c825ac1eba36a21aa25a8bae6f29c2f9f053e15be961", + "sha256:59258b81d567df207f8a0a33c4b5fa232afccf1d927c8ce3ba5395bfd64c0ed8", + "sha256:7c553a709fd56a37a07f969e849f55a0aeabaeb7677bebc588a640ab8ec134aa", + "sha256:95849d84e8929be248a180e672c8ce1ed98b1341263bc983efdf8427465584f1", + "sha256:b94d8a13c60e3298a9ba12b1b211026e8378c7d087efd7ce46a3f2d8d4678d94", + "sha256:c8beff52c104a7ffe4a15513a05dc0497998cf83aa1ca39454489994d18c1c07", + "sha256:d409beb7257208633c974c01f9dc3265562fb6802caee7de21880761ba87c3ed" ], - "version": "==1.0.1" + "version": "==1.0.2" }, "thinc": { "hashes": [ @@ -706,10 +706,10 @@ }, "zipp": { "hashes": [ - "sha256:12248a63bbdf7548f89cb4c7cda4681e537031eda29c02ea29674bc6854460c2", - "sha256:7c0f8e91abc0dc07a5068f315c52cb30c66bfbc581e5b50704c8a2f6ebae794a" + "sha256:aa36550ff0c0b7ef7fa639055d797116ee891440eac1a56f378e2d3179e0320b", + "sha256:c599e4d75c98f6798c509911d08a22e6c021d074469042177c8c86fb92eefd96" ], - "version": "==3.0.0" + "version": "==3.1.0" } }, "develop": { @@ -826,6 +826,14 @@ ], "version": "==0.6.1" }, + "monkeytype": { + "hashes": [ + "sha256:71da688939f08d19904462eef2e568a4f18f6133cc7e3c901ff5034c8ab5a538", + "sha256:9f052b42851bc24603836ce3105166c8cc5edabeb25e8fcf256fa25777122618" + ], + "index": "pypi", + "version": "==19.11.2" + }, "more-itertools": { "hashes": [ "sha256:5dd8bcf33e5f9513ffa06d5ad33d78f31e1931ac9a18f33d37e77a180d393a7c", @@ -985,6 +993,13 @@ ], "version": "==2.23.0" }, + "retype": { + "hashes": [ + "sha256:7d033b115f66e5327dea0a3fd7c9a3dbfa53841575daf27ce2ce409956d901d4", + "sha256:846fd135d3ee33c1bad387602a405d808cb99a9a7a47299bfd0e1d25dfb2fedd" + ], + "version": "==19.9.0" + }, "six": { "hashes": [ "sha256:70e8a77beed4562e7f14fe23a786b54f6296e34344c23bc42f07b15018ff98e9", @@ -1000,6 +1015,12 @@ ], "version": "==2.1.0" }, + "stringcase": { + "hashes": [ + "sha256:48a06980661908efe8d9d34eab2b6c13aefa2163b3ced26972902e3bdfd87008" + ], + "version": "==1.2.0" + }, "toml": { "hashes": [ "sha256:229f81c57791a41d65e399fc06bf0848bab550a9dfd5ed66df18ce5f05e73d5c", @@ -1065,10 +1086,10 @@ }, "zipp": { "hashes": [ - "sha256:12248a63bbdf7548f89cb4c7cda4681e537031eda29c02ea29674bc6854460c2", - "sha256:7c0f8e91abc0dc07a5068f315c52cb30c66bfbc581e5b50704c8a2f6ebae794a" + "sha256:aa36550ff0c0b7ef7fa639055d797116ee891440eac1a56f378e2d3179e0320b", + "sha256:c599e4d75c98f6798c509911d08a22e6c021d074469042177c8c86fb92eefd96" ], - "version": "==3.0.0" + "version": "==3.1.0" } } } From 41945a06af197f68719799a6676578922d192ad3 Mon Sep 17 00:00:00 2001 From: Michael Fekadu Date: Wed, 4 Mar 2020 18:54:42 -0800 Subject: [PATCH 2/3] generate type annotations by apply monkeytype stubs on nimbus.py monkeytype docs: https://github.com/Instagram/MonkeyType *************************************************** pipenv install MonkeyType --dev pipenv shell monkeytype --verbose run nimbus.py ``` $ monkeytype list-modules QA database_wrapper nimbus_nlp.NIMBUS_NLP nimbus_nlp.question_classifier nimbus_nlp.save_and_load_model ``` monkeytype --verbose stub QA monkeytype --verbose stub database_wrapper monkeytype --verbose stub nimbus_nlp.NIMBUS_NLP monkeytype --verbose stub nimbus_nlp.question_classifier monkeytype --verbose stub nimbus_nlp.save_and_load_model monkeytype --verbose apply QA monkeytype --verbose apply database_wrapper monkeytype --verbose apply nimbus_nlp.NIMBUS_NLP monkeytype --verbose apply nimbus_nlp.question_classifier monkeytype --verbose apply nimbus_nlp.save_and_load_model --- QA.py | 22 ++++++++++++---------- database_wrapper.py | 6 +++--- nimbus_nlp/NIMBUS_NLP.py | 17 ++++++++++------- nimbus_nlp/question_classifier.py | 20 +++++++++++--------- nimbus_nlp/save_and_load_model.py | 3 ++- 5 files changed, 38 insertions(+), 30 deletions(-) diff --git a/QA.py b/QA.py index 250b065..d2c2699 100644 --- a/QA.py +++ b/QA.py @@ -9,6 +9,8 @@ from database_wrapper import NimbusMySQLAlchemy from pandas import read_csv +from functools import partial +from typing import Dict, List Extracted_Vars = Dict[str, Any] DB_Data = Dict[str, Any] DB_Query = Callable[[Extracted_Vars], DB_Data] @@ -33,7 +35,7 @@ class QA: A class for wrapping functions used to answer a question. """ - def __init__(self, q_format, db_query, format_answer): + def __init__(self, q_format: str, db_query: partial, format_answer: partial) -> None: """ Args: q_format (str): Question format string @@ -49,13 +51,13 @@ def __init__(self, q_format, db_query, format_answer): self.db_query = db_query self.format_answer = format_answer - def _get_data_from_db(self, extracted_vars): + def _get_data_from_db(self, extracted_vars: Dict[str, str]) -> str: return self.db_query(extracted_vars) - def _format_answer(self, extracted_vars, db_data): + def _format_answer(self, extracted_vars: Dict[str, str], db_data: str) -> str: return self.format_answer(extracted_vars, db_data) - def answer(self, extracted_vars): + def answer(self, extracted_vars: Dict[str, str]) -> str: db_data = self._get_data_from_db(extracted_vars) return self._format_answer(extracted_vars, db_data) @@ -66,7 +68,7 @@ def __hash__(self): return hash(self.q_format) -def create_qa_mapping(qa_list): +def create_qa_mapping(qa_list: List[QA]) -> Dict[str, QA]: """ Creates a dictionary whose values are QA objects and keys are the question formats of those QA objects. @@ -146,18 +148,18 @@ def create_qa_mapping(qa_list): # return functools.partial(_single_var_string_sub, a_format) -def _string_sub(a_format, extracted_info, db_data): +def _string_sub(a_format: str, extracted_info: Dict[str, str], db_data: str) -> str: if db_data is None: return None else: return a_format.format(ex=extracted_info['normalized entity'], db=db_data) -def string_sub(a_format): +def string_sub(a_format: str) -> partial: return functools.partial(_string_sub, a_format) -def _get_property(prop, extracted_info): +def _get_property(prop: str, extracted_info: Dict[str, str]) -> str: ent_string = extracted_info["normalized entity"] ent = tag_lookup[extracted_info['tag']] try: @@ -168,7 +170,7 @@ def _get_property(prop, extracted_info): return value -def get_property(prop): +def get_property(prop: str) -> partial: return functools.partial(_get_property, prop) @@ -186,7 +188,7 @@ def yes_no(a_format, pred=None): return functools.partial(_yes_no, a_format, pred) -def generate_fact_QA(csv): +def generate_fact_QA(csv: str) -> List[QA]: df = read_csv(csv) text_in_brackets = r'\[[^\[\]]*\]' qa_objs = [] diff --git a/database_wrapper.py b/database_wrapper.py index c5b0849..7f4b9b9 100755 --- a/database_wrapper.py +++ b/database_wrapper.py @@ -409,7 +409,7 @@ def __safe_create(SQLAlchemy_object): __safe_create(self.Locations) __safe_create(self.QuestionAnswerPair) - def _create_database_session(self): + def _create_database_session(self) -> None: Session = sessionmaker(bind=self.engine) self.session = Session() print("initialized database session") @@ -435,13 +435,13 @@ def return_qa_pair_csv(self): def partial_fuzzy_match(self, tag_value, identifier): return fuzz.partial_ratio(tag_value, identifier) - def full_fuzzy_match(self, tag_value, identifier): + def full_fuzzy_match(self, tag_value: str, identifier: str) -> int: return fuzz.ratio(tag_value, identifier) def get_property_from_entity( self, prop: str, entity: UNION_ENTITIES, identifier: str, tag_column_map: dict = default_tag_column_dict - ): + ) -> str: """ This function implements the abstractmethod to get a column of values from a NimbusDatabase entity. diff --git a/nimbus_nlp/NIMBUS_NLP.py b/nimbus_nlp/NIMBUS_NLP.py index d525d8f..981e89d 100644 --- a/nimbus_nlp/NIMBUS_NLP.py +++ b/nimbus_nlp/NIMBUS_NLP.py @@ -19,10 +19,13 @@ # Temporary import for the classifier from nimbus_nlp.question_classifier import QuestionClassifier +from google.cloud.automl_v1.types import PredictResponse +from monkeytype.encoding import DUMMY_NAME +from typing import Dict class NIMBUS_NLP: @staticmethod - def predict_question(input_question): + def predict_question(input_question: str) -> Dict[str, str]: ''' Runs through variable extraction and the question classifier to predict the intended question. @@ -55,7 +58,7 @@ def predict_question(input_question): class Variable_Extraction: - def __init__(self, config_file: str = "config.json"): + def __init__(self, config_file: str = "config.json") -> None: with open(config_file) as json_data_file: config = json.load(json_data_file) @@ -70,7 +73,7 @@ def __init__(self, config_file: str = "config.json"): # TODO: consider does this even do anything useful? os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = credential_path - def inline_text_payload(self, sent): + def inline_text_payload(self, sent: str) -> Dict[str, DUMMY_NAME]: ''' Converts the input sentence into GCP's callable format @@ -82,7 +85,7 @@ def inline_text_payload(self, sent): return {'text_snippet': {'content': sent, 'mime_type': 'text/plain'} } - def get_prediction(self, sent): + def get_prediction(self, sent: str) -> PredictResponse: ''' Obtains the prediction from the input sentence and returns the normalized sentence @@ -109,7 +112,7 @@ def get_prediction(self, sent): # Return the output of the API call return request - def extract_variables(self, sent): + def extract_variables(self, sent: str) -> Dict[str, str]: ''' Takes the prediction and replaces the entity with its corresponding tag @@ -146,7 +149,7 @@ def extract_variables(self, sent): } @staticmethod - def excess_word_removal(entity, tag): + def excess_word_removal(entity: str, tag: str) -> str: ''' Checks the tag and determines which excess word removal function to use @@ -163,7 +166,7 @@ def excess_word_removal(entity, tag): return entity @staticmethod - def strip_titles(entity): + def strip_titles(entity: str) -> str: ''' Strips titles from input entities diff --git a/nimbus_nlp/question_classifier.py b/nimbus_nlp/question_classifier.py index 1890bcf..9fa56bb 100644 --- a/nimbus_nlp/question_classifier.py +++ b/nimbus_nlp/question_classifier.py @@ -13,9 +13,11 @@ # TODO: move the functionality in this module into class(es), so that it can be more easily used as a dependency +from spacy.tokens.token import Token +from typing import Dict, List, Tuple class QuestionClassifier: - def __init__(self): + def __init__(self) -> None: nltk.download('stopwords') nltk.download('punkt') nltk.download('averaged_perceptron_tagger') @@ -38,12 +40,12 @@ def train_model(self): save_model(self.classifier, "nlp-model") - def load_latest_classifier(self): + def load_latest_classifier(self) -> None: self.classifier = load_latest_model() with open(PROJECT_DIR+ '/models/features/overall_features.json', 'r') as fp: self.overall_features = json.load(fp) - def get_question_features(self, question): + def get_question_features(self, question: str) -> Dict[str, int]: # print("using new algorithm") """ Method to extract features from each individual question. @@ -121,7 +123,7 @@ def get_question_features_old_algorithm(self, question): # Note: this method of extracting the main verb is not perfect, but # for single sentence questions that should have no ambiguity about the main verb, # it should be sufficient. - def extract_main_verb(self, question): + def extract_main_verb(self, question: str) -> Token: doc = self.nlp(question) sents = list(doc.sents) if len(sents) == 0: @@ -129,10 +131,10 @@ def extract_main_verb(self, question): return sents[0].root - def get_lemmas(self, words): + def get_lemmas(self, words: List[str]) -> List[str]: return [self.nlp(word)[0].lemma_ for word in words] - def is_wh_word(self, pos): + def is_wh_word(self, pos: str) -> bool: return pos in self.WH_WORDS def build_question_classifier(self): @@ -174,7 +176,7 @@ def build_question_classifier(self): return new_classifier - def filterWHTags(self, question): + def filterWHTags(self, question: str) -> List[Tuple[str, str]]: # ADD ALL VARIABLES TO THE FEATURE DICT WITH A WEIGHT OF 90 matches = re.findall(r'(\[(.*?)\])', question) for match in matches: @@ -193,7 +195,7 @@ def filterWHTags(self, question): tag for tag in question_tags if self.is_wh_word(tag[1])] return question_tags - def validate_WH(self, test_question, predicted_question): + def validate_WH(self, test_question: str, predicted_question: str) -> bool: """ Assumes that only 1 WH word exists Returns True if the WH word in the test question equals the @@ -221,7 +223,7 @@ def validate_WH(self, test_question, predicted_question): i += 1 return wh_match - def classify_question(self, test_question): + def classify_question(self, test_question: str) -> str: """ Match a user query with a question in the database based on the classifier we trained and overall features we calculated. Return relevant question. diff --git a/nimbus_nlp/save_and_load_model.py b/nimbus_nlp/save_and_load_model.py index 375ae12..2f0f9da 100644 --- a/nimbus_nlp/save_and_load_model.py +++ b/nimbus_nlp/save_and_load_model.py @@ -12,6 +12,7 @@ from os.path import isfile, join import re +from sklearn.neighbors.classification import KNeighborsClassifier PROJECT_DIR = os.path.dirname(os.path.abspath(__file__)) now = datetime.now() date_time = now.strftime("_%m_%d_%Y_%H_%M_%S") @@ -29,7 +30,7 @@ def load_model(model_name): train_path = PROJECT_DIR + '/models/classification/' + model_name + '.joblib' return joblib.load(train_path) -def load_latest_model(): +def load_latest_model() -> KNeighborsClassifier: # https://stackoverflow.com/a/39327156 train_path = PROJECT_DIR + '/models/classification/*' list_of_files = glob.glob(train_path) From 1b83bc614a4b278eba50cf82e7801cefa6894369 Mon Sep 17 00:00:00 2001 From: Michael Fekadu Date: Wed, 4 Mar 2020 19:14:32 -0800 Subject: [PATCH 3/3] allow any version of pyre because its a dev-dependency not production --- Pipfile | 2 +- Pipfile.lock | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Pipfile b/Pipfile index dc31126..a78b4c7 100644 --- a/Pipfile +++ b/Pipfile @@ -19,7 +19,7 @@ flake8 = "==3.7.9" hypothesis = "==5.3.1" pytest = "==5.3.4" ## type-checking -pyre-check = "==0.0.41" +pyre-check = "*" ## like the Unix `make` but better invoke = "==1.4.1" monkeytype = "*" diff --git a/Pipfile.lock b/Pipfile.lock index 3d5dbb3..884f711 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "348fee5e0ee39fb3206a8e4f15f486f53691109f3e3035829eb5a61d64ccdd6c" + "sha256": "4040c61542e19c50f3a11a971ef93b181f9f548859f3acd8bc350ccbab0c425f" }, "pipfile-spec": 6, "requires": {