From 84f6905ece42907f36e4c898aab3c2703b98920b Mon Sep 17 00:00:00 2001 From: IanGrimstead <38883454+IanGrimstead@users.noreply.github.com> Date: Fri, 5 Apr 2019 19:45:50 +0100 Subject: [PATCH] Release 2.0.1 (#234) * Switched away from pyramid ARIMA due to stability issues * Now supports uncompressed pickle files (rather than just bzip2 compressed) * Using Python 3.7.3 * Handles imported data when dates are stored as strings rather than Timestamp objects * Corrected unigram handling --- .travis.yml | 24 ++-- appveyor.yml | 2 +- config/stopwords_glob.txt | 1 + config/stopwords_n.txt | 3 +- config/stopwords_uni.txt | 4 +- pygrams.py | 3 +- scripts/algorithms/arima.py | 77 +++++++++-- scripts/data_factory.py | 2 +- scripts/pipeline.py | 35 ++++- scripts/text_processing.py | 34 ++--- scripts/tfidf_mask.py | 6 +- scripts/tfidf_wrapper.py | 4 +- scripts/utils/argschecker.py | 8 +- scripts/utils/reduce_existing_data_frame.py | 2 +- scripts/utils/utils.py | 36 ++++++ setup.py | 5 +- tests/algorithms/test_arima.py | 49 ++++++- tests/data/fuel_cell_quarterly.csv | 52 ++++++++ tests/data/image_data_quarterly.csv | 52 ++++++++ tests/test_filter_terms.py | 26 +++- tests/test_pygrams.py | 33 +++-- tests/test_terms_graph.py | 14 +- tests/test_text_processing.py | 136 +++++++++++++++++++- tests/test_tfidf_mask.py | 8 +- tests/test_tfidf_reduce.py | 26 ++-- 25 files changed, 516 insertions(+), 126 deletions(-) create mode 100644 tests/data/fuel_cell_quarterly.csv create mode 100644 tests/data/image_data_quarterly.csv diff --git a/.travis.yml b/.travis.yml index c6aa177..f113baa 100644 --- a/.travis.yml +++ b/.travis.yml @@ -2,17 +2,17 @@ language: python matrix: include: - # Use the built in venv for linux builds - - os: linux - sudo: required - python: "3.6.6" - dist: trusty + # Use the built in venv for linux builds + - os: linux + sudo: required + python: "3.7.3" + dist: xenial - # Use generic language for osx; taken from https://pythonhosted.org/CodeChat/.travis.yml.html + # Use generic language for osx; taken from https://pythonhosted.org/CodeChat/.travis.yml.html # - os: osx # language: generic # env: PYTHON=3.6.6 - + before_install: | if [ "$TRAVIS_OS_NAME" == "osx" ]; then brew update @@ -21,12 +21,12 @@ before_install: | # See https://docs.travis-ci.com/user/osx-ci-environment/#A-note-on-upgrading-packages. # I didn't do this above because it works and I'm lazy. brew outdated pyenv || brew upgrade pyenv - + # virtualenv doesn't work without pyenv knowledge. venv in Python 3.3 # doesn't provide Pip by default. So, use `pyenv-virtualenv `_. brew install pyenv-virtualenv pyenv install $PYTHON - + # I would expect something like ``pyenv init; pyenv local $PYTHON`` or # ``pyenv shell $PYTHON`` would work, but ``pyenv init`` doesn't seem to # modify the Bash environment. ??? So, I hand-set the variables instead. @@ -34,13 +34,13 @@ before_install: | export PATH="/Users/travis/.pyenv/shims:${PATH}" pyenv-virtualenv venv source venv/bin/activate - + # A manual check that the correct version of Python is running. python --version fi export BOTO_CONFIG=/dev/null - + install: - python --version - python -m pip install -U pip @@ -53,6 +53,8 @@ install: script: # for codecov support - pip install pytest pytest-cov + # to report installed packages + - pip freeze # command to run tests - pytest --cov-config .coveragerc --cov=./ tests/ diff --git a/appveyor.yml b/appveyor.yml index 7d909f2..e0b4f07 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -5,7 +5,7 @@ build: none environment: matrix: - PYTHON: "C:\\Python36-x64" - PYTHON_VERSION: 3.6.6 + PYTHON_VERSION: 3.7.3 PYTHON_ARCH: 64 init: - ECHO %PYTHON% %PYTHON_VERSION% %PYTHON_ARCH% diff --git a/config/stopwords_glob.txt b/config/stopwords_glob.txt index d05b03a..cf4509a 100644 --- a/config/stopwords_glob.txt +++ b/config/stopwords_glob.txt @@ -82,6 +82,7 @@ everybody everyone everything everywhere +excess f few find diff --git a/config/stopwords_n.txt b/config/stopwords_n.txt index d5a921b..1e34355 100644 --- a/config/stopwords_n.txt +++ b/config/stopwords_n.txt @@ -1,4 +1,5 @@ situation consist first -plurality \ No newline at end of file +plurality +second \ No newline at end of file diff --git a/config/stopwords_uni.txt b/config/stopwords_uni.txt index dd7999b..a1d7522 100644 --- a/config/stopwords_uni.txt +++ b/config/stopwords_uni.txt @@ -1 +1,3 @@ -etc \ No newline at end of file +etc +cover +adjacent \ No newline at end of file diff --git a/pygrams.py b/pygrams.py index fb0b198..f9fa108 100644 --- a/pygrams.py +++ b/pygrams.py @@ -128,7 +128,6 @@ def get_args(command_line_arguments): args = parser.parse_args(command_line_arguments) - args.path = 'data' return args @@ -165,7 +164,7 @@ def main(supplied_args): pickled_tf_idf_file_name=pickled_tf_idf_path, output_name=args.outputs_name, emerging_technology=args.emerging_technology) - pipeline.output(outputs, wordcloud_title=args.wordcloud_title, outname=args.outputs_name, nterms=50) + pipeline.output(outputs, wordcloud_title=args.wordcloud_title, outname=args.outputs_name, nterms=args.num_ngrams_report) # emtech integration if args.emerging_technology: diff --git a/scripts/algorithms/arima.py b/scripts/algorithms/arima.py index 1c621c6..39334dc 100644 --- a/scripts/algorithms/arima.py +++ b/scripts/algorithms/arima.py @@ -1,27 +1,76 @@ +import warnings + +import numpy as np from numpy import clip, inf -from pyramid.arima import auto_arima +from sklearn.metrics import mean_squared_error +from statsmodels.tsa.arima_model import ARIMA class ARIMAForecast(object): - def __init__(self, data_in, num_prediction_periods): - if not all(isinstance(x, float) for x in data_in): - raise ValueError('Time series must be all float values') + def __evaluate_models(self, dataset, p_values, d_values, q_values): + dataset=np.array(dataset) + dataset = dataset.astype('float32') + best_score, best_cfg = float("inf"), None + for p in p_values: + for d in d_values: + for q in q_values: + order = (p, d, q) + try: + mse = self.__evaluate_arima_model(dataset, order, ground_truth_in_history=True) + if mse < best_score: + best_score = mse + best_cfg = order + except: + continue + return best_cfg, best_score + + def __evaluate_arima_model(self, X, arima_order, ground_truth_in_history=False): + + train_ratio = 0.8 + train_size = int(len(X) * train_ratio) + train, test = X[0:train_size], X[train_size:] + history = [x for x in train] + predictions = list() - self.__history = data_in - self.__num_prediction_periods = num_prediction_periods + for t in range(len(test)): + model = ARIMA(history, order=arima_order) + model_fit = model.fit(disp=0, maxiter=200) + yhat = model_fit.forecast()[0][0] + predictions.append(yhat) + history.append(test[t] if ground_truth_in_history else yhat) + error = mean_squared_error(test, predictions) + return error - self.__stepwise_model = auto_arima( - data_in, - seasonal=False, - error_action='ignore', suppress_warnings=True, stepwise=True - ) + def __arima_model_predict(self, X, arima_order, steps_ahead): + # make predictions + predictions = list() + try: + for t in range(steps_ahead): + model = ARIMA(X, order=arima_order) + model_fit = model.fit(disp=0) + yhat = model_fit.forecast()[0][0] + predictions.append(yhat) + X = np.append(X, yhat) + except: + predictions.extend([np.nan] * (steps_ahead - len(predictions))) + + return predictions + + def __init__(self, data_in, num_prediction_periods ): + if not all(isinstance(x, float) for x in data_in): + raise ValueError('Time series must be all float values') - self.__stepwise_model.fit(data_in) + p_values = [0, 1, 2, 4, 6] + d_values = range(0, 3) + q_values = range(0, 3) + warnings.filterwarnings("ignore") + self.__order, score = self.__evaluate_models(data_in, p_values, d_values, q_values) + self.__predictions = self.__arima_model_predict(data_in, self.__order, num_prediction_periods) @property def configuration(self): - return self.__stepwise_model.order + return self.__order def predict_counts(self): - return clip(self.__stepwise_model.predict(n_periods=self.__num_prediction_periods), 0, inf) + return clip(self.__predictions, 0, inf) diff --git a/scripts/data_factory.py b/scripts/data_factory.py index 9fb00c9..e70fb39 100644 --- a/scripts/data_factory.py +++ b/scripts/data_factory.py @@ -10,7 +10,7 @@ def get(doc_source_file_name): if not os.path.isfile(doc_source_file_name): raise PygramsException('file: ' + doc_source_file_name + ' does not exist in data folder') - if doc_source_file_name.endswith('.pkl.bz2'): + if doc_source_file_name.endswith('.pkl.bz2') or doc_source_file_name.endswith('.pkl'): return read_pickle(doc_source_file_name) elif doc_source_file_name.endswith('.xls'): return read_excel(doc_source_file_name) diff --git a/scripts/pipeline.py b/scripts/pipeline.py index 11e0a2b..495ad79 100644 --- a/scripts/pipeline.py +++ b/scripts/pipeline.py @@ -2,7 +2,8 @@ import pickle from os import makedirs, path -from pandas import read_pickle +from pandas import read_pickle, to_datetime +from pandas.api.types import is_string_dtype from tqdm import tqdm import scripts.data_factory as datafactory @@ -11,7 +12,7 @@ from scripts.documents_filter import DocumentsFilter from scripts.documents_weights import DocumentsWeights from scripts.filter_terms import FilterTerms -from scripts.text_processing import LemmaTokenizer +from scripts.text_processing import LemmaTokenizer, WordAnalyzer, lowercase_strip_accents_and_ownership from scripts.tfidf_mask import TfidfMask from scripts.tfidf_reduce import TfidfReduce from scripts.tfidf_wrapper import TFIDF @@ -21,14 +22,24 @@ from scripts.vandv.predictor import evaluate_prediction -def checkdf( df, emtec, docs_mask_dict, text_header): +def checkdf(df, emtec, docs_mask_dict, text_header, term_counts): app_exit = False - if emtec or docs_mask_dict['time'] or docs_mask_dict['date'] is not None: + if emtec or docs_mask_dict['time'] or docs_mask_dict['date'] is not None or term_counts: if docs_mask_dict['date_header'] not in df.columns: print(f"date_header '{docs_mask_dict['date_header']}' not in dataframe") app_exit = True + if docs_mask_dict['date_header'] is not None: + if is_string_dtype(df[docs_mask_dict['date_header']]): + df[docs_mask_dict['date_header']] = to_datetime(df[docs_mask_dict['date_header']]) + + min_date = min(df[docs_mask_dict['date_header']]) + max_date = max(df[docs_mask_dict['date_header']]) + print(f'Document dates range from {min_date:%Y-%m-%d} to {max_date:%Y-%m-%d}') + else: + print('Document dates not specified') + if text_header not in df.columns: print(f"text_header '{text_header}' not in dataframe") app_exit = True @@ -61,7 +72,7 @@ def __init__(self, data_filename, docs_mask_dict, pick_method='sum', ngram_range if pickled_tf_idf_file_name is None: self.__dataframe = datafactory.get(data_filename) - checkdf(self.__dataframe, emerging_technology, docs_mask_dict, text_header) + checkdf(self.__dataframe, emerging_technology, docs_mask_dict, text_header, term_counts) remove_empty_documents(self.__dataframe, text_header) self.__tfidf_obj = TFIDF(text_series=self.__dataframe[text_header], ngram_range=ngram_range, @@ -70,7 +81,7 @@ def __init__(self, data_filename, docs_mask_dict, pick_method='sum', ngram_range self.__text_lengths = self.__dataframe[text_header].map(len).tolist() self.__dataframe.drop(columns=[text_header], inplace=True) - tfidf_filename = path.join('outputs', 'tfidf', output_name + '-tfidf.pkl.bz2') + tfidf_filename = path.join('outputs', 'tfidf', output_name + f'-tfidf-mdf-{max_df}.pkl.bz2') makedirs(path.dirname(tfidf_filename), exist_ok=True) with bz2.BZ2File(tfidf_filename, 'wb') as pickle_file: pickle.dump( @@ -81,6 +92,17 @@ def __init__(self, data_filename, docs_mask_dict, pick_method='sum', ngram_range else: print(f'Reading document and TFIDF from pickle {pickled_tf_idf_file_name}') self.__tfidf_obj, self.__dataframe, self.__text_lengths = read_pickle(pickled_tf_idf_file_name) + if docs_mask_dict['date_header'] is None: + print('Document dates not specified') + else: + min_date = min(self.__dataframe[docs_mask_dict['date_header']]) + max_date = max(self.__dataframe[docs_mask_dict['date_header']]) + print(f'Document dates range from {min_date:%Y-%m-%d} to {max_date:%Y-%m-%d}') + + WordAnalyzer.init( + tokenizer=LemmaTokenizer(), + preprocess=lowercase_strip_accents_and_ownership, + ngram_range=ngram_range) # todo: pipeline is now a one-way trip of data, slowly collapsing / shrinking it as we don't need to keep # the original. We're really just filtering down. @@ -140,6 +162,7 @@ def __init__(self, data_filename, docs_mask_dict, pick_method='sum', ngram_range docs_mask_dict['date_header']) # if other outputs self.__term_score_tuples = self.__tfidf_reduce_obj.extract_ngrams_from_docset(pick_method) + self.__term_score_tuples = utils.stop_tup(self.__term_score_tuples, WordAnalyzer.stemmed_stop_word_set_uni, WordAnalyzer.stemmed_stop_word_set_n) # todo: no output method; just if statements to call output functions...? # Only supply what they each directly require diff --git a/scripts/text_processing.py b/scripts/text_processing.py index 9183431..a5b76e3 100644 --- a/scripts/text_processing.py +++ b/scripts/text_processing.py @@ -31,6 +31,7 @@ OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. """ +import scripts.utils.utils as ut import string from nltk import word_tokenize, PorterStemmer, pos_tag @@ -86,6 +87,7 @@ class WordAnalyzer(object): stemmed_stop_word_set_n = None stemmed_stop_word_set_uni = None + @staticmethod def init(tokenizer, preprocess, ngram_range): WordAnalyzer.tokenizer = tokenizer @@ -110,39 +112,23 @@ def init(tokenizer, preprocess, ngram_range): def analyzer(doc): """based on VectorizerMixin._word_ngrams in sklearn/feature_extraction/text.py, from scikit-learn; extended to prevent generation of n-grams containing stop words""" - tokens = WordAnalyzer.tokenizer(WordAnalyzer.preprocess(doc)) - - # handle token n-grams min_n, max_n = WordAnalyzer.ngram_range - if max_n != 1: - original_tokens = tokens - if min_n == 1: - # no need to do any slicing for unigrams - # just iterate through the original tokens - tokens = [w for w in tokens if w not in WordAnalyzer.stemmed_stop_word_set_uni and not w.isdigit()] - # tokens = list(original_tokens) - min_n += 1 - else: - tokens = [] + original_tokens = WordAnalyzer.tokenizer(WordAnalyzer.preprocess(doc)) + tokens = original_tokens if min_n == 1 else [] + # handle token n-grams + if max_n > 1: + min_phrase = max(min_n, 2) n_original_tokens = len(original_tokens) # bind method outside of loop to reduce overhead tokens_append = tokens.append space_join = " ".join - for n in range(min_n, min(max_n + 1, n_original_tokens + 1)): + for n in range(min_phrase, min(max_n + 1, n_original_tokens + 1)): for i in range(n_original_tokens - n + 1): candidate_ngram = original_tokens[i: i + n] - hasdigit = False - for ngram in candidate_ngram: - if ngram.isdigit(): - hasdigit = True + tokens_append(space_join(candidate_ngram)) - ngram_stop_word_set = set(candidate_ngram) & WordAnalyzer.stemmed_stop_word_set_n - if len(ngram_stop_word_set) == 0 and not hasdigit: - tokens_append(space_join(candidate_ngram)) + return ut.stop(tokens,WordAnalyzer.stemmed_stop_word_set_uni, WordAnalyzer.stemmed_stop_word_set_n) - return tokens - else: - return [w for w in tokens if w not in WordAnalyzer.stemmed_stop_word_set_uni] diff --git a/scripts/tfidf_mask.py b/scripts/tfidf_mask.py index 8de5f0c..4451d5a 100644 --- a/scripts/tfidf_mask.py +++ b/scripts/tfidf_mask.py @@ -7,7 +7,7 @@ def __init__(self, tfidf_obj, ngram_range=(2, 3), uni_factor=0.8): self.__feature_names = tfidf_obj.feature_names self.__tfidf_mask = self.__tfidf_matrix.copy() self.__tfidf_mask.data = np.ones(len(self.__tfidf_matrix.data)) - self.__vectorizer = tfidf_obj.vectorizer + self.__vocabulary = tfidf_obj.vocabulary self.__uni_factor = uni_factor self.__idf = tfidf_obj.idf @@ -88,8 +88,8 @@ def __unbias_ngrams(self, max_ngram_length): ngram_minus_front = ' '.join(big_ngram_terms[1:]) ngram_minus_back = ' '.join(big_ngram_terms[:len(big_ngram_terms) - 1]) - idx_ngram_minus_front = self.__vectorizer.vocabulary_.get(ngram_minus_front) - idx_ngram_minus_back = self.__vectorizer.vocabulary_.get(ngram_minus_back) + idx_ngram_minus_front = self.__vocabulary.get(ngram_minus_front) + idx_ngram_minus_back = self.__vocabulary.get(ngram_minus_back) indices_slice = self.__tfidf_matrix.indices[start_idx_ptr:end_idx_ptr] ngram_counts = self.__tfidf_matrix.data[j] / self.__idf[col_idx] diff --git a/scripts/tfidf_wrapper.py b/scripts/tfidf_wrapper.py index acc63c9..2c93338 100644 --- a/scripts/tfidf_wrapper.py +++ b/scripts/tfidf_wrapper.py @@ -34,8 +34,8 @@ def tfidf_matrix(self): return self.__tfidf_matrix @property - def vectorizer(self): - return self.__vectorizer + def vocabulary(self): + return self.__vectorizer.vocabulary_ @property def feature_names(self): diff --git a/scripts/utils/argschecker.py b/scripts/utils/argschecker.py index 42fbcec..daf71e7 100644 --- a/scripts/utils/argschecker.py +++ b/scripts/utils/argschecker.py @@ -14,7 +14,8 @@ def __init__(self, args, args_default): def checkargs(self): app_exit = False - if path.isfile(path.join(self.args.path, self.args.doc_source)) is False: + doc_path = path.join(self.args.path, self.args.doc_source) + if path.isfile(doc_path) is False: print(f"File {self.args.doc_source} in path {self.args.path} not found") app_exit = True @@ -71,11 +72,6 @@ def checkargs(self): '[-o] "wordcloud"') app_exit = True - if self.args.num_ngrams_report != self.args_default.num_ngrams_report: - if 'report' not in self.args.output: - print('arguments [-np] can only be used when output includes report [-o] "report"') - app_exit = True - if self.args.num_ngrams_fdg != self.args_default.num_ngrams_fdg: if 'fdg' not in self.args.output: print('argument [-nf] can only be used when output includes fdg [-o] "fdg"') diff --git a/scripts/utils/reduce_existing_data_frame.py b/scripts/utils/reduce_existing_data_frame.py index a04b96d..157259b 100644 --- a/scripts/utils/reduce_existing_data_frame.py +++ b/scripts/utils/reduce_existing_data_frame.py @@ -92,7 +92,7 @@ def main(): subset_size=args.size, fraction=args.fraction, date_range=date_range, date_column_name=args.date_column_name) - print(f'After filtering: {data_frame.shape[0]} rows in data frame') + print(f'After filtering: {data_frame.shape[0]:,} rows in data frame') print(f'Writing sub-sampled data frame in pickle {pickle_file_name}...') data_frame.to_pickle(pickle_file_name) print(f'...written sub-sampled data frame in pickle {pickle_file_name}') diff --git a/scripts/utils/utils.py b/scripts/utils/utils.py index 079847e..9537bbc 100644 --- a/scripts/utils/utils.py +++ b/scripts/utils/utils.py @@ -129,3 +129,39 @@ def normalize(ydata): return np.asarray([(_y - miny) / diff for _y in ydata]) + +def stop(tokensin, unigrams, ngrams, digits=True): + new_tokens=[] + for token in tokensin: + ngram = token.split() + if len(ngram)==1: + if ngram[0] not in unigrams and not ngram[0].isdigit(): + new_tokens.append(token) + else: + word_in_ngrams=False + for word in ngram: + if word in ngrams or (digits and word.isdigit()): + word_in_ngrams=True + break + if not word_in_ngrams: + new_tokens.append(token) + return new_tokens + + +def stop_tup(tuples, unigrams, ngrams, digits=True): + new_tuples=[] + for tuple in tuples: + token = tuple[1] + ngram = token.split() + if len(ngram)==1: + if ngram[0] not in unigrams and not ngram[0].isdigit(): + new_tuples.append(tuple) + else: + word_in_ngrams=False + for word in ngram: + if word in ngrams or (digits and word.isdigit()): + word_in_ngrams=True + break + if not word_in_ngrams: + new_tuples.append(tuple) + return new_tuples \ No newline at end of file diff --git a/setup.py b/setup.py index de94875..73da624 100644 --- a/setup.py +++ b/setup.py @@ -53,8 +53,9 @@ def setup_package(): 'License :: MIT License', 'Programming Language :: Python :: 3.6', ], - install_requires=['matplotlib', 'numpy', 'scipy', 'wordcloud', 'pandas', 'tqdm', 'nltk', 'scikit-learn', 'xlrd', - 'python-Levenshtein', 'gensim', 'pyramid-arima>=0.9.0', 'keras', 'tensorflow', 'keras_tqdm', + + install_requires=['matplotlib', 'numpy', 'scipy>=1.2.1', 'wordcloud', 'pandas', 'tqdm', 'nltk', 'scikit-learn', + 'xlrd','python-Levenshtein', 'gensim', 'statsmodels', 'keras', 'tensorflow', 'keras_tqdm', 'patsy', 'humanfriendly', 'psutil', 'jinja2'], # extras_require={'dev': ['check-manifest'],'test': ['coverage'],}, python_requires='>=3.6', diff --git a/tests/algorithms/test_arima.py b/tests/algorithms/test_arima.py index 04769bf..5a5f4e9 100644 --- a/tests/algorithms/test_arima.py +++ b/tests/algorithms/test_arima.py @@ -9,6 +9,15 @@ from scripts.algorithms.arima import ARIMAForecast +import platform; print(platform.platform()) +import sys; print("Python", sys.version) +import os +import pandas as pd +import numpy as np; print("NumPy", np.__version__) +import scipy; print("SciPy", scipy.__version__) +import sklearn; print("Scikit-Learn", sklearn.__version__) +import statsmodels; print("Statsmodels", statsmodels.__version__) + class ArimaTests(unittest.TestCase): @@ -31,12 +40,44 @@ def test_static_sequence(self): np_test.assert_almost_equal(actual_prediction, expected_prediction, decimal=4) - def test_linearly_increasing_sequence(self): - time_series = [8.9, 11.0, 13.0, 15.1, 17.0, 18.9, 21.0] - num_predicted_periods = 4 - expected_prediction = [23.0, 25.0, 27.0, 29.0] + def test_linear_sequence(self): + time_series = [1.0, 2.0, 3.0, 4.0, 5.0] + num_predicted_periods = 3 + expected_prediction = [6.0, 7.0, 8.0] + arima = ARIMAForecast(time_series, num_predicted_periods) + + actual_prediction = arima.predict_counts() + + np_test.assert_almost_equal(actual_prediction, expected_prediction, decimal=4) + + def test_flakey_sequence(self): + time_series = [20.0, -20.0] + num_predicted_periods = 3 + expected_prediction = [np.nan] * 3 arima = ARIMAForecast(time_series, num_predicted_periods) actual_prediction = arima.predict_counts() np_test.assert_almost_equal(actual_prediction, expected_prediction, decimal=1) + + def test_linearly_increasing_sequence_fuel_cell(self): + time_series = pd.read_csv(os.path.join('tests','data', 'fuel_cell_quarterly.csv')).values.tolist() + time_series = [item for sublist in time_series for item in sublist] + num_predicted_periods = 4 + expected_prediction = [333., 333., 334., 335.] + arima = ARIMAForecast(np.array(time_series).astype(float), num_predicted_periods) + + actual_prediction = arima.predict_counts() + + np_test.assert_almost_equal(actual_prediction, expected_prediction, decimal=0) + + def test_linearly_decreasing_sequence_image_data(self): + time_series = pd.read_csv(os.path.join('tests','data', 'image_data_quarterly.csv')).values.tolist() + time_series = [item for sublist in time_series for item in sublist] + num_predicted_periods = 4 + expected_prediction = [562., 561., 558., 556.] + arima = ARIMAForecast(np.array(time_series).astype(float), num_predicted_periods) + + actual_prediction = arima.predict_counts() + + np_test.assert_almost_equal(actual_prediction, expected_prediction, decimal=0) diff --git a/tests/data/fuel_cell_quarterly.csv b/tests/data/fuel_cell_quarterly.csv new file mode 100644 index 0000000..91f5489 --- /dev/null +++ b/tests/data/fuel_cell_quarterly.csv @@ -0,0 +1,52 @@ +323 +340 +296 +217 +265 +337 +326 +284 +276 +294 +252 +253 +264 +190 +262 +256 +264 +232 +211 +222 +235 +219 +273 +292 +330 +268 +260 +263 +277 +309 +282 +316 +348 +314 +314 +317 +350 +368 +375 +321 +413 +395 +368 +330 +407 +316 +349 +377 +320 +334 +340 +317 diff --git a/tests/data/image_data_quarterly.csv b/tests/data/image_data_quarterly.csv new file mode 100644 index 0000000..5e2d656 --- /dev/null +++ b/tests/data/image_data_quarterly.csv @@ -0,0 +1,52 @@ +190 +257 +186 +253 +275 +344 +296 +322 +273 +291 +253 +293 +285 +251 +349 +288 +330 +297 +341 +302 +349 +357 +427 +434 +409 +436 +430 +408 +474 +486 +517 +551 +575 +621 +618 +627 +560 +663 +630 +565 +661 +690 +685 +577 +623 +516 +639 +544 +538 +547 +564 +569 diff --git a/tests/test_filter_terms.py b/tests/test_filter_terms.py index 0cea76b..829f581 100644 --- a/tests/test_filter_terms.py +++ b/tests/test_filter_terms.py @@ -17,8 +17,26 @@ def setUp(self): def test_embeddings_filter_binary(self): user_queries = ['pharmacy', 'health', 'chemist'] - weights_vec_expected = [1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, - 0.0, 0.0, 0.0] + weights_vec_expected = [1.0, + 1.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 1.0, + 1.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0] weights_vec_actual = FilterTerms(self.feature_names, user_queries, threshold=0.8).ngram_weights_vec[410:430] self.assertListEqual(weights_vec_expected, weights_vec_actual) @@ -28,7 +46,6 @@ def test_embeddings_filter_cosine_dist(self): user_queries = ['pharmacy', 'health', 'chemist'] weights_vec_actual = FilterTerms(self.feature_names, user_queries).ngram_weights_vec[410:430] weights_vec_expected = [0.5728331683597565, - 0.5728331683597565, 0.5728331683597565, 0.023525821108745026, 0.551300224350135, @@ -46,7 +63,8 @@ def test_embeddings_filter_cosine_dist(self): 0.47060086220739433, -0.10829696922978878, 0.19429777744446344, - 0.19429777744446344] + 0.19429777744446344, + 0.47456806019549364] assert_list_almost_equal(self, weights_vec_expected, weights_vec_actual) diff --git a/tests/test_pygrams.py b/tests/test_pygrams.py index 91e748a..91a3401 100644 --- a/tests/test_pygrams.py +++ b/tests/test_pygrams.py @@ -8,6 +8,7 @@ import pygrams from scripts import FilePaths +from scripts.text_processing import WordAnalyzer from scripts.utils.pygrams_exception import PygramsException @@ -55,8 +56,9 @@ def preparePyGrams(self, fake_df_data, mock_read_pickle, mock_open, mock_bz2file in range(self.number_of_rows)] if self.publication_date_auto_tested: - fake_df_data['publication_date'] = [pd.Timestamp('2000-12-28 00:00:00') - pd.DateOffset(weeks=row) for row - in range(self.number_of_rows)] + fake_df_data['publication_date'] = [ + f"{pd.Timestamp('2000-12-28 00:00:00') - pd.DateOffset(weeks=row):%Y-%m-%d}" for row + in range(self.number_of_rows)] if self.invention_title_auto_tested: fake_df_data['invention_title'] = [f'invention_title-{pid}' for pid in range(self.number_of_rows)] @@ -126,14 +128,14 @@ def isfile_fake(file_name): mock_path_isfile.side_effect = isfile_fake - def assertTfidfOutputs(self, assert_func, mock_pickle_dump, mock_makedirs): + def assertTfidfOutputs(self, assert_func, mock_pickle_dump, mock_makedirs, max_df): self.assertTrue(self.publication_date_auto_tested) self.assertTrue(self.patent_id_auto_tested) mock_makedirs.assert_called_with(self.tfidfOutputFolder(), exist_ok=True) results_checked = False for dump_args in mock_pickle_dump.call_args_list: - if dump_args[0][1] == self.tfidfFileName(self.out_name): + if dump_args[0][1] == self.tfidfFileName(self.out_name, max_df): tfidf_pickle = dump_args[0][0] tfidf_obj = tfidf_pickle[0] @@ -168,8 +170,8 @@ def tfidfOutputFolder(): return os.path.join('outputs', 'tfidf') @staticmethod - def tfidfFileName(data_source_name): - return os.path.join(TestPyGrams.tfidfOutputFolder(), data_source_name + '-tfidf.pkl.bz2') + def tfidfFileName(data_source_name, max_df): + return os.path.join(TestPyGrams.tfidfOutputFolder(), data_source_name + f'-tfidf-mdf-{max_df}.pkl.bz2') @staticmethod def termCountsOutputFolder(): @@ -192,9 +194,9 @@ def test_simple_output_tfidf(self, mock_path_isfile, mock_makedirs, mock_bz2file 'abstract' ] } - + max_df = 1.0 self.preparePyGrams(fake_df_data, mock_read_pickle, mock_open, mock_bz2file, mock_path_isfile) - args = ['-ds', self.data_source_name, '--date_header', 'publication_date', '--max_document_frequency', '1.0'] + args = ['-ds', self.data_source_name, '--date_header', 'publication_date', '--max_document_frequency', str(max_df)] pygrams.main(args) @@ -202,7 +204,7 @@ def assert_tfidf_outputs(tfidf_matrix, feature_names): self.assertEqual(tfidf_matrix.todense(), np.ones(shape=(1, 1)), 'TFIDF should be 1x1 matrix of 1') self.assertListEqual(feature_names, ['abstract']) - self.assertTfidfOutputs(assert_tfidf_outputs, mock_pickle_dump, mock_makedirs) + self.assertTfidfOutputs(assert_tfidf_outputs, mock_pickle_dump, mock_makedirs, max_df) @mock.patch("scripts.pipeline.read_pickle", create=True) @mock.patch("scripts.data_factory.read_pickle", create=True) @@ -227,6 +229,13 @@ def test_simple_output_tfidf_pickle_and_unpickle(self, mock_path_isfile, mock_ou args = ['-ds', self.data_source_name, '--date_header', 'publication_date', '--max_document_frequency', '1.0'] pygrams.main(args) + # reset static object + WordAnalyzer.tokenizer = None + WordAnalyzer.preprocess = None + WordAnalyzer.ngram_range = None + WordAnalyzer.stemmed_stop_word_set_n = None + WordAnalyzer.stemmed_stop_word_set_uni = None + # Fail if original data frame is requested from disc def factory_read_pickle_fake(pickle_file_name): self.fail(f'Should not be reading {pickle_file_name} via a factory if TFIDF was requested from pickle') @@ -271,10 +280,10 @@ def test_simple_two_patents_unigrams_only_output_tfidf(self, mock_path_isfile, m 'abstract two' ] } - + max_df=1.0 self.preparePyGrams(fake_df_data, mock_read_pickle, mock_open, mock_bz2file, mock_path_isfile) args = ['-ds', self.data_source_name, '--date_header', - 'publication_date', '--max_document_frequency', '1.0', '--max_ngrams', '1'] + 'publication_date', '--max_document_frequency', str(max_df), '--max_ngrams', '1'] pygrams.main(args) @@ -301,7 +310,7 @@ def assert_tfidf_outputs(tfidf_matrix, feature_names): self.assertListAlmostEqual(tfidf_as_lists[0], [l2norm_tfidf_abstract, l2norm_tfidf_one, 0], places=4) self.assertListAlmostEqual(tfidf_as_lists[1], [l2norm_tfidf_abstract, 0, l2norm_tfidf_one], places=4) - self.assertTfidfOutputs(assert_tfidf_outputs, mock_pickle_dump, mock_makedirs) + self.assertTfidfOutputs(assert_tfidf_outputs, mock_pickle_dump, mock_makedirs, max_df) @mock.patch("scripts.data_factory.read_pickle", create=True) @mock.patch("pickle.dump", create=True) diff --git a/tests/test_terms_graph.py b/tests/test_terms_graph.py index 73b4f65..175135f 100644 --- a/tests/test_terms_graph.py +++ b/tests/test_terms_graph.py @@ -55,25 +55,25 @@ def test_num_nodes(self): self.assertEquals(50, len(self.__nodes)) def test_num_links(self): - self.assertEquals(447, len(self.__links)) + self.assertEquals(454, len(self.__links)) def test_terms_in_nodes(self): texts = [x['text'] for x in self.__nodes] self.assertIn('central portion', texts) self.assertIn('fluid commun', texts) - self.assertIn('provid seed', texts) + self.assertIn('phenyl ring', texts) self.assertIn('gate line', texts) idx_1 = texts.index("central portion") idx_2 = texts.index("fluid commun") - idx_3 = texts.index("provid seed") + idx_3 = texts.index("phenyl ring") idx_4 = texts.index("gate line") - self.assertAlmostEqual(0.05478826302293826, self.__nodes[idx_1]['freq']) - self.assertAlmostEqual(0.022815124444693337, self.__nodes[idx_2]['freq']) - self.assertAlmostEqual(0.01193531394736373, self.__nodes[idx_3]['freq']) - self.assertAlmostEqual(0.07963623423011947, self.__nodes[idx_4]['freq']) + self.assertAlmostEqual(0.024110680522099224, self.__nodes[idx_1]['freq']) + self.assertAlmostEqual(0.004707609539032177, self.__nodes[idx_2]['freq']) + self.assertAlmostEqual(0.09743319564023586, self.__nodes[idx_3]['freq']) + self.assertAlmostEqual(0.07346334072037178, self.__nodes[idx_4]['freq']) def test_terms_in_links(self): diff --git a/tests/test_text_processing.py b/tests/test_text_processing.py index 140429a..21c4936 100644 --- a/tests/test_text_processing.py +++ b/tests/test_text_processing.py @@ -1,12 +1,9 @@ import unittest -import pandas as pd from nltk import word_tokenize -from scripts import FilePaths -from scripts.text_processing import StemTokenizer, WordAnalyzer, \ - lowercase_strip_accents_and_ownership - +from scripts.text_processing import StemTokenizer, WordAnalyzer, \ + lowercase_strip_accents_and_ownership, LemmaTokenizer # Sample abstracts taken from the USPTO Bulk Download Service: https://bulkdata.uspto.gov # Data used was downloaded from "Patent Grant Full Text Data" @@ -23,8 +20,7 @@ def test_stematizer(self): self.assertListEqual(expected_words, actual_words) - -class Test_lowercase_strip_accents_and_ownership(unittest.TestCase): +class TestLowercaseStripAccentsAndOwnership(unittest.TestCase): def test_lowercase(self): doc = 'Test ABCdefGH IJ. Again' @@ -92,3 +88,129 @@ def test_WordAnalyser_ngrams_dont_cross_punctuation_or_stop_words(self): 'metal fish bucket'] actual_ngrams = WordAnalyzer.analyzer(doc) self.assertListEqual(expected_ngrams, actual_ngrams) + + def test_WordAnalyser_ngrams(self): + ngram_range = (1, 3) + WordAnalyzer.init(tokenizer=LemmaTokenizer(), preprocess=self.preprocess, ngram_range=ngram_range) + + doc = "Conductive structures in features of an insulator layer on a substrate are fabricated by a particular " \ + "process. In this process, a layer of conductive material is applied over the insulator layer so that " \ + "the layer of conductive material covers field regions adjacent the features and fills in the features " \ + "themselves. A grain size differential between the conductive material which covers the field regions " \ + "and the conductive material which fills in the feature is then established by annealing the layer of " \ + "conductive material. Excess conductive material is then removed to uncover the field regions and leave " \ + "the conductive structures. The layer of conductive material is applied so as to define a first layer " \ + "thickness over the field regions and a second layer thickness in and over the features. These " \ + "thicknesses are dimensioned such that d 1 ≦0.5d 2 , with d 1 being the first layer thickness and d 2 " \ + "being the second layer thickness. Preferably, the first and second layer thicknesses are dimensioned " \ + "such that d 1 ≦0.3d 2 . " + expected_ngrams = ['conductive', + 'structure', + 'feature', + 'insulator', + 'layer', + 'substrate', + 'fabricate', + 'particular', + 'process', + 'process', + 'layer', + 'conductive', + 'material', + 'apply', + 'insulator', + 'layer', + 'layer', + 'conductive', + 'material', + 'field', + 'region', + 'feature', + 'fill', + 'feature', + 'themselves', + 'grain', + 'differential', + 'conductive', + 'material', + 'field', + 'region', + 'conductive', + 'material', + 'fill', + 'feature', + 'establish', + 'anneal', + 'layer', + 'conductive', + 'material', + 'conductive', + 'material', + 'remove', + 'uncover', + 'field', + 'region', + 'leave', + 'conductive', + 'structure', + 'layer', + 'conductive', + 'material', + 'apply', + 'define', + 'first', + 'layer', + 'thickness', + 'field', + 'region', + 'second', + 'layer', + 'thickness', + 'feature', + 'thickness', + 'dimension', + '0.5d', + 'first', + 'layer', + 'thickness', + 'second', + 'layer', + 'thickness', + 'preferably', + 'first', + 'second', + 'layer', + 'thickness', + 'dimension', + '0.3d', + 'conductive structure', + 'insulator layer', + 'particular process', + 'conductive material', + 'insulator layer', + 'conductive material', + 'material cover', + 'cover field', + 'field region', + 'region adjacent', + 'feature themselves', + 'conductive material', + 'field region', + 'conductive material', + 'conductive material', + 'conductive material', + 'field region', + 'conductive structure', + 'conductive material', + 'layer thickness', + 'field region', + 'layer thickness', + 'layer thickness', + 'layer thickness', + 'layer thickness', + 'conductive material cover', + 'material cover field', + 'cover field region', + 'field region adjacent'] + actual_ngrams = WordAnalyzer.analyzer(doc) + self.assertListEqual(expected_ngrams, actual_ngrams) diff --git a/tests/test_tfidf_mask.py b/tests/test_tfidf_mask.py index 0c6681b..6b0e97b 100644 --- a/tests/test_tfidf_mask.py +++ b/tests/test_tfidf_mask.py @@ -64,7 +64,7 @@ def init_mask(self, cpc, min_n, uni_factor=0.8): def test_num_non_zeros_no_clean_rows(self): self.init_mask('Y02', 2) - self.assertEqual(2059, len(self.__tfidf_mask.data)) + self.assertEqual(2024, len(self.__tfidf_mask.data)) def test_terms(self): self.init_mask('Y02', 2) @@ -119,14 +119,14 @@ def test_no_negative_weights(self): def test_non_zeros_clean_rows(self): self.init_mask('Y02', 2) tfidf_mask_nozero_rows = utils.remove_all_null_rows(self.__tfidf_mask) - vectorizer = self.__tfidf_obj.vectorizer + vocabulary = self.__tfidf_obj.vocabulary expected_term1_val = 0.25 expected_term2_val = 0.2962962962962961 term1 = 'exhaust ga' # 0.25 term2 = 'drive region' # 0.2962962962962961 - idx_term1 = vectorizer.vocabulary_.get(term1) - idx_term2 = vectorizer.vocabulary_.get(term2) + idx_term1 = vocabulary.get(term1) + idx_term2 = vocabulary.get(term2) indexof_idx_term1 = tfidf_mask_nozero_rows.indices.tolist().index(idx_term1) indexof_idx_term2 = tfidf_mask_nozero_rows.indices.tolist().index(idx_term2) diff --git a/tests/test_tfidf_reduce.py b/tests/test_tfidf_reduce.py index c7ee153..6d2c1da 100644 --- a/tests/test_tfidf_reduce.py +++ b/tests/test_tfidf_reduce.py @@ -48,7 +48,8 @@ def setUpClass(cls): def test_terms(self): term_score_tuples = self.__term_score_tuples actual_terms = [x for _, x in term_score_tuples] - expected_terms = ['transmit path', + expected_terms = ['mount surfac', + 'transmit path', 'electron element', 'link document', 'amid deriv', @@ -60,14 +61,13 @@ def test_terms(self): 'contact beam', 'angular veloc', 'shorter tuft', + 'conduct materi', 'endodont instrument', 'mass offset', 'section bend', - 'termin channel', - 'stationari household applianc', - 'fault point', - 'adhes strip', - 'handheld electron devic' + 'compon materi', + 'connect portion', + 'termin channel' ] self.assertListEqual(actual_terms[:20], expected_terms) @@ -75,7 +75,8 @@ def test_terms(self): def test_scores(self): term_score_tuples = self.__term_score_tuples actual_scores = [x for x, _ in term_score_tuples] - expected_scores = [0.8259734063804905, + expected_scores = [0.9449111825230679, + 0.8259734063804905, 0.7754588414852185, 0.7276068751089988, 0.7071067811865476, @@ -83,17 +84,16 @@ def test_scores(self): 0.7071067811865475, 0.6666666666666666, 0.6396021490668312, - 0.6172133998483675, + 0.6246950475544241, 0.6031800939323297, 0.6000595413031171, 0.5834599659915781, + 0.5806718350868961, 0.5773502691896257, 0.5773502691896257, 0.5773502691896257, - 0.5597177778726654, - 0.5570860145311556, - 0.5568900989230109, - 0.547722557505166, - 0.5265695940793358] + 0.5669467095138407, + 0.5611088299627696, + 0.5597177778726654] support.assert_list_almost_equal(self, actual_scores[:20], expected_scores)