diff --git a/.gitignore b/.gitignore index 06ee63a..d4b292d 100644 --- a/.gitignore +++ b/.gitignore @@ -155,9 +155,5 @@ crashlytics.properties crashlytics-build.properties fabric.properties -# .idea and models +# .idea .idea/ -classifiers/*.model -classifiers/*.model.* -classifiers/*.pkl -data/*.csv diff --git a/.travis.yml b/.travis.yml index 9d1bb80..6ab1779 100644 --- a/.travis.yml +++ b/.travis.yml @@ -5,4 +5,4 @@ - sudo rm -f /etc/boto.cfg - pip install -r requirements.txt script: - - python3 text_classifier.py Test dataset.csv d2v.model joblib_model.pkl + - python3 text_classifier.py diff --git a/models/classifier_model.py b/models/classifier_model.py index f057e4d..29c14a0 100644 --- a/models/classifier_model.py +++ b/models/classifier_model.py @@ -1,11 +1,14 @@ +from .model import Model +from .doc2vec_model import doc2VecModel + import logging -import numpy as np import os import inspect + +import numpy as np from sklearn.linear_model import LogisticRegression from sklearn.metrics import accuracy_score, f1_score -from .model import Model -from .doc2vec_model import doc2VecModel + logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) base_file_path = inspect.getframeinfo(inspect.currentframe()).filename @@ -30,12 +33,6 @@ def train_model(self, d2v, training_vectors, training_labels): logging.info( 'Training F1 score: {}'.format(f1_score(training_labels, training_predictions, average='weighted'))) - def save_model(self, filename): - logging.info("Saving trained classification model") - - def load_model(self, filename): - logging.info("Loading trained classification model") - def test_model(self, d2v, testing_vectors, testing_labels): logging.info("Classifier testing") test_vectors = doc2VecModel.get_vectors(d2v, len(testing_vectors), 300, 'Test') diff --git a/models/doc2vec_model.py b/models/doc2vec_model.py index 8fb6f85..abdd2e7 100644 --- a/models/doc2vec_model.py +++ b/models/doc2vec_model.py @@ -1,12 +1,14 @@ +from .model import Model + import logging import random import os import inspect + import numpy as np from gensim.models import doc2vec -from gensim.models.doc2vec import Doc2Vec -from .model import Model + logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) base_file_path = inspect.getframeinfo(inspect.currentframe()).filename @@ -47,20 +49,6 @@ def train_model(self): # fix the learning rate, no decay self.model.min_alpha = self.model.alpha - def save_model(self, filename): - logging.info("Saving trained Doc2Vec model") - filename = os.path.join(classifiers_path, filename) - self.model.save(filename) - - def load_model(self, filename): - logging.info("Loading trained Doc2Vec model") - filename = os.path.join(classifiers_path, filename) - if (os.path.isfile(filename)): - d2v = Doc2Vec.load(filename) - self.model = d2v - else: - self.model = None - def get_vectors(self, corpus_size, vectors_size, vectors_type): """ Get vectors from trained doc2vec model diff --git a/models/model.py b/models/model.py index 1a44f26..4251f5b 100644 --- a/models/model.py +++ b/models/model.py @@ -15,12 +15,5 @@ def initialize_model(self): def train_model(self): pass - @abstractmethod - def save_model(self): - pass - - @abstractmethod - def load_model(self): - pass diff --git a/text_classifier.py b/text_classifier.py index a6d9e9d..158b6ad 100644 --- a/text_classifier.py +++ b/text_classifier.py @@ -1,17 +1,19 @@ -import pandas as pd -import logging -import sys, getopt -import os, inspect -import numpy as np -from sklearn.model_selection import train_test_split from models.doc2vec_model import doc2VecModel from models.classifier_model import classifierModel +import os +import logging +import inspect + +import pandas as pd +from sklearn.model_selection import train_test_split + + logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) base_file_path = inspect.getframeinfo(inspect.currentframe()).filename project_dir_path = os.path.dirname(os.path.abspath(base_file_path)) data_path = os.path.join(project_dir_path, 'data') -default_classifier = os.path.join(project_dir_path, 'classifiers','joblib_model.pkl') +default_classifier = os.path.join(project_dir_path, 'classifiers','logreg_model.pkl') default_doc2vec= os.path.join(project_dir_path, 'classifiers','d2v.model') default_dataset= os.path.join(data_path, 'dataset.csv') @@ -49,24 +51,17 @@ def train_classifier(self): return self.d2v, self.classifier def test_classifier(self): - x_train, x_test, y_train, y_test, all_data = self.prepare_all_data() + _, x_test, _, y_test, _ = self.prepare_all_data() if (self.d2v.model is None or self.classifier.model is None): logging.info("No Trained Models Found, Train First or Use Correct Model Names") else: self.classifier.test_model(self.d2v, x_test, y_test) -def main(argv): - if(len(argv)==1): - dataset_file = argv[0] - - tc = TextClassifier() - tc.read_data(dataset_file) - tc.test_classifier() - tc.train_classifier() - - else: - print('Please use the following Commands to use text_classifier for training/testing/predicting:') - print ('To Run: python text_classifier.py ') +def run(dataset_file): + tc = TextClassifier() + tc.read_data(dataset_file) + tc.test_classifier() + tc.train_classifier() if __name__ == "__main__": - main(sys.argv[1:]) + run("dataset.csv")