Feature/clean refactoring (#29)

Refactoring: * remove ontology subfolder, add seaborn dependency, fix flake8 * add llm test skipping if API key is not available * add 3.8 and 3.11 into github ci/cd * removing support for python 3.12 temporarily * update poetry.lock --------- Co-authored-by: fonhorst <[email protected]@gmail.com>
aimclub · Jun 7, 2024 · bd1eae6 · bd1eae6
1 parent ef12e02
commit bd1eae6
Show file tree

Hide file tree

Showing 15 changed files with 93 additions and 310 deletions.
diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
@@ -9,7 +9,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: ["3.8", "3.9"]
+        python-version: ["3.8", "3.9", "3.10", "3.11"]
 
     steps:
       - run: echo "🎉 The job was automatically triggered by a ${{ github.event_name }} event."

diff --git a/autotm/abstract_params.py b/autotm/abstract_params.py
@@ -19,7 +19,12 @@ def make_params_dict(self):
         ...
 
     @abstractmethod
-    def run_train(self, model: "TopicModel"):
+    def run_train(self, model):
+        """
+        Trains the topic model
+        :param model: an instance of TopicModel
+        :return:
+        """
         ...
 
     @abstractmethod

diff --git a/autotm/clustering.py b/autotm/clustering.py
@@ -1,20 +1,18 @@
-from matplotlib import offsetbox
 import matplotlib.pyplot as plt
-import matplotlib.patheffects as PathEffects
-import seaborn as sns
+import copy
+import warnings
+
+import matplotlib.pyplot as plt
+import numpy as np
 import pandas as pd
-import plotly.graph_objects as go
 from sklearn.cluster import KMeans
-from sklearn.preprocessing import StandardScaler
 from sklearn.manifold import TSNE
-import warnings
-import copy
-
+from sklearn.preprocessing import StandardScaler
 
 warnings.filterwarnings('ignore')
 
 
-def cluster_phi(phi_df: pd.dataFrame, n_clusters=10, plot_img=True):
+def cluster_phi(phi_df: pd.DataFrame, n_clusters=10, plot_img=True):
     _phi_df = copy.deepcopy(phi_df)
     y = _phi_df.index.values
     x = _phi_df.values

diff --git a/autotm/content_splitter.py b/autotm/content_splitter.py
@@ -1,6 +1,5 @@
-import re
-from abc import ABC, abstractmethod
-from autotm.preprocessing.text_preprocessing import process_dataset
+from abc import ABC
+
 
 class BaseTextSplitter(ABC):
 

diff --git a/autotm/main_fitness_worker.py b/autotm/main_fitness_worker.py
@@ -7,7 +7,7 @@ def main():
     os.environ['AUTOTM_COMPONENT'] = 'worker'
     os.environ['AUTOTM_EXEC_MODE'] = 'cluster'
 
-    from autotm.fitness.tasks import make_celery_app
+    from autotm.fitness.cluster_tasks import make_celery_app
     from autotm.fitness.tm import TopicModelFactory
 
     if "DATASETS_CONFIG" in os.environ:

diff --git a/autotm/ontology/__init__.py b/autotm/ontology/__init__.py
diff --git a/autotm/ontology/ontology_extractor.py b/autotm/ontology/ontology_extractor.py
diff --git a/autotm/ontology/visualization.py b/autotm/ontology/visualization.py
diff --git a/autotm/params.py b/autotm/params.py
@@ -1,5 +1,4 @@
 import copy
-import logging
 import random
 from typing import List
 
@@ -11,7 +10,6 @@
 from autotm.graph_ga import create_pipeline, crossover_pipelines, mutate_pipeline
 from autotm.pipeline import Pipeline, Stage, StageType, Param, create_stage, IntRangeDistribution, \
     FloatRangeDistribution
-from autotm.utils import do_suppress_stdout
 
 PARAM_NAMES = [
     "val_decor",

diff --git a/autotm/preprocessing/text_preprocessing.py b/autotm/preprocessing/text_preprocessing.py
@@ -22,13 +22,14 @@
 logger = logging.getLogger(__name__)
 
 # TODO: make transformer class and prep function to download all files
-nltk_components = ['corpora/stopwords.zip', 'corpora/wordnet.zip']
+nltk_components = ['corpora/stopwords', 'corpora/wordnet.zip']
 
 for nltk_component in nltk_components:
     try:
         nltk.data.find(nltk_component)
     except LookupError:
-        nltk.download(nltk_component.split('/')[-1])
+        nltk_component_name = os.path.splitext(os.path.basename(nltk_component))[0]
+        nltk.download(nltk_component_name)
 
 stop = stopwords.words("russian") + [" "] + stopwords.words("english")
 

diff --git a/autotm/utils.py b/autotm/utils.py
@@ -102,6 +102,8 @@ def parallelize_dataframe(df: pd.DataFrame, func, n_cores, return_type="df", **k
         elif isinstance(map_res[0], tuple):
             zipped_elems = list(zip(*map_res))
             res = (merge_dicts(zipped_elems[0]), merge_dicts(zipped_elems[1]))
+    else:
+        raise ValueError(f"Unsupported return_type: {return_type}")
     pool.close()
     pool.join()
     return res