Skip to content

Commit

Permalink
Feature/clean refactoring (#29)
Browse files Browse the repository at this point in the history
Refactoring:
* remove ontology subfolder, add seaborn dependency, fix flake8
* add llm test skipping if API key is not available
* add 3.8 and 3.11 into github ci/cd
* removing support for python 3.12 temporarily
* update poetry.lock
---------

Co-authored-by: fonhorst <[email protected]@gmail.com>
  • Loading branch information
fonhorst and fonhorst authored Jun 7, 2024
1 parent ef12e02 commit bd1eae6
Show file tree
Hide file tree
Showing 15 changed files with 93 additions and 310 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/build.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ jobs:
strategy:
fail-fast: false
matrix:
python-version: ["3.8", "3.9"]
python-version: ["3.8", "3.9", "3.10", "3.11"]

steps:
- run: echo "🎉 The job was automatically triggered by a ${{ github.event_name }} event."
Expand Down
7 changes: 6 additions & 1 deletion autotm/abstract_params.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,12 @@ def make_params_dict(self):
...

@abstractmethod
def run_train(self, model: "TopicModel"):
def run_train(self, model):
"""
Trains the topic model
:param model: an instance of TopicModel
:return:
"""
...

@abstractmethod
Expand Down
16 changes: 7 additions & 9 deletions autotm/clustering.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,18 @@
from matplotlib import offsetbox
import matplotlib.pyplot as plt
import matplotlib.patheffects as PathEffects
import seaborn as sns
import copy
import warnings

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.graph_objects as go
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.manifold import TSNE
import warnings
import copy

from sklearn.preprocessing import StandardScaler

warnings.filterwarnings('ignore')


def cluster_phi(phi_df: pd.dataFrame, n_clusters=10, plot_img=True):
def cluster_phi(phi_df: pd.DataFrame, n_clusters=10, plot_img=True):
_phi_df = copy.deepcopy(phi_df)
y = _phi_df.index.values
x = _phi_df.values
Expand Down
5 changes: 2 additions & 3 deletions autotm/content_splitter.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import re
from abc import ABC, abstractmethod
from autotm.preprocessing.text_preprocessing import process_dataset
from abc import ABC


class BaseTextSplitter(ABC):

Expand Down
2 changes: 1 addition & 1 deletion autotm/main_fitness_worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ def main():
os.environ['AUTOTM_COMPONENT'] = 'worker'
os.environ['AUTOTM_EXEC_MODE'] = 'cluster'

from autotm.fitness.tasks import make_celery_app
from autotm.fitness.cluster_tasks import make_celery_app
from autotm.fitness.tm import TopicModelFactory

if "DATASETS_CONFIG" in os.environ:
Expand Down
Empty file removed autotm/ontology/__init__.py
Empty file.
189 changes: 0 additions & 189 deletions autotm/ontology/ontology_extractor.py

This file was deleted.

Empty file removed autotm/ontology/visualization.py
Empty file.
2 changes: 0 additions & 2 deletions autotm/params.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import copy
import logging
import random
from typing import List

Expand All @@ -11,7 +10,6 @@
from autotm.graph_ga import create_pipeline, crossover_pipelines, mutate_pipeline
from autotm.pipeline import Pipeline, Stage, StageType, Param, create_stage, IntRangeDistribution, \
FloatRangeDistribution
from autotm.utils import do_suppress_stdout

PARAM_NAMES = [
"val_decor",
Expand Down
5 changes: 3 additions & 2 deletions autotm/preprocessing/text_preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,14 @@
logger = logging.getLogger(__name__)

# TODO: make transformer class and prep function to download all files
nltk_components = ['corpora/stopwords.zip', 'corpora/wordnet.zip']
nltk_components = ['corpora/stopwords', 'corpora/wordnet.zip']

for nltk_component in nltk_components:
try:
nltk.data.find(nltk_component)
except LookupError:
nltk.download(nltk_component.split('/')[-1])
nltk_component_name = os.path.splitext(os.path.basename(nltk_component))[0]
nltk.download(nltk_component_name)

stop = stopwords.words("russian") + [" "] + stopwords.words("english")

Expand Down
2 changes: 2 additions & 0 deletions autotm/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,8 @@ def parallelize_dataframe(df: pd.DataFrame, func, n_cores, return_type="df", **k
elif isinstance(map_res[0], tuple):
zipped_elems = list(zip(*map_res))
res = (merge_dicts(zipped_elems[0]), merge_dicts(zipped_elems[1]))
else:
raise ValueError(f"Unsupported return_type: {return_type}")
pool.close()
pool.join()
return res
Expand Down
Loading

0 comments on commit bd1eae6

Please sign in to comment.