From 31c0779e37f0af7d74e236e87a375dfda18a32e8 Mon Sep 17 00:00:00 2001 From: ngc436 Date: Sun, 26 Nov 2023 22:18:13 +0300 Subject: [PATCH] Fix fname --- README.md | 2 -- .../genetic_algorithm/mutation.py | 35 +++++++++++++++++-- autotm/fitness/tm.py | 2 +- .../preprocessing/dictionaries_preparation.py | 4 +-- autotm/preprocessing/text_preprocessing.py | 2 +- 5 files changed, 37 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 1af8363..75b7f34 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,6 @@

- Library scheme -

diff --git a/autotm/algorithms_for_tuning/genetic_algorithm/mutation.py b/autotm/algorithms_for_tuning/genetic_algorithm/mutation.py index 5d8f0e2..39c2c4d 100644 --- a/autotm/algorithms_for_tuning/genetic_algorithm/mutation.py +++ b/autotm/algorithms_for_tuning/genetic_algorithm/mutation.py @@ -11,12 +11,43 @@ def mutation_one_param( high_spm: float, low_n: int, high_n: int, - low_back: float, - high_back: float, + low_back: int, + high_back: int, low_decor: float, high_decor: float, elem_mutation_prob: float = 0.1, ): + """ + One-point mutation + + Checking the probability of mutation for each of the elements + + Parameters + ---------- + individ: List[float] + Individual to be processed + low_spb: float + The lower possible bound for sparsity regularizer of back topics + high_spb: float + The higher possible bound for sparsity regularizer of back topics + low_spm: float + The lower possible bound for sparsity regularizer of specific topics + high_spm: float + The higher possible bound for sparsity regularizer of specific topics + low_n: int + The lower possible bound for amount of iterations between stages + high_n: int + The higher possible bound for amount of iterations between stages + low_back: + The lower possible bound for amount of back topics + high_back: + The higher possible bound for amount of back topics + + + Returns + ---------- + Updated individuals with exchanged chromosome parts + """ for i in range(len(individ)): if random.random() <= elem_mutation_prob: if i in [2, 3]: diff --git a/autotm/fitness/tm.py b/autotm/fitness/tm.py index 7e09efd..c0dfe85 100644 --- a/autotm/fitness/tm.py +++ b/autotm/fitness/tm.py @@ -59,7 +59,7 @@ class Dataset: _ppmi_dict_df_path: str = "ppmi_df.txt" _ppmi_dict_tf_path: str = "ppmi_tf.txt" _mutual_info_dict_path: str = "mutual_info_dict.pkl" - _texts_path: str = "ppp.csv" + _texts_path: str = "prep_df.csv" _labels_path = "labels.pkl" def __init__(self, base_path: str, topic_count: int): diff --git a/autotm/preprocessing/dictionaries_preparation.py b/autotm/preprocessing/dictionaries_preparation.py index 1a6d7c7..8ca3c39 100644 --- a/autotm/preprocessing/dictionaries_preparation.py +++ b/autotm/preprocessing/dictionaries_preparation.py @@ -322,7 +322,7 @@ def mutual_info_dict_preparation(fname): def prepare_all_artifacts(save_path: str): - DATASET_PATH = os.path.join(save_path, "ppp.csv") + DATASET_PATH = os.path.join(save_path, "prep_df.csv") BATCHES_DIR = os.path.join(save_path, "batches") WV_PATH = os.path.join(save_path, "test_set_data_voc.txt") COOC_DICTIONARY_PATH = os.path.join(save_path, "cooc_dictionary.txt") @@ -333,7 +333,7 @@ def prepare_all_artifacts(save_path: str): ppmi_dict_df = os.path.join(save_path, "ppmi_df.txt") ppmi_dict_tf = os.path.join(save_path, "ppmi_tf.txt") MUTUAL_INFO_DICT_PATH = os.path.join(save_path, "mutual_info_dict.pkl") - DOCUMENTS_TO_BATCH_PATH = os.path.join(save_path, "ppp.csv") + DOCUMENTS_TO_BATCH_PATH = os.path.join(save_path, "prep_df.csv") # TODO: check why batch vectorizer is returned (unused further) prepare_batch_vectorizer( diff --git a/autotm/preprocessing/text_preprocessing.py b/autotm/preprocessing/text_preprocessing.py index dc05cd3..8883133 100644 --- a/autotm/preprocessing/text_preprocessing.py +++ b/autotm/preprocessing/text_preprocessing.py @@ -164,7 +164,7 @@ def process_dataset( :return: """ os.makedirs(save_path, exist_ok=True) - save_path = os.path.join(save_path, "ppp.csv") + save_path = os.path.join(save_path, "prep_df.csv") data = pd.read_csv(fname) if isinstance(fname, str) else cast(pd.DataFrame, fname) data = parallelize_dataframe( data, lemmatize_text, n_cores, lang=lang, col_to_process=col_to_process