From 8c2e60eaccab8700c06c5bc3cad3257c41d91ac8 Mon Sep 17 00:00:00 2001 From: fonhorst Date: Thu, 6 Jun 2024 01:07:33 +0300 Subject: [PATCH] remove print --- autotm/ontology/ontology_extractor.py | 2 -- autotm/preprocessing/dictionaries_preparation.py | 1 - autotm/preprocessing/text_preprocessing.py | 1 - autotm/visualization/dynamic_tracker.py | 9 ++++++--- distributed/autotm_distributed/preprocessing.py | 6 +++--- distributed/autotm_distributed/tm.py | 13 ++++++------- docs/conf.py | 1 - 7 files changed, 15 insertions(+), 18 deletions(-) diff --git a/autotm/ontology/ontology_extractor.py b/autotm/ontology/ontology_extractor.py index 580605f..0ff8c73 100644 --- a/autotm/ontology/ontology_extractor.py +++ b/autotm/ontology/ontology_extractor.py @@ -25,7 +25,6 @@ def format_attention(attention, layers=None, heads=None): layer_attention = layer_attention.squeeze(0) if heads: layer_attention = layer_attention[heads] - # print(layer_attention[0]) squeezed.append(layer_attention) # num_layers x num_heads x seq_len x seq_len return torch.stack(squeezed) @@ -138,7 +137,6 @@ def build_graph(autotm_model, topic_labels, tokens = tokenizer.convert_ids_to_tokens(inputs[0]) res, tokens_new = get_attention_vals(attention, tokens, head_num=2, layer_num=0) - print(res, tokens_new) try: v, i = torch.topk(res.flatten(), 5) diff --git a/autotm/preprocessing/dictionaries_preparation.py b/autotm/preprocessing/dictionaries_preparation.py index 42fc34c..c264141 100644 --- a/autotm/preprocessing/dictionaries_preparation.py +++ b/autotm/preprocessing/dictionaries_preparation.py @@ -156,7 +156,6 @@ def write_vw_dict(res_dict, vocab_words, fpath): try: fopen.write(f"{word}" + " " + " ".join(res_dict[word]) + "\n") except: - # print(f'The word {word} is not found') pass logger.info(f"{fpath} is ready!") diff --git a/autotm/preprocessing/text_preprocessing.py b/autotm/preprocessing/text_preprocessing.py index 07dea81..6252098 100644 --- a/autotm/preprocessing/text_preprocessing.py +++ b/autotm/preprocessing/text_preprocessing.py @@ -141,7 +141,6 @@ def lemmatize_text_en(text): def lemmatize_text(df, **kwargs): - # print(kwargs) lang = kwargs["lang"] col_to_process = kwargs["col_to_process"] if lang == "ru": diff --git a/autotm/visualization/dynamic_tracker.py b/autotm/visualization/dynamic_tracker.py index f7cf548..8ae0cce 100644 --- a/autotm/visualization/dynamic_tracker.py +++ b/autotm/visualization/dynamic_tracker.py @@ -1,3 +1,4 @@ +import logging import os import time import warnings @@ -12,6 +13,8 @@ FITNESS_DIFF_COL = "fitness_diff" PARAMS_DIST_COL = "params_dist" +logger = logging.getLogger(__name__) + class MetricsCollector: def __init__( @@ -163,7 +166,7 @@ def save_fitness(self, generation: int, params: list, fitness: float): def get_metric_df(self): if self.metric_df is not None: - print("Metric df already exists") + logger("Metric df already exists") else: population_max = [] for i in range(self.num_generations + 1): @@ -177,7 +180,7 @@ def get_metric_df(self): columns=[GENERATION_COL, FITNESS_COL], ) if self.mutation_df is not None: - print("Mutation df already exists") + logger.info("Mutation df already exists") else: dfs = [] for gen in self.mutation_changes: @@ -196,7 +199,7 @@ def get_metric_df(self): # warnings.warn("No mutations changes have been found to save", RuntimeWarning) self.mutation_df = pd.DataFrame([]) if self.crossover_df is not None: - print("Crossover df already exists") + logger.info("Crossover df already exists") else: dfs = [] for gen in self.crossover_changes: diff --git a/distributed/autotm_distributed/preprocessing.py b/distributed/autotm_distributed/preprocessing.py index 72bb9b8..71a1c01 100644 --- a/distributed/autotm_distributed/preprocessing.py +++ b/distributed/autotm_distributed/preprocessing.py @@ -153,7 +153,7 @@ def prepare_voc(batches_dir, vw_path, data_path, column_name='processed_text'): try: for file in os.listdir(data_path): if file.startswith('part'): - print('part_{}'.format(num_parts), end='\r') + logger.info('Preparing vocabulary: part_{}'.format(num_parts)) if file.split('.')[-1] == 'csv': part = pd.read_csv(os.path.join(data_path, file)) else: @@ -165,14 +165,14 @@ def prepare_voc(batches_dir, vw_path, data_path, column_name='processed_text'): num_parts += 1 except NotADirectoryError: - print('part 1/1') + logger.info('Preparing vocabulary: part 1/1') part = pd.read_csv(data_path) part_processed = part[column_name].tolist() for text in part_processed: result = return_string_part('@default_class', text) ofile.write(result + '\n') - print(' batches {} \n vocabulary {} \n are ready'.format(batches_dir, vw_path)) + logger.info('Preparing vocabulary: batches {} \n vocabulary {} \n are ready'.format(batches_dir, vw_path)) def prepare_batch_vectorizer(batches_dir, vw_path, data_path, column_name='processed_text'): diff --git a/distributed/autotm_distributed/tm.py b/distributed/autotm_distributed/tm.py index 829ed4b..50d08d2 100644 --- a/distributed/autotm_distributed/tm.py +++ b/distributed/autotm_distributed/tm.py @@ -431,7 +431,7 @@ def train(self, option='offline'): if self.n1 > 0: if self._early_stopping(): - print('Early stopping is triggered') + logger.info('Early stopping is triggered') return # if ((self.n2 != 0) and (self.B != 0)): @@ -470,7 +470,7 @@ def train(self, option='offline'): if self.n1 + self.n2 + self.n3 > 0: if self._early_stopping(): - print('Early stopping is triggered') + logger.info('Early stopping is triggered') return if self.n4 != 0: @@ -524,7 +524,6 @@ def get_topics(self): def _get_avg_coherence_score(self, for_individ_fitness=False): coherences_main, coherences_back = self.__return_all_tokens_coherence(self.model, s=self.S, b=self.B) if for_individ_fitness: - # print('COMPONENTS: ', np.mean(list(coherences_main.values())), np.min(list(coherences_main.values()))) return np.mean(list(coherences_main.values())) + np.min(list(coherences_main.values())) return np.mean(list(coherences_main.values())) @@ -680,7 +679,7 @@ def metrics_get_avg_coherence_score(self, for_individ_fitness=False) -> MetricsS # coeff = self._calculate_labels_coeff() coeff = 1.0 if for_individ_fitness: - print('COMPONENTS: ', np.mean(list(coherences_main.values())), np.min(list(coherences_main.values()))) + logger.info('COMPONENTS: ', np.mean(list(coherences_main.values())), np.min(list(coherences_main.values()))) avg_coherence_score = \ np.mean(list(coherences_main.values())) + np.min(list(coherences_main.values())) * coeff else: @@ -709,9 +708,9 @@ def metrics_get_last_avg_vals(self, texts, total_tokens, topic_significance_uni = np.mean(ts_uniform(topic_word_dist)) topic_significance_vacuous = np.mean(ts_vacuous(doc_topic_dist, topic_word_dist, total_tokens)) topic_significance_back = np.mean(ts_bground(doc_topic_dist)) - print(f'Topic Significance - Uniform Distribution Over Words: {topic_significance_uni}') - print(f'Topic Significance - Vacuous Semantic Distribution: {topic_significance_vacuous}') - print(f'Topic Significance - Background Distribution: {topic_significance_back}') + logger.info(f'Topic Significance - Uniform Distribution Over Words: {topic_significance_uni}') + logger.info(f'Topic Significance - Vacuous Semantic Distribution: {topic_significance_vacuous}') + logger.info(f'Topic Significance - Background Distribution: {topic_significance_back}') else: topic_significance_uni = None topic_significance_vacuous = None diff --git a/docs/conf.py b/docs/conf.py index 07538cc..9591e41 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -11,7 +11,6 @@ CURR_PATH = os.path.abspath(os.path.dirname(__file__)) LIB_PATH = os.path.join(CURR_PATH, os.path.pardir) -print(LIB_PATH) sys.path.insert(0, LIB_PATH) project = "AutoTM"