diff --git a/sapientml_core/explain/main.py b/sapientml_core/explain/main.py index cc6030a..b3f8ec0 100644 --- a/sapientml_core/explain/main.py +++ b/sapientml_core/explain/main.py @@ -17,6 +17,7 @@ import pandas as pd from sapientml.params import CancellationToken from sapientml.util.logging import setup_logger +from sapientml_core.preprocess.default.generator import check_cols_has_symbols, remove_symbols, rename_cols from .AutoEDA import EDA from .AutoVisualization import AutoVisualization_Class @@ -81,12 +82,40 @@ def process( if visualization: # Call AutoVisualization to generate visualization codes AV = AutoVisualization_Class() - visualization_code = AV.AutoVisualization( - df=dataframe, - target_columns=target_columns, - problem_type=problem_type, - ignore_columns=ignore_columns, - ) + cols_has_symbols = check_cols_has_symbols(dataframe.columns.to_list()) + no_symbol_columns = [col for col in dataframe.columns.values if col not in cols_has_symbols] + if cols_has_symbols: + rename_dict = {} + org_df_column = dataframe.columns.to_list() + df_columns = list( + dataframe.rename(columns=lambda col: remove_symbols(col) if col in cols_has_symbols else col).columns + ) + rename_dict = rename_cols(org_df_column, no_symbol_columns, df_columns) + if len(rename_dict) != 0: + col_has_target = [] + for org_column, target in zip(list(rename_dict.keys()), list(rename_dict.values())): + if target in target_columns: + col_has_target.append(org_column) + visualization_code = AV.AutoVisualization( + df=dataframe, + target_columns=col_has_target, + problem_type=problem_type, + ignore_columns=ignore_columns, + ) + else: + visualization_code = AV.AutoVisualization( + df=dataframe, + target_columns=col_has_target, + problem_type=problem_type, + ignore_columns=ignore_columns, + ) + else: + visualization_code = AV.AutoVisualization( + df=dataframe, + target_columns=target_columns, + problem_type=problem_type, + ignore_columns=ignore_columns, + ) else: visualization_code = None diff --git a/sapientml_core/generator.py b/sapientml_core/generator.py index 00e3b0f..74d79f3 100644 --- a/sapientml_core/generator.py +++ b/sapientml_core/generator.py @@ -222,8 +222,43 @@ def generate_pipeline(self, dataset: Dataset, task: Task): for pipeline in sapientml_results: pipeline.validation = code_block.validation + pipeline.validation pipeline.test = code_block.test + pipeline.test - pipeline.train = code_block.train + pipeline.train pipeline.predict = code_block.predict + pipeline.predict + if "cols_has_symbols" in pipeline.test: + pipeline.test = pipeline.test.replace( + '"feature": feature_train.columns', + '"feature": feature_train.rename(columns=rename_symbol_cols).columns', + ) + pipeline.test = pipeline.test.replace( + "prediction.to_csv", "prediction.rename(columns=rename_symbol_cols).to_csv" + ) + + pipeline.predict = pipeline.predict.replace( + '"feature": feature_train.columns', + '"feature": feature_train.rename(columns=rename_symbol_cols).columns', + ) + pipeline.predict = pipeline.predict.replace( + "prediction.to_csv", "prediction.rename(columns=rename_symbol_cols).to_csv" + ) + + pipeline.validation = pipeline.validation.replace( + '"feature": feature_train.columns', + '"feature": feature_train.rename(columns=rename_symbol_cols).columns', + ) + pipeline.validation = pipeline.validation.replace( + "prediction.to_csv", "prediction.rename(columns=rename_symbol_cols).to_csv" + ) + + def replace_targets(match_obj): + return match_obj[0].replace( + "TARGET_COLUMNS", "[rename_symbol_cols.get(v, v) for v in TARGET_COLUMNS]" + ) + + pat = r"prediction = pd.DataFrame\(y_prob, columns=.?TARGET_COLUMNS.*, index=feature_test.index\)" + pipeline.test = re.sub(pat, replace_targets, pipeline.test) + pipeline.predict = re.sub(pat, replace_targets, pipeline.predict) + pipeline.validation = re.sub(pat, replace_targets, pipeline.validation) + + pipeline.train = code_block.train + pipeline.train result_pipelines.append(pipeline) logger.info("Executing generated pipelines...") diff --git a/sapientml_core/preprocess/default/generator.py b/sapientml_core/preprocess/default/generator.py index c5187cc..b1d5646 100644 --- a/sapientml_core/preprocess/default/generator.py +++ b/sapientml_core/preprocess/default/generator.py @@ -12,7 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. +import collections import os +import random import re from pathlib import Path from typing import Tuple @@ -33,7 +35,7 @@ logger = setup_logger() INHIBITED_SYMBOL_PATTERN = re.compile(r"[\{\}\[\]\",:<'\\]+") - +seedvalue = 4736224 template_env = Environment(loader=FileSystemLoader(f"{os.path.dirname(__file__)}/templates"), trim_blocks=True) @@ -195,6 +197,40 @@ def remove_symbols(column_name: str) -> str: return INHIBITED_SYMBOL_PATTERN.sub("", column_name) +def rename_cols(org_column_name: list, no_symbol_columns: list, df_columns: list): + """Change duplicate column names. + + Parameters + ---------- + org_column_name : list + Column names containing special characters + no_symbol_columns : list + Column names that originally have no special characters + df_columns:list + Column names that originally have no special characters + + Returns + ------- + column_name : dict + Return a non-duplicate dict by renaming a duplicate column name. + + """ + random.seed(seedvalue) + rename_dict = {} + same_column = {k: v for k, v in collections.Counter(df_columns).items() if v > 1 and k in no_symbol_columns} + while len(same_column): + for target, org_column in zip(df_columns, org_column_name): + if target in same_column.keys(): + rename_dict[org_column] = target + str(random.randint(1000, 9999)) + else: + rename_dict[org_column] = target + + df_columns = [rename_dict[col] for col in org_column_name] + same_column = {k: v for k, v in collections.Counter(df_columns).items() if v > 1 and k in no_symbol_columns} + + return rename_dict + + class DefaultPreprocess(CodeBlockGenerator): def __init__(self, **kwargs): self.config = DefaultPreprocessConfig(**kwargs) @@ -230,15 +266,31 @@ def generate_code(self, dataset: Dataset, task: Task) -> Tuple[Dataset, Code]: logger.warning( f"Symbols that inhibit training and visualization will be removed from column name {str(cols_has_symbols)}." ) + org_df_column = df.columns.values + org_target_columns = list(task.target_columns) + no_symbol_columns = [col for col in df.columns.values if col not in cols_has_symbols] df = df.rename(columns=lambda col: remove_symbols(col) if col in cols_has_symbols else col) + df_columns = df.columns.values task.target_columns = [ remove_symbols(col) if col in cols_has_symbols else col for col in task.target_columns ] + if df.columns.duplicated().any(): + rename_dict = rename_cols(org_df_column, no_symbol_columns, df_columns) + df = df.set_axis(list(rename_dict.values()), axis=1) + task.target_columns = [rename_dict[col] for col in org_target_columns] tpl = template_env.get_template("rename_columns.py.jinja") - code.validation += _render(tpl, training=True, test=True, cols_has_symbols=cols_has_symbols) - code.test += _render(tpl, training=True, test=True, cols_has_symbols=cols_has_symbols) - code.train += _render(tpl, training=True, test=False, cols_has_symbols=cols_has_symbols) - code.predict += _render(tpl, training=False, test=True, cols_has_symbols=cols_has_symbols) + code.validation += _render( + tpl, training=True, test=True, cols_has_symbols=cols_has_symbols, rename_dict=rename_dict + ) + code.test += _render( + tpl, training=True, test=True, cols_has_symbols=cols_has_symbols, rename_dict=rename_dict + ) + code.train += _render( + tpl, training=True, test=False, cols_has_symbols=cols_has_symbols, rename_dict=rename_dict + ) + code.predict += _render( + tpl, training=False, test=True, cols_has_symbols=cols_has_symbols, rename_dict=rename_dict + ) # If None is intentionally inserted in the data, an error occurs, so we have added an action to change None to "np.nan." if df.isin([None]).any(axis=None): diff --git a/sapientml_core/preprocess/default/templates/rename_columns.py.jinja b/sapientml_core/preprocess/default/templates/rename_columns.py.jinja index 7e21706..846e637 100644 --- a/sapientml_core/preprocess/default/templates/rename_columns.py.jinja +++ b/sapientml_core/preprocess/default/templates/rename_columns.py.jinja @@ -2,9 +2,15 @@ import re cols_has_symbols = {{ cols_has_symbols }} inhibited_symbol_pattern = re.compile(r"[\{\}\[\]\",:<'\\]+") +{% if rename_dict %} +rename_symbol_cols = {{ rename_dict }} +{% else %} +rename_symbol_cols = {col: inhibited_symbol_pattern.sub("", col) if col in cols_has_symbols else col in cols_has_symbols for col in cols_has_symbols } +{% endif %} {% if training %} -train_dataset = train_dataset.rename(columns=lambda col: inhibited_symbol_pattern.sub("", col) if col in cols_has_symbols else col) +train_dataset = train_dataset.rename(columns=rename_symbol_cols) {% endif %} {% if test %} -test_dataset = test_dataset.rename(columns=lambda col: inhibited_symbol_pattern.sub("", col) if col in cols_has_symbols else col) -{% endif %} \ No newline at end of file +test_dataset = test_dataset.rename(columns=rename_symbol_cols) +{% endif %} +rename_symbol_cols = {v: k for k, v in rename_symbol_cols.items()} \ No newline at end of file diff --git a/sapientml_core/templates/model_templates/model.py.jinja b/sapientml_core/templates/model_templates/model.py.jinja index 747b6ee..f00ef49 100644 --- a/sapientml_core/templates/model_templates/model.py.jinja +++ b/sapientml_core/templates/model_templates/model.py.jinja @@ -51,7 +51,7 @@ y_pred = model.predict(feature_test) y_pred = model.classes_[np.argmax(y_pred, axis=1)].reshape(-1, 1) {% endif %} {% if is_multioutput_classification %} -y_pred_df = pd.DataFrame(y_pred, columns=TARGET_COLUMNS) +y_pred_df = pd.DataFrame(y_pred, columns=TARGET_COLUMNS, index=feature_test.index) for column in TARGET_COLUMNS: y_pred_df[column] = label_encoders[column].inverse_transform(y_pred_df[column].astype(int)) y_pred = y_pred_df diff --git a/sapientml_core/templates/model_templates/model_predict.py.jinja b/sapientml_core/templates/model_templates/model_predict.py.jinja index 003f1a5..47f4f0d 100644 --- a/sapientml_core/templates/model_templates/model_predict.py.jinja +++ b/sapientml_core/templates/model_templates/model_predict.py.jinja @@ -16,7 +16,7 @@ with open('target_LabelEncoder.pkl', 'rb') as f: label_encoder = pickle.load(f) {% endif %} {% if is_multioutput_classification %} -y_pred_df = pd.DataFrame(y_pred, columns=TARGET_COLUMNS) +y_pred_df = pd.DataFrame(y_pred, columns=TARGET_COLUMNS, index=feature_test.index) for column in TARGET_COLUMNS: y_pred_df[column] = label_encoder[column].inverse_transform(y_pred_df[column].astype(int)) y_pred = y_pred_df diff --git a/sapientml_core/templates/model_templates/model_test.py.jinja b/sapientml_core/templates/model_templates/model_test.py.jinja index 7a47d18..259d7fe 100644 --- a/sapientml_core/templates/model_templates/model_test.py.jinja +++ b/sapientml_core/templates/model_templates/model_test.py.jinja @@ -49,7 +49,7 @@ model.fit(feature_train, target_train) y_pred = model.predict(feature_test) {% if is_multioutput_classification %} -y_pred_df = pd.DataFrame(y_pred, columns=TARGET_COLUMNS) +y_pred_df = pd.DataFrame(y_pred, columns=TARGET_COLUMNS, index=feature_test.index) for column in TARGET_COLUMNS: y_pred_df[column] = label_encoders[column].inverse_transform(y_pred_df[column].astype(int)) y_pred = y_pred_df