sapientml · tashiro-akira · Oct 24, 2023 · Oct 24, 2023 · Oct 27, 2023 · Oct 31, 2023
@@ -17,6 +17,7 @@
 import pandas as pd
 from sapientml.params import CancellationToken
 from sapientml.util.logging import setup_logger
+from sapientml_core.preprocess.default.generator import check_cols_has_symbols, remove_symbols, rename_cols
 
 from .AutoEDA import EDA
 from .AutoVisualization import AutoVisualization_Class
@@ -81,12 +82,40 @@ def process(
     if visualization:
         # Call AutoVisualization to generate visualization codes
         AV = AutoVisualization_Class()
-        visualization_code = AV.AutoVisualization(
-            df=dataframe,
-            target_columns=target_columns,
-            problem_type=problem_type,
-            ignore_columns=ignore_columns,
-        )
+        cols_has_symbols = check_cols_has_symbols(dataframe.columns.to_list())
+        no_symbol_columns = [col for col in dataframe.columns.values if col not in cols_has_symbols]
+        if cols_has_symbols:
+            rename_dict = {}
+            org_df_column = dataframe.columns.to_list()
+            df_columns = list(
+                dataframe.rename(columns=lambda col: remove_symbols(col) if col in cols_has_symbols else col).columns
+            )
+            rename_dict = rename_cols(org_df_column, no_symbol_columns, df_columns)
+            if len(rename_dict) != 0:
+                col_has_target = []
+                for org_column, target in zip(list(rename_dict.keys()), list(rename_dict.values())):
+                    if target in target_columns:
+                        col_has_target.append(org_column)
+                visualization_code = AV.AutoVisualization(
+                    df=dataframe,
+                    target_columns=col_has_target,
+                    problem_type=problem_type,
+                    ignore_columns=ignore_columns,
+                )
+            else:
+                visualization_code = AV.AutoVisualization(
+                    df=dataframe,
+                    target_columns=col_has_target,
+                    problem_type=problem_type,
+                    ignore_columns=ignore_columns,
+                )
+        else:
+            visualization_code = AV.AutoVisualization(
+                df=dataframe,
+                target_columns=target_columns,
+                problem_type=problem_type,
+                ignore_columns=ignore_columns,
+            )
     else:
         visualization_code = None
 

@@ -222,8 +222,43 @@ def generate_pipeline(self, dataset: Dataset, task: Task):
         for pipeline in sapientml_results:
             pipeline.validation = code_block.validation + pipeline.validation
             pipeline.test = code_block.test + pipeline.test
-            pipeline.train = code_block.train + pipeline.train
             pipeline.predict = code_block.predict + pipeline.predict
+            if "cols_has_symbols" in pipeline.test:
+                pipeline.test = pipeline.test.replace(
+                    '"feature": feature_train.columns',
+                    '"feature": feature_train.rename(columns=rename_symbol_cols).columns',
+                )
+                pipeline.test = pipeline.test.replace(
+                    "prediction.to_csv", "prediction.rename(columns=rename_symbol_cols).to_csv"
+                )
+
+                pipeline.predict = pipeline.predict.replace(
+                    '"feature": feature_train.columns',
+                    '"feature": feature_train.rename(columns=rename_symbol_cols).columns',
+                )
+                pipeline.predict = pipeline.predict.replace(
+                    "prediction.to_csv", "prediction.rename(columns=rename_symbol_cols).to_csv"
+                )
+
+                pipeline.validation = pipeline.validation.replace(
+                    '"feature": feature_train.columns',
+                    '"feature": feature_train.rename(columns=rename_symbol_cols).columns',
+                )
+                pipeline.validation = pipeline.validation.replace(
+                    "prediction.to_csv", "prediction.rename(columns=rename_symbol_cols).to_csv"
+                )
+
+                def replace_targets(match_obj):
+                    return match_obj[0].replace(
+                        "TARGET_COLUMNS", "[rename_symbol_cols.get(v, v) for v in TARGET_COLUMNS]"
+                    )
+
+                pat = r"prediction = pd.DataFrame\(y_prob, columns=.?TARGET_COLUMNS.*, index=feature_test.index\)"
+                pipeline.test = re.sub(pat, replace_targets, pipeline.test)
+                pipeline.predict = re.sub(pat, replace_targets, pipeline.predict)
+                pipeline.validation = re.sub(pat, replace_targets, pipeline.validation)
+
+            pipeline.train = code_block.train + pipeline.train
             result_pipelines.append(pipeline)
 
         logger.info("Executing generated pipelines...")

@@ -12,7 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import collections
 import os
+import random
 import re
 from pathlib import Path
 from typing import Tuple
@@ -33,7 +35,7 @@
 logger = setup_logger()
 
 INHIBITED_SYMBOL_PATTERN = re.compile(r"[\{\}\[\]\",:<'\\]+")
-
+seedvalue = 4736224
 
 template_env = Environment(loader=FileSystemLoader(f"{os.path.dirname(__file__)}/templates"), trim_blocks=True)
 
@@ -195,6 +197,40 @@ def remove_symbols(column_name: str) -> str:
     return INHIBITED_SYMBOL_PATTERN.sub("", column_name)
 
 
+def rename_cols(org_column_name: list, no_symbol_columns: list, df_columns: list):
+    """Change duplicate column names.
+
+    Parameters
+    ----------
+    org_column_name : list
+        Column names containing special characters
+    no_symbol_columns : list
+        Column names that originally have no special characters
+    df_columns:list
+        Column names that originally have no special characters
+
+    Returns
+    -------
+    column_name : dict
+        Return a non-duplicate dict by renaming a duplicate column name.
+
+    """
+    random.seed(seedvalue)
+    rename_dict = {}
+    same_column = {k: v for k, v in collections.Counter(df_columns).items() if v > 1 and k in no_symbol_columns}
+    while len(same_column):
+        for target, org_column in zip(df_columns, org_column_name):
+            if target in same_column.keys():
+                rename_dict[org_column] = target + str(random.randint(1000, 9999))
+            else:
+                rename_dict[org_column] = target
+
+        df_columns = [rename_dict[col] for col in org_column_name]
+        same_column = {k: v for k, v in collections.Counter(df_columns).items() if v > 1 and k in no_symbol_columns}
+
+    return rename_dict
+
+
 class DefaultPreprocess(CodeBlockGenerator):
     def __init__(self, **kwargs):
         self.config = DefaultPreprocessConfig(**kwargs)
@@ -230,15 +266,31 @@ def generate_code(self, dataset: Dataset, task: Task) -> Tuple[Dataset, Code]:
             logger.warning(
                 f"Symbols that inhibit training and visualization will be removed from column name {str(cols_has_symbols)}."
             )
+            org_df_column = df.columns.values
+            org_target_columns = list(task.target_columns)
+            no_symbol_columns = [col for col in df.columns.values if col not in cols_has_symbols]
             df = df.rename(columns=lambda col: remove_symbols(col) if col in cols_has_symbols else col)
+            df_columns = df.columns.values
             task.target_columns = [
                 remove_symbols(col) if col in cols_has_symbols else col for col in task.target_columns
             ]
+            if df.columns.duplicated().any():
+                rename_dict = rename_cols(org_df_column, no_symbol_columns, df_columns)
+                df = df.set_axis(list(rename_dict.values()), axis=1)
+                task.target_columns = [rename_dict[col] for col in org_target_columns]
             tpl = template_env.get_template("rename_columns.py.jinja")
-            code.validation += _render(tpl, training=True, test=True, cols_has_symbols=cols_has_symbols)
-            code.test += _render(tpl, training=True, test=True, cols_has_symbols=cols_has_symbols)
-            code.train += _render(tpl, training=True, test=False, cols_has_symbols=cols_has_symbols)
-            code.predict += _render(tpl, training=False, test=True, cols_has_symbols=cols_has_symbols)
+            code.validation += _render(
+                tpl, training=True, test=True, cols_has_symbols=cols_has_symbols, rename_dict=rename_dict
+            )
+            code.test += _render(
+                tpl, training=True, test=True, cols_has_symbols=cols_has_symbols, rename_dict=rename_dict
+            )
+            code.train += _render(
+                tpl, training=True, test=False, cols_has_symbols=cols_has_symbols, rename_dict=rename_dict
+            )
+            code.predict += _render(
+                tpl, training=False, test=True, cols_has_symbols=cols_has_symbols, rename_dict=rename_dict
+            )
 
         # If None is intentionally inserted in the data, an error occurs, so we have added an action to change None to "np.nan."
         if df.isin([None]).any(axis=None):

@@ -2,9 +2,15 @@
 import re
 cols_has_symbols = {{ cols_has_symbols }}
 inhibited_symbol_pattern = re.compile(r"[\{\}\[\]\",:<'\\]+")
+{% if rename_dict %}
+rename_symbol_cols = {{ rename_dict }}
+{% else %}
+rename_symbol_cols = {col: inhibited_symbol_pattern.sub("", col) if col in cols_has_symbols else col in cols_has_symbols for col in cols_has_symbols }
+{% endif %}
 {% if training %}
-train_dataset = train_dataset.rename(columns=lambda col: inhibited_symbol_pattern.sub("", col) if col in cols_has_symbols else col)
+train_dataset = train_dataset.rename(columns=rename_symbol_cols)
 {% endif %}
 {% if test %}
-test_dataset = test_dataset.rename(columns=lambda col: inhibited_symbol_pattern.sub("", col) if col in cols_has_symbols else col)
-{% endif %}
+test_dataset = test_dataset.rename(columns=rename_symbol_cols)
+{% endif %}
+rename_symbol_cols = {v: k for k, v in rename_symbol_cols.items()}
@@ -51,7 +51,7 @@ y_pred = model.predict(feature_test)
 y_pred = model.classes_[np.argmax(y_pred, axis=1)].reshape(-1, 1)
 {% endif %}
 {% if is_multioutput_classification %}
-y_pred_df = pd.DataFrame(y_pred, columns=TARGET_COLUMNS)
+y_pred_df = pd.DataFrame(y_pred, columns=TARGET_COLUMNS, index=feature_test.index)
 for column in TARGET_COLUMNS:
     y_pred_df[column] = label_encoders[column].inverse_transform(y_pred_df[column].astype(int))
 y_pred = y_pred_df

@@ -16,7 +16,7 @@ with open('target_LabelEncoder.pkl', 'rb') as f:
     label_encoder = pickle.load(f)
 {% endif %}
 {% if is_multioutput_classification %}
-y_pred_df = pd.DataFrame(y_pred, columns=TARGET_COLUMNS)
+y_pred_df = pd.DataFrame(y_pred, columns=TARGET_COLUMNS, index=feature_test.index)
 for column in TARGET_COLUMNS:
     y_pred_df[column] = label_encoder[column].inverse_transform(y_pred_df[column].astype(int))
 y_pred = y_pred_df

@@ -49,7 +49,7 @@ model.fit(feature_train, target_train)
 y_pred = model.predict(feature_test)
 
 {% if is_multioutput_classification %}
-y_pred_df = pd.DataFrame(y_pred, columns=TARGET_COLUMNS)
+y_pred_df = pd.DataFrame(y_pred, columns=TARGET_COLUMNS, index=feature_test.index)
 for column in TARGET_COLUMNS:
     y_pred_df[column] = label_encoders[column].inverse_transform(y_pred_df[column].astype(int))
 y_pred = y_pred_df