Skip to content

Commit

Permalink
Fix data transform issue, spark log_loss metric compute error and jso…
Browse files Browse the repository at this point in the history
…n dumps TypeError (Sync Fabric till 3c545e67) (#1371)

* Merged PR 1444697: Fix json dumps TypeError

Fix json dumps TypeError

----
Bug fix to address a `TypeError` in `json.dumps`.

This pull request fixes a `TypeError` encountered when using `json.dumps` on `automl._automl_user_configurations` by introducing a safe JSON serialization function.
- Added `safe_json_dumps` function in `flaml/fabric/mlflow.py` to handle non-serializable objects.
- Updated `MLflowIntegration` class in `flaml/fabric/mlflow.py` to use `safe_json_dumps` for JSON serialization.
- Modified `test/automl/test_multiclass.py` to test the new `safe_json_dumps` function.

Related work items: #3439408

* Fix data transform issue and spark log_loss metric compute error
  • Loading branch information
thinkall authored Oct 29, 2024
1 parent c01c391 commit 69da685
Show file tree
Hide file tree
Showing 5 changed files with 29 additions and 6 deletions.
2 changes: 1 addition & 1 deletion flaml/automl/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -293,7 +293,7 @@ def fit_transform(self, X: Union[DataFrame, np.ndarray], y, task: Union[str, "Ta
y = y.rename(TS_VALUE_COL)
for column in X.columns:
# sklearn\utils\validation.py needs int/float values
if X[column].dtype.name in ("object", "category"):
if X[column].dtype.name in ("object", "category", "string"):
if X[column].nunique() == 1 or X[column].nunique(dropna=True) == n - X[column].isnull().sum():
X.drop(columns=column, inplace=True)
drop = True
Expand Down
18 changes: 17 additions & 1 deletion flaml/automl/spark/metrics.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import json
from typing import Union

import numpy as np
Expand All @@ -9,7 +10,7 @@
RegressionEvaluator,
)

from flaml.automl.spark import F, psSeries
from flaml.automl.spark import F, T, psDataFrame, psSeries, sparkDataFrame


def ps_group_counts(groups: Union[psSeries, np.ndarray]) -> np.ndarray:
Expand All @@ -36,6 +37,16 @@ def _compute_label_from_probability(df, probability_col, prediction_col):
return df


def string_to_array(s):
try:
return json.loads(s)
except json.JSONDecodeError:
return []


string_to_array_udf = F.udf(string_to_array, T.ArrayType(T.DoubleType()))


def spark_metric_loss_score(
metric_name: str,
y_predict: psSeries,
Expand Down Expand Up @@ -135,6 +146,11 @@ def spark_metric_loss_score(
)
elif metric_name == "log_loss":
# For log_loss, prediction_col should be probability, and we need to convert it to label
# handle data like "{'type': '1', 'values': '[1, 2, 3]'}"
# Fix cannot resolve "array_max(prediction)" due to data type mismatch: Parameter 1 requires the "ARRAY" type,
# however "prediction" has the type "STRUCT<type: TINYINT, size: INT, indices: ARRAY<INT>, values: ARRAY<DOUBLE>>"
df = df.withColumn(prediction_col, df[prediction_col].cast(T.StringType()))
df = df.withColumn(prediction_col, string_to_array_udf(df[prediction_col]))
df = _compute_label_from_probability(df, prediction_col, prediction_col + "_label")
evaluator = MulticlassClassificationEvaluator(
metricName="logLoss",
Expand Down
2 changes: 1 addition & 1 deletion flaml/automl/time_series/ts_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -393,7 +393,7 @@ def fit(self, X: Union[DataFrame, np.array], y):

for column in X.columns:
# sklearn/utils/validation.py needs int/float values
if X[column].dtype.name in ("object", "category"):
if X[column].dtype.name in ("object", "category", "string"):
if (
# drop columns where all values are the same
X[column].nunique() == 1
Expand Down
9 changes: 8 additions & 1 deletion flaml/fabric/mlflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,13 @@ def _get_notebook_name():
return None


def safe_json_dumps(obj):
def default(o):
return str(o)

return json.dumps(obj, default=default)


class MLflowIntegration:
def __init__(self, experiment_type="automl", mlflow_exp_name=None, extra_tag=None):
try:
Expand Down Expand Up @@ -438,7 +445,7 @@ def record_state(self, automl, search_state, estimator):
"flaml.meric": automl_metric_name,
"flaml.run_source": "flaml-automl",
"flaml.log_type": self.log_type,
"flaml.automl_user_configurations": json.dumps(automl._automl_user_configurations),
"flaml.automl_user_configurations": safe_json_dumps(automl._automl_user_configurations),
},
"params": {
"sample_size": search_state.sample_size,
Expand Down
4 changes: 2 additions & 2 deletions test/automl/test_multiclass.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,7 +187,6 @@ def test_dataframe(self):
def test_custom_metric(self):
df, y = load_iris(return_X_y=True, as_frame=True)
df["label"] = y
automl = AutoML()
settings = {
"dataframe": df,
"label": "label",
Expand All @@ -204,7 +203,8 @@ def test_custom_metric(self):
"pred_time_limit": 1e-5,
"ensemble": True,
}
automl.fit(**settings)
automl = AutoML(**settings) # test safe_json_dumps
automl.fit(dataframe=df, label="label")
print(automl.classes_)
print(automl.model)
print(automl.config_history)
Expand Down

0 comments on commit 69da685

Please sign in to comment.