From 80c89c80471a3b500512ad27e7897109ca0296ac Mon Sep 17 00:00:00 2001 From: Ally Franken Date: Wed, 15 Jan 2025 13:55:33 -0500 Subject: [PATCH] remove unnecessary helper functions --- datacompy/spark/sql.py | 57 +++++++++++++++--------------------------- 1 file changed, 20 insertions(+), 37 deletions(-) diff --git a/datacompy/spark/sql.py b/datacompy/spark/sql.py index 4f3a507..6db0674 100644 --- a/datacompy/spark/sql.py +++ b/datacompy/spark/sql.py @@ -1270,30 +1270,6 @@ def handle_numeric_strings(df, field_list): return df -def format_numeric_fields(df): - fixed_cols = [] - numeric_types = [ - "tinyint", - "smallint", - "int", - "bigint", - "float", - "double", - "decimal"] - - for c in df.dtypes: - # do not change non-numeric fields - if c[1] not in numeric_types: - fixed_cols.append(col(c[0])) - # round & truncate numeric fields - else: - new_val = format_number(col(c[0]), 5).alias(c[0]) - fixed_cols.append(new_val) - - formatted_df = df.select(*fixed_cols) - return formatted_df - - def sort_rows(prod_df, release_df): prod_cols = prod_df.columns release_cols = release_df.columns @@ -1324,18 +1300,25 @@ def sort_columns(prod_df, release_df): return prod_df, release_df -def convert_exponential_strings(base_df, compare_df): - # convert scientific number (1.23E4) to a decimal value - def sci_no_to_decimal(value): - return when(col(value).rlike(r"^[-+]?[0-9]*\.?[0-9]+[eE][0-9]+"), - col(value).cast(T.DecimalType(30, 10))).otherwise(col(value)) - - df_return_list = [] +def format_numeric_fields(df): + fixed_cols = [] + numeric_types = [ + "tinyint", + "smallint", + "int", + "bigint", + "float", + "double", + "decimal"] - for df in [base_df, compare_df]: - for column in df.columns: - if column in df.columns and df.schema[column].dataType == T.StringType(): - df = df.withColumn(column, sci_no_to_decimal(column)) - df_return_list.append(df) + for c in df.dtypes: + # do not change non-numeric fields + if c[1] not in numeric_types: + fixed_cols.append(col(c[0])) + # round & truncate numeric fields + else: + new_val = format_number(col(c[0]), 5).alias(c[0]) + fixed_cols.append(new_val) - return df_return_list[0], df_return_list[1] \ No newline at end of file + formatted_df = df.select(*fixed_cols) + return formatted_df \ No newline at end of file