From 647908b3d575cc996ff7781170f5cf022dfdd3b2 Mon Sep 17 00:00:00 2001 From: NigelHambly Date: Fri, 2 Sep 2022 16:56:37 +0100 Subject: [PATCH] Fixed bug for parsing double-quoted boolean labels in CSVs downloaded from CDN --- gaiadmpsetup/gaiadmpstore.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/gaiadmpsetup/gaiadmpstore.py b/gaiadmpsetup/gaiadmpstore.py index 563b8dd..a9d48a8 100644 --- a/gaiadmpsetup/gaiadmpstore.py +++ b/gaiadmpsetup/gaiadmpstore.py @@ -152,7 +152,11 @@ def cast_to_array(data_frame : DataFrame, column_name : str, data_type : DataTyp temporary_column_name = column_name + '_array_data' # reformat the string csv data as an array of the specified type - data_frame = data_frame.withColumn(temporary_column_name, f.split(f.col(column_name).substr(f.lit(2), f.length(f.col(column_name)) - 2), ',').cast(data_type)) + if isinstance(data_type.elementType, BooleanType): + # ... need to allow for the double quoted boolean labels + data_frame = data_frame.withColumn(temporary_column_name, f.split(f.col(column_name).substr(f.lit(3), f.length(f.col(column_name)) - 3), '","').cast(data_type)) + else: + data_frame = data_frame.withColumn(temporary_column_name, f.split(f.col(column_name).substr(f.lit(2), f.length(f.col(column_name)) - 2), ',').cast(data_type)) # drop the original string column to save space data_frame = data_frame.drop(column_name)