Skip to content

Commit

Permalink
Fixed bug for parsing double-quoted boolean labels in CSVs downloaded…
Browse files Browse the repository at this point in the history
… from CDN
  • Loading branch information
NigelHambly committed Sep 2, 2022
1 parent c6ec82a commit 647908b
Showing 1 changed file with 5 additions and 1 deletion.
6 changes: 5 additions & 1 deletion gaiadmpsetup/gaiadmpstore.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,11 @@ def cast_to_array(data_frame : DataFrame, column_name : str, data_type : DataTyp
temporary_column_name = column_name + '_array_data'

# reformat the string csv data as an array of the specified type
data_frame = data_frame.withColumn(temporary_column_name, f.split(f.col(column_name).substr(f.lit(2), f.length(f.col(column_name)) - 2), ',').cast(data_type))
if isinstance(data_type.elementType, BooleanType):
# ... need to allow for the double quoted boolean labels
data_frame = data_frame.withColumn(temporary_column_name, f.split(f.col(column_name).substr(f.lit(3), f.length(f.col(column_name)) - 3), '","').cast(data_type))
else:
data_frame = data_frame.withColumn(temporary_column_name, f.split(f.col(column_name).substr(f.lit(2), f.length(f.col(column_name)) - 2), ',').cast(data_type))

# drop the original string column to save space
data_frame = data_frame.drop(column_name)
Expand Down

0 comments on commit 647908b

Please sign in to comment.