diff --git a/docs/pages/index.md b/docs/pages/index.md index 39b1115..ccc584d 100644 --- a/docs/pages/index.md +++ b/docs/pages/index.md @@ -105,7 +105,7 @@ This token will periodically expire - you can re-run the above command again to ## Limitations / Known Issues -- Google Sheets has a limit of 1,000,000 cells per spreadsheet. +- Google Sheets has a limit of 10,000,000 cells per spreadsheet. - Reading sheets where data does not start in A1 is not yet supported. - Writing data to a sheet starting from a cell other than A1 is not yet supported. - Sheets must already exist to COPY TO them. diff --git a/src/gsheets_read.cpp b/src/gsheets_read.cpp index 2fdd029..0fb4d95 100644 --- a/src/gsheets_read.cpp +++ b/src/gsheets_read.cpp @@ -38,7 +38,7 @@ void ReadSheetFunction(ClientContext &context, TableFunctionInput &data_p, DataC if (bind_data.finished) { return; } - + json cleanJson = parseJson(bind_data.response); SheetData sheet_data = getSheetData(cleanJson); @@ -48,26 +48,12 @@ void ReadSheetFunction(ClientContext &context, TableFunctionInput &data_p, DataC // Adjust starting index based on whether we're using the header idx_t start_index = bind_data.header ? bind_data.row_index + 1 : bind_data.row_index; - // Determine column types - vector column_types(column_count, LogicalType::VARCHAR); - if (start_index < sheet_data.values.size()) { - const auto& first_data_row = sheet_data.values[start_index]; - for (idx_t col = 0; col < column_count && col < first_data_row.size(); col++) { - const string& value = first_data_row[col]; - if (value == "true" || value == "false") { - column_types[col] = LogicalType::BOOLEAN; - } else if (IsValidNumber(value)) { - column_types[col] = LogicalType::DOUBLE; - } - } - } - for (idx_t i = start_index; i < sheet_data.values.size() && row_count < STANDARD_VECTOR_SIZE; i++) { const auto& row = sheet_data.values[i]; for (idx_t col = 0; col < column_count; col++) { if (col < row.size()) { const string& value = row[col]; - switch (column_types[col].id()) { + switch (bind_data.return_types[col].id()) { case LogicalTypeId::BOOLEAN: if (value.empty()) { output.SetValue(col, row_count, Value(LogicalType::BOOLEAN)); @@ -83,7 +69,12 @@ void ReadSheetFunction(ClientContext &context, TableFunctionInput &data_p, DataC } break; default: - output.SetValue(col, row_count, Value(value)); + // Empty strings should be converted to NULL + if (value.empty()) { + output.SetValue(col, row_count, Value(LogicalType::VARCHAR)); + } else { + output.SetValue(col, row_count, Value(value)); + } break; } } else { @@ -160,25 +151,52 @@ unique_ptr ReadSheetBind(ClientContext &context, TableFunctionBind json cleanJson = parseJson(bind_data->response); SheetData sheet_data = getSheetData(cleanJson); - if (!sheet_data.values.empty()) { - idx_t start_index = header ? 1 : 0; - if (start_index < sheet_data.values.size()) { - const auto& first_data_row = sheet_data.values[start_index]; - for (size_t i = 0; i < first_data_row.size(); i++) { - string column_name = header ? sheet_data.values[0][i] : "column" + std::to_string(i + 1); - names.push_back(column_name); - - const string& value = first_data_row[i]; - if (value == "true" || value == "false") { - return_types.push_back(LogicalType::BOOLEAN); - } else if (IsValidNumber(value)) { - return_types.push_back(LogicalType::DOUBLE); - } else { - return_types.push_back(LogicalType::VARCHAR); - } - } + // Prefering early return style to reduce nesting + if (sheet_data.values.empty()) { + return bind_data; + } + idx_t start_index = header ? 1 : 0; + if (start_index >= sheet_data.values.size()) { + return bind_data; + } + + const auto& first_data_row = sheet_data.values[start_index]; + // If we have a header, we want the width of the result to be the max of: + // the width of the header row + // or the width of the first row of data + int result_width = first_data_row.size(); + if (header) { + int header_width = sheet_data.values[0].size(); + if (header_width > result_width) { + result_width = header_width; } } + + for (size_t i = 0; i < result_width; i++) { + // Assign default column_name, but rename to header value if using a header and header cell exists + string column_name = "column" + std::to_string(i + 1); + if (header && (i < sheet_data.values[0].size())) { + column_name = sheet_data.values[0][i]; + } + names.push_back(column_name); + + // If the first row has blanks, assume varchar for now + if (i >= first_data_row.size()) { + return_types.push_back(LogicalType::VARCHAR); + continue; + } + const string& value = first_data_row[i]; + if (value == "TRUE" || value == "FALSE") { + return_types.push_back(LogicalType::BOOLEAN); + } else if (IsValidNumber(value)) { + return_types.push_back(LogicalType::DOUBLE); + } else { + return_types.push_back(LogicalType::VARCHAR); + } + } + + bind_data->names = names; + bind_data->return_types = return_types; return bind_data; } diff --git a/src/include/gsheets_read.hpp b/src/include/gsheets_read.hpp index 39c47f6..2ba225f 100644 --- a/src/include/gsheets_read.hpp +++ b/src/include/gsheets_read.hpp @@ -15,6 +15,8 @@ struct ReadSheetBindData : public TableFunctionData { string response; bool header; string sheet_name; + vector return_types; + vector names; ReadSheetBindData(string spreadsheet_id, string token, bool header, string sheet_name); }; diff --git a/test/sql/read_gsheet.test b/test/sql/read_gsheet.test index 1284960..fbebfe0 100644 --- a/test/sql/read_gsheet.test +++ b/test/sql/read_gsheet.test @@ -78,6 +78,28 @@ FROM read_gsheet('https://docs.google.com/spreadsheets/d/11QdEasMWbETbFVxry-SsD8 3.0 value3 blabla3 NULL value4 blabla4 +# Issue 47: Blanks in the first row should not prevent all columns from returning +query IIII +FROM read_gsheet('https://docs.google.com/spreadsheets/d/11QdEasMWbETbFVxry-SsD8jVcdYIT1zBQszcF84MdE8/edit?gid=1746330494#gid=1746330494'); +---- +woot blah NULL NULL +more wooting more blah NULL should get this! + +# Issue 47: Read despite missing cells (and test booleans and doubles) +query IIIIIII +FROM read_gsheet('https://docs.google.com/spreadsheets/d/11QdEasMWbETbFVxry-SsD8jVcdYIT1zBQszcF84MdE8/edit?gid=1961167280#gid=1961167280'); +---- +woot blah NULL NULL true 123.0 should get this! +more wooting more blah should handle blank to the right NULL NULL NULL NULL +more wooting more blah NULL NULL false 456.789 should get this! + +# Issue 47: Read despite missing column headers +query IIII +FROM read_gsheet('https://docs.google.com/spreadsheets/d/11QdEasMWbETbFVxry-SsD8jVcdYIT1zBQszcF84MdE8/edit?gid=1108445818#gid=1108445818'); +---- +woot blah NULL should get this! +more wooting more blah NULL should get this! + # Drop the secret statement ok drop secret test_secret; \ No newline at end of file