From 3f2c5afed4be6bd8a30306645c69fdd6429b2622 Mon Sep 17 00:00:00 2001 From: Jelte Fennema-Nio Date: Mon, 9 Dec 2024 13:45:42 +0100 Subject: [PATCH 01/40] Initial trial for new read_csv syntax --- include/pgduckdb/pgduckdb_metadata_cache.hpp | 3 + include/pgduckdb/pgduckdb_ruleutils.h | 2 + sql/pg_duckdb--0.2.0--0.3.0.sql | 766 +++++++++++++++++++ src/pgduckdb_hooks.cpp | 1 + src/pgduckdb_metadata_cache.cpp | 29 + src/pgduckdb_options.cpp | 109 +++ src/pgduckdb_ruleutils.cpp | 23 + src/vendor/pg_ruleutils_17.c | 62 +- 8 files changed, 982 insertions(+), 13 deletions(-) diff --git a/include/pgduckdb/pgduckdb_metadata_cache.hpp b/include/pgduckdb/pgduckdb_metadata_cache.hpp index c8f9c564..4ff778bd 100644 --- a/include/pgduckdb/pgduckdb_metadata_cache.hpp +++ b/include/pgduckdb/pgduckdb_metadata_cache.hpp @@ -7,6 +7,9 @@ bool IsExtensionRegistered(); bool IsDuckdbOnlyFunction(Oid function_oid); uint64 CacheVersion(); Oid ExtensionOid(); +Oid SchemaOid(); +Oid DuckdbRowOid(); +Oid DuckdbUnresolvedTypeOid(); Oid DuckdbTableAmOid(); bool IsMotherDuckEnabled(); bool IsMotherDuckEnabledAnywhere(); diff --git a/include/pgduckdb/pgduckdb_ruleutils.h b/include/pgduckdb/pgduckdb_ruleutils.h index 5032670b..57f55e62 100644 --- a/include/pgduckdb/pgduckdb_ruleutils.h +++ b/include/pgduckdb/pgduckdb_ruleutils.h @@ -7,3 +7,5 @@ char *pgduckdb_get_tabledef(Oid relation_id); bool pgduckdb_is_not_default_expr(Node *node, void *context); List *pgduckdb_db_and_schema(const char *postgres_schema_name, bool is_duckdb_table); const char *pgduckdb_db_and_schema_string(const char *postgres_schema_name, bool is_duckdb_table); +bool pgduckdb_var_is_duckdb_row(Var *var); +bool pgduckdb_func_returns_duckdb_row(RangeTblFunction *rtfunc); diff --git a/sql/pg_duckdb--0.2.0--0.3.0.sql b/sql/pg_duckdb--0.2.0--0.3.0.sql index a23bb628..a40ef052 100644 --- a/sql/pg_duckdb--0.2.0--0.3.0.sql +++ b/sql/pg_duckdb--0.2.0--0.3.0.sql @@ -17,3 +17,769 @@ CREATE AGGREGATE @extschema@.approx_count_distinct(anyelement) CREATE DOMAIN pg_catalog.blob AS bytea; COMMENT ON DOMAIN pg_catalog.blob IS 'The DuckDB BLOB alias for BYTEA'; + +CREATE TYPE duckdb.row; +CREATE TYPE duckdb.unresolved_type; + +-- TODO: Should we remove IMMUTABLE STRICT? +CREATE FUNCTION duckdb.row_in(cstring) RETURNS duckdb.row AS 'MODULE_PATHNAME', 'duckdb_row_in' LANGUAGE C IMMUTABLE STRICT; +CREATE FUNCTION duckdb.row_out(duckdb.row) RETURNS cstring AS 'MODULE_PATHNAME', 'duckdb_row_out' LANGUAGE C IMMUTABLE STRICT; +CREATE FUNCTION duckdb.row_subscript(internal) RETURNS internal AS 'MODULE_PATHNAME', 'duckdb_row_subscript' LANGUAGE C IMMUTABLE STRICT; +CREATE TYPE duckdb.row ( + INTERNALLENGTH = VARIABLE, + INPUT = duckdb.row_in, + OUTPUT = duckdb.row_out, + SUBSCRIPT = duckdb.row_subscript +); + +CREATE FUNCTION duckdb.unresolved_type_in(cstring) RETURNS duckdb.unresolved_type AS 'MODULE_PATHNAME', 'duckdb_unresolved_type_in' LANGUAGE C IMMUTABLE STRICT; +CREATE FUNCTION duckdb.unresolved_type_out(duckdb.unresolved_type) RETURNS cstring AS 'MODULE_PATHNAME', 'duckdb_unresolved_type_out' LANGUAGE C IMMUTABLE STRICT; +CREATE TYPE duckdb.unresolved_type ( + INTERNALLENGTH = VARIABLE, + INPUT = duckdb.unresolved_type_in, + OUTPUT = duckdb.unresolved_type_out +); + +-- Dummy functions for binary operators with unresolved type on the lefthand +CREATE FUNCTION duckdb_unresolved_type_operator(duckdb.unresolved_type, "any") RETURNS duckdb.unresolved_type AS 'MODULE_PATHNAME', 'duckdb_unresolved_type_operator' LANGUAGE C IMMUTABLE STRICT; +CREATE FUNCTION duckdb_unresolved_type_operator_bool(duckdb.unresolved_type, "any") RETURNS boolean AS 'MODULE_PATHNAME', 'duckdb_unresolved_type_operator' LANGUAGE C IMMUTABLE STRICT; + +-- Dummy functions for binary operators with unresolved type on the righthand +CREATE FUNCTION duckdb_unresolved_type_operator("any", duckdb.unresolved_type) RETURNS duckdb.unresolved_type AS 'MODULE_PATHNAME', 'duckdb_unresolved_type_operator' LANGUAGE C IMMUTABLE STRICT; +CREATE FUNCTION duckdb_unresolved_type_operator_bool("any", duckdb.unresolved_type) RETURNS boolean AS 'MODULE_PATHNAME', 'duckdb_unresolved_type_operator' LANGUAGE C IMMUTABLE STRICT; + +-- Dummy function for prefix/unary operators +CREATE FUNCTION duckdb_unresolved_type_operator(duckdb.unresolved_type) RETURNS duckdb.unresolved_type AS 'MODULE_PATHNAME', 'duckdb_unresolved_type_operator' LANGUAGE C IMMUTABLE STRICT; + +-- prefix operators + and - +CREATE OPERATOR + ( + RIGHTARG = duckdb.unresolved_type, + FUNCTION = duckdb_unresolved_type_operator +); + +CREATE OPERATOR - ( + RIGHTARG = duckdb.unresolved_type, + FUNCTION = duckdb_unresolved_type_operator +); + +-- Basic comparison operators +CREATE OPERATOR <= ( + LEFTARG = duckdb.unresolved_type, + RIGHTARG = "any", + FUNCTION = duckdb_unresolved_type_operator_bool +); + +CREATE OPERATOR <= ( + LEFTARG = "any", + RIGHTARG = duckdb.unresolved_type, + FUNCTION = duckdb_unresolved_type_operator_bool +); + +CREATE OPERATOR < ( + LEFTARG = duckdb.unresolved_type, + RIGHTARG = "any", + FUNCTION = duckdb_unresolved_type_operator_bool +); + +CREATE OPERATOR < ( + LEFTARG = "any", + RIGHTARG = duckdb.unresolved_type, + FUNCTION = duckdb_unresolved_type_operator_bool +); + +CREATE OPERATOR <> ( + LEFTARG = duckdb.unresolved_type, + RIGHTARG = "any", + FUNCTION = duckdb_unresolved_type_operator_bool +); + +CREATE OPERATOR <> ( + LEFTARG = "any", + RIGHTARG = duckdb.unresolved_type, + FUNCTION = duckdb_unresolved_type_operator_bool +); + +CREATE OPERATOR = ( + LEFTARG = duckdb.unresolved_type, + RIGHTARG = "any", + FUNCTION = duckdb_unresolved_type_operator_bool +); + +CREATE OPERATOR = ( + LEFTARG = "any", + RIGHTARG = duckdb.unresolved_type, + FUNCTION = duckdb_unresolved_type_operator_bool +); + +CREATE OPERATOR > ( + LEFTARG = duckdb.unresolved_type, + RIGHTARG = "any", + FUNCTION = duckdb_unresolved_type_operator_bool +); + +CREATE OPERATOR > ( + LEFTARG = "any", + RIGHTARG = duckdb.unresolved_type, + FUNCTION = duckdb_unresolved_type_operator_bool +); + +CREATE OPERATOR >= ( + LEFTARG = duckdb.unresolved_type, + RIGHTARG = "any", + FUNCTION = duckdb_unresolved_type_operator_bool +); + +CREATE OPERATOR >= ( + LEFTARG = "any", + RIGHTARG = duckdb.unresolved_type, + FUNCTION = duckdb_unresolved_type_operator_bool +); + +-- binary math operators +CREATE OPERATOR + ( + LEFTARG = duckdb.unresolved_type, + RIGHTARG = "any", + FUNCTION = duckdb_unresolved_type_operator +); + +CREATE OPERATOR + ( + LEFTARG = "any", + RIGHTARG = duckdb.unresolved_type, + FUNCTION = duckdb_unresolved_type_operator +); + +CREATE OPERATOR - ( + LEFTARG = duckdb.unresolved_type, + RIGHTARG = "any", + FUNCTION = duckdb_unresolved_type_operator +); + +CREATE OPERATOR - ( + LEFTARG = "any", + RIGHTARG = duckdb.unresolved_type, + FUNCTION = duckdb_unresolved_type_operator +); + +CREATE OPERATOR * ( + LEFTARG = duckdb.unresolved_type, + RIGHTARG = "any", + FUNCTION = duckdb_unresolved_type_operator +); + +CREATE OPERATOR * ( + LEFTARG = "any", + RIGHTARG = duckdb.unresolved_type, + FUNCTION = duckdb_unresolved_type_operator +); + +CREATE OPERATOR / ( + LEFTARG = duckdb.unresolved_type, + RIGHTARG = "any", + FUNCTION = duckdb_unresolved_type_operator +); + +CREATE OPERATOR / ( + LEFTARG = "any", + RIGHTARG = duckdb.unresolved_type, + FUNCTION = duckdb_unresolved_type_operator +); + +-- Self comparison operators and necessary dummy function, needed for GROUP BY and ORDER BY +CREATE FUNCTION duckdb_unresolved_type_operator_bool(duckdb.unresolved_type, duckdb.unresolved_type) RETURNS boolean AS 'MODULE_PATHNAME', 'duckdb_unresolved_type_operator' LANGUAGE C IMMUTABLE STRICT; + +CREATE OPERATOR < ( + LEFTARG = duckdb.unresolved_type, + RIGHTARG = duckdb.unresolved_type, + FUNCTION = duckdb_unresolved_type_operator_bool +); + +CREATE OPERATOR <= ( + LEFTARG = duckdb.unresolved_type, + RIGHTARG = duckdb.unresolved_type, + FUNCTION = duckdb_unresolved_type_operator_bool +); + +CREATE OPERATOR = ( + LEFTARG = duckdb.unresolved_type, + RIGHTARG = duckdb.unresolved_type, + FUNCTION = duckdb_unresolved_type_operator_bool +); + +CREATE OPERATOR > ( + LEFTARG = duckdb.unresolved_type, + RIGHTARG = duckdb.unresolved_type, + FUNCTION = duckdb_unresolved_type_operator_bool +); + +CREATE OPERATOR >= ( + LEFTARG = duckdb.unresolved_type, + RIGHTARG = duckdb.unresolved_type, + FUNCTION = duckdb_unresolved_type_operator_bool +); + +-- TODO: use other dummy function with better error +CREATE FUNCTION duckdb_unresolved_type_btree_cmp(duckdb.unresolved_type, duckdb.unresolved_type) RETURNS int AS 'MODULE_PATHNAME', 'duckdb_unresolved_type_operator' LANGUAGE C IMMUTABLE STRICT; + +-- Create a B-tree operator class for duckdb.unresolved_type, so it can be used in ORDER BY +CREATE OPERATOR CLASS duckdb_unresolved_type_ops +DEFAULT FOR TYPE duckdb.unresolved_type USING btree AS + OPERATOR 1 < (duckdb.unresolved_type, duckdb.unresolved_type), + OPERATOR 2 <= (duckdb.unresolved_type, duckdb.unresolved_type), + OPERATOR 3 = (duckdb.unresolved_type, duckdb.unresolved_type), + OPERATOR 4 >= (duckdb.unresolved_type, duckdb.unresolved_type), + OPERATOR 5 > (duckdb.unresolved_type, duckdb.unresolved_type), + FUNCTION 1 duckdb_unresolved_type_btree_cmp(duckdb.unresolved_type, duckdb.unresolved_type); + +CREATE FUNCTION duckdb_unresolved_type_hash(duckdb.unresolved_type) RETURNS int AS 'MODULE_PATHNAME', 'duckdb_unresolved_type_operator' LANGUAGE C IMMUTABLE STRICT; + +-- Create a hash operator class for duckdb.unresolved_type, so it can be used in GROUP BY +CREATE OPERATOR CLASS duckdb_unresolved_type_hash_ops +DEFAULT FOR TYPE duckdb.unresolved_type USING hash AS + OPERATOR 1 = (duckdb.unresolved_type, duckdb.unresolved_type), + FUNCTION 1 duckdb_unresolved_type_hash(duckdb.unresolved_type); + +-- TODO: create dedicated dummy C functions for these +-- +-- State transition function +CREATE FUNCTION duckdb_unresolved_type_state_trans(state duckdb.unresolved_type, value duckdb.unresolved_type) +RETURNS duckdb.unresolved_type AS 'MODULE_PATHNAME', 'duckdb_unresolved_type_operator' LANGUAGE C IMMUTABLE STRICT; +CREATE FUNCTION duckdb_unresolved_type_state_trans(state duckdb.unresolved_type, value duckdb.unresolved_type, other "any") +RETURNS duckdb.unresolved_type AS 'MODULE_PATHNAME', 'duckdb_unresolved_type_operator' LANGUAGE C IMMUTABLE STRICT; +CREATE FUNCTION duckdb_unresolved_type_state_trans(state duckdb.unresolved_type, value duckdb.unresolved_type, other "any", another "any") +RETURNS duckdb.unresolved_type AS 'MODULE_PATHNAME', 'duckdb_unresolved_type_operator' LANGUAGE C IMMUTABLE STRICT; + +-- Final function +CREATE FUNCTION duckdb_unresolved_type_final(state duckdb.unresolved_type) +RETURNS duckdb.unresolved_type AS 'MODULE_PATHNAME', 'duckdb_unresolved_type_operator' LANGUAGE C IMMUTABLE STRICT; + +-- Aggregate functions + +-- NOTE: any_value is already definied in core in PG16+, so we don't create it. +-- People using older Postgres versions can manually implement the aggregate if +-- they really require it. + +CREATE AGGREGATE arbitrary(duckdb.unresolved_type) ( + SFUNC = duckdb_unresolved_type_state_trans, + STYPE = duckdb.unresolved_type, + FINALFUNC = duckdb_unresolved_type_final +); + +CREATE AGGREGATE arg_max(duckdb.unresolved_type, "any") ( + SFUNC = duckdb_unresolved_type_state_trans, + STYPE = duckdb.unresolved_type, + FINALFUNC = duckdb_unresolved_type_final +); + +CREATE AGGREGATE arg_max(duckdb.unresolved_type, "any", "any") ( + SFUNC = duckdb_unresolved_type_state_trans, + STYPE = duckdb.unresolved_type, + FINALFUNC = duckdb_unresolved_type_final +); + +CREATE AGGREGATE arg_max_null(duckdb.unresolved_type, "any") ( + SFUNC = duckdb_unresolved_type_state_trans, + STYPE = duckdb.unresolved_type, + FINALFUNC = duckdb_unresolved_type_final +); + +CREATE AGGREGATE arg_min(duckdb.unresolved_type, "any") ( + SFUNC = duckdb_unresolved_type_state_trans, + STYPE = duckdb.unresolved_type, + FINALFUNC = duckdb_unresolved_type_final +); + +CREATE AGGREGATE arg_min(duckdb.unresolved_type, "any", "any") ( + SFUNC = duckdb_unresolved_type_state_trans, + STYPE = duckdb.unresolved_type, + FINALFUNC = duckdb_unresolved_type_final +); + +CREATE AGGREGATE arg_min_null(duckdb.unresolved_type, "any") ( + SFUNC = duckdb_unresolved_type_state_trans, + STYPE = duckdb.unresolved_type, + FINALFUNC = duckdb_unresolved_type_final +); + +CREATE AGGREGATE array_agg(duckdb.unresolved_type) ( + SFUNC = duckdb_unresolved_type_state_trans, + STYPE = duckdb.unresolved_type, + FINALFUNC = duckdb_unresolved_type_final +); + +CREATE AGGREGATE avg(duckdb.unresolved_type) ( + SFUNC = duckdb_unresolved_type_state_trans, + STYPE = duckdb.unresolved_type, + FINALFUNC = duckdb_unresolved_type_final +); + +CREATE AGGREGATE bit_and(duckdb.unresolved_type) ( + SFUNC = duckdb_unresolved_type_state_trans, + STYPE = duckdb.unresolved_type, + FINALFUNC = duckdb_unresolved_type_final +); + +CREATE AGGREGATE bit_or(duckdb.unresolved_type) ( + SFUNC = duckdb_unresolved_type_state_trans, + STYPE = duckdb.unresolved_type, + FINALFUNC = duckdb_unresolved_type_final +); + +CREATE AGGREGATE bit_xor(duckdb.unresolved_type) ( + SFUNC = duckdb_unresolved_type_state_trans, + STYPE = duckdb.unresolved_type, + FINALFUNC = duckdb_unresolved_type_final +); + +CREATE AGGREGATE bitstring_agg(duckdb.unresolved_type) ( + SFUNC = duckdb_unresolved_type_state_trans, + STYPE = duckdb.unresolved_type, + FINALFUNC = duckdb_unresolved_type_final +); + +CREATE AGGREGATE bool_and(duckdb.unresolved_type) ( + SFUNC = duckdb_unresolved_type_state_trans, + STYPE = duckdb.unresolved_type, + FINALFUNC = duckdb_unresolved_type_final +); + +CREATE AGGREGATE bool_or(duckdb.unresolved_type) ( + SFUNC = duckdb_unresolved_type_state_trans, + STYPE = duckdb.unresolved_type, + FINALFUNC = duckdb_unresolved_type_final +); + +-- NOTE: count(*) and count(duckdb.unresolved_type) are already defined in the core + +CREATE AGGREGATE favg(duckdb.unresolved_type) ( + SFUNC = duckdb_unresolved_type_state_trans, + STYPE = duckdb.unresolved_type, + FINALFUNC = duckdb_unresolved_type_final +); + +CREATE AGGREGATE first(duckdb.unresolved_type) ( + SFUNC = duckdb_unresolved_type_state_trans, + STYPE = duckdb.unresolved_type, + FINALFUNC = duckdb_unresolved_type_final +); + +CREATE AGGREGATE fsum(duckdb.unresolved_type) ( + SFUNC = duckdb_unresolved_type_state_trans, + STYPE = duckdb.unresolved_type, + FINALFUNC = duckdb_unresolved_type_final +); + +CREATE AGGREGATE geomean(duckdb.unresolved_type) ( + SFUNC = duckdb_unresolved_type_state_trans, + STYPE = duckdb.unresolved_type, + FINALFUNC = duckdb_unresolved_type_final +); + +CREATE AGGREGATE histogram(duckdb.unresolved_type) ( + SFUNC = duckdb_unresolved_type_state_trans, + STYPE = duckdb.unresolved_type, + FINALFUNC = duckdb_unresolved_type_final +); + +CREATE AGGREGATE histogram(duckdb.unresolved_type, "any") ( + SFUNC = duckdb_unresolved_type_state_trans, + STYPE = duckdb.unresolved_type, + FINALFUNC = duckdb_unresolved_type_final +); + +CREATE AGGREGATE histogram_exact(duckdb.unresolved_type, "any") ( + SFUNC = duckdb_unresolved_type_state_trans, + STYPE = duckdb.unresolved_type, + FINALFUNC = duckdb_unresolved_type_final +); + +CREATE AGGREGATE last(duckdb.unresolved_type) ( + SFUNC = duckdb_unresolved_type_state_trans, + STYPE = duckdb.unresolved_type, + FINALFUNC = duckdb_unresolved_type_final +); + +CREATE AGGREGATE list(duckdb.unresolved_type) ( + SFUNC = duckdb_unresolved_type_state_trans, + STYPE = duckdb.unresolved_type, + FINALFUNC = duckdb_unresolved_type_final +); + +CREATE AGGREGATE max(duckdb.unresolved_type, "any") ( + SFUNC = duckdb_unresolved_type_state_trans, + STYPE = duckdb.unresolved_type, + FINALFUNC = duckdb_unresolved_type_final +); + +CREATE AGGREGATE max_by(duckdb.unresolved_type, "any") ( + SFUNC = duckdb_unresolved_type_state_trans, + STYPE = duckdb.unresolved_type, + FINALFUNC = duckdb_unresolved_type_final +); + +CREATE AGGREGATE max_by(duckdb.unresolved_type, "any", "any") ( + SFUNC = duckdb_unresolved_type_state_trans, + STYPE = duckdb.unresolved_type, + FINALFUNC = duckdb_unresolved_type_final +); + +CREATE AGGREGATE min(duckdb.unresolved_type, "any") ( + SFUNC = duckdb_unresolved_type_state_trans, + STYPE = duckdb.unresolved_type, + FINALFUNC = duckdb_unresolved_type_final +); + +CREATE AGGREGATE min_by(duckdb.unresolved_type, "any") ( + SFUNC = duckdb_unresolved_type_state_trans, + STYPE = duckdb.unresolved_type, + FINALFUNC = duckdb_unresolved_type_final +); + +CREATE AGGREGATE min_by(duckdb.unresolved_type, "any", "any") ( + SFUNC = duckdb_unresolved_type_state_trans, + STYPE = duckdb.unresolved_type, + FINALFUNC = duckdb_unresolved_type_final +); + +CREATE AGGREGATE product(duckdb.unresolved_type) ( + SFUNC = duckdb_unresolved_type_state_trans, + STYPE = duckdb.unresolved_type, + FINALFUNC = duckdb_unresolved_type_final +); + +CREATE AGGREGATE string_agg(duckdb.unresolved_type, "any") ( + SFUNC = duckdb_unresolved_type_state_trans, + STYPE = duckdb.unresolved_type, + FINALFUNC = duckdb_unresolved_type_final +); + +CREATE AGGREGATE sum(duckdb.unresolved_type) ( + SFUNC = duckdb_unresolved_type_state_trans, + STYPE = duckdb.unresolved_type, + FINALFUNC = duckdb_unresolved_type_final +); + + +-- "AS ASSIGNMENT" cast to boolean for unresolved types, so that they can be +-- used as the final expression in a WHERE clause +CREATE CAST (duckdb.unresolved_type AS boolean) + WITH INOUT + AS ASSIGNMENT; + +-- Regular casts for all our supported types +-- BOOLEAN (skiping plain boolean because it's right above) +CREATE CAST (duckdb.unresolved_type AS boolean[]) + WITH INOUT; + +-- TINYINT (CHAR) +CREATE CAST (duckdb.unresolved_type AS char) + WITH INOUT; +CREATE CAST (duckdb.unresolved_type AS char[]) + WITH INOUT; + +-- SMALLINT (INT2) +CREATE CAST (duckdb.unresolved_type AS smallint) + WITH INOUT; +CREATE CAST (duckdb.unresolved_type AS smallint[]) + WITH INOUT; + +-- INTEGER (INT4) +CREATE CAST (duckdb.unresolved_type AS integer) + WITH INOUT; +CREATE CAST (duckdb.unresolved_type AS integer[]) + WITH INOUT; + +-- BIGINT (INT8) +CREATE CAST (duckdb.unresolved_type AS bigint) + WITH INOUT; +CREATE CAST (duckdb.unresolved_type AS bigint[]) + WITH INOUT; + +-- VARCHAR (BPCHAR, TEXT, VARCHAR) +CREATE CAST (duckdb.unresolved_type AS varchar) + WITH INOUT; +CREATE CAST (duckdb.unresolved_type AS varchar[]) + WITH INOUT; + +-- DATE +CREATE CAST (duckdb.unresolved_type AS date) + WITH INOUT; +CREATE CAST (duckdb.unresolved_type AS date[]) + WITH INOUT; + +-- TIMESTAMP +CREATE CAST (duckdb.unresolved_type AS timestamp) + WITH INOUT; +CREATE CAST (duckdb.unresolved_type AS timestamp[]) + WITH INOUT; + +-- TIMESTAMP WITH TIME ZONE +CREATE CAST (duckdb.unresolved_type AS timestamptz) + WITH INOUT; + +-- FLOAT +CREATE CAST (duckdb.unresolved_type AS real) + WITH INOUT; +CREATE CAST (duckdb.unresolved_type AS real[]) + WITH INOUT; + +-- DOUBLE +CREATE CAST (duckdb.unresolved_type AS double precision) + WITH INOUT; +CREATE CAST (duckdb.unresolved_type AS double precision[]) + WITH INOUT; + +-- NUMERIC (DECIMAL) +CREATE CAST (duckdb.unresolved_type AS numeric) + WITH INOUT; +CREATE CAST (duckdb.unresolved_type AS numeric[]) + WITH INOUT; + +-- UUID +CREATE CAST (duckdb.unresolved_type AS uuid) + WITH INOUT; +CREATE CAST (duckdb.unresolved_type AS uuid[]) + WITH INOUT; + +-- JSON +CREATE CAST (duckdb.unresolved_type AS json) + WITH INOUT; +CREATE CAST (duckdb.unresolved_type AS json[]) + WITH INOUT; + +-- read_parquet function for single path +DROP FUNCTION @extschema@.read_parquet(path text, binary_as_string BOOLEAN, + filename BOOLEAN, + file_row_number BOOLEAN, + hive_partitioning BOOLEAN, + union_by_name BOOLEAN); +CREATE FUNCTION @extschema@.read_parquet(path text, binary_as_string BOOLEAN DEFAULT FALSE, + filename BOOLEAN DEFAULT FALSE, + file_row_number BOOLEAN DEFAULT FALSE, + hive_partitioning BOOLEAN DEFAULT FALSE, + union_by_name BOOLEAN DEFAULT FALSE) +RETURNS SETOF duckdb.row +SET search_path = pg_catalog, pg_temp +AS 'MODULE_PATHNAME', 'duckdb_only_function' +LANGUAGE C; + +-- read_parquet function for array of paths +DROP FUNCTION @extschema@.read_parquet(path text[], binary_as_string BOOLEAN, + filename BOOLEAN, + file_row_number BOOLEAN, + hive_partitioning BOOLEAN, + union_by_name BOOLEAN); +CREATE FUNCTION @extschema@.read_parquet(path text[], binary_as_string BOOLEAN DEFAULT FALSE, + filename BOOLEAN DEFAULT FALSE, + file_row_number BOOLEAN DEFAULT FALSE, + hive_partitioning BOOLEAN DEFAULT FALSE, + union_by_name BOOLEAN DEFAULT FALSE) +RETURNS SETOF duckdb.row +SET search_path = pg_catalog, pg_temp +AS 'MODULE_PATHNAME', 'duckdb_only_function' +LANGUAGE C; + +-- read_csv function for single path +DROP FUNCTION @extschema@.read_csv(path text, all_varchar BOOLEAN, + allow_quoted_nulls BOOLEAN, + auto_detect BOOLEAN, + auto_type_candidates TEXT[], + compression VARCHAR, + dateformat VARCHAR, + decimal_separator VARCHAR, + delim VARCHAR, + escape VARCHAR, + filename BOOLEAN, + force_not_null TEXT[], + header BOOLEAN, + hive_partitioning BOOLEAN, + ignore_errors BOOLEAN, + max_line_size BIGINT, + names TEXT[], + new_line VARCHAR, + normalize_names BOOLEAN, + null_padding BOOLEAN, + nullstr TEXT[], + parallel BOOLEAN, + quote VARCHAR, + sample_size BIGINT, + sep VARCHAR, + skip BIGINT, + timestampformat VARCHAR, + types TEXT[], + union_by_name BOOLEAN); +CREATE FUNCTION @extschema@.read_csv(path text, all_varchar BOOLEAN DEFAULT FALSE, + allow_quoted_nulls BOOLEAN DEFAULT TRUE, + auto_detect BOOLEAN DEFAULT TRUE, + auto_type_candidates TEXT[] DEFAULT ARRAY[]::TEXT[], + compression VARCHAR DEFAULT 'auto', + dateformat VARCHAR DEFAULT '', + decimal_separator VARCHAR DEFAULT '.', + delim VARCHAR DEFAULT ',', + escape VARCHAR DEFAULT '"', + filename BOOLEAN DEFAULT FALSE, + force_not_null TEXT[] DEFAULT ARRAY[]::TEXT[], + header BOOLEAN DEFAULT FALSE, + hive_partitioning BOOLEAN DEFAULT FALSE, + ignore_errors BOOLEAN DEFAULT FALSE, + max_line_size BIGINT DEFAULT 2097152, + names TEXT[] DEFAULT ARRAY[]::TEXT[], + new_line VARCHAR DEFAULT '', + normalize_names BOOLEAN DEFAULT FALSE, + null_padding BOOLEAN DEFAULT FALSE, + nullstr TEXT[] DEFAULT ARRAY[]::TEXT[], + parallel BOOLEAN DEFAULT FALSE, + quote VARCHAR DEFAULT '"', + sample_size BIGINT DEFAULT 20480, + sep VARCHAR DEFAULT ',', + skip BIGINT DEFAULT 0, + timestampformat VARCHAR DEFAULT '', + types TEXT[] DEFAULT ARRAY[]::TEXT[], + union_by_name BOOLEAN DEFAULT FALSE) +RETURNS SETOF duckdb.row +SET search_path = pg_catalog, pg_temp +AS 'MODULE_PATHNAME', 'duckdb_only_function' +LANGUAGE C; + +-- read_csv function for array of paths +DROP FUNCTION @extschema@.read_csv(path text[], all_varchar BOOLEAN, + allow_quoted_nulls BOOLEAN, + auto_detect BOOLEAN, + auto_type_candidates TEXT[], + compression VARCHAR, + dateformat VARCHAR, + decimal_separator VARCHAR, + delim VARCHAR, + escape VARCHAR, + filename BOOLEAN, + force_not_null TEXT[], + header BOOLEAN, + hive_partitioning BOOLEAN, + ignore_errors BOOLEAN, + max_line_size BIGINT, + names TEXT[], + new_line VARCHAR, + normalize_names BOOLEAN, + null_padding BOOLEAN, + nullstr TEXT[], + parallel BOOLEAN, + quote VARCHAR, + sample_size BIGINT, + sep VARCHAR, + skip BIGINT, + timestampformat VARCHAR, + types TEXT[], + union_by_name BOOLEAN); +CREATE FUNCTION @extschema@.read_csv(path text[], all_varchar BOOLEAN DEFAULT FALSE, + allow_quoted_nulls BOOLEAN DEFAULT TRUE, + auto_detect BOOLEAN DEFAULT TRUE, + auto_type_candidates TEXT[] DEFAULT ARRAY[]::TEXT[], + compression VARCHAR DEFAULT 'auto', + dateformat VARCHAR DEFAULT '', + decimal_separator VARCHAR DEFAULT '.', + delim VARCHAR DEFAULT ',', + escape VARCHAR DEFAULT '"', + filename BOOLEAN DEFAULT FALSE, + force_not_null TEXT[] DEFAULT ARRAY[]::TEXT[], + header BOOLEAN DEFAULT FALSE, + hive_partitioning BOOLEAN DEFAULT FALSE, + ignore_errors BOOLEAN DEFAULT FALSE, + max_line_size BIGINT DEFAULT 2097152, + names TEXT[] DEFAULT ARRAY[]::TEXT[], + new_line VARCHAR DEFAULT '', + normalize_names BOOLEAN DEFAULT FALSE, + null_padding BOOLEAN DEFAULT FALSE, + nullstr TEXT[] DEFAULT ARRAY[]::TEXT[], + parallel BOOLEAN DEFAULT FALSE, + quote VARCHAR DEFAULT '"', + sample_size BIGINT DEFAULT 20480, + sep VARCHAR DEFAULT ',', + skip BIGINT DEFAULT 0, + timestampformat VARCHAR DEFAULT '', + types TEXT[] DEFAULT ARRAY[]::TEXT[], + union_by_name BOOLEAN DEFAULT FALSE) +RETURNS SETOF duckdb.row +SET search_path = pg_catalog, pg_temp +AS 'MODULE_PATHNAME', 'duckdb_only_function' +LANGUAGE C; + +-- iceberg_scan function +DROP FUNCTION @extschema@.iceberg_scan(path text, allow_moved_paths BOOLEAN, + mode TEXT, + metadata_compression_codec TEXT, + skip_schema_inference BOOLEAN, + version TEXT, + version_name_format TEXT); +CREATE FUNCTION @extschema@.iceberg_scan(path text, allow_moved_paths BOOLEAN DEFAULT FALSE, + mode TEXT DEFAULT '', + metadata_compression_codec TEXT DEFAULT 'none', + skip_schema_inference BOOLEAN DEFAULT FALSE, + version TEXT DEFAULT 'version-hint.text', + version_name_format TEXT DEFAULT 'v%s%s.metadata.json,%s%s.metadata.json') +RETURNS SETOF duckdb.row +SET search_path = pg_catalog, pg_temp +AS 'MODULE_PATHNAME', 'duckdb_only_function' +LANGUAGE C; + +-- delta_scan function +DROP FUNCTION @extschema@.delta_scan(path text); +CREATE FUNCTION @extschema@.delta_scan(path text) +RETURNS SETOF duckdb.row +SET search_path = pg_catalog, pg_temp +AS 'MODULE_PATHNAME', 'duckdb_only_function' +LANGUAGE C; + +-- read_json function for single path +DROP FUNCTION @extschema@.read_json(path text, auto_detect BOOLEAN, + compression VARCHAR, + dateformat VARCHAR, + format VARCHAR, + ignore_errors BOOLEAN, + maximum_depth BIGINT, + maximum_object_size INT, + records VARCHAR, + sample_size BIGINT, + timestampformat VARCHAR, + union_by_name BOOLEAN); +CREATE FUNCTION @extschema@.read_json(path text, auto_detect BOOLEAN DEFAULT FALSE, + compression VARCHAR DEFAULT 'auto', + dateformat VARCHAR DEFAULT 'iso', + format VARCHAR DEFAULT 'array', + ignore_errors BOOLEAN DEFAULT FALSE, + maximum_depth BIGINT DEFAULT -1, + maximum_object_size INT DEFAULT 16777216, + records VARCHAR DEFAULT 'records', + sample_size BIGINT DEFAULT 20480, + timestampformat VARCHAR DEFAULT 'iso', + union_by_name BOOLEAN DEFAULT FALSE) +RETURNS SETOF duckdb.row +SET search_path = pg_catalog, pg_temp +AS 'MODULE_PATHNAME', 'duckdb_only_function' +LANGUAGE C; + +-- read_json function for array of paths +DROP FUNCTION @extschema@.read_json(path text[], auto_detect BOOLEAN, + compression VARCHAR, + dateformat VARCHAR, + format VARCHAR, + ignore_errors BOOLEAN, + maximum_depth BIGINT, + maximum_object_size INT, + records VARCHAR, + sample_size BIGINT, + timestampformat VARCHAR, + union_by_name BOOLEAN); +CREATE FUNCTION @extschema@.read_json(path text[], auto_detect BOOLEAN DEFAULT FALSE, + compression VARCHAR DEFAULT 'auto', + dateformat VARCHAR DEFAULT 'iso', + format VARCHAR DEFAULT 'array', + ignore_errors BOOLEAN DEFAULT FALSE, + maximum_depth BIGINT DEFAULT -1, + maximum_object_size INT DEFAULT 16777216, + records VARCHAR DEFAULT 'records', + sample_size BIGINT DEFAULT 20480, + timestampformat VARCHAR DEFAULT 'iso', + union_by_name BOOLEAN DEFAULT FALSE) +RETURNS SETOF duckdb.row +SET search_path = pg_catalog, pg_temp +AS 'MODULE_PATHNAME', 'duckdb_only_function' +LANGUAGE C; diff --git a/src/pgduckdb_hooks.cpp b/src/pgduckdb_hooks.cpp index bc3d921f..dc0dbf47 100644 --- a/src/pgduckdb_hooks.cpp +++ b/src/pgduckdb_hooks.cpp @@ -189,6 +189,7 @@ DuckdbPlannerHook_Cpp(Query *parse, const char *query_string, int cursor_options if (pgduckdb::IsExtensionRegistered()) { if (NeedsDuckdbExecution(parse)) { IsAllowedStatement(parse, true); + pprint(parse); return DuckdbPlanNode(parse, query_string, cursor_options, bound_params, true); } else if (duckdb_force_execution && IsAllowedStatement(parse) && ContainsFromClause(parse)) { diff --git a/src/pgduckdb_metadata_cache.cpp b/src/pgduckdb_metadata_cache.cpp index 4d812930..3fac74af 100644 --- a/src/pgduckdb_metadata_cache.cpp +++ b/src/pgduckdb_metadata_cache.cpp @@ -49,6 +49,12 @@ struct { bool installed; /* The Postgres OID of the pg_duckdb extension. */ Oid extension_oid; + /* The OID of the duckdb schema */ + Oid schema_oid; + /* The OID of the duckdb.row type */ + Oid row_oid; + /* The OID of the duckdb.unresolved_type */ + Oid unresolved_type_oid; /* The OID of the duckdb Table Access Method */ Oid table_am_oid; /* The OID of the duckdb.motherduck_postgres_database */ @@ -174,6 +180,11 @@ IsExtensionRegistered() { BuildDuckdbOnlyFunctions(); cache.table_am_oid = GetSysCacheOid1(AMNAME, Anum_pg_am_oid, CStringGetDatum("duckdb")); + cache.schema_oid = get_namespace_oid("duckdb", false); + cache.row_oid = GetSysCacheOid2(TYPENAMENSP, Anum_pg_type_oid, CStringGetDatum("row"), cache.schema_oid); + cache.unresolved_type_oid = + GetSysCacheOid2(TYPENAMENSP, Anum_pg_type_oid, CStringGetDatum("unresolved_type"), cache.schema_oid); + cache.motherduck_postgres_database_oid = get_database_oid(duckdb_motherduck_postgres_database, false); if (duckdb_postgres_role[0] != '\0') { @@ -220,6 +231,24 @@ ExtensionOid() { return cache.extension_oid; } +Oid +SchemaOid() { + Assert(cache.valid); + return cache.schema_oid; +} + +Oid +DuckdbRowOid() { + Assert(cache.valid); + return cache.row_oid; +} + +Oid +DuckdbUnresolvedTypeOid() { + Assert(cache.valid); + return cache.unresolved_type_oid; +} + Oid DuckdbTableAmOid() { Assert(cache.valid); diff --git a/src/pgduckdb_options.cpp b/src/pgduckdb_options.cpp index bc4b81a3..0cb90d18 100644 --- a/src/pgduckdb_options.cpp +++ b/src/pgduckdb_options.cpp @@ -17,16 +17,24 @@ extern "C" { #include "access/xact.h" #include "executor/spi.h" #include "catalog/indexing.h" +#include "parser/parse_coerce.h" +#include "parser/parse_node.h" +#include "parser/parse_expr.h" +#include "nodes/subscripting.h" +#include "nodes/nodeFuncs.h" #include "catalog/namespace.h" #include "utils/builtins.h" #include "utils/lsyscache.h" #include "utils/rel.h" #include "utils/snapmgr.h" + +#include "nodes/print.h" } #include "pgduckdb/pgduckdb_options.hpp" #include "pgduckdb/pgduckdb_duckdb.hpp" #include "pgduckdb/pgduckdb_xact.hpp" +#include "pgduckdb/pgduckdb_metadata_cache.hpp" #include "pgduckdb/utility/cpp_wrapper.hpp" namespace pgduckdb { @@ -365,4 +373,105 @@ DECLARE_PG_FUNCTION(pgduckdb_recycle_ddb) { PG_RETURN_BOOL(true); } +void +DuckdbRowSubscriptTransform(SubscriptingRef *sbsref, List *indirection, struct ParseState *pstate, bool isSlice, + bool isAssignment) { + if (isAssignment) { + elog(ERROR, "Assignment to duckdb.row is not supported"); + } + + if (isSlice) { + elog(ERROR, "Creating a slice out of duckdb.row is not supported"); + } + + if (indirection == NIL) { + elog(ERROR, "Subscripting duckdb.row with an empty subscript is not supported"); + } + + if (list_length(indirection) != 1) { + /* TODO: Fix this, if the column contains an array we should be able to subscript that type */ + elog(ERROR, "Subscripting duckdb.row is only supported with a single subscript"); + } + pprint(indirection); + + // Transform each subscript expression + foreach_node(A_Indices, subscript, indirection) { + Assert(!subscript->is_slice); + Assert(subscript->uidx); + + Node *subscript_expr = transformExpr(pstate, subscript->uidx, pstate->p_expr_kind); + pprint(subscript_expr); + int expr_location = exprLocation(subscript->uidx); + Oid subscript_expr_type = exprType(subscript_expr); + + Node *coerced_expr = coerce_to_target_type(pstate, subscript_expr, subscript_expr_type, TEXTOID, -1, + COERCION_IMPLICIT, COERCE_IMPLICIT_CAST, expr_location); + if (!coerced_expr) { + ereport(ERROR, (errcode(ERRCODE_DATATYPE_MISMATCH), errmsg("duckdb.row subscript must have text type"), + parser_errposition(pstate, expr_location))); + } + + if (!IsA(subscript_expr, Const)) { + pprint(subscript_expr); + ereport(ERROR, (errcode(ERRCODE_DATATYPE_MISMATCH), errmsg("duckdb.row subscript must be a constant"), + parser_errposition(pstate, expr_location))); + } + sbsref->refupperindexpr = lappend(sbsref->refupperindexpr, coerced_expr); + } + + /* TODO: Trigger cache population, probably we should do this somewhere else */ + pgduckdb::IsExtensionRegistered(); + + // Set the result type of the subscripting operation + sbsref->refrestype = pgduckdb::DuckdbUnresolvedTypeOid(); + sbsref->reftypmod = -1; +} + +void +DuckdbRowSubscriptExecSetup(const SubscriptingRef * /*sbsref*/, SubscriptingRefState * /*sbsrefstate*/, + SubscriptExecSteps * /*exprstate*/) { + elog(ERROR, "Subscripting duckdb.row is not supported in the Postgres Executor"); +} + +static SubscriptRoutines duckdb_subscript_routines = { + .transform = DuckdbRowSubscriptTransform, + .exec_setup = DuckdbRowSubscriptExecSetup, + .fetch_strict = false, + .fetch_leakproof = true, + .store_leakproof = true, +}; + +DECLARE_PG_FUNCTION(duckdb_row_in) { + elog(ERROR, "Creating the duckdb.row type is not supported"); +} + +DECLARE_PG_FUNCTION(duckdb_row_out) { + elog(ERROR, "Converting a duckdb.row to a string is not supported"); +} + +DECLARE_PG_FUNCTION(duckdb_unresolved_type_in) { + elog(ERROR, "Creating the duckdb.unresolved_type type is not supported"); +} + +DECLARE_PG_FUNCTION(duckdb_unresolved_type_out) { + elog(ERROR, "Converting a duckdb.unresolved_type to a string is not supported"); +} + +DECLARE_PG_FUNCTION(duckdb_get_row_field_operator) { + elog(ERROR, "The 'get field' operator for duckdb.row cannot be executed by Postgres"); +} + +DECLARE_PG_FUNCTION(duckdb_row_subscript) { + PG_RETURN_POINTER(&duckdb_subscript_routines); +} + +DECLARE_PG_FUNCTION(duckdb_unresolved_type_operator) { + elog(ERROR, "Unresolved duckdb types cannot be used by the Postgres executor"); +} + +DECLARE_PG_FUNCTION(duckdb_only_function) { + char *function_name = DatumGetCString(DirectFunctionCall1(regprocout, fcinfo->flinfo->fn_oid)); + elog(ERROR, "Function '%s' only works with DuckDB execution", function_name); +} + } // extern "C" diff --git a/src/pgduckdb_ruleutils.cpp b/src/pgduckdb_ruleutils.cpp index 5624b586..4d9e1e5a 100644 --- a/src/pgduckdb_ruleutils.cpp +++ b/src/pgduckdb_ruleutils.cpp @@ -38,6 +38,29 @@ pgduckdb_function_name(Oid function_oid) { return psprintf("system.main.%s", quote_identifier(func_name)); } +bool +pgduckdb_var_is_duckdb_row(Var *var) { + if (!var) { + return false; + } + return var->vartype == pgduckdb::DuckdbRowOid(); +} + +bool +pgduckdb_func_returns_duckdb_row(RangeTblFunction *rtfunc) { + if (!rtfunc) { + return false; + } + + if (!IsA(rtfunc->funcexpr, FuncExpr)) { + return false; + } + + FuncExpr *func_expr = castNode(FuncExpr, rtfunc->funcexpr); + + return func_expr->funcresulttype == pgduckdb::DuckdbRowOid(); +} + /* * Given a postgres schema name, this returns a list of two elements: the first * is the DuckDB database name and the second is the duckdb schema name. These diff --git a/src/vendor/pg_ruleutils_17.c b/src/vendor/pg_ruleutils_17.c index b9b78a83..afcac88a 100644 --- a/src/vendor/pg_ruleutils_17.c +++ b/src/vendor/pg_ruleutils_17.c @@ -6071,8 +6071,10 @@ get_target_list(List *targetList, deparse_context *context, * directly so that we can tell it to do the right thing, and so that * we can get the attribute name which is the default AS label. */ + Var *var = NULL; if (tle->expr && (IsA(tle->expr, Var))) { + var = (Var *) tle->expr; attname = get_variable((Var *) tle->expr, 0, true, context); } else @@ -6099,7 +6101,7 @@ get_target_list(List *targetList, deparse_context *context, colname = tle->resname; /* Show AS unless the column's name is correct as-is */ - if (colname) /* resname could be NULL */ + if (colname && !pgduckdb_var_is_duckdb_row(var)) { if (attname == NULL || strcmp(attname, colname) != 0) appendStringInfo(&targetbuf, " AS %s", quote_identifier(colname)); @@ -7504,17 +7506,35 @@ get_variable(Var *var, int levelsup, bool istoplevel, deparse_context *context) if (attnum > colinfo->num_cols) elog(ERROR, "invalid attnum %d for relation \"%s\"", attnum, rte->eref->aliasname); - attname = colinfo->colnames[attnum - 1]; + if (pgduckdb_var_is_duckdb_row(var)) { + if (istoplevel) { + /* If the duckdb row is at the top level target list of a + * select, then we want to generate r.*, to unpack all the + * columns instead of returning a STRUCT from the query. */ + attname = NULL; + } else { + /* In any other case, we want to simply use the alias of the + * TargetEntry, without the column name that postgres gave to + * it. Because even though according to the Postgres parser we + * are referencing a column of type "duckdb.row", that column + * won't actually exist in the DuckDB query. + */ + attname = refname; + refname = NULL; + } + } else { + attname = colinfo->colnames[attnum - 1]; - /* - * If we find a Var referencing a dropped column, it seems better to - * print something (anything) than to fail. In general this should - * not happen, but it used to be possible for some cases involving - * functions returning named composite types, and perhaps there are - * still bugs out there. - */ - if (attname == NULL) - attname = "?dropped?column?"; + /* + * If we find a Var referencing a dropped column, it seems better to + * print something (anything) than to fail. In general this should + * not happen, but it used to be possible for some cases involving + * functions returning named composite types, and perhaps there are + * still bugs out there. + */ + if (attname == NULL) + attname = "?dropped?column?"; + } } else { @@ -7532,7 +7552,7 @@ get_variable(Var *var, int levelsup, bool istoplevel, deparse_context *context) else { appendStringInfoChar(buf, '*'); - if (istoplevel) + if (istoplevel && !pgduckdb_var_is_duckdb_row(var)) appendStringInfo(buf, "::%s", format_type_with_typemod(var->vartype, var->vartypmod)); @@ -9015,6 +9035,19 @@ get_rule_expr(Node *node, deparse_context *context, appendStringInfoString(buf, " := "); get_rule_expr(refassgnexpr, context, showimplicit); } + else if (IsA(sbsref->refexpr, Var) && pgduckdb_var_is_duckdb_row((Var*) sbsref->refexpr)) { + Assert(sbsref->refupperindexpr); + Assert(!sbsref->reflowerindexpr); + Assert(list_length(sbsref->refupperindexpr) == 1); + Oid typoutput; + bool typIsVarlena; + Const *constval = castNode(Const, linitial(sbsref->refupperindexpr)); + getTypeOutputInfo(constval->consttype, + &typoutput, &typIsVarlena); + + char *extval = OidOutputFunctionCall(typoutput, constval->constvalue); + appendStringInfo(context->buf, ".%s", quote_identifier(extval)); + } else { /* Just an ordinary container fetch, so print subscripts */ @@ -12121,8 +12154,11 @@ get_from_clause_item(Node *jtnode, Query *query, deparse_context *context) /* Print the relation alias, if needed */ get_rte_alias(rte, varno, false, context); + if (pgduckdb_func_returns_duckdb_row(rtfunc1)) { + /* Don't print column aliases for functions that return a duckdb.row */ + } /* Print the column definitions or aliases, if needed */ - if (rtfunc1 && rtfunc1->funccolnames != NIL) + else if (rtfunc1 && rtfunc1->funccolnames != NIL) { /* Reconstruct the columndef list, which is also the aliases */ get_from_clause_coldeflist(rtfunc1, colinfo, context); From bf6596fb5f3020c36a4f87a051802cd9ecdf6477 Mon Sep 17 00:00:00 2001 From: Jelte Fennema-Nio Date: Wed, 11 Dec 2024 13:57:43 +0100 Subject: [PATCH 02/40] Generate query that works for iceberg scans --- src/vendor/pg_ruleutils_17.c | 39 +++++++++++++++++++++++++++++++++++- 1 file changed, 38 insertions(+), 1 deletion(-) diff --git a/src/vendor/pg_ruleutils_17.c b/src/vendor/pg_ruleutils_17.c index afcac88a..b8d1eda2 100644 --- a/src/vendor/pg_ruleutils_17.c +++ b/src/vendor/pg_ruleutils_17.c @@ -7507,6 +7507,14 @@ get_variable(Var *var, int levelsup, bool istoplevel, deparse_context *context) elog(ERROR, "invalid attnum %d for relation \"%s\"", attnum, rte->eref->aliasname); if (pgduckdb_var_is_duckdb_row(var)) { + if (rte->rtekind == RTE_FUNCTION) { + if (istoplevel) { + appendStringInfoChar(buf, '*'); + return NULL; + } + return NULL; + } + if (istoplevel) { /* If the duckdb row is at the top level target list of a * select, then we want to generate r.*, to unpack all the @@ -9039,6 +9047,32 @@ get_rule_expr(Node *node, deparse_context *context, Assert(sbsref->refupperindexpr); Assert(!sbsref->reflowerindexpr); Assert(list_length(sbsref->refupperindexpr) == 1); + Var *var = (Var *) sbsref->refexpr; + + deparse_namespace* dpns = (deparse_namespace *) list_nth(context->namespaces, + var->varlevelsup); + int varno; + AttrNumber varattno; + + /* + * If we have a syntactic referent for the Var, and we're working from a + * parse tree, prefer to use the syntactic referent. Otherwise, fall back + * on the semantic referent. (Forcing use of the semantic referent when + * printing plan trees is a design choice that's perhaps more motivated by + * backwards compatibility than anything else. But it does have the + * advantage of making plans more explicit.) + */ + if (var->varnosyn > 0 && dpns->plan == NULL) + { + varno = var->varnosyn; + varattno = var->varattnosyn; + } + else + { + varno = var->varno; + varattno = var->varattno; + } + RangeTblEntry* rte = rt_fetch(varno, dpns->rtable); Oid typoutput; bool typIsVarlena; Const *constval = castNode(Const, linitial(sbsref->refupperindexpr)); @@ -9046,7 +9080,10 @@ get_rule_expr(Node *node, deparse_context *context, &typoutput, &typIsVarlena); char *extval = OidOutputFunctionCall(typoutput, constval->constvalue); - appendStringInfo(context->buf, ".%s", quote_identifier(extval)); + if (rte->rtekind != RTE_FUNCTION) { + appendStringInfoChar(buf, '.'); + } + appendStringInfoString(context->buf, quote_identifier(extval)); } else { From 65681557b4f870a34d8408d7e8d63e72ee8adec3 Mon Sep 17 00:00:00 2001 From: Jelte Fennema-Nio Date: Thu, 12 Dec 2024 13:58:08 +0100 Subject: [PATCH 03/40] Correct column aliases and use star --- include/pgduckdb/pgduckdb_ruleutils.h | 4 + sql/pg_duckdb--0.2.0--0.3.0.sql | 93 ++++++++++++------- src/pgduckdb_options.cpp | 8 +- src/pgduckdb_ruleutils.cpp | 83 ++++++++++++++++- src/vendor/pg_ruleutils_17.c | 80 +++++++++++++++- .../expected/duckdb_only_functions.out | 8 +- test/regression/expected/non_superuser.out | 2 +- test/regression/expected/read_functions.out | 50 +++++----- test/regression/expected/search_path.out | 2 +- test/regression/sql/create_table_as.sql | 2 +- test/regression/sql/duckdb_only_functions.sql | 8 +- test/regression/sql/non_superuser.sql | 2 +- test/regression/sql/read_functions.sql | 50 +++++----- test/regression/sql/search_path.sql | 2 +- 14 files changed, 284 insertions(+), 110 deletions(-) diff --git a/include/pgduckdb/pgduckdb_ruleutils.h b/include/pgduckdb/pgduckdb_ruleutils.h index 57f55e62..5c057b17 100644 --- a/include/pgduckdb/pgduckdb_ruleutils.h +++ b/include/pgduckdb/pgduckdb_ruleutils.h @@ -7,5 +7,9 @@ char *pgduckdb_get_tabledef(Oid relation_id); bool pgduckdb_is_not_default_expr(Node *node, void *context); List *pgduckdb_db_and_schema(const char *postgres_schema_name, bool is_duckdb_table); const char *pgduckdb_db_and_schema_string(const char *postgres_schema_name, bool is_duckdb_table); +bool pgduckdb_is_duckdb_row(Oid type_oid); +bool pgduckdb_is_unresolved_type(Oid type_oid); bool pgduckdb_var_is_duckdb_row(Var *var); bool pgduckdb_func_returns_duckdb_row(RangeTblFunction *rtfunc); +Var *pgduckdb_duckdb_row_subscript_var(Expr *expr); +List *pgduckdb_star_start_vars(List *target_list); diff --git a/sql/pg_duckdb--0.2.0--0.3.0.sql b/sql/pg_duckdb--0.2.0--0.3.0.sql index a40ef052..ba554590 100644 --- a/sql/pg_duckdb--0.2.0--0.3.0.sql +++ b/sql/pg_duckdb--0.2.0--0.3.0.sql @@ -48,6 +48,10 @@ CREATE FUNCTION duckdb_unresolved_type_operator_bool(duckdb.unresolved_type, "an CREATE FUNCTION duckdb_unresolved_type_operator("any", duckdb.unresolved_type) RETURNS duckdb.unresolved_type AS 'MODULE_PATHNAME', 'duckdb_unresolved_type_operator' LANGUAGE C IMMUTABLE STRICT; CREATE FUNCTION duckdb_unresolved_type_operator_bool("any", duckdb.unresolved_type) RETURNS boolean AS 'MODULE_PATHNAME', 'duckdb_unresolved_type_operator' LANGUAGE C IMMUTABLE STRICT; +-- Dummy functions for binary operators with unresolved type on both sides +CREATE FUNCTION duckdb_unresolved_type_operator(duckdb.unresolved_type, duckdb.unresolved_type) RETURNS duckdb.unresolved_type AS 'MODULE_PATHNAME', 'duckdb_unresolved_type_operator' LANGUAGE C IMMUTABLE STRICT; +CREATE FUNCTION duckdb_unresolved_type_operator_bool(duckdb.unresolved_type, duckdb.unresolved_type) RETURNS boolean AS 'MODULE_PATHNAME', 'duckdb_unresolved_type_operator' LANGUAGE C IMMUTABLE STRICT; + -- Dummy function for prefix/unary operators CREATE FUNCTION duckdb_unresolved_type_operator(duckdb.unresolved_type) RETURNS duckdb.unresolved_type AS 'MODULE_PATHNAME', 'duckdb_unresolved_type_operator' LANGUAGE C IMMUTABLE STRICT; @@ -63,6 +67,12 @@ CREATE OPERATOR - ( ); -- Basic comparison operators +CREATE OPERATOR <= ( + LEFTARG = duckdb.unresolved_type, + RIGHTARG = duckdb.unresolved_type, + FUNCTION = duckdb_unresolved_type_operator_bool +); + CREATE OPERATOR <= ( LEFTARG = duckdb.unresolved_type, RIGHTARG = "any", @@ -75,6 +85,12 @@ CREATE OPERATOR <= ( FUNCTION = duckdb_unresolved_type_operator_bool ); +CREATE OPERATOR < ( + LEFTARG = duckdb.unresolved_type, + RIGHTARG = duckdb.unresolved_type, + FUNCTION = duckdb_unresolved_type_operator_bool +); + CREATE OPERATOR < ( LEFTARG = duckdb.unresolved_type, RIGHTARG = "any", @@ -87,6 +103,12 @@ CREATE OPERATOR < ( FUNCTION = duckdb_unresolved_type_operator_bool ); +CREATE OPERATOR <> ( + LEFTARG = duckdb.unresolved_type, + RIGHTARG = duckdb.unresolved_type, + FUNCTION = duckdb_unresolved_type_operator_bool +); + CREATE OPERATOR <> ( LEFTARG = duckdb.unresolved_type, RIGHTARG = "any", @@ -99,6 +121,12 @@ CREATE OPERATOR <> ( FUNCTION = duckdb_unresolved_type_operator_bool ); +CREATE OPERATOR = ( + LEFTARG = duckdb.unresolved_type, + RIGHTARG = duckdb.unresolved_type, + FUNCTION = duckdb_unresolved_type_operator_bool +); + CREATE OPERATOR = ( LEFTARG = duckdb.unresolved_type, RIGHTARG = "any", @@ -111,6 +139,12 @@ CREATE OPERATOR = ( FUNCTION = duckdb_unresolved_type_operator_bool ); +CREATE OPERATOR > ( + LEFTARG = duckdb.unresolved_type, + RIGHTARG = duckdb.unresolved_type, + FUNCTION = duckdb_unresolved_type_operator_bool +); + CREATE OPERATOR > ( LEFTARG = duckdb.unresolved_type, RIGHTARG = "any", @@ -123,6 +157,12 @@ CREATE OPERATOR > ( FUNCTION = duckdb_unresolved_type_operator_bool ); +CREATE OPERATOR >= ( + LEFTARG = duckdb.unresolved_type, + RIGHTARG = duckdb.unresolved_type, + FUNCTION = duckdb_unresolved_type_operator_bool +); + CREATE OPERATOR >= ( LEFTARG = duckdb.unresolved_type, RIGHTARG = "any", @@ -136,6 +176,12 @@ CREATE OPERATOR >= ( ); -- binary math operators +CREATE OPERATOR + ( + LEFTARG = duckdb.unresolved_type, + RIGHTARG = duckdb.unresolved_type, + FUNCTION = duckdb_unresolved_type_operator +); + CREATE OPERATOR + ( LEFTARG = duckdb.unresolved_type, RIGHTARG = "any", @@ -148,6 +194,12 @@ CREATE OPERATOR + ( FUNCTION = duckdb_unresolved_type_operator ); +CREATE OPERATOR - ( + LEFTARG = duckdb.unresolved_type, + RIGHTARG = duckdb.unresolved_type, + FUNCTION = duckdb_unresolved_type_operator +); + CREATE OPERATOR - ( LEFTARG = duckdb.unresolved_type, RIGHTARG = "any", @@ -162,59 +214,38 @@ CREATE OPERATOR - ( CREATE OPERATOR * ( LEFTARG = duckdb.unresolved_type, - RIGHTARG = "any", - FUNCTION = duckdb_unresolved_type_operator -); - -CREATE OPERATOR * ( - LEFTARG = "any", RIGHTARG = duckdb.unresolved_type, FUNCTION = duckdb_unresolved_type_operator ); -CREATE OPERATOR / ( +CREATE OPERATOR * ( LEFTARG = duckdb.unresolved_type, RIGHTARG = "any", FUNCTION = duckdb_unresolved_type_operator ); -CREATE OPERATOR / ( +CREATE OPERATOR * ( LEFTARG = "any", RIGHTARG = duckdb.unresolved_type, FUNCTION = duckdb_unresolved_type_operator ); --- Self comparison operators and necessary dummy function, needed for GROUP BY and ORDER BY -CREATE FUNCTION duckdb_unresolved_type_operator_bool(duckdb.unresolved_type, duckdb.unresolved_type) RETURNS boolean AS 'MODULE_PATHNAME', 'duckdb_unresolved_type_operator' LANGUAGE C IMMUTABLE STRICT; - -CREATE OPERATOR < ( - LEFTARG = duckdb.unresolved_type, - RIGHTARG = duckdb.unresolved_type, - FUNCTION = duckdb_unresolved_type_operator_bool -); - -CREATE OPERATOR <= ( - LEFTARG = duckdb.unresolved_type, - RIGHTARG = duckdb.unresolved_type, - FUNCTION = duckdb_unresolved_type_operator_bool -); - -CREATE OPERATOR = ( +CREATE OPERATOR / ( LEFTARG = duckdb.unresolved_type, RIGHTARG = duckdb.unresolved_type, - FUNCTION = duckdb_unresolved_type_operator_bool + FUNCTION = duckdb_unresolved_type_operator ); -CREATE OPERATOR > ( +CREATE OPERATOR / ( LEFTARG = duckdb.unresolved_type, - RIGHTARG = duckdb.unresolved_type, - FUNCTION = duckdb_unresolved_type_operator_bool + RIGHTARG = "any", + FUNCTION = duckdb_unresolved_type_operator ); -CREATE OPERATOR >= ( - LEFTARG = duckdb.unresolved_type, +CREATE OPERATOR / ( + LEFTARG = "any", RIGHTARG = duckdb.unresolved_type, - FUNCTION = duckdb_unresolved_type_operator_bool + FUNCTION = duckdb_unresolved_type_operator ); -- TODO: use other dummy function with better error diff --git a/src/pgduckdb_options.cpp b/src/pgduckdb_options.cpp index 0cb90d18..98576c52 100644 --- a/src/pgduckdb_options.cpp +++ b/src/pgduckdb_options.cpp @@ -450,15 +450,11 @@ DECLARE_PG_FUNCTION(duckdb_row_out) { } DECLARE_PG_FUNCTION(duckdb_unresolved_type_in) { - elog(ERROR, "Creating the duckdb.unresolved_type type is not supported"); + return textin(fcinfo); } DECLARE_PG_FUNCTION(duckdb_unresolved_type_out) { - elog(ERROR, "Converting a duckdb.unresolved_type to a string is not supported"); -} - -DECLARE_PG_FUNCTION(duckdb_get_row_field_operator) { - elog(ERROR, "The 'get field' operator for duckdb.row cannot be executed by Postgres"); + return textout(fcinfo); } DECLARE_PG_FUNCTION(duckdb_row_subscript) { diff --git a/src/pgduckdb_ruleutils.cpp b/src/pgduckdb_ruleutils.cpp index 4d9e1e5a..0ca738ef 100644 --- a/src/pgduckdb_ruleutils.cpp +++ b/src/pgduckdb_ruleutils.cpp @@ -38,12 +38,22 @@ pgduckdb_function_name(Oid function_oid) { return psprintf("system.main.%s", quote_identifier(func_name)); } +bool +pgduckdb_is_unresolved_type(Oid type_oid) { + return type_oid == pgduckdb::DuckdbUnresolvedTypeOid(); +} + +bool +pgduckdb_is_duckdb_row(Oid type_oid) { + return type_oid == pgduckdb::DuckdbRowOid(); +} + bool pgduckdb_var_is_duckdb_row(Var *var) { if (!var) { return false; } - return var->vartype == pgduckdb::DuckdbRowOid(); + return pgduckdb_is_duckdb_row(var->vartype); } bool @@ -58,7 +68,76 @@ pgduckdb_func_returns_duckdb_row(RangeTblFunction *rtfunc) { FuncExpr *func_expr = castNode(FuncExpr, rtfunc->funcexpr); - return func_expr->funcresulttype == pgduckdb::DuckdbRowOid(); + return pgduckdb_is_duckdb_row(func_expr->funcresulttype); +} + +/* + * Returns NULL if the expression is not a subscript on a duckdb row. Returns + * the Var of the duckdb row if it is. + */ +Var * +pgduckdb_duckdb_row_subscript_var(Expr *expr) { + if (!expr) { + return NULL; + } + + if (!IsA(expr, SubscriptingRef)) { + return NULL; + } + + SubscriptingRef *subscript = (SubscriptingRef *)expr; + + if (!IsA(subscript->refexpr, Var)) { + return NULL; + } + + Var *refexpr = (Var *)subscript->refexpr; + + if (!pgduckdb_var_is_duckdb_row(refexpr)) { + return NULL; + } + return refexpr; +} + +/* + * Returns a list of duckdb + */ +List * +pgduckdb_star_start_vars(List *target_list) { + List *star_start_indexes = NIL; + Var *possible_star_start_var; + int possible_star_start_var_index = 0; + + int i = 0; + + foreach_node(TargetEntry, tle, target_list) { + i++; + + if (!IsA(tle->expr, Var)) { + possible_star_start_var = NULL; + possible_star_start_var_index = 0; + continue; + } + + Var *var = (Var *)tle->expr; + + if (var->varattno == 1) { + possible_star_start_var = var; + possible_star_start_var_index = i; + } else if (possible_star_start_var) { + if (var->varno != possible_star_start_var->varno || var->varno == i - possible_star_start_var_index + 1) { + possible_star_start_var = NULL; + possible_star_start_var_index = 0; + } + } + + if (pgduckdb_var_is_duckdb_row(var)) { + star_start_indexes = lappend_int(star_start_indexes, possible_star_start_var_index); + possible_star_start_var = NULL; + possible_star_start_var_index = 0; + } + } + return star_start_indexes; } /* diff --git a/src/vendor/pg_ruleutils_17.c b/src/vendor/pg_ruleutils_17.c index b8d1eda2..efcdf9e7 100644 --- a/src/vendor/pg_ruleutils_17.c +++ b/src/vendor/pg_ruleutils_17.c @@ -6041,11 +6041,51 @@ get_target_list(List *targetList, deparse_context *context, sep = " "; colno = 0; + List *star_starts = pgduckdb_star_start_vars(targetList); + ListCell *current_star_start = list_head(star_starts); + + int varno_star = 0; + int varattno_star = 0; + bool added_star = false; + + int i = 0; foreach(l, targetList) { TargetEntry *tle = (TargetEntry *) lfirst(l); char *colname; char *attname; + i++; + + if (current_star_start && lfirst_int(current_star_start) == i) { + Var *var = castNode(Var, tle->expr); + varno_star = var->varno; + varattno_star = var->varattno; + } + + if (varno_star) { + bool reset = true; + if (tle->expr && IsA(tle->expr, Var)) { + Var *var = castNode(Var, tle->expr); + + if (var->varno == varno_star && var->varattno == varattno_star) { + varattno_star++; + reset = false; + if (added_star || !pgduckdb_var_is_duckdb_row(var)) { + continue; + } + added_star = true; + } + } + + if (reset) { + varno_star = 0; + varattno_star = 0; + current_star_start = lnext(star_starts, current_star_start); + added_star = false; + } + } + + if (tle->resjunk) continue; /* ignore junk entries */ @@ -6100,8 +6140,39 @@ get_target_list(List *targetList, deparse_context *context, else colname = tle->resname; + bool duckdb_row_needs_as = !pgduckdb_var_is_duckdb_row(var); + Var *subscript_var = pgduckdb_duckdb_row_subscript_var(tle->expr); + if (subscript_var) { + deparse_namespace* dpns = (deparse_namespace *) list_nth(context->namespaces, + subscript_var->varlevelsup); + int varno; + int varattno; + + /* + * If we have a syntactic referent for the Var, and we're working from a + * parse tree, prefer to use the syntactic referent. Otherwise, fall back + * on the semantic referent. (Forcing use of the semantic referent when + * printing plan trees is a design choice that's perhaps more motivated by + * backwards compatibility than anything else. But it does have the + * advantage of making plans more explicit.) + */ + if (subscript_var->varnosyn > 0 && dpns->plan == NULL) + { + varno = subscript_var->varnosyn; + varattno = subscript_var->varattnosyn; + } + else + { + varno = subscript_var->varno; + varattno = subscript_var->varattno; + } + RangeTblEntry* rte = rt_fetch(varno, dpns->rtable); + char *original_column = strVal(list_nth(rte->eref->colnames, varattno - 1)); + duckdb_row_needs_as = strcmp(original_column, colname) != 0; + } + /* Show AS unless the column's name is correct as-is */ - if (colname && !pgduckdb_var_is_duckdb_row(var)) + if (colname && duckdb_row_needs_as) { if (attname == NULL || strcmp(attname, colname) != 0) appendStringInfo(&targetbuf, " AS %s", quote_identifier(colname)); @@ -9052,7 +9123,6 @@ get_rule_expr(Node *node, deparse_context *context, deparse_namespace* dpns = (deparse_namespace *) list_nth(context->namespaces, var->varlevelsup); int varno; - AttrNumber varattno; /* * If we have a syntactic referent for the Var, and we're working from a @@ -9065,12 +9135,10 @@ get_rule_expr(Node *node, deparse_context *context, if (var->varnosyn > 0 && dpns->plan == NULL) { varno = var->varnosyn; - varattno = var->varattnosyn; } else { varno = var->varno; - varattno = var->varattno; } RangeTblEntry* rte = rt_fetch(varno, dpns->rtable); Oid typoutput; @@ -11152,6 +11220,10 @@ get_const_expr(Const *constval, deparse_context *context, int showtype) char *extval; bool needlabel = false; + if (pgduckdb_is_unresolved_type(constval->consttype)) { + showtype = -1; + } + if (constval->constisnull) { /* diff --git a/test/regression/expected/duckdb_only_functions.out b/test/regression/expected/duckdb_only_functions.out index cb70e1e3..a423f3a8 100644 --- a/test/regression/expected/duckdb_only_functions.out +++ b/test/regression/expected/duckdb_only_functions.out @@ -2,26 +2,26 @@ -- duckdb.force_execution is turned off SET duckdb.force_execution = false; \set pwd `pwd` -select * from read_parquet(:'pwd' || '/data/unsigned_types.parquet') as (usmallint int); +select r['usmallint'] from read_parquet(:'pwd' || '/data/unsigned_types.parquet') r; usmallint ----------- 65535 (1 row) -select * from read_parquet(ARRAY[:'pwd' || '/data/unsigned_types.parquet']) as (usmallint int); +select r['usmallint'] from read_parquet(ARRAY[:'pwd' || '/data/unsigned_types.parquet']) r; usmallint ----------- 65535 (1 row) -select * from read_csv(:'pwd' || '/data/web_page.csv') as (column00 int) LIMIT 2; +select r['column00'] from read_csv(:'pwd' || '/data/web_page.csv') r LIMIT 2; column00 ---------- 1 2 (2 rows) -select * from read_csv(ARRAY[:'pwd' || '/data/web_page.csv']) as (column00 int) LIMIT 2; +select r['column00'] from read_csv(ARRAY[:'pwd' || '/data/web_page.csv']) r LIMIT 2; column00 ---------- 1 diff --git a/test/regression/expected/non_superuser.out b/test/regression/expected/non_superuser.out index 37574149..b4aa0809 100644 --- a/test/regression/expected/non_superuser.out +++ b/test/regression/expected/non_superuser.out @@ -33,7 +33,7 @@ SELECT * FROM duckdb.cache('some hacky sql', 'some more hacky sql'); ERROR: permission denied for function cache SET duckdb.force_execution = true; -- read_csv from the local filesystem should be disallowed -SELECT count("sepal.length") FROM read_csv('../../data/iris.csv') AS ("sepal.length" FLOAT); +SELECT count(r['sepal.length']) FROM read_csv('../../data/iris.csv') r; ERROR: (PGDuckDB/CreatePlan) Prepared query returned an error: 'Permission Error: File system LocalFileSystem has been disabled by configuration -- Should fail because DuckDB execution is not allowed for this user SET ROLE user3; diff --git a/test/regression/expected/read_functions.out b/test/regression/expected/read_functions.out index 85ebae38..9d9226ce 100644 --- a/test/regression/expected/read_functions.out +++ b/test/regression/expected/read_functions.out @@ -1,13 +1,13 @@ -- PostgreSQL instance has data directory set to tmp_check/data so for all read functions argument -- is relative to that data directory patch -- read_parquet -SELECT count("sepal.length") FROM read_parquet('../../data/iris.parquet') AS ("sepal.length" FLOAT); +SELECT count(r['sepal.length']) FROM read_parquet('../../data/iris.parquet') r; count ------- 150 (1 row) -SELECT "sepal.length" FROM read_parquet('../../data/iris.parquet') AS ("sepal.length" FLOAT) ORDER BY "sepal.length" LIMIT 5; +SELECT r['sepal.length'] FROM read_parquet('../../data/iris.parquet') r ORDER BY r['sepal.length'] LIMIT 5; sepal.length -------------- 4.3 @@ -17,9 +17,9 @@ SELECT "sepal.length" FROM read_parquet('../../data/iris.parquet') AS ("sepal.le 4.5 (5 rows) -SELECT "sepal.length", file_row_number, filename - FROM read_parquet('../../data/iris.parquet', file_row_number => true, filename => true) - AS ("sepal.length" FLOAT, file_row_number INT, filename VARCHAR) ORDER BY "sepal.length" LIMIT 5; +SELECT r['sepal.length'], r['file_row_number'], r['filename'] + FROM read_parquet('../../data/iris.parquet', file_row_number => true, filename => true) r + ORDER BY r['sepal.length'] LIMIT 5; sepal.length | file_row_number | filename --------------+-----------------+------------------------- 4.3 | 13 | ../../data/iris.parquet @@ -30,13 +30,13 @@ SELECT "sepal.length", file_row_number, filename (5 rows) -- read_csv -SELECT count("sepal.length") FROM read_csv('../../data/iris.csv') AS ("sepal.length" FLOAT); +SELECT count(r['sepal.length']) FROM read_csv('../../data/iris.csv') r; count ------- 150 (1 row) -SELECT "sepal.length" FROM read_csv('../../data/iris.csv') AS ("sepal.length" FLOAT) ORDER BY "sepal.length" LIMIT 5; +SELECT r['sepal.length'] FROM read_csv('../../data/iris.csv') r ORDER BY r['sepal.length'] LIMIT 5; sepal.length -------------- 4.3 @@ -46,9 +46,9 @@ SELECT "sepal.length" FROM read_csv('../../data/iris.csv') AS ("sepal.length" FL 4.5 (5 rows) -SELECT "sepal.length", filename - FROM read_csv('../../data/iris.csv', filename => true, header => true) - AS ("sepal.length" FLOAT, filename VARCHAR) ORDER BY "sepal.length" LIMIT 5; +SELECT r['sepal.length'], r['filename'] + FROM read_csv('../../data/iris.csv', filename => true, header => true) r + ORDER BY r['sepal.length'] LIMIT 5; sepal.length | filename --------------+--------------------- 4.3 | ../../data/iris.csv @@ -58,7 +58,7 @@ SELECT "sepal.length", filename 4.5 | ../../data/iris.csv (5 rows) -SELECT * FROM read_csv('../../non-existing-file.csv') AS ("sepal.length" FLOAT); +SELECT * FROM read_csv('../../non-existing-file.csv'); ERROR: (PGDuckDB/CreatePlan) Prepared query returned an error: 'IO Error: No files found that match the pattern "../../non-existing-file.csv" -- delta_scan SELECT duckdb.install_extension('delta'); @@ -67,13 +67,13 @@ SELECT duckdb.install_extension('delta'); t (1 row) -SELECT count(a) FROM delta_scan('../../data/delta_table') AS (a INT); +SELECT count(r['a']) FROM delta_scan('../../data/delta_table') r; count ------- 100 (1 row) -SELECT * FROM delta_scan('../../data/delta_table') AS (a INT, b VARCHAR) WHERE (a = 1 OR b = 'delta_table_3'); +SELECT * FROM delta_scan('../../data/delta_table') r WHERE (r['a'] = 1 OR r['b'] = 'delta_table_3'); a | b ---+--------------- 1 | delta_table_1 @@ -87,7 +87,7 @@ SELECT duckdb.install_extension('iceberg'); t (1 row) -SELECT COUNT(l_orderkey) FROM iceberg_scan('../../data/lineitem_iceberg', allow_moved_paths => true) AS (l_orderkey BIGINT); +SELECT COUNT(r['l_orderkey']) FROM iceberg_scan('../../data/lineitem_iceberg', allow_moved_paths => true) r; count ------- 51793 @@ -95,18 +95,14 @@ SELECT COUNT(l_orderkey) FROM iceberg_scan('../../data/lineitem_iceberg', allow_ -- TPCH query #6 SELECT - sum(l_extendedprice * l_discount) as revenue + sum(r['l_extendedprice'] * r['l_discount']) as revenue FROM - iceberg_scan('../../data/lineitem_iceberg', allow_moved_paths => true) AS - (l_extendedprice DOUBLE PRECISION, - l_discount DOUBLE PRECISION, - l_shipdate DATE, - l_quantity DOUBLE PRECISION) + iceberg_scan('../../data/lineitem_iceberg', allow_moved_paths => true) r WHERE - l_shipdate >= date '1997-01-01' - AND l_shipdate < date '1997-01-01' + interval '1' year - AND l_discount between 0.08 - 0.01 and 0.08 + 0.01 - AND l_quantity < 25 + r['l_shipdate'] >= date '1997-01-01' + AND r['l_shipdate'] < date '1997-01-01' + interval '1' year + AND r['l_discount'] between 0.08 - 0.01 and 0.08 + 0.01 + AND r['l_quantity'] < 25 LIMIT 1; revenue -------------- @@ -128,19 +124,19 @@ SELECT * FROM iceberg_metadata('../../data/lineitem_iceberg', allow_moved_paths (2 rows) -- read_json -SELECT COUNT(a) FROM read_json('../../data/table.json') AS (a INT); +SELECT COUNT(r['a']) FROM read_json('../../data/table.json') r; count ------- 100 (1 row) -SELECT COUNT(a) FROM read_json('../../data/table.json') AS (a INT, c FLOAT) WHERE c > 50.4; +SELECT COUNT(r['a']) FROM read_json('../../data/table.json') r WHERE r['c'] > 50.4; count ------- 51 (1 row) -SELECT a, b, c FROM read_json('../../data/table.json') AS (a INT, b VARCHAR, c FLOAT) WHERE c > 50.4 AND c < 51.2; +SELECT r['a'], r['b'], r['c'] FROM read_json('../../data/table.json') r WHERE r['c'] > 50.4 AND r['c'] < 51.2; a | b | c ----+---------+------ 50 | json_50 | 50.5 diff --git a/test/regression/expected/search_path.out b/test/regression/expected/search_path.out index f9754c6c..381f5ef6 100644 --- a/test/regression/expected/search_path.out +++ b/test/regression/expected/search_path.out @@ -64,7 +64,7 @@ SELECT count(*) FROM public.t, other.t; 1000 (1 row) -SELECT count(*) FROM public.read_csv(:'pwd' || '/data/web_page.csv') as (column00 int); +SELECT count(*) FROM public.read_csv(:'pwd' || '/data/web_page.csv'); count ------- 60 diff --git a/test/regression/sql/create_table_as.sql b/test/regression/sql/create_table_as.sql index 9f10e927..e24e7184 100644 --- a/test/regression/sql/create_table_as.sql +++ b/test/regression/sql/create_table_as.sql @@ -1,5 +1,5 @@ \set pwd `pwd` -CREATE TABLE webpages AS SELECT * FROM read_csv(:'pwd' || '/data/web_page.csv') as (column00 int, column01 text, column02 date); +CREATE TABLE webpages AS SELECT r['column00'], r['column01'], r['column02'] FROM read_csv(:'pwd' || '/data/web_page.csv') r; select * from webpages order by column00 limit 2; select count(*) from webpages; diff --git a/test/regression/sql/duckdb_only_functions.sql b/test/regression/sql/duckdb_only_functions.sql index e38b6568..50213b9c 100644 --- a/test/regression/sql/duckdb_only_functions.sql +++ b/test/regression/sql/duckdb_only_functions.sql @@ -4,10 +4,10 @@ SET duckdb.force_execution = false; \set pwd `pwd` -select * from read_parquet(:'pwd' || '/data/unsigned_types.parquet') as (usmallint int); -select * from read_parquet(ARRAY[:'pwd' || '/data/unsigned_types.parquet']) as (usmallint int); +select r['usmallint'] from read_parquet(:'pwd' || '/data/unsigned_types.parquet') r; +select r['usmallint'] from read_parquet(ARRAY[:'pwd' || '/data/unsigned_types.parquet']) r; -select * from read_csv(:'pwd' || '/data/web_page.csv') as (column00 int) LIMIT 2; -select * from read_csv(ARRAY[:'pwd' || '/data/web_page.csv']) as (column00 int) LIMIT 2; +select r['column00'] from read_csv(:'pwd' || '/data/web_page.csv') r LIMIT 2; +select r['column00'] from read_csv(ARRAY[:'pwd' || '/data/web_page.csv']) r LIMIT 2; -- TODO: Add a test for scan_iceberg once we have a test table diff --git a/test/regression/sql/non_superuser.sql b/test/regression/sql/non_superuser.sql index 73bad923..63361757 100644 --- a/test/regression/sql/non_superuser.sql +++ b/test/regression/sql/non_superuser.sql @@ -25,7 +25,7 @@ SELECT * FROM duckdb.cache('some hacky sql', 'some more hacky sql'); SET duckdb.force_execution = true; -- read_csv from the local filesystem should be disallowed -SELECT count("sepal.length") FROM read_csv('../../data/iris.csv') AS ("sepal.length" FLOAT); +SELECT count(r['sepal.length']) FROM read_csv('../../data/iris.csv') r; -- Should fail because DuckDB execution is not allowed for this user SET ROLE user3; SELECT * FROM t; diff --git a/test/regression/sql/read_functions.sql b/test/regression/sql/read_functions.sql index b3af1915..a9bd5851 100644 --- a/test/regression/sql/read_functions.sql +++ b/test/regression/sql/read_functions.sql @@ -3,55 +3,51 @@ -- read_parquet -SELECT count("sepal.length") FROM read_parquet('../../data/iris.parquet') AS ("sepal.length" FLOAT); +SELECT count(r['sepal.length']) FROM read_parquet('../../data/iris.parquet') r; -SELECT "sepal.length" FROM read_parquet('../../data/iris.parquet') AS ("sepal.length" FLOAT) ORDER BY "sepal.length" LIMIT 5; +SELECT r['sepal.length'] FROM read_parquet('../../data/iris.parquet') r ORDER BY r['sepal.length'] LIMIT 5; -SELECT "sepal.length", file_row_number, filename - FROM read_parquet('../../data/iris.parquet', file_row_number => true, filename => true) - AS ("sepal.length" FLOAT, file_row_number INT, filename VARCHAR) ORDER BY "sepal.length" LIMIT 5; +SELECT r['sepal.length'], r['file_row_number'], r['filename'] + FROM read_parquet('../../data/iris.parquet', file_row_number => true, filename => true) r + ORDER BY r['sepal.length'] LIMIT 5; -- read_csv -SELECT count("sepal.length") FROM read_csv('../../data/iris.csv') AS ("sepal.length" FLOAT); +SELECT count(r['sepal.length']) FROM read_csv('../../data/iris.csv') r; -SELECT "sepal.length" FROM read_csv('../../data/iris.csv') AS ("sepal.length" FLOAT) ORDER BY "sepal.length" LIMIT 5; +SELECT r['sepal.length'] FROM read_csv('../../data/iris.csv') r ORDER BY r['sepal.length'] LIMIT 5; -SELECT "sepal.length", filename - FROM read_csv('../../data/iris.csv', filename => true, header => true) - AS ("sepal.length" FLOAT, filename VARCHAR) ORDER BY "sepal.length" LIMIT 5; +SELECT r['sepal.length'], r['filename'] + FROM read_csv('../../data/iris.csv', filename => true, header => true) r + ORDER BY r['sepal.length'] LIMIT 5; -SELECT * FROM read_csv('../../non-existing-file.csv') AS ("sepal.length" FLOAT); +SELECT * FROM read_csv('../../non-existing-file.csv'); -- delta_scan SELECT duckdb.install_extension('delta'); -SELECT count(a) FROM delta_scan('../../data/delta_table') AS (a INT); -SELECT * FROM delta_scan('../../data/delta_table') AS (a INT, b VARCHAR) WHERE (a = 1 OR b = 'delta_table_3'); +SELECT count(r['a']) FROM delta_scan('../../data/delta_table') r; +SELECT * FROM delta_scan('../../data/delta_table') r WHERE (r['a'] = 1 OR r['b'] = 'delta_table_3'); -- iceberg_* SELECT duckdb.install_extension('iceberg'); -SELECT COUNT(l_orderkey) FROM iceberg_scan('../../data/lineitem_iceberg', allow_moved_paths => true) AS (l_orderkey BIGINT); +SELECT COUNT(r['l_orderkey']) FROM iceberg_scan('../../data/lineitem_iceberg', allow_moved_paths => true) r; -- TPCH query #6 SELECT - sum(l_extendedprice * l_discount) as revenue + sum(r['l_extendedprice'] * r['l_discount']) as revenue FROM - iceberg_scan('../../data/lineitem_iceberg', allow_moved_paths => true) AS - (l_extendedprice DOUBLE PRECISION, - l_discount DOUBLE PRECISION, - l_shipdate DATE, - l_quantity DOUBLE PRECISION) + iceberg_scan('../../data/lineitem_iceberg', allow_moved_paths => true) r WHERE - l_shipdate >= date '1997-01-01' - AND l_shipdate < date '1997-01-01' + interval '1' year - AND l_discount between 0.08 - 0.01 and 0.08 + 0.01 - AND l_quantity < 25 + r['l_shipdate'] >= date '1997-01-01' + AND r['l_shipdate'] < date '1997-01-01' + interval '1' year + AND r['l_discount'] between 0.08 - 0.01 and 0.08 + 0.01 + AND r['l_quantity'] < 25 LIMIT 1; SELECT * FROM iceberg_snapshots('../../data/lineitem_iceberg'); @@ -59,6 +55,6 @@ SELECT * FROM iceberg_metadata('../../data/lineitem_iceberg', allow_moved_paths -- read_json -SELECT COUNT(a) FROM read_json('../../data/table.json') AS (a INT); -SELECT COUNT(a) FROM read_json('../../data/table.json') AS (a INT, c FLOAT) WHERE c > 50.4; -SELECT a, b, c FROM read_json('../../data/table.json') AS (a INT, b VARCHAR, c FLOAT) WHERE c > 50.4 AND c < 51.2; +SELECT COUNT(r['a']) FROM read_json('../../data/table.json') r; +SELECT COUNT(r['a']) FROM read_json('../../data/table.json') r WHERE r['c'] > 50.4; +SELECT r['a'], r['b'], r['c'] FROM read_json('../../data/table.json') r WHERE r['c'] > 50.4 AND r['c'] < 51.2; diff --git a/test/regression/sql/search_path.sql b/test/regression/sql/search_path.sql index f8059d54..c458b6b8 100644 --- a/test/regression/sql/search_path.sql +++ b/test/regression/sql/search_path.sql @@ -26,7 +26,7 @@ SET search_path TO ''; SELECT count(*) FROM t, other.t; SELECT count(*) FROM public.t, other.t; -SELECT count(*) FROM public.read_csv(:'pwd' || '/data/web_page.csv') as (column00 int); +SELECT count(*) FROM public.read_csv(:'pwd' || '/data/web_page.csv'); -- Cleanup DROP TABLE other.t; From 01d5ad5ddc39ce9a43589ea5ccc450a166ea030c Mon Sep 17 00:00:00 2001 From: Jelte Fennema-Nio Date: Fri, 13 Dec 2024 16:38:59 +0100 Subject: [PATCH 04/40] Attempt to fix create table as, but very hacky --- src/pgduckdb_ddl.cpp | 59 +++++++++++++++---- test/regression/expected/create_table_as.out | 2 +- .../regression/expected/materialized_view.out | 43 ++++++-------- test/regression/expected/temporary_tables.out | 4 +- test/regression/sql/materialized_view.sql | 2 +- test/regression/sql/temporary_tables.sql | 2 +- 6 files changed, 71 insertions(+), 41 deletions(-) diff --git a/src/pgduckdb_ddl.cpp b/src/pgduckdb_ddl.cpp index ba5e962f..bd491aff 100644 --- a/src/pgduckdb_ddl.cpp +++ b/src/pgduckdb_ddl.cpp @@ -20,6 +20,7 @@ extern "C" { #include "commands/event_trigger.h" #include "executor/spi.h" #include "miscadmin.h" +#include "rewrite/rewriteHandler.h" #include "pgduckdb/vendor/pg_ruleutils.h" #include "pgduckdb/pgduckdb_ruleutils.h" @@ -39,33 +40,70 @@ extern "C" { * tables we force this value to false. */ static bool ctas_skip_data = false; +static List *ctas_original_target_list = NIL; static bool top_level_ddl = true; static ProcessUtility_hook_type prev_process_utility_hook = NULL; static void -DuckdbHandleDDL(Node *parsetree) { +DuckdbHandleDDL(PlannedStmt *pstmt, const char *query_string, ParamListInfo params) { if (!pgduckdb::IsExtensionRegistered()) { /* We're not installed, so don't mess with the query */ return; } + Node *parsetree = pstmt->utilityStmt; + if (IsA(parsetree, CreateTableAsStmt)) { auto stmt = castNode(CreateTableAsStmt, parsetree); char *access_method = stmt->into->accessMethod ? stmt->into->accessMethod : default_table_access_method; - if (strcmp(access_method, "duckdb") != 0) { - /* not a duckdb table, so don't mess with the query */ + bool is_duckdb_table = strcmp(access_method, "duckdb") == 0; + if (is_duckdb_table) { + /* + * Force skipData to false for duckdb tables, so that Postgres does + * not execute the query, and save the original value in ctas_skip_data + * so we can use it later in duckdb_create_table_trigger to choose + * whether to execute the query in DuckDB or not. + */ + ctas_skip_data = stmt->into->skipData; + stmt->into->skipData = true; + } + bool is_create_materialized_view = stmt->into->viewQuery != NULL; + bool skips_planning = stmt->into->skipData || is_create_materialized_view; + if (!skips_planning) { + // No need to plan here as well, because standard_ProcessUtility + // will already plan the query and thus get the correct columns. return; } + // TODO: Probably we should only do this if the targetlist actually + // contains some duckdb.unresolved_type or duckdb.row columns. + // XXX: This is a huge hack. Probably we should do something different + // here. The current hack doesn't work with materialized views yet. + + List *rewritten; + PlannedStmt *plan; + Query *original_query = castNode(Query, stmt->query); + Query *query = (Query *)copyObjectImpl(original_query); /* - * Force skipData to false for duckdb tables, so that Postgres does - * not execute the query, and save the original value in ctas_skip_data - * so we can use it later in duckdb_create_table_trigger to choose - * whether to execute the query in DuckDB or not. + * Parse analysis was done already, but we still have to run the rule + * rewriter. We do not do AcquireRewriteLocks: we assume the query + * either came straight from the parser, or suitable locks were + * acquired by plancache.c. */ - ctas_skip_data = stmt->into->skipData; - stmt->into->skipData = true; + rewritten = QueryRewrite(query); + + /* SELECT should never rewrite to more or less than one SELECT query */ + if (list_length(rewritten) != 1) + elog(ERROR, "unexpected rewrite result for CREATE TABLE AS SELECT"); + query = linitial_node(Query, rewritten); + Assert(query->commandType == CMD_SELECT); + + /* plan the query */ + plan = pg_plan_query(query, query_string, CURSOR_OPT_PARALLEL_OK, params); + ctas_original_target_list = original_query->targetList; + original_query->targetList = plan->planTree->targetlist; + } else if (IsA(parsetree, CreateSchemaStmt) && !pgduckdb::doing_motherduck_sync) { auto stmt = castNode(CreateSchemaStmt, parsetree); if (stmt->schemaname) { @@ -125,7 +163,7 @@ DuckdbUtilityHook_Cpp(PlannedStmt *pstmt, const char *query_string, bool read_on bool prev_top_level_ddl = top_level_ddl; top_level_ddl = context == PROCESS_UTILITY_TOPLEVEL; - DuckdbHandleDDL(parsetree); + DuckdbHandleDDL(pstmt, query_string, params); prev_process_utility_hook(pstmt, query_string, read_only_tree, context, params, query_env, dest, qc); top_level_ddl = prev_top_level_ddl; @@ -345,6 +383,7 @@ DECLARE_PG_FUNCTION(duckdb_create_table_trigger) { if (IsA(parsetree, CreateTableAsStmt) && !ctas_skip_data) { auto stmt = castNode(CreateTableAsStmt, parsetree); ctas_query = (Query *)stmt->query; + ctas_query->targetList = ctas_original_target_list; } pgduckdb::DuckDBQueryOrThrow(*connection, create_table_string); diff --git a/test/regression/expected/create_table_as.out b/test/regression/expected/create_table_as.out index aee30e06..b15334e7 100644 --- a/test/regression/expected/create_table_as.out +++ b/test/regression/expected/create_table_as.out @@ -1,5 +1,5 @@ \set pwd `pwd` -CREATE TABLE webpages AS SELECT * FROM read_csv(:'pwd' || '/data/web_page.csv') as (column00 int, column01 text, column02 date); +CREATE TABLE webpages AS SELECT r['column00'], r['column01'], r['column02'] FROM read_csv(:'pwd' || '/data/web_page.csv') r; select * from webpages order by column00 limit 2; column00 | column01 | column02 ----------+------------------+------------ diff --git a/test/regression/expected/materialized_view.out b/test/regression/expected/materialized_view.out index ec0940e8..dd2a6053 100644 --- a/test/regression/expected/materialized_view.out +++ b/test/regression/expected/materialized_view.out @@ -47,38 +47,29 @@ INSERT INTO t_csv VALUES (1,1),(2,2),(3,3); \set pwd `pwd` \set csv_file_path '\'' :pwd '/tmp_check/t_csv.csv' '\'' COPY t_csv TO :csv_file_path (FORMAT CSV, HEADER TRUE, DELIMITER ','); -CREATE MATERIALIZED VIEW mv_csv AS SELECT * FROM read_csv(:csv_file_path) AS (a BIGINT); +CREATE MATERIALIZED VIEW mv_csv AS SELECT * FROM read_csv(:csv_file_path); +ERROR: SELECT rule's target entry 1 has different type from column "a" +DETAIL: SELECT target entry has type duckdb."row", but column has type bigint. SELECT COUNT(*) FROM mv_csv; - count -------- - 3 -(1 row) - +ERROR: relation "mv_csv" does not exist +LINE 1: SELECT COUNT(*) FROM mv_csv; + ^ SELECT * FROM mv_csv; - a ---- - 1 - 2 - 3 -(3 rows) - +ERROR: relation "mv_csv" does not exist +LINE 1: SELECT * FROM mv_csv; + ^ INSERT INTO t_csv VALUES (4,4); COPY t_csv TO :csv_file_path (FORMAT CSV, HEADER TRUE, DELIMITER ','); REFRESH MATERIALIZED VIEW mv_csv; +ERROR: relation "mv_csv" does not exist SELECT COUNT(*) FROM mv_csv; - count -------- - 4 -(1 row) - +ERROR: relation "mv_csv" does not exist +LINE 1: SELECT COUNT(*) FROM mv_csv; + ^ SELECT * FROM mv_csv; - a ---- - 1 - 2 - 3 - 4 -(4 rows) - +ERROR: relation "mv_csv" does not exist +LINE 1: SELECT * FROM mv_csv; + ^ DROP MATERIALIZED VIEW mv_csv; +ERROR: materialized view "mv_csv" does not exist DROP TABLE t_csv; diff --git a/test/regression/expected/temporary_tables.out b/test/regression/expected/temporary_tables.out index ac87ab7d..ba28d24b 100644 --- a/test/regression/expected/temporary_tables.out +++ b/test/regression/expected/temporary_tables.out @@ -207,7 +207,7 @@ DROP TABLE t; CREATE TEMP TABLE t(a int) ON COMMIT DROP; ERROR: DuckDB does not support ON COMMIT DROP -- CTAS fully in Duckdb -CREATE TEMP TABLE webpages USING duckdb AS SELECT * FROM read_csv('../../data/web_page.csv') as (column00 int, column01 text, column02 date); +CREATE TEMP TABLE webpages USING duckdb AS SELECT r['column00'], r['column01'], r['column02'] FROM read_csv('../../data/web_page.csv') r; SELECT * FROM webpages ORDER BY column00 LIMIT 2; column00 | column01 | column02 ----------+------------------+------------ @@ -247,7 +247,7 @@ NOTICE: result: database_name schema_name sql VARCHAR VARCHAR VARCHAR [ Rows: 2] pg_temp main CREATE TABLE t(b INTEGER); -pg_temp main CREATE TABLE webpages(column00 INTEGER, column01 VARCHAR, column02 DATE); +pg_temp main CREATE TABLE webpages(column00 BIGINT, column01 VARCHAR, column02 DATE); raw_query diff --git a/test/regression/sql/materialized_view.sql b/test/regression/sql/materialized_view.sql index 729ced66..a189afa5 100644 --- a/test/regression/sql/materialized_view.sql +++ b/test/regression/sql/materialized_view.sql @@ -26,7 +26,7 @@ INSERT INTO t_csv VALUES (1,1),(2,2),(3,3); COPY t_csv TO :csv_file_path (FORMAT CSV, HEADER TRUE, DELIMITER ','); -CREATE MATERIALIZED VIEW mv_csv AS SELECT * FROM read_csv(:csv_file_path) AS (a BIGINT); +CREATE MATERIALIZED VIEW mv_csv AS SELECT * FROM read_csv(:csv_file_path); SELECT COUNT(*) FROM mv_csv; SELECT * FROM mv_csv; diff --git a/test/regression/sql/temporary_tables.sql b/test/regression/sql/temporary_tables.sql index cb0209ca..45f94f14 100644 --- a/test/regression/sql/temporary_tables.sql +++ b/test/regression/sql/temporary_tables.sql @@ -148,7 +148,7 @@ DROP TABLE t; CREATE TEMP TABLE t(a int) ON COMMIT DROP; -- CTAS fully in Duckdb -CREATE TEMP TABLE webpages USING duckdb AS SELECT * FROM read_csv('../../data/web_page.csv') as (column00 int, column01 text, column02 date); +CREATE TEMP TABLE webpages USING duckdb AS SELECT r['column00'], r['column01'], r['column02'] FROM read_csv('../../data/web_page.csv') r; SELECT * FROM webpages ORDER BY column00 LIMIT 2; CREATE TEMP TABLE t_heap(a int) USING heap; From 69b044e758e077f97d1d81f85d6c6bdc66a661b7 Mon Sep 17 00:00:00 2001 From: Jelte Fennema-Nio Date: Fri, 13 Dec 2024 17:23:40 +0100 Subject: [PATCH 05/40] Add comment about bug so I don't forget --- src/vendor/pg_ruleutils_17.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/vendor/pg_ruleutils_17.c b/src/vendor/pg_ruleutils_17.c index efcdf9e7..606a2fc7 100644 --- a/src/vendor/pg_ruleutils_17.c +++ b/src/vendor/pg_ruleutils_17.c @@ -6168,6 +6168,9 @@ get_target_list(List *targetList, deparse_context *context, } RangeTblEntry* rte = rt_fetch(varno, dpns->rtable); char *original_column = strVal(list_nth(rte->eref->colnames, varattno - 1)); + /* BUG: Thil logic breaks the following query: + * SELECT * FROM (SELECT r['column00'] FROM read_csv('/home/jelte/work/pg_duckdb/test/regression/data/web_page.csv') r limit 1); + */ duckdb_row_needs_as = strcmp(original_column, colname) != 0; } From 556d4761ec5eaf161208cc45aaa9b1b246168619 Mon Sep 17 00:00:00 2001 From: Jelte Fennema-Nio Date: Wed, 18 Dec 2024 10:28:45 +0100 Subject: [PATCH 06/40] Add support for duckdb.query --- sql/pg_duckdb--0.2.0--0.3.0.sql | 6 ++++++ src/pgduckdb_metadata_cache.cpp | 11 +++++++++-- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/sql/pg_duckdb--0.2.0--0.3.0.sql b/sql/pg_duckdb--0.2.0--0.3.0.sql index ba554590..f371e921 100644 --- a/sql/pg_duckdb--0.2.0--0.3.0.sql +++ b/sql/pg_duckdb--0.2.0--0.3.0.sql @@ -814,3 +814,9 @@ RETURNS SETOF duckdb.row SET search_path = pg_catalog, pg_temp AS 'MODULE_PATHNAME', 'duckdb_only_function' LANGUAGE C; + +CREATE FUNCTION duckdb.query(query text) +RETURNS SETOF duckdb.row +SET search_path = pg_catalog, pg_temp +AS 'MODULE_PATHNAME', 'duckdb_only_function' +LANGUAGE C; diff --git a/src/pgduckdb_metadata_cache.cpp b/src/pgduckdb_metadata_cache.cpp index 3fac74af..6d892de7 100644 --- a/src/pgduckdb_metadata_cache.cpp +++ b/src/pgduckdb_metadata_cache.cpp @@ -115,8 +115,15 @@ BuildDuckdbOnlyFunctions() { * each of the found functions is actually part of our extension before * caching its OID as a DuckDB-only function. */ - const char *function_names[] = {"read_parquet", "read_csv", "iceberg_scan", "iceberg_metadata", - "iceberg_snapshots", "delta_scan", "read_json", "approx_count_distinct"}; + const char *function_names[] = {"read_parquet", + "read_csv", + "iceberg_scan", + "iceberg_metadata", + "iceberg_snapshots", + "delta_scan", + "read_json", + "approx_count_distinct", + "query"}; for (uint32_t i = 0; i < lengthof(function_names); i++) { CatCList *catlist = SearchSysCacheList1(PROCNAMEARGSNSP, CStringGetDatum(function_names[i])); From 537753bdf46f40adf759e465c26e158df43e3cd6 Mon Sep 17 00:00:00 2001 From: Jelte Fennema-Nio Date: Fri, 20 Dec 2024 12:58:45 +0100 Subject: [PATCH 07/40] Add missing max aggregate --- sql/pg_duckdb--0.2.0--0.3.0.sql | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/sql/pg_duckdb--0.2.0--0.3.0.sql b/sql/pg_duckdb--0.2.0--0.3.0.sql index f371e921..9c231484 100644 --- a/sql/pg_duckdb--0.2.0--0.3.0.sql +++ b/sql/pg_duckdb--0.2.0--0.3.0.sql @@ -435,6 +435,12 @@ CREATE AGGREGATE list(duckdb.unresolved_type) ( FINALFUNC = duckdb_unresolved_type_final ); +CREATE AGGREGATE max(duckdb.unresolved_type) ( + SFUNC = duckdb_unresolved_type_state_trans, + STYPE = duckdb.unresolved_type, + FINALFUNC = duckdb_unresolved_type_final +); + CREATE AGGREGATE max(duckdb.unresolved_type, "any") ( SFUNC = duckdb_unresolved_type_state_trans, STYPE = duckdb.unresolved_type, From 43e084a3ab94fab4caa4e14f2ee8a84661e10d45 Mon Sep 17 00:00:00 2001 From: Jelte Fennema-Nio Date: Tue, 14 Jan 2025 11:08:08 +0100 Subject: [PATCH 08/40] Add error hint --- src/pgduckdb_hooks.cpp | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/src/pgduckdb_hooks.cpp b/src/pgduckdb_hooks.cpp index dc0dbf47..b901802f 100644 --- a/src/pgduckdb_hooks.cpp +++ b/src/pgduckdb_hooks.cpp @@ -37,6 +37,7 @@ static planner_hook_type prev_planner_hook = NULL; static ExecutorStart_hook_type prev_executor_start_hook = NULL; static ExecutorFinish_hook_type prev_executor_finish_hook = NULL; static ExplainOneQuery_hook_type prev_explain_one_query_hook = NULL; +static emit_log_hook_type prev_emit_log_hook = NULL; static bool ContainsCatalogTable(List *rtes) { @@ -353,6 +354,25 @@ DuckdbExplainOneQueryHook(Query *query, int cursorOptions, IntoClause *into, Exp prev_explain_one_query_hook(query, cursorOptions, into, es, queryString, params, queryEnv); } +static void +DuckdbEmitLogHook(ErrorData *edata) { + if (prev_emit_log_hook) { + prev_emit_log_hook(edata); + } + + if (edata->elevel == ERROR && edata->sqlerrcode == ERRCODE_UNDEFINED_COLUMN && pgduckdb::IsExtensionRegistered()) { + /* + * XXX: It would be nice if we could check if the query contained any + * of the functions, but we don't have access to the query string here. + * So instead we simply always add this HINT for this specific error if + * the pg_duckdb extension is installed. + */ + edata->hint = pstrdup( + "If you use DuckDB functions like read_parquet, you need to use the r['colname'] syntax to use columns. If " + "you're already doing that, maybe you forgot to to give the function the r alias."); + } +} + void DuckdbInitHooks(void) { prev_planner_hook = planner_hook; @@ -367,5 +387,8 @@ DuckdbInitHooks(void) { prev_explain_one_query_hook = ExplainOneQuery_hook ? ExplainOneQuery_hook : standard_ExplainOneQuery; ExplainOneQuery_hook = DuckdbExplainOneQueryHook; + prev_emit_log_hook = emit_log_hook ? emit_log_hook : NULL; + emit_log_hook = DuckdbEmitLogHook; + DuckdbInitUtilityHook(); } From 926d1b5965c97d88a9b9d320e3dffaad376016d6 Mon Sep 17 00:00:00 2001 From: Jelte Fennema-Nio Date: Tue, 14 Jan 2025 11:43:52 +0100 Subject: [PATCH 09/40] Ignore warning about declarations after statements --- Makefile | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/Makefile b/Makefile index c3a52bf1..45e98e86 100644 --- a/Makefile +++ b/Makefile @@ -46,6 +46,13 @@ COMPILER_FLAGS=-Wno-sign-compare -Wshadow -Wswitch -Wunused-parameter -Wunreacha override PG_CPPFLAGS += -Iinclude -isystem third_party/duckdb/src/include -isystem third_party/duckdb/third_party/re2 ${COMPILER_FLAGS} override PG_CXXFLAGS += -std=c++17 ${DUCKDB_BUILD_CXX_FLAGS} ${COMPILER_FLAGS} -Wno-register +# Ignore declaration-after-statement warnings in our code. Postgres enforces +# this because their ancient style guide requires it, but we don't care. It +# would only apply to C files anyway, and we don't have many of those. The only +# ones that we do have are vendored in from Postgres (ruleutils), and allowing +# declarations to be anywhere is even a good thing for those as we can keep our +# changes to the vendored code in one place. +override PG_CFLAGS += -Wno-declaration-after-statement SHLIB_LINK += -Wl,-rpath,$(PG_LIB)/ -lpq -Lthird_party/duckdb/build/$(DUCKDB_BUILD_TYPE)/src -L$(PG_LIB) -lduckdb -lstdc++ -llz4 From e312a3b5d61be5bb4de5332466599420e671ae79 Mon Sep 17 00:00:00 2001 From: Jelte Fennema-Nio Date: Tue, 14 Jan 2025 13:28:56 +0100 Subject: [PATCH 10/40] Fix warning --- src/pgduckdb_ruleutils.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pgduckdb_ruleutils.cpp b/src/pgduckdb_ruleutils.cpp index 0ca738ef..b0e3c757 100644 --- a/src/pgduckdb_ruleutils.cpp +++ b/src/pgduckdb_ruleutils.cpp @@ -105,7 +105,7 @@ pgduckdb_duckdb_row_subscript_var(Expr *expr) { List * pgduckdb_star_start_vars(List *target_list) { List *star_start_indexes = NIL; - Var *possible_star_start_var; + Var *possible_star_start_var = NULL; int possible_star_start_var_index = 0; int i = 0; From e46b49f3b7ad5ccbea8eabef4353cc7b92f083e2 Mon Sep 17 00:00:00 2001 From: Jelte Fennema-Nio Date: Tue, 14 Jan 2025 13:51:11 +0100 Subject: [PATCH 11/40] Don't rename subscript expressions in subqueries --- include/pgduckdb/pgduckdb_ruleutils.h | 2 + src/pgduckdb_ruleutils.cpp | 7 +++ src/vendor/pg_ruleutils_17.c | 19 ++++++-- test/regression/expected/read_functions.out | 48 +++++++++++++++++++++ test/regression/sql/read_functions.sql | 27 ++++++++++++ 5 files changed, 99 insertions(+), 4 deletions(-) diff --git a/include/pgduckdb/pgduckdb_ruleutils.h b/include/pgduckdb/pgduckdb_ruleutils.h index 5c057b17..21e9a3b2 100644 --- a/include/pgduckdb/pgduckdb_ruleutils.h +++ b/include/pgduckdb/pgduckdb_ruleutils.h @@ -13,3 +13,5 @@ bool pgduckdb_var_is_duckdb_row(Var *var); bool pgduckdb_func_returns_duckdb_row(RangeTblFunction *rtfunc); Var *pgduckdb_duckdb_row_subscript_var(Expr *expr); List *pgduckdb_star_start_vars(List *target_list); + +extern bool processed_targetlist; diff --git a/src/pgduckdb_ruleutils.cpp b/src/pgduckdb_ruleutils.cpp index b0e3c757..071f2ded 100644 --- a/src/pgduckdb_ruleutils.cpp +++ b/src/pgduckdb_ruleutils.cpp @@ -28,6 +28,8 @@ extern "C" { #include "pgduckdb/pgduckdb_metadata_cache.hpp" extern "C" { +bool processed_targetlist = false; + char * pgduckdb_function_name(Oid function_oid) { if (!pgduckdb::IsDuckdbOnlyFunction(function_oid)) { @@ -273,9 +275,14 @@ pgduckdb_relation_name(Oid relation_oid) { * DuckDB understands). The reason this is not part of * pgduckdb_pg_get_querydef_internal is because we want to avoid changing that * vendored in function as much as possible to keep updates easy. + * + * Apart from that it also sets the processed_targetlist variable to false, + * which we use in get_target_list to determine if we're processing the + * outermost targetlist or not. */ char * pgduckdb_get_querydef(Query *query) { + processed_targetlist = false; auto save_nestlevel = NewGUCNestLevel(); SetConfigOption("DateStyle", "ISO, YMD", PGC_USERSET, PGC_S_SESSION); char *result = pgduckdb_pg_get_querydef_internal(query, false); diff --git a/src/vendor/pg_ruleutils_17.c b/src/vendor/pg_ruleutils_17.c index 606a2fc7..ab25e072 100644 --- a/src/vendor/pg_ruleutils_17.c +++ b/src/vendor/pg_ruleutils_17.c @@ -6048,6 +6048,9 @@ get_target_list(List *targetList, deparse_context *context, int varattno_star = 0; bool added_star = false; + bool outermost_targetlist = !processed_targetlist; + processed_targetlist = true; + int i = 0; foreach(l, targetList) { @@ -6140,9 +6143,20 @@ get_target_list(List *targetList, deparse_context *context, else colname = tle->resname; + /* + * This makes sure we don't add Postgres its bad default alias to the + * duckdb.row type. + */ bool duckdb_row_needs_as = !pgduckdb_var_is_duckdb_row(var); + + /* + * For r['abc'] we don't want the column name to be r, but instead we + * want it to be "abc". We can only to do this for the target list of + * the outside most query though to make sure references to the column + * name are still valid. + */ Var *subscript_var = pgduckdb_duckdb_row_subscript_var(tle->expr); - if (subscript_var) { + if (outermost_targetlist && subscript_var) { deparse_namespace* dpns = (deparse_namespace *) list_nth(context->namespaces, subscript_var->varlevelsup); int varno; @@ -6168,9 +6182,6 @@ get_target_list(List *targetList, deparse_context *context, } RangeTblEntry* rte = rt_fetch(varno, dpns->rtable); char *original_column = strVal(list_nth(rte->eref->colnames, varattno - 1)); - /* BUG: Thil logic breaks the following query: - * SELECT * FROM (SELECT r['column00'] FROM read_csv('/home/jelte/work/pg_duckdb/test/regression/data/web_page.csv') r limit 1); - */ duckdb_row_needs_as = strcmp(original_column, colname) != 0; } diff --git a/test/regression/expected/read_functions.out b/test/regression/expected/read_functions.out index 9d9226ce..b3f4d5e8 100644 --- a/test/regression/expected/read_functions.out +++ b/test/regression/expected/read_functions.out @@ -60,6 +60,54 @@ SELECT r['sepal.length'], r['filename'] SELECT * FROM read_csv('../../non-existing-file.csv'); ERROR: (PGDuckDB/CreatePlan) Prepared query returned an error: 'IO Error: No files found that match the pattern "../../non-existing-file.csv" +-- We override Postgres its default column name for subscript expressions. In +-- the following example the column would normally be named "r", which is +-- pretty non-descriptive especially when selecting multiple columns from the +-- same row. +-- +-- NOTE: Jelte tried to change this behaviour in upstream Postgres, but met +-- with some resistance: +-- https://www.postgresql.org/message-id/flat/CAGECzQRYAFHLnjjymsSPhL-9OExVyNfMQkZMc1hcoUQ6dDHo=Q@mail.gmail.com +SELECT r['column00'] FROM read_csv('../../data/web_page.csv') r limit 1; + column00 +---------- + 1 +(1 row) + +-- If an explicit column name is given we +SELECT r['column00'] AS col1 FROM read_csv('../../data/web_page.csv') r limit 1; + col1 +------ + 1 +(1 row) + +-- ...except if that explicit column name is the same as the default column +-- name. In which case we fail to recognize that fact. +-- XXX: It would be nice to fix this, but it's not a high priority. +SELECT r['column00'] AS r FROM read_csv('../../data/web_page.csv') r limit 1; + column00 +---------- + 1 +(1 row) + +-- If we use the same trick inside subqueries, then references to columns from +-- that subquery would not use that better name, and thus the query could not +-- be executed. To avoid that we simply don't rename a subscript expression +-- inside a subquery, and only do so in the outermost SELECT list (aka +-- targetlist). +SELECT * FROM (SELECT r['column00'] FROM read_csv('../../data/web_page.csv') r limit 1); + r +--- + 1 +(1 row) + +-- If you give it a different alias then that alias is propegated though. +SELECT * FROM (SELECT r['column00'] AS col1 FROM read_csv('../../data/web_page.csv') r limit 1); + col1 +------ + 1 +(1 row) + -- delta_scan SELECT duckdb.install_extension('delta'); install_extension diff --git a/test/regression/sql/read_functions.sql b/test/regression/sql/read_functions.sql index a9bd5851..be657413 100644 --- a/test/regression/sql/read_functions.sql +++ b/test/regression/sql/read_functions.sql @@ -23,6 +23,33 @@ SELECT r['sepal.length'], r['filename'] SELECT * FROM read_csv('../../non-existing-file.csv'); +-- We override Postgres its default column name for subscript expressions. In +-- the following example the column would normally be named "r", which is +-- pretty non-descriptive especially when selecting multiple columns from the +-- same row. +-- +-- NOTE: Jelte tried to change this behaviour in upstream Postgres, but met +-- with some resistance: +-- https://www.postgresql.org/message-id/flat/CAGECzQRYAFHLnjjymsSPhL-9OExVyNfMQkZMc1hcoUQ6dDHo=Q@mail.gmail.com +SELECT r['column00'] FROM read_csv('../../data/web_page.csv') r limit 1; + +-- If an explicit column name is given we +SELECT r['column00'] AS col1 FROM read_csv('../../data/web_page.csv') r limit 1; + +-- ...except if that explicit column name is the same as the default column +-- name. In which case we fail to recognize that fact. +-- XXX: It would be nice to fix this, but it's not a high priority. +SELECT r['column00'] AS r FROM read_csv('../../data/web_page.csv') r limit 1; + +-- If we use the same trick inside subqueries, then references to columns from +-- that subquery would not use that better name, and thus the query could not +-- be executed. To avoid that we simply don't rename a subscript expression +-- inside a subquery, and only do so in the outermost SELECT list (aka +-- targetlist). +SELECT * FROM (SELECT r['column00'] FROM read_csv('../../data/web_page.csv') r limit 1); + +-- If you give it a different alias then that alias is propegated though. +SELECT * FROM (SELECT r['column00'] AS col1 FROM read_csv('../../data/web_page.csv') r limit 1); -- delta_scan From aa3d72b37c6e855f4a368f4f9feebb652feb343b Mon Sep 17 00:00:00 2001 From: Jelte Fennema-Nio Date: Tue, 14 Jan 2025 15:52:44 +0100 Subject: [PATCH 12/40] Fix materialized views --- src/pgduckdb_ddl.cpp | 51 +++++++++++++++++-- .../regression/expected/materialized_view.out | 41 +++++++++------ 2 files changed, 71 insertions(+), 21 deletions(-) diff --git a/src/pgduckdb_ddl.cpp b/src/pgduckdb_ddl.cpp index bd491aff..2a4a0cb9 100644 --- a/src/pgduckdb_ddl.cpp +++ b/src/pgduckdb_ddl.cpp @@ -12,7 +12,9 @@ extern "C" { #include "funcapi.h" #include "nodes/print.h" #include "nodes/makefuncs.h" +#include "nodes/nodeFuncs.h" #include "optimizer/optimizer.h" +#include "parser/analyze.h" #include "tcop/utility.h" #include "utils/builtins.h" #include "utils/syscache.h" @@ -45,6 +47,41 @@ static List *ctas_original_target_list = NIL; static bool top_level_ddl = true; static ProcessUtility_hook_type prev_process_utility_hook = NULL; +static Query * +WrapQueryInQueryCall(Query *query, List *target_list) { + char *duckdb_query_string = pgduckdb_get_querydef(query); + + StringInfo buf = makeStringInfo(); + appendStringInfo(buf, "SELECT "); + bool first = true; + foreach_node(TargetEntry, tle, target_list) { + if (!first) { + appendStringInfoString(buf, ", "); + } + + Oid type = exprType((Node *)tle->expr); + Oid typemod = exprTypmod((Node *)tle->expr); + first = false; + appendStringInfo(buf, "r[%s]::%s AS %s", quote_literal_cstr(tle->resname), + format_type_with_typemod(type, typemod), quote_identifier(tle->resname)); + } + + appendStringInfo(buf, " FROM duckdb.query(%s) r", quote_literal_cstr(duckdb_query_string)); + + List *parsetree_list = pg_parse_query(buf->data); + + /* + * We only allow a single user statement in a prepared statement. This is + * mainly to keep the protocol simple --- otherwise we'd need to worry + * about multiple result tupdescs and things like that. + */ + if (list_length(parsetree_list) > 1) + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), errmsg("BUG: pg_duckdb generated a command with multiple queries"))); + + return parse_analyze_fixedparams((RawStmt *)linitial(parsetree_list), buf->data, NULL, 0, NULL); +} + static void DuckdbHandleDDL(PlannedStmt *pstmt, const char *query_string, ParamListInfo params) { if (!pgduckdb::IsExtensionRegistered()) { @@ -82,8 +119,8 @@ DuckdbHandleDDL(PlannedStmt *pstmt, const char *query_string, ParamListInfo para // here. The current hack doesn't work with materialized views yet. List *rewritten; - PlannedStmt *plan; Query *original_query = castNode(Query, stmt->query); + Query *query = (Query *)copyObjectImpl(original_query); /* * Parse analysis was done already, but we still have to run the rule @@ -99,10 +136,14 @@ DuckdbHandleDDL(PlannedStmt *pstmt, const char *query_string, ParamListInfo para query = linitial_node(Query, rewritten); Assert(query->commandType == CMD_SELECT); - /* plan the query */ - plan = pg_plan_query(query, query_string, CURSOR_OPT_PARALLEL_OK, params); - ctas_original_target_list = original_query->targetList; - original_query->targetList = plan->planTree->targetlist; + Query *rewritten_query_copy = (Query *)copyObjectImpl(query); + + PlannedStmt *plan = pg_plan_query(query, query_string, CURSOR_OPT_PARALLEL_OK, params); + + List *target_list = plan->planTree->targetlist; + + stmt->query = (Node *)WrapQueryInQueryCall(rewritten_query_copy, target_list); + stmt->into->viewQuery = (Node *)copyObjectImpl(stmt->query); } else if (IsA(parsetree, CreateSchemaStmt) && !pgduckdb::doing_motherduck_sync) { auto stmt = castNode(CreateSchemaStmt, parsetree); diff --git a/test/regression/expected/materialized_view.out b/test/regression/expected/materialized_view.out index dd2a6053..88b98440 100644 --- a/test/regression/expected/materialized_view.out +++ b/test/regression/expected/materialized_view.out @@ -48,28 +48,37 @@ INSERT INTO t_csv VALUES (1,1),(2,2),(3,3); \set csv_file_path '\'' :pwd '/tmp_check/t_csv.csv' '\'' COPY t_csv TO :csv_file_path (FORMAT CSV, HEADER TRUE, DELIMITER ','); CREATE MATERIALIZED VIEW mv_csv AS SELECT * FROM read_csv(:csv_file_path); -ERROR: SELECT rule's target entry 1 has different type from column "a" -DETAIL: SELECT target entry has type duckdb."row", but column has type bigint. SELECT COUNT(*) FROM mv_csv; -ERROR: relation "mv_csv" does not exist -LINE 1: SELECT COUNT(*) FROM mv_csv; - ^ + count +------- + 3 +(1 row) + SELECT * FROM mv_csv; -ERROR: relation "mv_csv" does not exist -LINE 1: SELECT * FROM mv_csv; - ^ + a | b +---+--- + 1 | 1 + 2 | 2 + 3 | 3 +(3 rows) + INSERT INTO t_csv VALUES (4,4); COPY t_csv TO :csv_file_path (FORMAT CSV, HEADER TRUE, DELIMITER ','); REFRESH MATERIALIZED VIEW mv_csv; -ERROR: relation "mv_csv" does not exist SELECT COUNT(*) FROM mv_csv; -ERROR: relation "mv_csv" does not exist -LINE 1: SELECT COUNT(*) FROM mv_csv; - ^ + count +------- + 4 +(1 row) + SELECT * FROM mv_csv; -ERROR: relation "mv_csv" does not exist -LINE 1: SELECT * FROM mv_csv; - ^ + a | b +---+--- + 1 | 1 + 2 | 2 + 3 | 3 + 4 | 4 +(4 rows) + DROP MATERIALIZED VIEW mv_csv; -ERROR: materialized view "mv_csv" does not exist DROP TABLE t_csv; From 6055d593810e4a65f281955c753de7783240fd4f Mon Sep 17 00:00:00 2001 From: Jelte Fennema-Nio Date: Tue, 14 Jan 2025 16:02:56 +0100 Subject: [PATCH 13/40] Only do CTAS type detection using duckdb for unresolved types --- include/pgduckdb/pgduckdb_ruleutils.h | 1 + src/pgduckdb_ddl.cpp | 36 +++++++++++++++++++++------ src/pgduckdb_ruleutils.cpp | 15 +++++++++++ 3 files changed, 45 insertions(+), 7 deletions(-) diff --git a/include/pgduckdb/pgduckdb_ruleutils.h b/include/pgduckdb/pgduckdb_ruleutils.h index 21e9a3b2..28233b04 100644 --- a/include/pgduckdb/pgduckdb_ruleutils.h +++ b/include/pgduckdb/pgduckdb_ruleutils.h @@ -11,6 +11,7 @@ bool pgduckdb_is_duckdb_row(Oid type_oid); bool pgduckdb_is_unresolved_type(Oid type_oid); bool pgduckdb_var_is_duckdb_row(Var *var); bool pgduckdb_func_returns_duckdb_row(RangeTblFunction *rtfunc); +bool pgduckdb_target_list_contains_unresolved_type_or_row(List *target_list); Var *pgduckdb_duckdb_row_subscript_var(Expr *expr); List *pgduckdb_star_start_vars(List *target_list); diff --git a/src/pgduckdb_ddl.cpp b/src/pgduckdb_ddl.cpp index 2a4a0cb9..bae49a7b 100644 --- a/src/pgduckdb_ddl.cpp +++ b/src/pgduckdb_ddl.cpp @@ -47,8 +47,21 @@ static List *ctas_original_target_list = NIL; static bool top_level_ddl = true; static ProcessUtility_hook_type prev_process_utility_hook = NULL; +/* + * WrapQueryInQueryCall takes a query and wraps it an duckdb.query(...) call. + * It then explicitly references all the columns and the types from the + * original qeury its target list. So a query like this: + * + * SELECT r from read_csv('file.csv') r; + * + * Would expand to: + * + * SELECT r['id']::int AS id, r['name']::text AS name + * FROM duckdb.query('SELECT * from system.main.read_csv(''file.csv'')') r; + * + */ static Query * -WrapQueryInQueryCall(Query *query, List *target_list) { +WrapQueryInDuckdbQueryCall(Query *query, List *target_list) { char *duckdb_query_string = pgduckdb_get_querydef(query); StringInfo buf = makeStringInfo(); @@ -113,13 +126,21 @@ DuckdbHandleDDL(PlannedStmt *pstmt, const char *query_string, ParamListInfo para return; } - // TODO: Probably we should only do this if the targetlist actually - // contains some duckdb.unresolved_type or duckdb.row columns. - // XXX: This is a huge hack. Probably we should do something different - // here. The current hack doesn't work with materialized views yet. + Query *original_query = castNode(Query, stmt->query); + // We need to do hacky things if the targetlist contains + // duckdb.unresolved_type or duckdb.row columns. In those cases we want + // to run the query through duckdb to get the actual result types for + // these queries. We also want to lock in those types creating a new + // query that will always retun them. + if (!pgduckdb_target_list_contains_unresolved_type_or_row(original_query->targetList)) { + // If the target list doesn't contain duckdb.row or + // duckdb.unresolved_type though, we are done now. + return; + } + + /* NOTE: The below code is mostly copied from ExecCreateTableAs */ List *rewritten; - Query *original_query = castNode(Query, stmt->query); Query *query = (Query *)copyObjectImpl(original_query); /* @@ -140,9 +161,10 @@ DuckdbHandleDDL(PlannedStmt *pstmt, const char *query_string, ParamListInfo para PlannedStmt *plan = pg_plan_query(query, query_string, CURSOR_OPT_PARALLEL_OK, params); + /* This is where our custom code starts again */ List *target_list = plan->planTree->targetlist; - stmt->query = (Node *)WrapQueryInQueryCall(rewritten_query_copy, target_list); + stmt->query = (Node *)WrapQueryInDuckdbQueryCall(rewritten_query_copy, target_list); stmt->into->viewQuery = (Node *)copyObjectImpl(stmt->query); } else if (IsA(parsetree, CreateSchemaStmt) && !pgduckdb::doing_motherduck_sync) { diff --git a/src/pgduckdb_ruleutils.cpp b/src/pgduckdb_ruleutils.cpp index 071f2ded..99a66bdb 100644 --- a/src/pgduckdb_ruleutils.cpp +++ b/src/pgduckdb_ruleutils.cpp @@ -73,6 +73,21 @@ pgduckdb_func_returns_duckdb_row(RangeTblFunction *rtfunc) { return pgduckdb_is_duckdb_row(func_expr->funcresulttype); } +bool +pgduckdb_target_list_contains_unresolved_type_or_row(List *target_list) { + foreach_node(TargetEntry, tle, target_list) { + Oid type = exprType((Node *)tle->expr); + if (pgduckdb_is_unresolved_type(type)) { + return true; + } + + if (pgduckdb_is_duckdb_row(type)) { + return true; + } + } + return false; +} + /* * Returns NULL if the expression is not a subscript on a duckdb row. Returns * the Var of the duckdb row if it is. From 284795b0c94c8efc0be1eb7da39342622f2c14ae Mon Sep 17 00:00:00 2001 From: Jelte Fennema-Nio Date: Tue, 14 Jan 2025 16:50:32 +0100 Subject: [PATCH 14/40] Use C duckdb_only_function handler for approx_count_distinct --- sql/pg_duckdb--0.2.0--0.3.0.sql | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/sql/pg_duckdb--0.2.0--0.3.0.sql b/sql/pg_duckdb--0.2.0--0.3.0.sql index 9c231484..ed91cccc 100644 --- a/sql/pg_duckdb--0.2.0--0.3.0.sql +++ b/sql/pg_duckdb--0.2.0--0.3.0.sql @@ -1,12 +1,8 @@ CREATE FUNCTION @extschema@.approx_count_distinct_sfunc(bigint, anyelement) -RETURNS bigint LANGUAGE 'plpgsql' -SET search_path = pg_catalog, pg_temp -AS -$func$ -BEGIN - RAISE EXCEPTION 'Aggregate `approx_count_distinct(ANYELEMENT)` only works with Duckdb execution.'; -END; -$func$; +RETURNS bigint +AS 'MODULE_PATHNAME', 'duckdb_only_function' +LANGUAGE C; + CREATE AGGREGATE @extschema@.approx_count_distinct(anyelement) ( From 3ce96f71493b69d8b9516ca0d3ba9ba7098e329f Mon Sep 17 00:00:00 2001 From: Jelte Fennema-Nio Date: Tue, 14 Jan 2025 18:19:02 +0100 Subject: [PATCH 15/40] Support array/json indexing --- src/pgduckdb_options.cpp | 36 +++++++++++---------- src/vendor/pg_ruleutils_17.c | 20 +++++++++++- test/regression/expected/read_functions.out | 7 ++++ test/regression/sql/read_functions.sql | 3 ++ 4 files changed, 48 insertions(+), 18 deletions(-) diff --git a/src/pgduckdb_options.cpp b/src/pgduckdb_options.cpp index 98576c52..bcaa4b22 100644 --- a/src/pgduckdb_options.cpp +++ b/src/pgduckdb_options.cpp @@ -388,11 +388,7 @@ DuckdbRowSubscriptTransform(SubscriptingRef *sbsref, List *indirection, struct P elog(ERROR, "Subscripting duckdb.row with an empty subscript is not supported"); } - if (list_length(indirection) != 1) { - /* TODO: Fix this, if the column contains an array we should be able to subscript that type */ - elog(ERROR, "Subscripting duckdb.row is only supported with a single subscript"); - } - pprint(indirection); + bool first = true; // Transform each subscript expression foreach_node(A_Indices, subscript, indirection) { @@ -400,23 +396,29 @@ DuckdbRowSubscriptTransform(SubscriptingRef *sbsref, List *indirection, struct P Assert(subscript->uidx); Node *subscript_expr = transformExpr(pstate, subscript->uidx, pstate->p_expr_kind); - pprint(subscript_expr); int expr_location = exprLocation(subscript->uidx); Oid subscript_expr_type = exprType(subscript_expr); - Node *coerced_expr = coerce_to_target_type(pstate, subscript_expr, subscript_expr_type, TEXTOID, -1, - COERCION_IMPLICIT, COERCE_IMPLICIT_CAST, expr_location); - if (!coerced_expr) { - ereport(ERROR, (errcode(ERRCODE_DATATYPE_MISMATCH), errmsg("duckdb.row subscript must have text type"), - parser_errposition(pstate, expr_location))); - } + /* The first subscript needs to be a TEXT constant, since it should be + * a column reference. But the subscripts after that can be anything, + * DuckDB should interpret those. */ + if (first) { + Node *coerced_expr = coerce_to_target_type(pstate, subscript_expr, subscript_expr_type, TEXTOID, -1, + COERCION_IMPLICIT, COERCE_IMPLICIT_CAST, expr_location); + if (!coerced_expr) { + ereport(ERROR, (errcode(ERRCODE_DATATYPE_MISMATCH), errmsg("duckdb.row subscript must have text type"), + parser_errposition(pstate, expr_location))); + } + + if (!IsA(subscript_expr, Const)) { + ereport(ERROR, (errcode(ERRCODE_DATATYPE_MISMATCH), errmsg("duckdb.row subscript must be a constant"), + parser_errposition(pstate, expr_location))); + } - if (!IsA(subscript_expr, Const)) { - pprint(subscript_expr); - ereport(ERROR, (errcode(ERRCODE_DATATYPE_MISMATCH), errmsg("duckdb.row subscript must be a constant"), - parser_errposition(pstate, expr_location))); + subscript_expr = coerced_expr; + first = false; } - sbsref->refupperindexpr = lappend(sbsref->refupperindexpr, coerced_expr); + sbsref->refupperindexpr = lappend(sbsref->refupperindexpr, subscript_expr); } /* TODO: Trigger cache population, probably we should do this somewhere else */ diff --git a/src/vendor/pg_ruleutils_17.c b/src/vendor/pg_ruleutils_17.c index ab25e072..d3de77e5 100644 --- a/src/vendor/pg_ruleutils_17.c +++ b/src/vendor/pg_ruleutils_17.c @@ -9129,9 +9129,15 @@ get_rule_expr(Node *node, deparse_context *context, get_rule_expr(refassgnexpr, context, showimplicit); } else if (IsA(sbsref->refexpr, Var) && pgduckdb_var_is_duckdb_row((Var*) sbsref->refexpr)) { + /* Subscript expressions into the duckdb.row type we want to + * change to regular column references in the DuckDB query. + * Both because it's generally more common and thus + * results in better optimized queries, and because + * iceberg_scan doesn't support the subscripting syntax on + * its results. + */ Assert(sbsref->refupperindexpr); Assert(!sbsref->reflowerindexpr); - Assert(list_length(sbsref->refupperindexpr) == 1); Var *var = (Var *) sbsref->refexpr; deparse_namespace* dpns = (deparse_namespace *) list_nth(context->namespaces, @@ -9166,6 +9172,18 @@ get_rule_expr(Node *node, deparse_context *context, appendStringInfoChar(buf, '.'); } appendStringInfoString(context->buf, quote_identifier(extval)); + + if (list_length(sbsref->refupperindexpr) > 1) { + /* + * If there are any additional subscript expressions we + * should output them. Subscripts can be used in duckdb + * to index into arrays or json objects. + */ + SubscriptingRef *shorter_sbsref = copyObject(sbsref); + /* strip the first subscript from the list */ + shorter_sbsref->refupperindexpr = list_delete_first(shorter_sbsref->refupperindexpr); + printSubscripts(shorter_sbsref, context); + } } else { diff --git a/test/regression/expected/read_functions.out b/test/regression/expected/read_functions.out index b3f4d5e8..35871439 100644 --- a/test/regression/expected/read_functions.out +++ b/test/regression/expected/read_functions.out @@ -29,6 +29,13 @@ SELECT r['sepal.length'], r['file_row_number'], r['filename'] 4.5 | 41 | ../../data/iris.parquet (5 rows) +-- Supports subscripts +SELECT r['jsoncol'][1], r['arraycol'][2] FROM read_parquet('../../data/indexable.parquet') r; + jsoncol[1] | arraycol[2] +------------+------------- + "d" | 22 +(1 row) + -- read_csv SELECT count(r['sepal.length']) FROM read_csv('../../data/iris.csv') r; count diff --git a/test/regression/sql/read_functions.sql b/test/regression/sql/read_functions.sql index be657413..c17c1d95 100644 --- a/test/regression/sql/read_functions.sql +++ b/test/regression/sql/read_functions.sql @@ -11,6 +11,9 @@ SELECT r['sepal.length'], r['file_row_number'], r['filename'] FROM read_parquet('../../data/iris.parquet', file_row_number => true, filename => true) r ORDER BY r['sepal.length'] LIMIT 5; +-- Supports subscripts +SELECT r['jsoncol'][1], r['arraycol'][2] FROM read_parquet('../../data/indexable.parquet') r; + -- read_csv SELECT count(r['sepal.length']) FROM read_csv('../../data/iris.csv') r; From 7678dbcd201cdcbc1e1fe1f1a92e2fa0ce562630 Mon Sep 17 00:00:00 2001 From: Jelte Fennema-Nio Date: Wed, 15 Jan 2025 11:07:34 +0100 Subject: [PATCH 16/40] Fix CTAS --- src/pgduckdb_ddl.cpp | 38 ++++++++++++++++++++++++++------------ 1 file changed, 26 insertions(+), 12 deletions(-) diff --git a/src/pgduckdb_ddl.cpp b/src/pgduckdb_ddl.cpp index bae49a7b..6b2e0c8e 100644 --- a/src/pgduckdb_ddl.cpp +++ b/src/pgduckdb_ddl.cpp @@ -42,7 +42,6 @@ extern "C" { * tables we force this value to false. */ static bool ctas_skip_data = false; -static List *ctas_original_target_list = NIL; static bool top_level_ddl = true; static ProcessUtility_hook_type prev_process_utility_hook = NULL; @@ -118,24 +117,32 @@ DuckdbHandleDDL(PlannedStmt *pstmt, const char *query_string, ParamListInfo para ctas_skip_data = stmt->into->skipData; stmt->into->skipData = true; } + bool is_create_materialized_view = stmt->into->viewQuery != NULL; bool skips_planning = stmt->into->skipData || is_create_materialized_view; if (!skips_planning) { // No need to plan here as well, because standard_ProcessUtility - // will already plan the query and thus get the correct columns. + // will already plan the query and thus get the correct columns and + // their types. return; } Query *original_query = castNode(Query, stmt->query); - // We need to do hacky things if the targetlist contains - // duckdb.unresolved_type or duckdb.row columns. In those cases we want - // to run the query through duckdb to get the actual result types for - // these queries. We also want to lock in those types creating a new - // query that will always retun them. + // For cases where Postgres does not usually plan the query, we need + // to do hacky things if the targetlist contains duckdb.unresolved_type + // or duckdb.row columns. In those cases we want to run the query + // through duckdb to get the actual result types for these queries, + // which won't happen automatically if the query is not planned by + // Postgres. So we will manually do it here. + // + // If we're doing that anyway we might as well slightly change the + // query so that it always returns the types that are expected by + // Postgres. This is especially useful for materialized views, since + // the query for is likely to be run many times. if (!pgduckdb_target_list_contains_unresolved_type_or_row(original_query->targetList)) { - // If the target list doesn't contain duckdb.row or - // duckdb.unresolved_type though, we are done now. + // ... but if the target list doesn't contain duckdb.row or + // duckdb.unresolved_type, there's no need to do any of that. return; } @@ -165,7 +172,16 @@ DuckdbHandleDDL(PlannedStmt *pstmt, const char *query_string, ParamListInfo para List *target_list = plan->planTree->targetlist; stmt->query = (Node *)WrapQueryInDuckdbQueryCall(rewritten_query_copy, target_list); - stmt->into->viewQuery = (Node *)copyObjectImpl(stmt->query); + + if (is_create_materialized_view) { + /* + * If this is a materialized view we also need to update its view + * query. It's important not to set it for regular CTAS queries, + * otherwise Postgres code will assume it's a materialized view + * instead. + */ + stmt->into->viewQuery = (Node *)copyObjectImpl(stmt->query); + } } else if (IsA(parsetree, CreateSchemaStmt) && !pgduckdb::doing_motherduck_sync) { auto stmt = castNode(CreateSchemaStmt, parsetree); @@ -446,12 +462,10 @@ DECLARE_PG_FUNCTION(duckdb_create_table_trigger) { if (IsA(parsetree, CreateTableAsStmt) && !ctas_skip_data) { auto stmt = castNode(CreateTableAsStmt, parsetree); ctas_query = (Query *)stmt->query; - ctas_query->targetList = ctas_original_target_list; } pgduckdb::DuckDBQueryOrThrow(*connection, create_table_string); if (ctas_query) { - const char *ctas_query_string = pgduckdb_get_querydef(ctas_query); std::string insert_string = From 83acc3558d238b68079c8534a8d05d88127eb430 Mon Sep 17 00:00:00 2001 From: Jelte Fennema-Nio Date: Wed, 15 Jan 2025 11:13:09 +0100 Subject: [PATCH 17/40] Move extension registration to top of subscript transform fuction --- src/pgduckdb_options.cpp | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/src/pgduckdb_options.cpp b/src/pgduckdb_options.cpp index bcaa4b22..52525d8e 100644 --- a/src/pgduckdb_options.cpp +++ b/src/pgduckdb_options.cpp @@ -376,6 +376,15 @@ DECLARE_PG_FUNCTION(pgduckdb_recycle_ddb) { void DuckdbRowSubscriptTransform(SubscriptingRef *sbsref, List *indirection, struct ParseState *pstate, bool isSlice, bool isAssignment) { + /* + * We need to populate our cache for some of the code below. Normally this + * cache is populated at the start of our planner hook, but this function + * is being called from the parser. + */ + if (!pgduckdb::IsExtensionRegistered()) { + elog(ERROR, "BUG: Using duckdb.row but the pg_duckdb extension is not installed"); + } + if (isAssignment) { elog(ERROR, "Assignment to duckdb.row is not supported"); } @@ -421,9 +430,6 @@ DuckdbRowSubscriptTransform(SubscriptingRef *sbsref, List *indirection, struct P sbsref->refupperindexpr = lappend(sbsref->refupperindexpr, subscript_expr); } - /* TODO: Trigger cache population, probably we should do this somewhere else */ - pgduckdb::IsExtensionRegistered(); - // Set the result type of the subscripting operation sbsref->refrestype = pgduckdb::DuckdbUnresolvedTypeOid(); sbsref->reftypmod = -1; From a4cf3a1295c5f41d6269caf6508acc42ca5453aa Mon Sep 17 00:00:00 2001 From: Jelte Fennema-Nio Date: Wed, 15 Jan 2025 11:13:43 +0100 Subject: [PATCH 18/40] Remove pprint --- src/pgduckdb_hooks.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/pgduckdb_hooks.cpp b/src/pgduckdb_hooks.cpp index b901802f..b148bbba 100644 --- a/src/pgduckdb_hooks.cpp +++ b/src/pgduckdb_hooks.cpp @@ -190,7 +190,6 @@ DuckdbPlannerHook_Cpp(Query *parse, const char *query_string, int cursor_options if (pgduckdb::IsExtensionRegistered()) { if (NeedsDuckdbExecution(parse)) { IsAllowedStatement(parse, true); - pprint(parse); return DuckdbPlanNode(parse, query_string, cursor_options, bound_params, true); } else if (duckdb_force_execution && IsAllowedStatement(parse) && ContainsFromClause(parse)) { From adc717cd0c972f858864da218a80c771537e773c Mon Sep 17 00:00:00 2001 From: Jelte Fennema-Nio Date: Wed, 15 Jan 2025 11:44:46 +0100 Subject: [PATCH 19/40] More comments and tests --- src/pgduckdb_options.cpp | 26 +++++++++++++++++---- test/regression/expected/read_functions.out | 21 +++++++++++++++++ test/regression/sql/read_functions.sql | 10 ++++++++ 3 files changed, 53 insertions(+), 4 deletions(-) diff --git a/src/pgduckdb_options.cpp b/src/pgduckdb_options.cpp index 52525d8e..3a6b2e23 100644 --- a/src/pgduckdb_options.cpp +++ b/src/pgduckdb_options.cpp @@ -373,6 +373,12 @@ DECLARE_PG_FUNCTION(pgduckdb_recycle_ddb) { PG_RETURN_BOOL(true); } +/* + * DuckdbRowSubscriptTransform is called by the parser when a subscripting + * operation is performed on a duckdb.row. It has two main puprposes: + * 1. Ensure that the row is being indexed using a string literal + * 2. Ensure that the return type of this index operation is duckdb.unresolved_type + */ void DuckdbRowSubscriptTransform(SubscriptingRef *sbsref, List *indirection, struct ParseState *pstate, bool isSlice, bool isAssignment) { @@ -424,6 +430,12 @@ DuckdbRowSubscriptTransform(SubscriptingRef *sbsref, List *indirection, struct P parser_errposition(pstate, expr_location))); } + Const *subscript_const = castNode(Const, subscript_expr); + if (subscript_const->constisnull) { + ereport(ERROR, (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED), errmsg("duckdb.row subscript cannot be NULL"), + parser_errposition(pstate, expr_location))); + } + subscript_expr = coerced_expr; first = false; } @@ -435,6 +447,12 @@ DuckdbRowSubscriptTransform(SubscriptingRef *sbsref, List *indirection, struct P sbsref->reftypmod = -1; } +/* + * DuckdbRowSubscriptExecSetup is called by the executor when a subscripting + * operation is performed on a duckdb.row. This should never happen, because + * any query that contains a duckdb.row should automatically be use DuckDB + * execution. + */ void DuckdbRowSubscriptExecSetup(const SubscriptingRef * /*sbsref*/, SubscriptingRefState * /*sbsrefstate*/, SubscriptExecSteps * /*exprstate*/) { @@ -449,6 +467,10 @@ static SubscriptRoutines duckdb_subscript_routines = { .store_leakproof = true, }; +DECLARE_PG_FUNCTION(duckdb_row_subscript) { + PG_RETURN_POINTER(&duckdb_subscript_routines); +} + DECLARE_PG_FUNCTION(duckdb_row_in) { elog(ERROR, "Creating the duckdb.row type is not supported"); } @@ -465,10 +487,6 @@ DECLARE_PG_FUNCTION(duckdb_unresolved_type_out) { return textout(fcinfo); } -DECLARE_PG_FUNCTION(duckdb_row_subscript) { - PG_RETURN_POINTER(&duckdb_subscript_routines); -} - DECLARE_PG_FUNCTION(duckdb_unresolved_type_operator) { elog(ERROR, "Unresolved duckdb types cannot be used by the Postgres executor"); } diff --git a/test/regression/expected/read_functions.out b/test/regression/expected/read_functions.out index 35871439..be84b12c 100644 --- a/test/regression/expected/read_functions.out +++ b/test/regression/expected/read_functions.out @@ -115,6 +115,27 @@ SELECT * FROM (SELECT r['column00'] AS col1 FROM read_csv('../../data/web_page.c 1 (1 row) +-- Only simple string literals are supported as column names +SELECT r[NULL] FROM read_csv('../../data/web_page.csv') r limit 1; +ERROR: duckdb.row subscript cannot be NULL +LINE 1: SELECT r[NULL] FROM read_csv('../../data/web_page.csv') r li... + ^ +SELECT r[123] FROM read_csv('../../data/web_page.csv') r limit 1; +ERROR: duckdb.row subscript must have text type +LINE 1: SELECT r[123] FROM read_csv('../../data/web_page.csv') r lim... + ^ +SELECT r[3.14] FROM read_csv('../../data/web_page.csv') r limit 1; +ERROR: duckdb.row subscript must have text type +LINE 1: SELECT r[3.14] FROM read_csv('../../data/web_page.csv') r li... + ^ +SELECT r[q.col] +FROM + read_csv('../../data/web_page.csv') r, + (SELECT 'abc'::text as col) q +LIMIT 1; +ERROR: duckdb.row subscript must be a constant +LINE 1: SELECT r[q.col] + ^ -- delta_scan SELECT duckdb.install_extension('delta'); install_extension diff --git a/test/regression/sql/read_functions.sql b/test/regression/sql/read_functions.sql index c17c1d95..ba3bc7f9 100644 --- a/test/regression/sql/read_functions.sql +++ b/test/regression/sql/read_functions.sql @@ -54,6 +54,16 @@ SELECT * FROM (SELECT r['column00'] FROM read_csv('../../data/web_page.csv') r l -- If you give it a different alias then that alias is propegated though. SELECT * FROM (SELECT r['column00'] AS col1 FROM read_csv('../../data/web_page.csv') r limit 1); +-- Only simple string literals are supported as column names +SELECT r[NULL] FROM read_csv('../../data/web_page.csv') r limit 1; +SELECT r[123] FROM read_csv('../../data/web_page.csv') r limit 1; +SELECT r[3.14] FROM read_csv('../../data/web_page.csv') r limit 1; +SELECT r[q.col] +FROM + read_csv('../../data/web_page.csv') r, + (SELECT 'abc'::text as col) q +LIMIT 1; + -- delta_scan SELECT duckdb.install_extension('delta'); From 595547101f0e27022124a659aca5aa8d3578057b Mon Sep 17 00:00:00 2001 From: Jelte Fennema-Nio Date: Wed, 15 Jan 2025 12:37:45 +0100 Subject: [PATCH 20/40] Even more comments and tests --- src/pgduckdb_ruleutils.cpp | 29 ++++- test/regression/expected/read_functions.out | 113 ++++++++++++++++++++ test/regression/sql/read_functions.sql | 76 +++++++++++++ 3 files changed, 217 insertions(+), 1 deletion(-) diff --git a/src/pgduckdb_ruleutils.cpp b/src/pgduckdb_ruleutils.cpp index 99a66bdb..433213b0 100644 --- a/src/pgduckdb_ruleutils.cpp +++ b/src/pgduckdb_ruleutils.cpp @@ -117,7 +117,34 @@ pgduckdb_duckdb_row_subscript_var(Expr *expr) { } /* - * Returns a list of duckdb + * In our DuckDB queries we sometimes want to use "SELECT *", when selecting + * from a function like read_parquet. That way DuckDB can figure out the actual + * columns that it should return. Sadly Postgres expands the * character from + * the original query to a list of columns. So we need to put a star, any time + * we want to replace duckdb.row columns with a "*" in the duckdb query. + * + * Since the original "*" might expand to many columns we need to remove all of + * those, when putting a "*" back. To do so we try to find a runs of Vars from + * the same FROM entry, aka RangeTableEntry (RTE) that we expect were created + * with a *. + * + * This function tries to find the indexes of the first column for each of + * those runs. It does so using this heuristic: + * + * 1. Find a column with varattno == 1 (aka the first column from an RTE) + * 2. Find a consecutive run of columns from the same RTE with varattnos that + * keep increasing by 1. + * 3. Once we find a duckdb.row column in any of those consecutive columns, we + * assume this run was created using a star expression and we track the + * initial index. Then we start at 1 again to find the next run. + * + * NOTE: This function does not find the end of such runs, that's left as an + * excersice for the caller. It should be pretty easy for the caller to do + * that, because they need to remove such columns anyway. The main reason this + * function existis is so that the caller doesn't have to scan backwards to + * find the start of a run once it finds a duckdb.row column. Scanning + * backwards is difficult for the caller because it wants to write out columns + * to the DuckDB query as it consumes them. */ List * pgduckdb_star_start_vars(List *target_list) { diff --git a/test/regression/expected/read_functions.out b/test/regression/expected/read_functions.out index be84b12c..1689428e 100644 --- a/test/regression/expected/read_functions.out +++ b/test/regression/expected/read_functions.out @@ -36,6 +36,119 @@ SELECT r['jsoncol'][1], r['arraycol'][2] FROM read_parquet('../../data/indexable "d" | 22 (1 row) +-- Subqueries correctly expand *, in case of multiple columns. +SELECT * FROM ( + SELECT 'something' as prefix, *, 'something else' as postfix + FROM read_parquet('../../data/iris.parquet') r + LIMIT 1 +); + prefix | sepal.length | sepal.width | petal.length | petal.width | variety | postfix +-----------+--------------+-------------+--------------+-------------+---------+---------------- + something | 5.1 | 3.5 | 1.4 | 0.2 | Setosa | something else +(1 row) + +-- NOTE: A single "r" is equivalent to a *. The prefix and postfix columns are +-- not explicitely selected, but still show up in the result. This is +-- considered a bug, but it's one that we cannot easily solve because the "r" +-- reference does not exist in the DuckDB query at all, so there's no way to +-- reference only the columns coming from that part of the subquery. Very few +-- people should be impacted by this though. +SELECT r FROM ( + SELECT 'something' as prefix, *, 'something else' as postfix + FROM read_parquet('../../data/iris.parquet') r + LIMIT 1 +); + prefix | sepal.length | sepal.width | petal.length | petal.width | variety | postfix +-----------+--------------+-------------+--------------+-------------+---------+---------------- + something | 5.1 | 3.5 | 1.4 | 0.2 | Setosa | something else +(1 row) + +-- ... but if you manually add the expected columns then they are merged into +-- the new star expansion. +SELECT prefix, r FROM ( + SELECT 'something' as prefix, *, 'something else' as postfix + FROM read_parquet('../../data/iris.parquet') r + LIMIT 1 +); + prefix | sepal.length | sepal.width | petal.length | petal.width | variety | postfix +-----------+--------------+-------------+--------------+-------------+---------+---------------- + something | 5.1 | 3.5 | 1.4 | 0.2 | Setosa | something else +(1 row) + +SELECT prefix, r, postfix FROM ( + SELECT 'something' as prefix, *, 'something else' as postfix + FROM read_parquet('../../data/iris.parquet') r + LIMIT 1 +); + prefix | sepal.length | sepal.width | petal.length | petal.width | variety | postfix +-----------+--------------+-------------+--------------+-------------+---------+---------------- + something | 5.1 | 3.5 | 1.4 | 0.2 | Setosa | something else +(1 row) + +-- This requires the prefix columns to be there though. If the prefix columns +-- are not there the postfix columns don't get merged into the new star +-- expansion automatically. +-- NOTE: Would be nice to fix this, but it's not a high priority. +SELECT r, postfix FROM ( + SELECT 'something' as prefix, *, 'something else' as postfix + FROM read_parquet('../../data/iris.parquet') r + LIMIT 1 +); + prefix | sepal.length | sepal.width | petal.length | petal.width | variety | postfix | postfix +-----------+--------------+-------------+--------------+-------------+---------+----------------+---------------- + something | 5.1 | 3.5 | 1.4 | 0.2 | Setosa | something else | something else +(1 row) + +-- If you swap them around, they will get duplicated though. For the +SELECT postfix, r, prefix FROM ( + SELECT 'something' as prefix, *, 'something else' as postfix + FROM read_parquet('../../data/iris.parquet') r + LIMIT 1 +); + postfix | prefix | sepal.length | sepal.width | petal.length | petal.width | variety | postfix | prefix +----------------+-----------+--------------+-------------+--------------+-------------+---------+----------------+----------- + something else | something | 5.1 | 3.5 | 1.4 | 0.2 | Setosa | something else | something +(1 row) + +-- Joining two subqueries a single * works as expected. +SELECT * +FROM ( + SELECT 'something' as prefix, *, 'something else' as postfix + FROM read_parquet('../../data/iris.parquet') r + LIMIT 1 +) q1, ( + SELECT * FROM read_parquet('../../data/unsigned_types.parquet') r +) q2; + prefix | sepal.length | sepal.width | petal.length | petal.width | variety | postfix | utinyint | usmallint | uinteger +-----------+--------------+-------------+--------------+-------------+---------+----------------+----------+-----------+------------ + something | 5.1 | 3.5 | 1.4 | 0.2 | Setosa | something else | 255 | 65535 | 4294967295 +(1 row) + +-- Combining multiple read_parquet calls in a single query also works as +-- expected. They are not expanded to multiple *'s. +-- BUG: This currently doesn't work correctly! +SELECT 'something' as prefix, *, 'something else' as postfix +FROM read_parquet('../../data/iris.parquet') r, + read_parquet('../../data/unsigned_types.parquet') r2 +LIMIT 1; + prefix | sepal.length | sepal.width | petal.length | petal.width | variety | utinyint | usmallint | uinteger | sepal.length | sepal.width | petal.length | petal.width | variety | utinyint | usmallint | uinteger | postfix +-----------+--------------+-------------+--------------+-------------+---------+----------+-----------+------------+--------------+-------------+--------------+-------------+---------+----------+-----------+------------+---------------- + something | 5.1 | 3.5 | 1.4 | 0.2 | Setosa | 255 | 65535 | 4294967295 | 5.1 | 3.5 | 1.4 | 0.2 | Setosa | 255 | 65535 | 4294967295 | something else +(1 row) + +-- And also when done in a subquery +-- BUG: Broken in the same way as the above query. +SELECT * FROM ( + SELECT 'something' as prefix, *, 'something else' as postfix + FROM read_parquet('../../data/iris.parquet') r, + read_parquet('../../data/unsigned_types.parquet') r2 + LIMIT 1 +); + prefix | sepal.length | sepal.width | petal.length | petal.width | variety | utinyint | usmallint | uinteger | sepal.length_1 | sepal.width_1 | petal.length_1 | petal.width_1 | variety_1 | utinyint_1 | usmallint_1 | uinteger_1 | postfix +-----------+--------------+-------------+--------------+-------------+---------+----------+-----------+------------+----------------+---------------+----------------+---------------+-----------+------------+-------------+------------+---------------- + something | 5.1 | 3.5 | 1.4 | 0.2 | Setosa | 255 | 65535 | 4294967295 | 5.1 | 3.5 | 1.4 | 0.2 | Setosa | 255 | 65535 | 4294967295 | something else +(1 row) + -- read_csv SELECT count(r['sepal.length']) FROM read_csv('../../data/iris.csv') r; count diff --git a/test/regression/sql/read_functions.sql b/test/regression/sql/read_functions.sql index ba3bc7f9..11c18862 100644 --- a/test/regression/sql/read_functions.sql +++ b/test/regression/sql/read_functions.sql @@ -14,6 +14,82 @@ SELECT r['sepal.length'], r['file_row_number'], r['filename'] -- Supports subscripts SELECT r['jsoncol'][1], r['arraycol'][2] FROM read_parquet('../../data/indexable.parquet') r; +-- Subqueries correctly expand *, in case of multiple columns. +SELECT * FROM ( + SELECT 'something' as prefix, *, 'something else' as postfix + FROM read_parquet('../../data/iris.parquet') r + LIMIT 1 +); + +-- NOTE: A single "r" is equivalent to a *. The prefix and postfix columns are +-- not explicitely selected, but still show up in the result. This is +-- considered a bug, but it's one that we cannot easily solve because the "r" +-- reference does not exist in the DuckDB query at all, so there's no way to +-- reference only the columns coming from that part of the subquery. Very few +-- people should be impacted by this though. +SELECT r FROM ( + SELECT 'something' as prefix, *, 'something else' as postfix + FROM read_parquet('../../data/iris.parquet') r + LIMIT 1 +); + +-- ... but if you manually add the expected columns then they are merged into +-- the new star expansion. +SELECT prefix, r FROM ( + SELECT 'something' as prefix, *, 'something else' as postfix + FROM read_parquet('../../data/iris.parquet') r + LIMIT 1 +); +SELECT prefix, r, postfix FROM ( + SELECT 'something' as prefix, *, 'something else' as postfix + FROM read_parquet('../../data/iris.parquet') r + LIMIT 1 +); + +-- This requires the prefix columns to be there though. If the prefix columns +-- are not there the postfix columns don't get merged into the new star +-- expansion automatically. +-- NOTE: Would be nice to fix this, but it's not a high priority. +SELECT r, postfix FROM ( + SELECT 'something' as prefix, *, 'something else' as postfix + FROM read_parquet('../../data/iris.parquet') r + LIMIT 1 +); + +-- If you swap them around, they will get duplicated though. For the +SELECT postfix, r, prefix FROM ( + SELECT 'something' as prefix, *, 'something else' as postfix + FROM read_parquet('../../data/iris.parquet') r + LIMIT 1 +); + +-- Joining two subqueries a single * works as expected. +SELECT * +FROM ( + SELECT 'something' as prefix, *, 'something else' as postfix + FROM read_parquet('../../data/iris.parquet') r + LIMIT 1 +) q1, ( + SELECT * FROM read_parquet('../../data/unsigned_types.parquet') r +) q2; + +-- Combining multiple read_parquet calls in a single query also works as +-- expected. They are not expanded to multiple *'s. +-- BUG: This currently doesn't work correctly! +SELECT 'something' as prefix, *, 'something else' as postfix +FROM read_parquet('../../data/iris.parquet') r, + read_parquet('../../data/unsigned_types.parquet') r2 +LIMIT 1; + +-- And also when done in a subquery +-- BUG: Broken in the same way as the above query. +SELECT * FROM ( + SELECT 'something' as prefix, *, 'something else' as postfix + FROM read_parquet('../../data/iris.parquet') r, + read_parquet('../../data/unsigned_types.parquet') r2 + LIMIT 1 +); + -- read_csv SELECT count(r['sepal.length']) FROM read_csv('../../data/iris.csv') r; From c238af7d19fc0b1f553c000f55ccdc766cce0de2 Mon Sep 17 00:00:00 2001 From: Jelte Fennema-Nio Date: Wed, 15 Jan 2025 12:40:52 +0100 Subject: [PATCH 21/40] Add missing file --- test/regression/data/indexable.parquet | Bin 0 -> 390 bytes 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 test/regression/data/indexable.parquet diff --git a/test/regression/data/indexable.parquet b/test/regression/data/indexable.parquet new file mode 100644 index 0000000000000000000000000000000000000000..d19e8cddebd749868c0df8a6f29f97ae3bddf145 GIT binary patch literal 390 zcmZ`$O-sW-5S^?`jJ4jJWg|VrK$ZxhG-5P|UX-3hqK3d5@W1-h~M0V-O%CV4?|dc*6jI89nL`ZsrD#o4Zf6J;OL4$`9=*=b8rG z&~$TWpSqwrrb1>vMj#LJRrZ#r&t3LXEE42^eUHiA*RH9vrt%a-Oj=3XChs(o0TUuF zc~y2xWTc_FMO7?{dg-|!m_4uq1q1Bc2=+z>nPB2t+$w7k*{4SCgyOe@A4 zE8iA=Pm{hWX5wDW#**uBlIy7P7N${reC`RREb#pag`QkF>!R(-rgqlhFdUvbe!42F V+?k)pqx2#l#q$$T2Edd)@CEsNMEL*! literal 0 HcmV?d00001 From 09b7f56f0884b19cfcc4616456de32427fce5692 Mon Sep 17 00:00:00 2001 From: Jelte Fennema-Nio Date: Fri, 17 Jan 2025 10:18:52 +0100 Subject: [PATCH 22/40] Revert "Generate query that works for iceberg scans" This reverts commit a799f7ee0691ef9defa93332416343bb4c62283f. --- src/vendor/pg_ruleutils_17.c | 37 ++------------------- test/regression/expected/read_functions.out | 36 ++++++++++---------- 2 files changed, 19 insertions(+), 54 deletions(-) diff --git a/src/vendor/pg_ruleutils_17.c b/src/vendor/pg_ruleutils_17.c index d3de77e5..60509067 100644 --- a/src/vendor/pg_ruleutils_17.c +++ b/src/vendor/pg_ruleutils_17.c @@ -7592,14 +7592,6 @@ get_variable(Var *var, int levelsup, bool istoplevel, deparse_context *context) elog(ERROR, "invalid attnum %d for relation \"%s\"", attnum, rte->eref->aliasname); if (pgduckdb_var_is_duckdb_row(var)) { - if (rte->rtekind == RTE_FUNCTION) { - if (istoplevel) { - appendStringInfoChar(buf, '*'); - return NULL; - } - return NULL; - } - if (istoplevel) { /* If the duckdb row is at the top level target list of a * select, then we want to generate r.*, to unpack all the @@ -9138,29 +9130,6 @@ get_rule_expr(Node *node, deparse_context *context, */ Assert(sbsref->refupperindexpr); Assert(!sbsref->reflowerindexpr); - Var *var = (Var *) sbsref->refexpr; - - deparse_namespace* dpns = (deparse_namespace *) list_nth(context->namespaces, - var->varlevelsup); - int varno; - - /* - * If we have a syntactic referent for the Var, and we're working from a - * parse tree, prefer to use the syntactic referent. Otherwise, fall back - * on the semantic referent. (Forcing use of the semantic referent when - * printing plan trees is a design choice that's perhaps more motivated by - * backwards compatibility than anything else. But it does have the - * advantage of making plans more explicit.) - */ - if (var->varnosyn > 0 && dpns->plan == NULL) - { - varno = var->varnosyn; - } - else - { - varno = var->varno; - } - RangeTblEntry* rte = rt_fetch(varno, dpns->rtable); Oid typoutput; bool typIsVarlena; Const *constval = castNode(Const, linitial(sbsref->refupperindexpr)); @@ -9168,10 +9137,8 @@ get_rule_expr(Node *node, deparse_context *context, &typoutput, &typIsVarlena); char *extval = OidOutputFunctionCall(typoutput, constval->constvalue); - if (rte->rtekind != RTE_FUNCTION) { - appendStringInfoChar(buf, '.'); - } - appendStringInfoString(context->buf, quote_identifier(extval)); + + appendStringInfo(context->buf, ".%s", quote_identifier(extval)); if (list_length(sbsref->refupperindexpr) > 1) { /* diff --git a/test/regression/expected/read_functions.out b/test/regression/expected/read_functions.out index 1689428e..8d912d21 100644 --- a/test/regression/expected/read_functions.out +++ b/test/regression/expected/read_functions.out @@ -31,9 +31,9 @@ SELECT r['sepal.length'], r['file_row_number'], r['filename'] -- Supports subscripts SELECT r['jsoncol'][1], r['arraycol'][2] FROM read_parquet('../../data/indexable.parquet') r; - jsoncol[1] | arraycol[2] -------------+------------- - "d" | 22 + r.jsoncol[1] | r.arraycol[2] +--------------+--------------- + "d" | 22 (1 row) -- Subqueries correctly expand *, in case of multiple columns. @@ -131,9 +131,9 @@ SELECT 'something' as prefix, *, 'something else' as postfix FROM read_parquet('../../data/iris.parquet') r, read_parquet('../../data/unsigned_types.parquet') r2 LIMIT 1; - prefix | sepal.length | sepal.width | petal.length | petal.width | variety | utinyint | usmallint | uinteger | sepal.length | sepal.width | petal.length | petal.width | variety | utinyint | usmallint | uinteger | postfix ------------+--------------+-------------+--------------+-------------+---------+----------+-----------+------------+--------------+-------------+--------------+-------------+---------+----------+-----------+------------+---------------- - something | 5.1 | 3.5 | 1.4 | 0.2 | Setosa | 255 | 65535 | 4294967295 | 5.1 | 3.5 | 1.4 | 0.2 | Setosa | 255 | 65535 | 4294967295 | something else + prefix | sepal.length | sepal.width | petal.length | petal.width | variety | utinyint | usmallint | uinteger | postfix +-----------+--------------+-------------+--------------+-------------+---------+----------+-----------+------------+---------------- + something | 5.1 | 3.5 | 1.4 | 0.2 | Setosa | 255 | 65535 | 4294967295 | something else (1 row) -- And also when done in a subquery @@ -144,9 +144,9 @@ SELECT * FROM ( read_parquet('../../data/unsigned_types.parquet') r2 LIMIT 1 ); - prefix | sepal.length | sepal.width | petal.length | petal.width | variety | utinyint | usmallint | uinteger | sepal.length_1 | sepal.width_1 | petal.length_1 | petal.width_1 | variety_1 | utinyint_1 | usmallint_1 | uinteger_1 | postfix ------------+--------------+-------------+--------------+-------------+---------+----------+-----------+------------+----------------+---------------+----------------+---------------+-----------+------------+-------------+------------+---------------- - something | 5.1 | 3.5 | 1.4 | 0.2 | Setosa | 255 | 65535 | 4294967295 | 5.1 | 3.5 | 1.4 | 0.2 | Setosa | 255 | 65535 | 4294967295 | something else + prefix | sepal.length | sepal.width | petal.length | petal.width | variety | utinyint | usmallint | uinteger | postfix +-----------+--------------+-------------+--------------+-------------+---------+----------+-----------+------------+---------------- + something | 5.1 | 3.5 | 1.4 | 0.2 | Setosa | 255 | 65535 | 4294967295 | something else (1 row) -- read_csv @@ -277,11 +277,10 @@ SELECT duckdb.install_extension('iceberg'); (1 row) SELECT COUNT(r['l_orderkey']) FROM iceberg_scan('../../data/lineitem_iceberg', allow_moved_paths => true) r; - count -------- - 51793 -(1 row) - +ERROR: (PGDuckDB/CreatePlan) Prepared query returned an error: 'Binder Error: Referenced table "r" not found! +Candidate tables: "iceberg_scan_data" +LINE 1: SELECT count(r.l_orderkey) AS count FROM system.main... + ^ -- TPCH query #6 SELECT sum(r['l_extendedprice'] * r['l_discount']) as revenue @@ -293,11 +292,10 @@ WHERE AND r['l_discount'] between 0.08 - 0.01 and 0.08 + 0.01 AND r['l_quantity'] < 25 LIMIT 1; - revenue --------------- - 1520873.7806 -(1 row) - +ERROR: (PGDuckDB/CreatePlan) Prepared query returned an error: 'Binder Error: Referenced table "r" not found! +Candidate tables: "iceberg_scan_data" +LINE 1: ...C allow_moved_paths => true) r WHERE ((r.l_shipdate >= '1997-01-01'::date) AND... + ^ SELECT * FROM iceberg_snapshots('../../data/lineitem_iceberg'); sequence_number | snapshot_id | timestamp_ms | manifest_list -----------------+---------------------+------------------------------+------------------------------------------------------------------------------------------------ From 1c9dddaa606c0d5e1ec6c4f0bf588985e5866618 Mon Sep 17 00:00:00 2001 From: Jelte Fennema-Nio Date: Fri, 17 Jan 2025 14:31:43 +0100 Subject: [PATCH 23/40] Fix iceberg_scan in a better way --- src/vendor/pg_ruleutils_17.c | 14 ++++++++++++++ test/regression/expected/read_functions.out | 18 ++++++++++-------- 2 files changed, 24 insertions(+), 8 deletions(-) diff --git a/src/vendor/pg_ruleutils_17.c b/src/vendor/pg_ruleutils_17.c index 60509067..f706d0c4 100644 --- a/src/vendor/pg_ruleutils_17.c +++ b/src/vendor/pg_ruleutils_17.c @@ -10615,6 +10615,16 @@ get_func_expr(FuncExpr *expr, deparse_context *context, nargs++; } + /* + * iceberg_scan needs to be wrapped in a subquery to resolve a bug where + * aliasses on iceberg_scan are ignored: + * https://github.com/duckdb/duckdb-iceberg/issues/44 + */ + bool is_iceberg_scan = strcmp(pgduckdb_function_name(funcoid), "system.main.iceberg_scan") == 0; + if (is_iceberg_scan) { + appendStringInfoString(buf, "(FROM "); + } + appendStringInfo(buf, "%s(", generate_function_name(funcoid, nargs, argnames, argtypes, @@ -10631,6 +10641,10 @@ get_func_expr(FuncExpr *expr, deparse_context *context, get_rule_expr((Node *) lfirst(l), context, true); } appendStringInfoChar(buf, ')'); + + if (is_iceberg_scan) { + appendStringInfoChar(buf, ')'); + } } /* diff --git a/test/regression/expected/read_functions.out b/test/regression/expected/read_functions.out index 8d912d21..10d0807b 100644 --- a/test/regression/expected/read_functions.out +++ b/test/regression/expected/read_functions.out @@ -277,10 +277,11 @@ SELECT duckdb.install_extension('iceberg'); (1 row) SELECT COUNT(r['l_orderkey']) FROM iceberg_scan('../../data/lineitem_iceberg', allow_moved_paths => true) r; -ERROR: (PGDuckDB/CreatePlan) Prepared query returned an error: 'Binder Error: Referenced table "r" not found! -Candidate tables: "iceberg_scan_data" -LINE 1: SELECT count(r.l_orderkey) AS count FROM system.main... - ^ + count +------- + 51793 +(1 row) + -- TPCH query #6 SELECT sum(r['l_extendedprice'] * r['l_discount']) as revenue @@ -292,10 +293,11 @@ WHERE AND r['l_discount'] between 0.08 - 0.01 and 0.08 + 0.01 AND r['l_quantity'] < 25 LIMIT 1; -ERROR: (PGDuckDB/CreatePlan) Prepared query returned an error: 'Binder Error: Referenced table "r" not found! -Candidate tables: "iceberg_scan_data" -LINE 1: ...C allow_moved_paths => true) r WHERE ((r.l_shipdate >= '1997-01-01'::date) AND... - ^ + revenue +-------------- + 1520873.7806 +(1 row) + SELECT * FROM iceberg_snapshots('../../data/lineitem_iceberg'); sequence_number | snapshot_id | timestamp_ms | manifest_list -----------------+---------------------+------------------------------+------------------------------------------------------------------------------------------------ From 259e0f96d393ddda0e3ffb069a632c6134171e8f Mon Sep 17 00:00:00 2001 From: Jelte Fennema-Nio Date: Fri, 17 Jan 2025 14:40:38 +0100 Subject: [PATCH 24/40] Update iceberg_scan fix --- src/vendor/pg_ruleutils_17.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/src/vendor/pg_ruleutils_17.c b/src/vendor/pg_ruleutils_17.c index f706d0c4..23702362 100644 --- a/src/vendor/pg_ruleutils_17.c +++ b/src/vendor/pg_ruleutils_17.c @@ -10616,11 +10616,19 @@ get_func_expr(FuncExpr *expr, deparse_context *context, } /* - * iceberg_scan needs to be wrapped in a subquery to resolve a bug where - * aliasses on iceberg_scan are ignored: + * iceberg_scan needs to be wrapped in an additianol subquery to resolve a + * bug where aliasses on iceberg_scan are ignored: * https://github.com/duckdb/duckdb-iceberg/issues/44 + * + * By wrapping the iceberg_scan call the alias is given to the subquery, + * instead of th call. This subquery is easily optimized away by DuckDB, + * because it doesn't do anything. + * + * TODO: Probably check this in a bit more efficient way and move it to + * pgduckdb_ruleutils.cpp */ - bool is_iceberg_scan = strcmp(pgduckdb_function_name(funcoid), "system.main.iceberg_scan") == 0; + char *duckdb_function_name = pgduckdb_function_name(funcoid); + bool is_iceberg_scan = duckdb_function_name != NULL && strcmp(duckdb_function_name, "system.main.iceberg_scan") == 0; if (is_iceberg_scan) { appendStringInfoString(buf, "(FROM "); } From 13c7a0e60a6c0c04b87fd69176d3d098d78035eb Mon Sep 17 00:00:00 2001 From: Jelte Fennema-Nio Date: Fri, 17 Jan 2025 15:05:34 +0100 Subject: [PATCH 25/40] Also wrap "query" in a subquery --- src/vendor/pg_ruleutils_17.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/src/vendor/pg_ruleutils_17.c b/src/vendor/pg_ruleutils_17.c index 23702362..ff65c6dc 100644 --- a/src/vendor/pg_ruleutils_17.c +++ b/src/vendor/pg_ruleutils_17.c @@ -10624,12 +10624,19 @@ get_func_expr(FuncExpr *expr, deparse_context *context, * instead of th call. This subquery is easily optimized away by DuckDB, * because it doesn't do anything. * + * This problem is also true for the "query" function, which we use when + * creating materialized views and CTAS. + * https://github.com/duckdb/duckdb/issues/15570#issuecomment-2598419474 + * * TODO: Probably check this in a bit more efficient way and move it to * pgduckdb_ruleutils.cpp */ char *duckdb_function_name = pgduckdb_function_name(funcoid); - bool is_iceberg_scan = duckdb_function_name != NULL && strcmp(duckdb_function_name, "system.main.iceberg_scan") == 0; - if (is_iceberg_scan) { + bool function_needs_subquery = duckdb_function_name != NULL && ( + strcmp(duckdb_function_name, "system.main.iceberg_scan") == 0 + || strcmp(duckdb_function_name, "system.main.query") == 0 + ); + if (function_needs_subquery) { appendStringInfoString(buf, "(FROM "); } @@ -10650,7 +10657,7 @@ get_func_expr(FuncExpr *expr, deparse_context *context, } appendStringInfoChar(buf, ')'); - if (is_iceberg_scan) { + if (function_needs_subquery) { appendStringInfoChar(buf, ')'); } } From 8700df4ac419c3ecc273ae11ddd4e297bc7578b2 Mon Sep 17 00:00:00 2001 From: Jelte Fennema-Nio Date: Sat, 18 Jan 2025 10:15:15 +0100 Subject: [PATCH 26/40] Update comment to reflect reality --- src/vendor/pg_ruleutils_17.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/vendor/pg_ruleutils_17.c b/src/vendor/pg_ruleutils_17.c index ff65c6dc..926bcbb2 100644 --- a/src/vendor/pg_ruleutils_17.c +++ b/src/vendor/pg_ruleutils_17.c @@ -9123,10 +9123,9 @@ get_rule_expr(Node *node, deparse_context *context, else if (IsA(sbsref->refexpr, Var) && pgduckdb_var_is_duckdb_row((Var*) sbsref->refexpr)) { /* Subscript expressions into the duckdb.row type we want to * change to regular column references in the DuckDB query. - * Both because it's generally more common and thus - * results in better optimized queries, and because - * iceberg_scan doesn't support the subscripting syntax on - * its results. + * The main reason we do this is so that DuckDB generates + * nicer column names, i.e. without the square brackets: + * "mycolumn" instead of "r['mycolumn']" */ Assert(sbsref->refupperindexpr); Assert(!sbsref->reflowerindexpr); From b14a98e84310fc676e9ff6b1b0c37ec515374a9f Mon Sep 17 00:00:00 2001 From: Jelte Fennema-Nio Date: Tue, 21 Jan 2025 11:59:23 +0100 Subject: [PATCH 27/40] Simplify and comment --- include/pgduckdb/pgduckdb_ruleutils.h | 2 ++ src/pgduckdb_ruleutils.cpp | 47 +++++++++++++++++++++++++++ src/vendor/pg_ruleutils_17.c | 35 ++++++-------------- 3 files changed, 59 insertions(+), 25 deletions(-) diff --git a/include/pgduckdb/pgduckdb_ruleutils.h b/include/pgduckdb/pgduckdb_ruleutils.h index 28233b04..724eb0e6 100644 --- a/include/pgduckdb/pgduckdb_ruleutils.h +++ b/include/pgduckdb/pgduckdb_ruleutils.h @@ -14,5 +14,7 @@ bool pgduckdb_func_returns_duckdb_row(RangeTblFunction *rtfunc); bool pgduckdb_target_list_contains_unresolved_type_or_row(List *target_list); Var *pgduckdb_duckdb_row_subscript_var(Expr *expr); List *pgduckdb_star_start_vars(List *target_list); +bool pgduckdb_function_needs_subquery(Oid function_oid); +int pgduckdb_show_type(Const *constval, int original_showtype); extern bool processed_targetlist; diff --git a/src/pgduckdb_ruleutils.cpp b/src/pgduckdb_ruleutils.cpp index 433213b0..d7dfaed3 100644 --- a/src/pgduckdb_ruleutils.cpp +++ b/src/pgduckdb_ruleutils.cpp @@ -184,6 +184,53 @@ pgduckdb_star_start_vars(List *target_list) { return star_start_indexes; } +/* + * iceberg_scan needs to be wrapped in an additianol subquery to resolve a + * bug where aliasses on iceberg_scan are ignored: + * https://github.com/duckdb/duckdb-iceberg/issues/44 + * + * By wrapping the iceberg_scan call the alias is given to the subquery, + * instead of th call. This subquery is easily optimized away by DuckDB, + * because it doesn't do anything. + * + * This problem is also true for the "query" function, which we use when + * creating materialized views and CTAS. + * https://github.com/duckdb/duckdb/issues/15570#issuecomment-2598419474 + * + * TODO: Probably check this in a bit more efficient way and move it to + * pgduckdb_ruleutils.cpp + */ +bool +pgduckdb_function_needs_subquery(Oid function_oid) { + if (!pgduckdb::IsDuckdbOnlyFunction(function_oid)) { + return false; + } + + auto func_name = get_func_name(function_oid); + if (strcmp(func_name, "iceberg_scan") == 0) { + return true; + } + + if (strcmp(func_name, "query") == 0) { + return true; + } + return false; +} + +/* + * We never want to show the unresolved_type in DuckDB query. The + * unrosolved_type does not actually exist in DuckDB, we only use it to keep + * the Postgres parser happy. DuckDB can simply figure out the correct type + * itself without an explicit cast. + */ +int +pgduckdb_show_type(Const *constval, int original_showtype) { + if (pgduckdb_is_unresolved_type(constval->consttype)) { + return -1; + } + return original_showtype; +} + /* * Given a postgres schema name, this returns a list of two elements: the first * is the DuckDB database name and the second is the duckdb schema name. These diff --git a/src/vendor/pg_ruleutils_17.c b/src/vendor/pg_ruleutils_17.c index 926bcbb2..7b519221 100644 --- a/src/vendor/pg_ruleutils_17.c +++ b/src/vendor/pg_ruleutils_17.c @@ -10614,27 +10614,7 @@ get_func_expr(FuncExpr *expr, deparse_context *context, nargs++; } - /* - * iceberg_scan needs to be wrapped in an additianol subquery to resolve a - * bug where aliasses on iceberg_scan are ignored: - * https://github.com/duckdb/duckdb-iceberg/issues/44 - * - * By wrapping the iceberg_scan call the alias is given to the subquery, - * instead of th call. This subquery is easily optimized away by DuckDB, - * because it doesn't do anything. - * - * This problem is also true for the "query" function, which we use when - * creating materialized views and CTAS. - * https://github.com/duckdb/duckdb/issues/15570#issuecomment-2598419474 - * - * TODO: Probably check this in a bit more efficient way and move it to - * pgduckdb_ruleutils.cpp - */ - char *duckdb_function_name = pgduckdb_function_name(funcoid); - bool function_needs_subquery = duckdb_function_name != NULL && ( - strcmp(duckdb_function_name, "system.main.iceberg_scan") == 0 - || strcmp(duckdb_function_name, "system.main.query") == 0 - ); + bool function_needs_subquery = pgduckdb_function_needs_subquery(funcoid); if (function_needs_subquery) { appendStringInfoString(buf, "(FROM "); } @@ -11247,9 +11227,7 @@ get_const_expr(Const *constval, deparse_context *context, int showtype) char *extval; bool needlabel = false; - if (pgduckdb_is_unresolved_type(constval->consttype)) { - showtype = -1; - } + showtype = pgduckdb_show_type(constval, showtype); if (constval->constisnull) { @@ -12291,7 +12269,14 @@ get_from_clause_item(Node *jtnode, Query *query, deparse_context *context) get_rte_alias(rte, varno, false, context); if (pgduckdb_func_returns_duckdb_row(rtfunc1)) { - /* Don't print column aliases for functions that return a duckdb.row */ + /* + * We never want to print column aliases for functions that return + * a duckdb.row. The common pattern is for people to not provide an + * explicit column alias (i.e. "r" becomes "r(r)"). This obviously + * is a naming collision and DuckDB resolves that in the opposite + * way that we want. Never adding column aliases for duckdb.row + * avoids this conflict. + */ } /* Print the column definitions or aliases, if needed */ else if (rtfunc1 && rtfunc1->funccolnames != NIL) From fb4813359b36dc851ad9b48906180dd55e882c1a Mon Sep 17 00:00:00 2001 From: Jelte Fennema-Nio Date: Tue, 21 Jan 2025 14:57:28 +0100 Subject: [PATCH 28/40] Simplify star reconstruction logic and move it almost fully to pgduckdb_ruleutils.cpp --- include/pgduckdb/pgduckdb_ruleutils.h | 10 +- src/pgduckdb_ruleutils.cpp | 167 +++++++++++++++++++------- src/vendor/pg_ruleutils_17.c | 40 +----- 3 files changed, 136 insertions(+), 81 deletions(-) diff --git a/include/pgduckdb/pgduckdb_ruleutils.h b/include/pgduckdb/pgduckdb_ruleutils.h index 724eb0e6..b56f34a8 100644 --- a/include/pgduckdb/pgduckdb_ruleutils.h +++ b/include/pgduckdb/pgduckdb_ruleutils.h @@ -1,4 +1,12 @@ #include "postgres.h" +#include "pgduckdb/vendor/pg_list.hpp" + +typedef struct StarReconstructionContext { + List *target_list; + int varno_star; + int varattno_star; + bool added_current_star; +} StarReconstructionContext; char *pgduckdb_relation_name(Oid relid); char *pgduckdb_function_name(Oid function_oid); @@ -13,7 +21,7 @@ bool pgduckdb_var_is_duckdb_row(Var *var); bool pgduckdb_func_returns_duckdb_row(RangeTblFunction *rtfunc); bool pgduckdb_target_list_contains_unresolved_type_or_row(List *target_list); Var *pgduckdb_duckdb_row_subscript_var(Expr *expr); -List *pgduckdb_star_start_vars(List *target_list); +bool pgduckdb_reconstruct_star_step(StarReconstructionContext *ctx, ListCell *tle_cell); bool pgduckdb_function_needs_subquery(Oid function_oid); int pgduckdb_show_type(Const *constval, int original_showtype); diff --git a/src/pgduckdb_ruleutils.cpp b/src/pgduckdb_ruleutils.cpp index d7dfaed3..d8dccabd 100644 --- a/src/pgduckdb_ruleutils.cpp +++ b/src/pgduckdb_ruleutils.cpp @@ -20,6 +20,7 @@ extern "C" { #include "storage/lockdefs.h" #include "pgduckdb/vendor/pg_ruleutils.h" +#include "pgduckdb/pgduckdb_ruleutils.h" } #include "pgduckdb/pgduckdb.h" @@ -116,6 +117,75 @@ pgduckdb_duckdb_row_subscript_var(Expr *expr) { return refexpr; } +/* + * pgduckdb_check_for_star_start tries to figure out if this is tle_cell + * contains a Var that is the start of a run of Vars that should be + * reconstructed as a star. If that's the case it sets the varno_star and + * varattno_star of the ctx. + */ +static void +pgduckdb_check_for_star_start(StarReconstructionContext *ctx, ListCell *tle_cell) { + TargetEntry *first_tle = (TargetEntry *)lfirst(tle_cell); + + if (!IsA(first_tle->expr, Var)) { + /* Not a Var so we're not at the start of a run of Vars. */ + return; + } + + Var *first_var = (Var *)first_tle->expr; + + if (first_var->varattno != 1) { + /* If we don't have varattno 1, then we are not at a run of Vars */ + return; + } + + /* + * We found a Var that could potentially be the first of a run of Vars for + * which we have to reconstruct the star. To check if this is indeed the + * case we see if we can find a duckdb.row in this list of Vars. + */ + int varno = first_var->varno; + int varattno = first_var->varattno; + + do { + TargetEntry *tle = (TargetEntry *)lfirst(tle_cell); + + if (!IsA(tle->expr, Var)) { + /* + * We found the end of this run of Vars, by finding something else + * than a Var. + */ + return; + } + + Var *var = (Var *)tle->expr; + + if (var->varno != varno) { + /* A Var from a different RTE */ + return; + } + + if (var->varattno != varattno) { + /* Not a consecutive Var */ + return; + } + if (pgduckdb_var_is_duckdb_row(var)) { + /* + * If we have a duckdb.row, then we found a run of Vars that we + * have to reconstruct the star for. + */ + + ctx->varno_star = varno; + ctx->varattno_star = first_var->varattno; + ctx->added_current_star = false; + return; + } + + /* Look for the next Var in the run */ + varattno++; + } while ((tle_cell = lnext(ctx->target_list, tle_cell))); +} + /* * In our DuckDB queries we sometimes want to use "SELECT *", when selecting * from a function like read_parquet. That way DuckDB can figure out the actual @@ -128,60 +198,69 @@ pgduckdb_duckdb_row_subscript_var(Expr *expr) { * the same FROM entry, aka RangeTableEntry (RTE) that we expect were created * with a *. * - * This function tries to find the indexes of the first column for each of - * those runs. It does so using this heuristic: - * - * 1. Find a column with varattno == 1 (aka the first column from an RTE) - * 2. Find a consecutive run of columns from the same RTE with varattnos that - * keep increasing by 1. - * 3. Once we find a duckdb.row column in any of those consecutive columns, we - * assume this run was created using a star expression and we track the - * initial index. Then we start at 1 again to find the next run. - * - * NOTE: This function does not find the end of such runs, that's left as an - * excersice for the caller. It should be pretty easy for the caller to do - * that, because they need to remove such columns anyway. The main reason this - * function existis is so that the caller doesn't have to scan backwards to - * find the start of a run once it finds a duckdb.row column. Scanning - * backwards is difficult for the caller because it wants to write out columns - * to the DuckDB query as it consumes them. + * This function returns true if we should skip writing this tle_cell to the + * DuckDB query because it is part of a run of Vars that will be reconstructed + * as a star. */ -List * -pgduckdb_star_start_vars(List *target_list) { - List *star_start_indexes = NIL; - Var *possible_star_start_var = NULL; - int possible_star_start_var_index = 0; +bool +pgduckdb_reconstruct_star_step(StarReconstructionContext *ctx, ListCell *tle_cell) { + /* Detect start of a Var run that should be reconstructed to a star */ + pgduckdb_check_for_star_start(ctx, tle_cell); - int i = 0; + /* + * If we're not currently reconstructing a star we don't need to do + * anything. + */ + if (!ctx->varno_star) { + return false; + } - foreach_node(TargetEntry, tle, target_list) { - i++; + TargetEntry *tle = (TargetEntry *)lfirst(tle_cell); - if (!IsA(tle->expr, Var)) { - possible_star_start_var = NULL; - possible_star_start_var_index = 0; - continue; - } + /* + * Find out if this target entry is the next element in the run of Vars for + * the star we're currently reconstructing. + */ + if (tle->expr && IsA(tle->expr, Var)) { + Var *var = castNode(Var, tle->expr); - Var *var = (Var *)tle->expr; + if (var->varno == ctx->varno_star && var->varattno == ctx->varattno_star) { + /* + * We're still in the run of Vars, increment the varattno to look + * for the next Var on the next call. + */ + ctx->varattno_star++; - if (var->varattno == 1) { - possible_star_start_var = var; - possible_star_start_var_index = i; - } else if (possible_star_start_var) { - if (var->varno != possible_star_start_var->varno || var->varno == i - possible_star_start_var_index + 1) { - possible_star_start_var = NULL; - possible_star_start_var_index = 0; + /* If we already added star we skip writing this target entry */ + if (ctx->added_current_star) { + return true; } - } - if (pgduckdb_var_is_duckdb_row(var)) { - star_start_indexes = lappend_int(star_start_indexes, possible_star_start_var_index); - possible_star_start_var = NULL; - possible_star_start_var_index = 0; + /* + * If it's not a duckdb row we skip this target entry too. The way + * we add a single star is by expanding the first duckdb.row torget + * entry, which we've defined to expand to a star. So we need to + * skip any non duckdb.row Vars that precede the first duckdb.row. + */ + if (!pgduckdb_var_is_duckdb_row(var)) { + return true; + } + + ctx->added_current_star = true; + return false; } } - return star_start_indexes; + + /* + * If it was not, that means we've successfully expanded this star and we + * should start looking for the next star start. So reset all the state + * used for this star reconstruction. + */ + ctx->varno_star = 0; + ctx->varattno_star = 0; + ctx->added_current_star = false; + + return false; } /* diff --git a/src/vendor/pg_ruleutils_17.c b/src/vendor/pg_ruleutils_17.c index 7b519221..45b105a8 100644 --- a/src/vendor/pg_ruleutils_17.c +++ b/src/vendor/pg_ruleutils_17.c @@ -6041,55 +6041,23 @@ get_target_list(List *targetList, deparse_context *context, sep = " "; colno = 0; - List *star_starts = pgduckdb_star_start_vars(targetList); - ListCell *current_star_start = list_head(star_starts); - int varno_star = 0; - int varattno_star = 0; - bool added_star = false; + StarReconstructionContext star_reconstruction_context = {0}; + star_reconstruction_context.target_list = targetList; bool outermost_targetlist = !processed_targetlist; processed_targetlist = true; - int i = 0; foreach(l, targetList) { TargetEntry *tle = (TargetEntry *) lfirst(l); char *colname; char *attname; - i++; - - if (current_star_start && lfirst_int(current_star_start) == i) { - Var *var = castNode(Var, tle->expr); - varno_star = var->varno; - varattno_star = var->varattno; - } - - if (varno_star) { - bool reset = true; - if (tle->expr && IsA(tle->expr, Var)) { - Var *var = castNode(Var, tle->expr); - - if (var->varno == varno_star && var->varattno == varattno_star) { - varattno_star++; - reset = false; - if (added_star || !pgduckdb_var_is_duckdb_row(var)) { - continue; - } - added_star = true; - } - } - if (reset) { - varno_star = 0; - varattno_star = 0; - current_star_start = lnext(star_starts, current_star_start); - added_star = false; - } + if (pgduckdb_reconstruct_star_step(&star_reconstruction_context, l)) { + continue; } - - if (tle->resjunk) continue; /* ignore junk entries */ From f4f5d17a1ea2025cbe9508fc8b219650bde8106f Mon Sep 17 00:00:00 2001 From: Jelte Fennema-Nio Date: Tue, 21 Jan 2025 15:10:17 +0100 Subject: [PATCH 29/40] Move some more logic to pgduckdb_ruleutils.cpp --- include/pgduckdb/pgduckdb_ruleutils.h | 1 + src/pgduckdb_ruleutils.cpp | 28 +++++++++++++++ src/vendor/pg_ruleutils_17.c | 49 +++++++++------------------ 3 files changed, 45 insertions(+), 33 deletions(-) diff --git a/include/pgduckdb/pgduckdb_ruleutils.h b/include/pgduckdb/pgduckdb_ruleutils.h index b56f34a8..ded54cbc 100644 --- a/include/pgduckdb/pgduckdb_ruleutils.h +++ b/include/pgduckdb/pgduckdb_ruleutils.h @@ -24,5 +24,6 @@ Var *pgduckdb_duckdb_row_subscript_var(Expr *expr); bool pgduckdb_reconstruct_star_step(StarReconstructionContext *ctx, ListCell *tle_cell); bool pgduckdb_function_needs_subquery(Oid function_oid); int pgduckdb_show_type(Const *constval, int original_showtype); +bool pgduckdb_subscript_has_custom_alias(Plan *plan, List *rtable, Var *subscript_var, char *colname); extern bool processed_targetlist; diff --git a/src/pgduckdb_ruleutils.cpp b/src/pgduckdb_ruleutils.cpp index d8dccabd..c975730a 100644 --- a/src/pgduckdb_ruleutils.cpp +++ b/src/pgduckdb_ruleutils.cpp @@ -10,6 +10,7 @@ extern "C" { #include "commands/dbcommands.h" #include "nodes/nodeFuncs.h" #include "lib/stringinfo.h" +#include "parser/parsetree.h" #include "utils/builtins.h" #include "utils/guc.h" #include "utils/lsyscache.h" @@ -310,6 +311,33 @@ pgduckdb_show_type(Const *constval, int original_showtype) { return original_showtype; } +bool +pgduckdb_subscript_has_custom_alias(Plan *plan, List *rtable, Var *subscript_var, char *colname) { + /* The first bit of this logic is taken from get_variable() */ + int varno; + int varattno; + + /* + * If we have a syntactic referent for the Var, and we're working from a + * parse tree, prefer to use the syntactic referent. Otherwise, fall back + * on the semantic referent. (See comments in get_variable().) + */ + if (subscript_var->varnosyn > 0 && plan == NULL) { + varno = subscript_var->varnosyn; + varattno = subscript_var->varattnosyn; + } else { + varno = subscript_var->varno; + varattno = subscript_var->varattno; + } + + RangeTblEntry *rte = rt_fetch(varno, rtable); + + /* Custom code starts here */ + char *original_column = strVal(list_nth(rte->eref->colnames, varattno - 1)); + + return strcmp(original_column, colname) != 0; +} + /* * Given a postgres schema name, this returns a list of two elements: the first * is the DuckDB database name and the second is the duckdb schema name. These diff --git a/src/vendor/pg_ruleutils_17.c b/src/vendor/pg_ruleutils_17.c index 45b105a8..b1326b81 100644 --- a/src/vendor/pg_ruleutils_17.c +++ b/src/vendor/pg_ruleutils_17.c @@ -6115,46 +6115,29 @@ get_target_list(List *targetList, deparse_context *context, * This makes sure we don't add Postgres its bad default alias to the * duckdb.row type. */ - bool duckdb_row_needs_as = !pgduckdb_var_is_duckdb_row(var); + bool duckdb_skip_as = pgduckdb_var_is_duckdb_row(var); /* - * For r['abc'] we don't want the column name to be r, but instead we - * want it to be "abc". We can only to do this for the target list of - * the outside most query though to make sure references to the column - * name are still valid. + * For r['abc'] expressions we don't want the column name to be r, but + * instead we want it to be "abc". We can only to do this for the + * target list of the outside most query though to make sure references + * to the column name are still valid. */ - Var *subscript_var = pgduckdb_duckdb_row_subscript_var(tle->expr); - if (outermost_targetlist && subscript_var) { - deparse_namespace* dpns = (deparse_namespace *) list_nth(context->namespaces, - subscript_var->varlevelsup); - int varno; - int varattno; - - /* - * If we have a syntactic referent for the Var, and we're working from a - * parse tree, prefer to use the syntactic referent. Otherwise, fall back - * on the semantic referent. (Forcing use of the semantic referent when - * printing plan trees is a design choice that's perhaps more motivated by - * backwards compatibility than anything else. But it does have the - * advantage of making plans more explicit.) - */ - if (subscript_var->varnosyn > 0 && dpns->plan == NULL) - { - varno = subscript_var->varnosyn; - varattno = subscript_var->varattnosyn; - } - else - { - varno = subscript_var->varno; - varattno = subscript_var->varattno; + if (!duckdb_skip_as && outermost_targetlist) { + Var *subscript_var = pgduckdb_duckdb_row_subscript_var(tle->expr); + if (subscript_var) { + /* + * This cannot be moved to pgduckdb_ruleutils, because of + * the reliance on the non-public deparse_namespace type. + */ + deparse_namespace* dpns = (deparse_namespace *) list_nth(context->namespaces, + subscript_var->varlevelsup); + duckdb_skip_as = !pgduckdb_subscript_has_custom_alias(dpns->plan, dpns->rtable, subscript_var, colname); } - RangeTblEntry* rte = rt_fetch(varno, dpns->rtable); - char *original_column = strVal(list_nth(rte->eref->colnames, varattno - 1)); - duckdb_row_needs_as = strcmp(original_column, colname) != 0; } /* Show AS unless the column's name is correct as-is */ - if (colname && duckdb_row_needs_as) + if (colname && !duckdb_skip_as) { if (attname == NULL || strcmp(attname, colname) != 0) appendStringInfo(&targetbuf, " AS %s", quote_identifier(colname)); From 2410c1020ba9c78243713ca45614338f138a78fb Mon Sep 17 00:00:00 2001 From: Jelte Fennema-Nio Date: Tue, 21 Jan 2025 15:49:49 +0100 Subject: [PATCH 30/40] Move subscript to columnref conversion to pgduckdb_ruleutils.cpp --- include/pgduckdb/pgduckdb_ruleutils.h | 1 + src/pgduckdb_ruleutils.cpp | 40 +++++++++++++++++++++++++++ src/vendor/pg_ruleutils_17.c | 34 ++--------------------- 3 files changed, 43 insertions(+), 32 deletions(-) diff --git a/include/pgduckdb/pgduckdb_ruleutils.h b/include/pgduckdb/pgduckdb_ruleutils.h index ded54cbc..c226bae9 100644 --- a/include/pgduckdb/pgduckdb_ruleutils.h +++ b/include/pgduckdb/pgduckdb_ruleutils.h @@ -25,5 +25,6 @@ bool pgduckdb_reconstruct_star_step(StarReconstructionContext *ctx, ListCell *tl bool pgduckdb_function_needs_subquery(Oid function_oid); int pgduckdb_show_type(Const *constval, int original_showtype); bool pgduckdb_subscript_has_custom_alias(Plan *plan, List *rtable, Var *subscript_var, char *colname); +SubscriptingRef *pgduckdb_strip_first_subscript(SubscriptingRef *sbsref, StringInfo buf); extern bool processed_targetlist; diff --git a/src/pgduckdb_ruleutils.cpp b/src/pgduckdb_ruleutils.cpp index c975730a..ea20cf7c 100644 --- a/src/pgduckdb_ruleutils.cpp +++ b/src/pgduckdb_ruleutils.cpp @@ -22,6 +22,7 @@ extern "C" { #include "pgduckdb/vendor/pg_ruleutils.h" #include "pgduckdb/pgduckdb_ruleutils.h" +#include "pgduckdb/vendor/pg_list.hpp" } #include "pgduckdb/pgduckdb.h" @@ -338,6 +339,45 @@ pgduckdb_subscript_has_custom_alias(Plan *plan, List *rtable, Var *subscript_var return strcmp(original_column, colname) != 0; } +/* + * Subscript expressions that index into the duckdb.row type need to be changed + * to regular column references in the DuckDB query. The main reason we do this + * is so that DuckDB generates nicer column names, i.e. without the square + * brackets: "mycolumn" instead of "r['mycolumn']" + */ +SubscriptingRef * +pgduckdb_strip_first_subscript(SubscriptingRef *sbsref, StringInfo buf) { + if (!IsA(sbsref->refexpr, Var)) { + return sbsref; + } + + if (!pgduckdb_var_is_duckdb_row((Var *)sbsref->refexpr)) { + return sbsref; + } + + Assert(sbsref->refupperindexpr); + Assert(!sbsref->reflowerindexpr); + Oid typoutput; + bool typIsVarlena; + Const *constval = castNode(Const, linitial(sbsref->refupperindexpr)); + getTypeOutputInfo(constval->consttype, &typoutput, &typIsVarlena); + + char *extval = OidOutputFunctionCall(typoutput, constval->constvalue); + + appendStringInfo(buf, ".%s", quote_identifier(extval)); + + /* + * If there are any additional subscript expressions we should output them. + * Subscripts can be used in duckdb to index into arrays or json objects. + * It's fine if this results in an empty List, because printSubscripts + * handles that case correctly. + */ + SubscriptingRef *shorter_sbsref = (SubscriptingRef *)copyObjectImpl(sbsref); + /* strip the first subscript from the list */ + shorter_sbsref->refupperindexpr = list_delete_first(shorter_sbsref->refupperindexpr); + return shorter_sbsref; +} + /* * Given a postgres schema name, this returns a list of two elements: the first * is the DuckDB database name and the second is the duckdb schema name. These diff --git a/src/vendor/pg_ruleutils_17.c b/src/vendor/pg_ruleutils_17.c index b1326b81..b3f9684e 100644 --- a/src/vendor/pg_ruleutils_17.c +++ b/src/vendor/pg_ruleutils_17.c @@ -9071,41 +9071,11 @@ get_rule_expr(Node *node, deparse_context *context, appendStringInfoString(buf, " := "); get_rule_expr(refassgnexpr, context, showimplicit); } - else if (IsA(sbsref->refexpr, Var) && pgduckdb_var_is_duckdb_row((Var*) sbsref->refexpr)) { - /* Subscript expressions into the duckdb.row type we want to - * change to regular column references in the DuckDB query. - * The main reason we do this is so that DuckDB generates - * nicer column names, i.e. without the square brackets: - * "mycolumn" instead of "r['mycolumn']" - */ - Assert(sbsref->refupperindexpr); - Assert(!sbsref->reflowerindexpr); - Oid typoutput; - bool typIsVarlena; - Const *constval = castNode(Const, linitial(sbsref->refupperindexpr)); - getTypeOutputInfo(constval->consttype, - &typoutput, &typIsVarlena); - - char *extval = OidOutputFunctionCall(typoutput, constval->constvalue); - - appendStringInfo(context->buf, ".%s", quote_identifier(extval)); - - if (list_length(sbsref->refupperindexpr) > 1) { - /* - * If there are any additional subscript expressions we - * should output them. Subscripts can be used in duckdb - * to index into arrays or json objects. - */ - SubscriptingRef *shorter_sbsref = copyObject(sbsref); - /* strip the first subscript from the list */ - shorter_sbsref->refupperindexpr = list_delete_first(shorter_sbsref->refupperindexpr); - printSubscripts(shorter_sbsref, context); - } - } else { + SubscriptingRef *new_sbsref = pgduckdb_strip_first_subscript(sbsref, context->buf); /* Just an ordinary container fetch, so print subscripts */ - printSubscripts(sbsref, context); + printSubscripts(new_sbsref, context); } } break; From 855ce1859eb911a6165d5868e62ac65bb74f0339 Mon Sep 17 00:00:00 2001 From: Jelte Fennema-Nio Date: Tue, 21 Jan 2025 16:59:42 +0100 Subject: [PATCH 31/40] Move duckdb.row refname generation to pgduckdb_ruleutils.cpp --- include/pgduckdb/pgduckdb_ruleutils.h | 1 + src/pgduckdb_ruleutils.cpp | 29 ++++++++++++++++++ src/vendor/pg_ruleutils_17.c | 43 ++++++++++----------------- 3 files changed, 45 insertions(+), 28 deletions(-) diff --git a/include/pgduckdb/pgduckdb_ruleutils.h b/include/pgduckdb/pgduckdb_ruleutils.h index c226bae9..4ca2bd38 100644 --- a/include/pgduckdb/pgduckdb_ruleutils.h +++ b/include/pgduckdb/pgduckdb_ruleutils.h @@ -26,5 +26,6 @@ bool pgduckdb_function_needs_subquery(Oid function_oid); int pgduckdb_show_type(Const *constval, int original_showtype); bool pgduckdb_subscript_has_custom_alias(Plan *plan, List *rtable, Var *subscript_var, char *colname); SubscriptingRef *pgduckdb_strip_first_subscript(SubscriptingRef *sbsref, StringInfo buf); +char *pgduckdb_write_row_refname(StringInfo buf, char *refname, bool is_top_level); extern bool processed_targetlist; diff --git a/src/pgduckdb_ruleutils.cpp b/src/pgduckdb_ruleutils.cpp index ea20cf7c..249f2681 100644 --- a/src/pgduckdb_ruleutils.cpp +++ b/src/pgduckdb_ruleutils.cpp @@ -378,6 +378,35 @@ pgduckdb_strip_first_subscript(SubscriptingRef *sbsref, StringInfo buf) { return shorter_sbsref; } +/* + * Writes the refname to the buf in a way that results in the correct output + * for the duckdb.row type. + * + * Returns the "attname" that should be passed back to the caller of + * get_variable(). + */ +char * +pgduckdb_write_row_refname(StringInfo buf, char *refname, bool is_top_level) { + appendStringInfoString(buf, quote_identifier(refname)); + + if (is_top_level) { + /* + * If the duckdb.row is at the top level target list of a select, then + * we want to generate r.*, to unpack all the columns instead of + * returning a STRUCT from the query. + * + * Since we use .* there is no attname. + */ + appendStringInfoString(buf, ".*"); + return NULL; + } + + /* + * In any other case, we want to simply use the alias of the TargetEntry. + */ + return refname; +} + /* * Given a postgres schema name, this returns a list of two elements: the first * is the DuckDB database name and the second is the duckdb schema name. These diff --git a/src/vendor/pg_ruleutils_17.c b/src/vendor/pg_ruleutils_17.c index b3f9684e..26ec23ca 100644 --- a/src/vendor/pg_ruleutils_17.c +++ b/src/vendor/pg_ruleutils_17.c @@ -7542,35 +7542,22 @@ get_variable(Var *var, int levelsup, bool istoplevel, deparse_context *context) if (attnum > colinfo->num_cols) elog(ERROR, "invalid attnum %d for relation \"%s\"", attnum, rte->eref->aliasname); - if (pgduckdb_var_is_duckdb_row(var)) { - if (istoplevel) { - /* If the duckdb row is at the top level target list of a - * select, then we want to generate r.*, to unpack all the - * columns instead of returning a STRUCT from the query. */ - attname = NULL; - } else { - /* In any other case, we want to simply use the alias of the - * TargetEntry, without the column name that postgres gave to - * it. Because even though according to the Postgres parser we - * are referencing a column of type "duckdb.row", that column - * won't actually exist in the DuckDB query. - */ - attname = refname; - refname = NULL; - } - } else { - attname = colinfo->colnames[attnum - 1]; - /* - * If we find a Var referencing a dropped column, it seems better to - * print something (anything) than to fail. In general this should - * not happen, but it used to be possible for some cases involving - * functions returning named composite types, and perhaps there are - * still bugs out there. - */ - if (attname == NULL) - attname = "?dropped?column?"; + if (pgduckdb_var_is_duckdb_row(var)) { + return pgduckdb_write_row_refname(context->buf, refname, istoplevel); } + + attname = colinfo->colnames[attnum - 1]; + + /* + * If we find a Var referencing a dropped column, it seems better to + * print something (anything) than to fail. In general this should + * not happen, but it used to be possible for some cases involving + * functions returning named composite types, and perhaps there are + * still bugs out there. + */ + if (attname == NULL) + attname = "?dropped?column?"; } else { @@ -7588,7 +7575,7 @@ get_variable(Var *var, int levelsup, bool istoplevel, deparse_context *context) else { appendStringInfoChar(buf, '*'); - if (istoplevel && !pgduckdb_var_is_duckdb_row(var)) + if (istoplevel) appendStringInfo(buf, "::%s", format_type_with_typemod(var->vartype, var->vartypmod)); From 2c3fae59f5541359f28504ef9156024c3a142cd1 Mon Sep 17 00:00:00 2001 From: Jelte Fennema-Nio Date: Tue, 21 Jan 2025 20:03:16 +0100 Subject: [PATCH 32/40] Improve subscript support --- sql/pg_duckdb--0.2.0--0.3.0.sql | 4 +- src/pgduckdb_options.cpp | 96 +++++++++++++++++++-- src/pgduckdb_ruleutils.cpp | 4 +- test/regression/expected/read_functions.out | 30 ++++++- test/regression/sql/read_functions.sql | 16 +++- 5 files changed, 137 insertions(+), 13 deletions(-) diff --git a/sql/pg_duckdb--0.2.0--0.3.0.sql b/sql/pg_duckdb--0.2.0--0.3.0.sql index ed91cccc..722a8cc0 100644 --- a/sql/pg_duckdb--0.2.0--0.3.0.sql +++ b/sql/pg_duckdb--0.2.0--0.3.0.sql @@ -30,10 +30,12 @@ CREATE TYPE duckdb.row ( CREATE FUNCTION duckdb.unresolved_type_in(cstring) RETURNS duckdb.unresolved_type AS 'MODULE_PATHNAME', 'duckdb_unresolved_type_in' LANGUAGE C IMMUTABLE STRICT; CREATE FUNCTION duckdb.unresolved_type_out(duckdb.unresolved_type) RETURNS cstring AS 'MODULE_PATHNAME', 'duckdb_unresolved_type_out' LANGUAGE C IMMUTABLE STRICT; +CREATE FUNCTION duckdb.unresolved_type_subscript(internal) RETURNS internal AS 'MODULE_PATHNAME', 'duckdb_unresolved_type_subscript' LANGUAGE C IMMUTABLE STRICT; CREATE TYPE duckdb.unresolved_type ( INTERNALLENGTH = VARIABLE, INPUT = duckdb.unresolved_type_in, - OUTPUT = duckdb.unresolved_type_out + OUTPUT = duckdb.unresolved_type_out, + SUBSCRIPT = duckdb.unresolved_type_subscript ); -- Dummy functions for binary operators with unresolved type on the lefthand diff --git a/src/pgduckdb_options.cpp b/src/pgduckdb_options.cpp index 3a6b2e23..93d92762 100644 --- a/src/pgduckdb_options.cpp +++ b/src/pgduckdb_options.cpp @@ -395,10 +395,6 @@ DuckdbRowSubscriptTransform(SubscriptingRef *sbsref, List *indirection, struct P elog(ERROR, "Assignment to duckdb.row is not supported"); } - if (isSlice) { - elog(ERROR, "Creating a slice out of duckdb.row is not supported"); - } - if (indirection == NIL) { elog(ERROR, "Subscripting duckdb.row with an empty subscript is not supported"); } @@ -407,17 +403,21 @@ DuckdbRowSubscriptTransform(SubscriptingRef *sbsref, List *indirection, struct P // Transform each subscript expression foreach_node(A_Indices, subscript, indirection) { - Assert(!subscript->is_slice); Assert(subscript->uidx); Node *subscript_expr = transformExpr(pstate, subscript->uidx, pstate->p_expr_kind); - int expr_location = exprLocation(subscript->uidx); - Oid subscript_expr_type = exprType(subscript_expr); /* The first subscript needs to be a TEXT constant, since it should be * a column reference. But the subscripts after that can be anything, * DuckDB should interpret those. */ if (first) { + int expr_location = exprLocation(subscript->uidx); + Oid subscript_expr_type = exprType(subscript_expr); + + if (subscript->lidx) { + elog(ERROR, "Creating a slice out of duckdb.row is not supported"); + } + Node *coerced_expr = coerce_to_target_type(pstate, subscript_expr, subscript_expr_type, TEXTOID, -1, COERCION_IMPLICIT, COERCE_IMPLICIT_CAST, expr_location); if (!coerced_expr) { @@ -440,6 +440,13 @@ DuckdbRowSubscriptTransform(SubscriptingRef *sbsref, List *indirection, struct P first = false; } sbsref->refupperindexpr = lappend(sbsref->refupperindexpr, subscript_expr); + if (isSlice) { + Node *lower_subscript_expr = NULL; + if (subscript->uidx) { + lower_subscript_expr = transformExpr(pstate, subscript->lidx, pstate->p_expr_kind); + } + sbsref->reflowerindexpr = lappend(sbsref->reflowerindexpr, lower_subscript_expr); + } } // Set the result type of the subscripting operation @@ -459,7 +466,7 @@ DuckdbRowSubscriptExecSetup(const SubscriptingRef * /*sbsref*/, SubscriptingRefS elog(ERROR, "Subscripting duckdb.row is not supported in the Postgres Executor"); } -static SubscriptRoutines duckdb_subscript_routines = { +static SubscriptRoutines duckdb_row_subscript_routines = { .transform = DuckdbRowSubscriptTransform, .exec_setup = DuckdbRowSubscriptExecSetup, .fetch_strict = false, @@ -468,7 +475,78 @@ static SubscriptRoutines duckdb_subscript_routines = { }; DECLARE_PG_FUNCTION(duckdb_row_subscript) { - PG_RETURN_POINTER(&duckdb_subscript_routines); + PG_RETURN_POINTER(&duckdb_row_subscript_routines); +} + +/* + * DuckdbUnresolvedTypeSubscriptTransform is called by the parser when a + * subscripting operation is performed on a duckdb.unresolved_type. All this + * does is parse ensre that any subscript on duckdb.unresolved_type returns an + * unrsolved type again. + */ +void +DuckdbUnresolvedTypeSubscriptTransform(SubscriptingRef *sbsref, List *indirection, struct ParseState *pstate, + bool isSlice, bool isAssignment) { + /* + * We need to populate our cache for some of the code below. Normally this + * cache is populated at the start of our planner hook, but this function + * is being called from the parser. + */ + if (!pgduckdb::IsExtensionRegistered()) { + elog(ERROR, "BUG: Using duckdb.unresolved_type but the pg_duckdb extension is not installed"); + } + + if (isAssignment) { + elog(ERROR, "Assignment to duckdb.unresolved_type is not supported"); + } + + if (indirection == NIL) { + elog(ERROR, "Subscripting duckdb.row with an empty subscript is not supported"); + } + + // Transform each subscript expression + foreach_node(A_Indices, subscript, indirection) { + Assert(subscript->uidx); + + Node *subscript_expr = transformExpr(pstate, subscript->uidx, pstate->p_expr_kind); + + sbsref->refupperindexpr = lappend(sbsref->refupperindexpr, subscript_expr); + if (isSlice) { + Node *lower_subscript_expr = NULL; + if (subscript->uidx) { + lower_subscript_expr = transformExpr(pstate, subscript->lidx, pstate->p_expr_kind); + } + sbsref->reflowerindexpr = lappend(sbsref->reflowerindexpr, lower_subscript_expr); + } + } + + // Set the result type of the subscripting operation + sbsref->refrestype = pgduckdb::DuckdbUnresolvedTypeOid(); + sbsref->reftypmod = -1; +} + +/* + * DuckdbUnresolvedTypeSubscriptExecSetup is called by the executor when a + * subscripting operation is performed on a duckdb.unresolved_type. This should + * never happen, because any query that contains a duckdb.unresolved_type should + * automatically be use DuckDB execution. + */ +void +DuckdbUnresolvedTypeSubscriptExecSetup(const SubscriptingRef * /*sbsref*/, SubscriptingRefState * /*sbsrefstate*/, + SubscriptExecSteps * /*exprstate*/) { + elog(ERROR, "Subscripting duckdb.unresolved_type is not supported in the Postgres Executor"); +} + +static SubscriptRoutines duckdb_unresolved_type_subscript_routines = { + .transform = DuckdbUnresolvedTypeSubscriptTransform, + .exec_setup = DuckdbUnresolvedTypeSubscriptExecSetup, + .fetch_strict = false, + .fetch_leakproof = true, + .store_leakproof = true, +}; + +DECLARE_PG_FUNCTION(duckdb_unresolved_type_subscript) { + PG_RETURN_POINTER(&duckdb_unresolved_type_subscript_routines); } DECLARE_PG_FUNCTION(duckdb_row_in) { diff --git a/src/pgduckdb_ruleutils.cpp b/src/pgduckdb_ruleutils.cpp index 249f2681..cb51c568 100644 --- a/src/pgduckdb_ruleutils.cpp +++ b/src/pgduckdb_ruleutils.cpp @@ -356,7 +356,6 @@ pgduckdb_strip_first_subscript(SubscriptingRef *sbsref, StringInfo buf) { } Assert(sbsref->refupperindexpr); - Assert(!sbsref->reflowerindexpr); Oid typoutput; bool typIsVarlena; Const *constval = castNode(Const, linitial(sbsref->refupperindexpr)); @@ -375,6 +374,9 @@ pgduckdb_strip_first_subscript(SubscriptingRef *sbsref, StringInfo buf) { SubscriptingRef *shorter_sbsref = (SubscriptingRef *)copyObjectImpl(sbsref); /* strip the first subscript from the list */ shorter_sbsref->refupperindexpr = list_delete_first(shorter_sbsref->refupperindexpr); + if (shorter_sbsref->reflowerindexpr) { + shorter_sbsref->reflowerindexpr = list_delete_first(shorter_sbsref->reflowerindexpr); + } return shorter_sbsref; } diff --git a/test/regression/expected/read_functions.out b/test/regression/expected/read_functions.out index 10d0807b..f75dbb1b 100644 --- a/test/regression/expected/read_functions.out +++ b/test/regression/expected/read_functions.out @@ -29,13 +29,39 @@ SELECT r['sepal.length'], r['file_row_number'], r['filename'] 4.5 | 41 | ../../data/iris.parquet (5 rows) --- Supports subscripts +-- Further subscripting is supported on duckdb.row SELECT r['jsoncol'][1], r['arraycol'][2] FROM read_parquet('../../data/indexable.parquet') r; r.jsoncol[1] | r.arraycol[2] --------------+--------------- "d" | 22 (1 row) +-- And not just on duckdb.row, but also on duckdb.unresolved_type (the parenth +SELECT jsoncol[1], arraycol[2] FROM ( + SELECT r['jsoncol'] jsoncol, r['arraycol'] arraycol + FROM read_parquet('../../data/indexable.parquet') r +); + jsoncol | arraycol +---------+---------- + "d" | 22 +(1 row) + +-- And the same for slice subscripts +SELECT r['arraycol'][1:2] FROM read_parquet('../../data/indexable.parquet') r; + r.arraycol[1:2] +----------------- + {11,22} +(1 row) + +SELECT arraycol[1:2] FROM ( + SELECT r['arraycol'] arraycol + FROM read_parquet('../../data/indexable.parquet') r +); + arraycol +---------- + {11,22} +(1 row) + -- Subqueries correctly expand *, in case of multiple columns. SELECT * FROM ( SELECT 'something' as prefix, *, 'something else' as postfix @@ -241,6 +267,8 @@ SELECT r[3.14] FROM read_csv('../../data/web_page.csv') r limit 1; ERROR: duckdb.row subscript must have text type LINE 1: SELECT r[3.14] FROM read_csv('../../data/web_page.csv') r li... ^ +SELECT r['abc':'abc'] FROM read_csv('../../data/web_page.csv') r limit 1; +ERROR: Creating a slice out of duckdb.row is not supported SELECT r[q.col] FROM read_csv('../../data/web_page.csv') r, diff --git a/test/regression/sql/read_functions.sql b/test/regression/sql/read_functions.sql index 11c18862..4b1600aa 100644 --- a/test/regression/sql/read_functions.sql +++ b/test/regression/sql/read_functions.sql @@ -11,9 +11,22 @@ SELECT r['sepal.length'], r['file_row_number'], r['filename'] FROM read_parquet('../../data/iris.parquet', file_row_number => true, filename => true) r ORDER BY r['sepal.length'] LIMIT 5; --- Supports subscripts +-- Further subscripting is supported on duckdb.row SELECT r['jsoncol'][1], r['arraycol'][2] FROM read_parquet('../../data/indexable.parquet') r; +-- And not just on duckdb.row, but also on duckdb.unresolved_type (the parenth +SELECT jsoncol[1], arraycol[2] FROM ( + SELECT r['jsoncol'] jsoncol, r['arraycol'] arraycol + FROM read_parquet('../../data/indexable.parquet') r +); + +-- And the same for slice subscripts +SELECT r['arraycol'][1:2] FROM read_parquet('../../data/indexable.parquet') r; +SELECT arraycol[1:2] FROM ( + SELECT r['arraycol'] arraycol + FROM read_parquet('../../data/indexable.parquet') r +); + -- Subqueries correctly expand *, in case of multiple columns. SELECT * FROM ( SELECT 'something' as prefix, *, 'something else' as postfix @@ -134,6 +147,7 @@ SELECT * FROM (SELECT r['column00'] AS col1 FROM read_csv('../../data/web_page.c SELECT r[NULL] FROM read_csv('../../data/web_page.csv') r limit 1; SELECT r[123] FROM read_csv('../../data/web_page.csv') r limit 1; SELECT r[3.14] FROM read_csv('../../data/web_page.csv') r limit 1; +SELECT r['abc':'abc'] FROM read_csv('../../data/web_page.csv') r limit 1; SELECT r[q.col] FROM read_csv('../../data/web_page.csv') r, From 1b9f7d5bce7a347c8a74f75d1d6d9e9de8f39b28 Mon Sep 17 00:00:00 2001 From: Jelte Fennema-Nio Date: Tue, 21 Jan 2025 20:16:26 +0100 Subject: [PATCH 33/40] Port to PG16 ruleutils --- src/vendor/pg_ruleutils_16.c | 71 ++++++++++++++++++++++++++++++++++-- 1 file changed, 68 insertions(+), 3 deletions(-) diff --git a/src/vendor/pg_ruleutils_16.c b/src/vendor/pg_ruleutils_16.c index 7ed7ae78..601863f3 100644 --- a/src/vendor/pg_ruleutils_16.c +++ b/src/vendor/pg_ruleutils_16.c @@ -6027,12 +6027,23 @@ get_target_list(List *targetList, deparse_context *context, sep = " "; colno = 0; + + StarReconstructionContext star_reconstruction_context = {0}; + star_reconstruction_context.target_list = targetList; + + bool outermost_targetlist = !processed_targetlist; + processed_targetlist = true; + foreach(l, targetList) { TargetEntry *tle = (TargetEntry *) lfirst(l); char *colname; char *attname; + if (pgduckdb_reconstruct_star_step(&star_reconstruction_context, l)) { + continue; + } + if (tle->resjunk) continue; /* ignore junk entries */ @@ -6057,8 +6068,10 @@ get_target_list(List *targetList, deparse_context *context, * directly so that we can tell it to do the right thing, and so that * we can get the attribute name which is the default AS label. */ + Var *var = NULL; if (tle->expr && (IsA(tle->expr, Var))) { + var = (Var *) tle->expr; attname = get_variable((Var *) tle->expr, 0, true, context); } else @@ -6084,8 +6097,33 @@ get_target_list(List *targetList, deparse_context *context, else colname = tle->resname; + /* + * This makes sure we don't add Postgres its bad default alias to the + * duckdb.row type. + */ + bool duckdb_skip_as = pgduckdb_var_is_duckdb_row(var); + + /* + * For r['abc'] expressions we don't want the column name to be r, but + * instead we want it to be "abc". We can only to do this for the + * target list of the outside most query though to make sure references + * to the column name are still valid. + */ + if (!duckdb_skip_as && outermost_targetlist) { + Var *subscript_var = pgduckdb_duckdb_row_subscript_var(tle->expr); + if (subscript_var) { + /* + * This cannot be moved to pgduckdb_ruleutils, because of + * the reliance on the non-public deparse_namespace type. + */ + deparse_namespace* dpns = (deparse_namespace *) list_nth(context->namespaces, + subscript_var->varlevelsup); + duckdb_skip_as = !pgduckdb_subscript_has_custom_alias(dpns->plan, dpns->rtable, subscript_var, colname); + } + } + /* Show AS unless the column's name is correct as-is */ - if (colname) /* resname could be NULL */ + if (colname && !duckdb_skip_as) /* resname could be NULL */ { if (attname == NULL || strcmp(attname, colname) != 0) appendStringInfo(&targetbuf, " AS %s", quote_identifier(colname)); @@ -7448,6 +7486,11 @@ get_variable(Var *var, int levelsup, bool istoplevel, deparse_context *context) if (attnum > colinfo->num_cols) elog(ERROR, "invalid attnum %d for relation \"%s\"", attnum, rte->eref->aliasname); + + if (pgduckdb_var_is_duckdb_row(var)) { + return pgduckdb_write_row_refname(context->buf, refname, istoplevel); + } + attname = colinfo->colnames[attnum - 1]; /* @@ -8743,8 +8786,9 @@ get_rule_expr(Node *node, deparse_context *context, } else { + SubscriptingRef *new_sbsref = pgduckdb_strip_first_subscript(sbsref, context->buf); /* Just an ordinary container fetch, so print subscripts */ - printSubscripts(sbsref, context); + printSubscripts(new_sbsref, context); } } break; @@ -10075,6 +10119,11 @@ get_func_expr(FuncExpr *expr, deparse_context *context, nargs++; } + bool function_needs_subquery = pgduckdb_function_needs_subquery(funcoid); + if (function_needs_subquery) { + appendStringInfoString(buf, "(FROM "); + } + appendStringInfo(buf, "%s(", generate_function_name(funcoid, nargs, argnames, argtypes, @@ -10091,6 +10140,10 @@ get_func_expr(FuncExpr *expr, deparse_context *context, get_rule_expr((Node *) lfirst(l), context, true); } appendStringInfoChar(buf, ')'); + + if (function_needs_subquery) { + appendStringInfoChar(buf, ')'); + } } /* @@ -10669,6 +10722,8 @@ get_const_expr(Const *constval, deparse_context *context, int showtype) char *extval; bool needlabel = false; + showtype = pgduckdb_show_type(constval, showtype); + if (constval->constisnull) { /* @@ -11463,8 +11518,18 @@ get_from_clause_item(Node *jtnode, Query *query, deparse_context *context) /* Print the relation alias, if needed */ get_rte_alias(rte, varno, false, context); + if (pgduckdb_func_returns_duckdb_row(rtfunc1)) { + /* + * We never want to print column aliases for functions that return + * a duckdb.row. The common pattern is for people to not provide an + * explicit column alias (i.e. "r" becomes "r(r)"). This obviously + * is a naming collision and DuckDB resolves that in the opposite + * way that we want. Never adding column aliases for duckdb.row + * avoids this conflict. + */ + } /* Print the column definitions or aliases, if needed */ - if (rtfunc1 && rtfunc1->funccolnames != NIL) + else if (rtfunc1 && rtfunc1->funccolnames != NIL) { /* Reconstruct the columndef list, which is also the aliases */ get_from_clause_coldeflist(rtfunc1, colinfo, context); From bdcc965cd97674bff66c30e7cfecc5645205a021 Mon Sep 17 00:00:00 2001 From: Jelte Fennema-Nio Date: Tue, 21 Jan 2025 20:23:05 +0100 Subject: [PATCH 34/40] Port to PG14 ruleutils --- src/vendor/pg_ruleutils_14.c | 71 ++++++++++++++++++++++++++++++++++-- 1 file changed, 68 insertions(+), 3 deletions(-) diff --git a/src/vendor/pg_ruleutils_14.c b/src/vendor/pg_ruleutils_14.c index 572954ea..f9d59c47 100644 --- a/src/vendor/pg_ruleutils_14.c +++ b/src/vendor/pg_ruleutils_14.c @@ -5987,12 +5987,23 @@ get_target_list(List *targetList, deparse_context *context, sep = " "; colno = 0; + + StarReconstructionContext star_reconstruction_context = {0}; + star_reconstruction_context.target_list = targetList; + + bool outermost_targetlist = !processed_targetlist; + processed_targetlist = true; + foreach(l, targetList) { TargetEntry *tle = (TargetEntry *) lfirst(l); char *colname; char *attname; + if (pgduckdb_reconstruct_star_step(&star_reconstruction_context, l)) { + continue; + } + if (tle->resjunk) continue; /* ignore junk entries */ @@ -6017,8 +6028,10 @@ get_target_list(List *targetList, deparse_context *context, * directly so that we can tell it to do the right thing, and so that * we can get the attribute name which is the default AS label. */ + Var *var = NULL; if (tle->expr && (IsA(tle->expr, Var))) { + var = (Var *) tle->expr; attname = get_variable((Var *) tle->expr, 0, true, context); } else @@ -6044,8 +6057,33 @@ get_target_list(List *targetList, deparse_context *context, else colname = tle->resname; + /* + * This makes sure we don't add Postgres its bad default alias to the + * duckdb.row type. + */ + bool duckdb_skip_as = pgduckdb_var_is_duckdb_row(var); + + /* + * For r['abc'] expressions we don't want the column name to be r, but + * instead we want it to be "abc". We can only to do this for the + * target list of the outside most query though to make sure references + * to the column name are still valid. + */ + if (!duckdb_skip_as && outermost_targetlist) { + Var *subscript_var = pgduckdb_duckdb_row_subscript_var(tle->expr); + if (subscript_var) { + /* + * This cannot be moved to pgduckdb_ruleutils, because of + * the reliance on the non-public deparse_namespace type. + */ + deparse_namespace* dpns = (deparse_namespace *) list_nth(context->namespaces, + subscript_var->varlevelsup); + duckdb_skip_as = !pgduckdb_subscript_has_custom_alias(dpns->plan, dpns->rtable, subscript_var, colname); + } + } + /* Show AS unless the column's name is correct as-is */ - if (colname) /* resname could be NULL */ + if (colname && !duckdb_skip_as) /* resname could be NULL */ { if (attname == NULL || strcmp(attname, colname) != 0) appendStringInfo(&targetbuf, " AS %s", quote_identifier(colname)); @@ -7285,6 +7323,11 @@ get_variable(Var *var, int levelsup, bool istoplevel, deparse_context *context) if (attnum > colinfo->num_cols) elog(ERROR, "invalid attnum %d for relation \"%s\"", attnum, rte->eref->aliasname); + + if (pgduckdb_var_is_duckdb_row(var)) { + return pgduckdb_write_row_refname(context->buf, refname, istoplevel); + } + attname = colinfo->colnames[attnum - 1]; /* @@ -8598,8 +8641,9 @@ get_rule_expr(Node *node, deparse_context *context, } else { + SubscriptingRef *new_sbsref = pgduckdb_strip_first_subscript(sbsref, context->buf); /* Just an ordinary container fetch, so print subscripts */ - printSubscripts(sbsref, context); + printSubscripts(new_sbsref, context); } } break; @@ -9882,6 +9926,11 @@ get_func_expr(FuncExpr *expr, deparse_context *context, nargs++; } + bool function_needs_subquery = pgduckdb_function_needs_subquery(funcoid); + if (function_needs_subquery) { + appendStringInfoString(buf, "(FROM "); + } + appendStringInfo(buf, "%s(", generate_function_name(funcoid, nargs, argnames, argtypes, @@ -9898,6 +9947,10 @@ get_func_expr(FuncExpr *expr, deparse_context *context, get_rule_expr((Node *) lfirst(l), context, true); } appendStringInfoChar(buf, ')'); + + if (function_needs_subquery) { + appendStringInfoChar(buf, ')'); + } } /* @@ -10414,6 +10467,8 @@ get_const_expr(Const *constval, deparse_context *context, int showtype) char *extval; bool needlabel = false; + showtype = pgduckdb_show_type(constval, showtype); + if (constval->constisnull) { /* @@ -11056,8 +11111,18 @@ get_from_clause_item(Node *jtnode, Query *query, deparse_context *context) /* Print the relation alias, if needed */ get_rte_alias(rte, varno, false, context); + if (pgduckdb_func_returns_duckdb_row(rtfunc1)) { + /* + * We never want to print column aliases for functions that return + * a duckdb.row. The common pattern is for people to not provide an + * explicit column alias (i.e. "r" becomes "r(r)"). This obviously + * is a naming collision and DuckDB resolves that in the opposite + * way that we want. Never adding column aliases for duckdb.row + * avoids this conflict. + */ + } /* Print the column definitions or aliases, if needed */ - if (rtfunc1 && rtfunc1->funccolnames != NIL) + else if (rtfunc1 && rtfunc1->funccolnames != NIL) { /* Reconstruct the columndef list, which is also the aliases */ get_from_clause_coldeflist(rtfunc1, colinfo, context); From ac2bdf9ab8e9ea4d9bdfa524b88acec5207effb5 Mon Sep 17 00:00:00 2001 From: Jelte Fennema-Nio Date: Tue, 21 Jan 2025 20:23:44 +0100 Subject: [PATCH 35/40] Add back removed comment --- src/vendor/pg_ruleutils_17.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/vendor/pg_ruleutils_17.c b/src/vendor/pg_ruleutils_17.c index 26ec23ca..31a17e99 100644 --- a/src/vendor/pg_ruleutils_17.c +++ b/src/vendor/pg_ruleutils_17.c @@ -6137,7 +6137,7 @@ get_target_list(List *targetList, deparse_context *context, } /* Show AS unless the column's name is correct as-is */ - if (colname && !duckdb_skip_as) + if (colname && !duckdb_skip_as) /* resname could be NULL */ { if (attname == NULL || strcmp(attname, colname) != 0) appendStringInfo(&targetbuf, " AS %s", quote_identifier(colname)); From f24fc1996c5a693f99f97cdbec53e8896c28f2ab Mon Sep 17 00:00:00 2001 From: Jelte Fennema-Nio Date: Tue, 21 Jan 2025 20:29:59 +0100 Subject: [PATCH 36/40] Port to PG15 ruleutils --- src/vendor/pg_ruleutils_15.c | 71 ++++++++++++++++++++++++++++++++++-- 1 file changed, 68 insertions(+), 3 deletions(-) diff --git a/src/vendor/pg_ruleutils_15.c b/src/vendor/pg_ruleutils_15.c index 42cabfc1..ff58ef00 100644 --- a/src/vendor/pg_ruleutils_15.c +++ b/src/vendor/pg_ruleutils_15.c @@ -6068,12 +6068,23 @@ get_target_list(List *targetList, deparse_context *context, sep = " "; colno = 0; + + StarReconstructionContext star_reconstruction_context = {0}; + star_reconstruction_context.target_list = targetList; + + bool outermost_targetlist = !processed_targetlist; + processed_targetlist = true; + foreach(l, targetList) { TargetEntry *tle = (TargetEntry *) lfirst(l); char *colname; char *attname; + if (pgduckdb_reconstruct_star_step(&star_reconstruction_context, l)) { + continue; + } + if (tle->resjunk) continue; /* ignore junk entries */ @@ -6098,8 +6109,10 @@ get_target_list(List *targetList, deparse_context *context, * directly so that we can tell it to do the right thing, and so that * we can get the attribute name which is the default AS label. */ + Var *var = NULL; if (tle->expr && (IsA(tle->expr, Var))) { + var = (Var *) tle->expr; attname = get_variable((Var *) tle->expr, 0, true, context); } else @@ -6125,8 +6138,33 @@ get_target_list(List *targetList, deparse_context *context, else colname = tle->resname; + /* + * This makes sure we don't add Postgres its bad default alias to the + * duckdb.row type. + */ + bool duckdb_skip_as = pgduckdb_var_is_duckdb_row(var); + + /* + * For r['abc'] expressions we don't want the column name to be r, but + * instead we want it to be "abc". We can only to do this for the + * target list of the outside most query though to make sure references + * to the column name are still valid. + */ + if (!duckdb_skip_as && outermost_targetlist) { + Var *subscript_var = pgduckdb_duckdb_row_subscript_var(tle->expr); + if (subscript_var) { + /* + * This cannot be moved to pgduckdb_ruleutils, because of + * the reliance on the non-public deparse_namespace type. + */ + deparse_namespace* dpns = (deparse_namespace *) list_nth(context->namespaces, + subscript_var->varlevelsup); + duckdb_skip_as = !pgduckdb_subscript_has_custom_alias(dpns->plan, dpns->rtable, subscript_var, colname); + } + } + /* Show AS unless the column's name is correct as-is */ - if (colname) /* resname could be NULL */ + if (colname && !duckdb_skip_as) /* resname could be NULL */ { if (attname == NULL || strcmp(attname, colname) != 0) appendStringInfo(&targetbuf, " AS %s", quote_identifier(colname)); @@ -7488,6 +7526,11 @@ get_variable(Var *var, int levelsup, bool istoplevel, deparse_context *context) if (attnum > colinfo->num_cols) elog(ERROR, "invalid attnum %d for relation \"%s\"", attnum, rte->eref->aliasname); + + if (pgduckdb_var_is_duckdb_row(var)) { + return pgduckdb_write_row_refname(context->buf, refname, istoplevel); + } + attname = colinfo->colnames[attnum - 1]; /* @@ -8801,8 +8844,9 @@ get_rule_expr(Node *node, deparse_context *context, } else { + SubscriptingRef *new_sbsref = pgduckdb_strip_first_subscript(sbsref, context->buf); /* Just an ordinary container fetch, so print subscripts */ - printSubscripts(sbsref, context); + printSubscripts(new_sbsref, context); } } break; @@ -10084,6 +10128,11 @@ get_func_expr(FuncExpr *expr, deparse_context *context, nargs++; } + bool function_needs_subquery = pgduckdb_function_needs_subquery(funcoid); + if (function_needs_subquery) { + appendStringInfoString(buf, "(FROM "); + } + appendStringInfo(buf, "%s(", generate_function_name(funcoid, nargs, argnames, argtypes, @@ -10100,6 +10149,10 @@ get_func_expr(FuncExpr *expr, deparse_context *context, get_rule_expr((Node *) lfirst(l), context, true); } appendStringInfoChar(buf, ')'); + + if (function_needs_subquery) { + appendStringInfoChar(buf, ')'); + } } /* @@ -10616,6 +10669,8 @@ get_const_expr(Const *constval, deparse_context *context, int showtype) char *extval; bool needlabel = false; + showtype = pgduckdb_show_type(constval, showtype); + if (constval->constisnull) { /* @@ -11258,8 +11313,18 @@ get_from_clause_item(Node *jtnode, Query *query, deparse_context *context) /* Print the relation alias, if needed */ get_rte_alias(rte, varno, false, context); + if (pgduckdb_func_returns_duckdb_row(rtfunc1)) { + /* + * We never want to print column aliases for functions that return + * a duckdb.row. The common pattern is for people to not provide an + * explicit column alias (i.e. "r" becomes "r(r)"). This obviously + * is a naming collision and DuckDB resolves that in the opposite + * way that we want. Never adding column aliases for duckdb.row + * avoids this conflict. + */ + } /* Print the column definitions or aliases, if needed */ - if (rtfunc1 && rtfunc1->funccolnames != NIL) + else if (rtfunc1 && rtfunc1->funccolnames != NIL) { /* Reconstruct the columndef list, which is also the aliases */ get_from_clause_coldeflist(rtfunc1, colinfo, context); From 89e2849cd4317a075bf2fb24ece7be208218e5fd Mon Sep 17 00:00:00 2001 From: Jelte Fennema-Nio Date: Tue, 21 Jan 2025 20:32:24 +0100 Subject: [PATCH 37/40] Add missing include for PG below 17 --- src/pgduckdb_options.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/pgduckdb_options.cpp b/src/pgduckdb_options.cpp index 93d92762..dba6e306 100644 --- a/src/pgduckdb_options.cpp +++ b/src/pgduckdb_options.cpp @@ -29,6 +29,8 @@ extern "C" { #include "utils/snapmgr.h" #include "nodes/print.h" + +#include "pgduckdb/vendor/pg_list.hpp" } #include "pgduckdb/pgduckdb_options.hpp" From c41ea06ee37daabbc6d2638efbae74a6ce06b502 Mon Sep 17 00:00:00 2001 From: Jelte Fennema-Nio Date: Tue, 21 Jan 2025 20:41:19 +0100 Subject: [PATCH 38/40] Add some PG14 version specific code --- src/pgduckdb_ddl.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/pgduckdb_ddl.cpp b/src/pgduckdb_ddl.cpp index 6b2e0c8e..eb755809 100644 --- a/src/pgduckdb_ddl.cpp +++ b/src/pgduckdb_ddl.cpp @@ -91,7 +91,11 @@ WrapQueryInDuckdbQueryCall(Query *query, List *target_list) { ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), errmsg("BUG: pg_duckdb generated a command with multiple queries"))); +#if PG_VERSION_NUM >= 150000 return parse_analyze_fixedparams((RawStmt *)linitial(parsetree_list), buf->data, NULL, 0, NULL); +#else + return parse_analyze((RawStmt *)linitial(parsetree_list), buf->data, NULL, 0, NULL); +#endif } static void From 1b2d1cf187b8879b76a462a429b7d4d3004ff8bc Mon Sep 17 00:00:00 2001 From: Jelte Fennema-Nio Date: Tue, 21 Jan 2025 20:46:33 +0100 Subject: [PATCH 39/40] Add subquery aliases to make PG15 & PG14 happy --- test/regression/expected/read_functions.out | 22 ++++++++++----------- test/regression/sql/read_functions.sql | 22 ++++++++++----------- 2 files changed, 22 insertions(+), 22 deletions(-) diff --git a/test/regression/expected/read_functions.out b/test/regression/expected/read_functions.out index f75dbb1b..61dfa755 100644 --- a/test/regression/expected/read_functions.out +++ b/test/regression/expected/read_functions.out @@ -40,7 +40,7 @@ SELECT r['jsoncol'][1], r['arraycol'][2] FROM read_parquet('../../data/indexable SELECT jsoncol[1], arraycol[2] FROM ( SELECT r['jsoncol'] jsoncol, r['arraycol'] arraycol FROM read_parquet('../../data/indexable.parquet') r -); +) q; jsoncol | arraycol ---------+---------- "d" | 22 @@ -56,7 +56,7 @@ SELECT r['arraycol'][1:2] FROM read_parquet('../../data/indexable.parquet') r; SELECT arraycol[1:2] FROM ( SELECT r['arraycol'] arraycol FROM read_parquet('../../data/indexable.parquet') r -); +) q; arraycol ---------- {11,22} @@ -67,7 +67,7 @@ SELECT * FROM ( SELECT 'something' as prefix, *, 'something else' as postfix FROM read_parquet('../../data/iris.parquet') r LIMIT 1 -); +) q; prefix | sepal.length | sepal.width | petal.length | petal.width | variety | postfix -----------+--------------+-------------+--------------+-------------+---------+---------------- something | 5.1 | 3.5 | 1.4 | 0.2 | Setosa | something else @@ -83,7 +83,7 @@ SELECT r FROM ( SELECT 'something' as prefix, *, 'something else' as postfix FROM read_parquet('../../data/iris.parquet') r LIMIT 1 -); +) q; prefix | sepal.length | sepal.width | petal.length | petal.width | variety | postfix -----------+--------------+-------------+--------------+-------------+---------+---------------- something | 5.1 | 3.5 | 1.4 | 0.2 | Setosa | something else @@ -95,7 +95,7 @@ SELECT prefix, r FROM ( SELECT 'something' as prefix, *, 'something else' as postfix FROM read_parquet('../../data/iris.parquet') r LIMIT 1 -); +) q; prefix | sepal.length | sepal.width | petal.length | petal.width | variety | postfix -----------+--------------+-------------+--------------+-------------+---------+---------------- something | 5.1 | 3.5 | 1.4 | 0.2 | Setosa | something else @@ -105,7 +105,7 @@ SELECT prefix, r, postfix FROM ( SELECT 'something' as prefix, *, 'something else' as postfix FROM read_parquet('../../data/iris.parquet') r LIMIT 1 -); +) q; prefix | sepal.length | sepal.width | petal.length | petal.width | variety | postfix -----------+--------------+-------------+--------------+-------------+---------+---------------- something | 5.1 | 3.5 | 1.4 | 0.2 | Setosa | something else @@ -119,7 +119,7 @@ SELECT r, postfix FROM ( SELECT 'something' as prefix, *, 'something else' as postfix FROM read_parquet('../../data/iris.parquet') r LIMIT 1 -); +) q; prefix | sepal.length | sepal.width | petal.length | petal.width | variety | postfix | postfix -----------+--------------+-------------+--------------+-------------+---------+----------------+---------------- something | 5.1 | 3.5 | 1.4 | 0.2 | Setosa | something else | something else @@ -130,7 +130,7 @@ SELECT postfix, r, prefix FROM ( SELECT 'something' as prefix, *, 'something else' as postfix FROM read_parquet('../../data/iris.parquet') r LIMIT 1 -); +) q; postfix | prefix | sepal.length | sepal.width | petal.length | petal.width | variety | postfix | prefix ----------------+-----------+--------------+-------------+--------------+-------------+---------+----------------+----------- something else | something | 5.1 | 3.5 | 1.4 | 0.2 | Setosa | something else | something @@ -169,7 +169,7 @@ SELECT * FROM ( FROM read_parquet('../../data/iris.parquet') r, read_parquet('../../data/unsigned_types.parquet') r2 LIMIT 1 -); +) q; prefix | sepal.length | sepal.width | petal.length | petal.width | variety | utinyint | usmallint | uinteger | postfix -----------+--------------+-------------+--------------+-------------+---------+----------+-----------+------------+---------------- something | 5.1 | 3.5 | 1.4 | 0.2 | Setosa | 255 | 65535 | 4294967295 | something else @@ -241,14 +241,14 @@ SELECT r['column00'] AS r FROM read_csv('../../data/web_page.csv') r limit 1; -- be executed. To avoid that we simply don't rename a subscript expression -- inside a subquery, and only do so in the outermost SELECT list (aka -- targetlist). -SELECT * FROM (SELECT r['column00'] FROM read_csv('../../data/web_page.csv') r limit 1); +SELECT * FROM (SELECT r['column00'] FROM read_csv('../../data/web_page.csv') r limit 1) q; r --- 1 (1 row) -- If you give it a different alias then that alias is propegated though. -SELECT * FROM (SELECT r['column00'] AS col1 FROM read_csv('../../data/web_page.csv') r limit 1); +SELECT * FROM (SELECT r['column00'] AS col1 FROM read_csv('../../data/web_page.csv') r limit 1) q; col1 ------ 1 diff --git a/test/regression/sql/read_functions.sql b/test/regression/sql/read_functions.sql index 4b1600aa..82d32db5 100644 --- a/test/regression/sql/read_functions.sql +++ b/test/regression/sql/read_functions.sql @@ -18,21 +18,21 @@ SELECT r['jsoncol'][1], r['arraycol'][2] FROM read_parquet('../../data/indexable SELECT jsoncol[1], arraycol[2] FROM ( SELECT r['jsoncol'] jsoncol, r['arraycol'] arraycol FROM read_parquet('../../data/indexable.parquet') r -); +) q; -- And the same for slice subscripts SELECT r['arraycol'][1:2] FROM read_parquet('../../data/indexable.parquet') r; SELECT arraycol[1:2] FROM ( SELECT r['arraycol'] arraycol FROM read_parquet('../../data/indexable.parquet') r -); +) q; -- Subqueries correctly expand *, in case of multiple columns. SELECT * FROM ( SELECT 'something' as prefix, *, 'something else' as postfix FROM read_parquet('../../data/iris.parquet') r LIMIT 1 -); +) q; -- NOTE: A single "r" is equivalent to a *. The prefix and postfix columns are -- not explicitely selected, but still show up in the result. This is @@ -44,7 +44,7 @@ SELECT r FROM ( SELECT 'something' as prefix, *, 'something else' as postfix FROM read_parquet('../../data/iris.parquet') r LIMIT 1 -); +) q; -- ... but if you manually add the expected columns then they are merged into -- the new star expansion. @@ -52,12 +52,12 @@ SELECT prefix, r FROM ( SELECT 'something' as prefix, *, 'something else' as postfix FROM read_parquet('../../data/iris.parquet') r LIMIT 1 -); +) q; SELECT prefix, r, postfix FROM ( SELECT 'something' as prefix, *, 'something else' as postfix FROM read_parquet('../../data/iris.parquet') r LIMIT 1 -); +) q; -- This requires the prefix columns to be there though. If the prefix columns -- are not there the postfix columns don't get merged into the new star @@ -67,14 +67,14 @@ SELECT r, postfix FROM ( SELECT 'something' as prefix, *, 'something else' as postfix FROM read_parquet('../../data/iris.parquet') r LIMIT 1 -); +) q; -- If you swap them around, they will get duplicated though. For the SELECT postfix, r, prefix FROM ( SELECT 'something' as prefix, *, 'something else' as postfix FROM read_parquet('../../data/iris.parquet') r LIMIT 1 -); +) q; -- Joining two subqueries a single * works as expected. SELECT * @@ -101,7 +101,7 @@ SELECT * FROM ( FROM read_parquet('../../data/iris.parquet') r, read_parquet('../../data/unsigned_types.parquet') r2 LIMIT 1 -); +) q; -- read_csv @@ -138,10 +138,10 @@ SELECT r['column00'] AS r FROM read_csv('../../data/web_page.csv') r limit 1; -- be executed. To avoid that we simply don't rename a subscript expression -- inside a subquery, and only do so in the outermost SELECT list (aka -- targetlist). -SELECT * FROM (SELECT r['column00'] FROM read_csv('../../data/web_page.csv') r limit 1); +SELECT * FROM (SELECT r['column00'] FROM read_csv('../../data/web_page.csv') r limit 1) q; -- If you give it a different alias then that alias is propegated though. -SELECT * FROM (SELECT r['column00'] AS col1 FROM read_csv('../../data/web_page.csv') r limit 1); +SELECT * FROM (SELECT r['column00'] AS col1 FROM read_csv('../../data/web_page.csv') r limit 1) q; -- Only simple string literals are supported as column names SELECT r[NULL] FROM read_csv('../../data/web_page.csv') r limit 1; From c890f35f77a94bcdc4ef22206b1497730b21ffd9 Mon Sep 17 00:00:00 2001 From: Jelte Fennema-Nio Date: Thu, 23 Jan 2025 18:38:25 +0100 Subject: [PATCH 40/40] Address review --- src/pgduckdb_ddl.cpp | 7 +- src/pgduckdb_hooks.cpp | 17 ++- src/pgduckdb_options.cpp | 129 ++++++++++++-------- test/regression/expected/read_functions.out | 23 ++++ test/regression/sql/read_functions.sql | 9 ++ 5 files changed, 131 insertions(+), 54 deletions(-) diff --git a/src/pgduckdb_ddl.cpp b/src/pgduckdb_ddl.cpp index eb755809..8a85a6ed 100644 --- a/src/pgduckdb_ddl.cpp +++ b/src/pgduckdb_ddl.cpp @@ -151,8 +151,6 @@ DuckdbHandleDDL(PlannedStmt *pstmt, const char *query_string, ParamListInfo para } /* NOTE: The below code is mostly copied from ExecCreateTableAs */ - List *rewritten; - Query *query = (Query *)copyObjectImpl(original_query); /* * Parse analysis was done already, but we still have to run the rule @@ -160,11 +158,12 @@ DuckdbHandleDDL(PlannedStmt *pstmt, const char *query_string, ParamListInfo para * either came straight from the parser, or suitable locks were * acquired by plancache.c. */ - rewritten = QueryRewrite(query); + List *rewritten = QueryRewrite(query); /* SELECT should never rewrite to more or less than one SELECT query */ if (list_length(rewritten) != 1) - elog(ERROR, "unexpected rewrite result for CREATE TABLE AS SELECT"); + elog(ERROR, "unexpected rewrite result for CREATE TABLE AS SELECT, contains %d queries", + list_length(rewritten)); query = linitial_node(Query, rewritten); Assert(query->commandType == CMD_SELECT); diff --git a/src/pgduckdb_hooks.cpp b/src/pgduckdb_hooks.cpp index b148bbba..e9e0d1cb 100644 --- a/src/pgduckdb_hooks.cpp +++ b/src/pgduckdb_hooks.cpp @@ -362,13 +362,28 @@ DuckdbEmitLogHook(ErrorData *edata) { if (edata->elevel == ERROR && edata->sqlerrcode == ERRCODE_UNDEFINED_COLUMN && pgduckdb::IsExtensionRegistered()) { /* * XXX: It would be nice if we could check if the query contained any - * of the functions, but we don't have access to the query string here. + * of the functions. We could probably check the debug_query_string + * global for this. For now we don't consider that too important though. * So instead we simply always add this HINT for this specific error if * the pg_duckdb extension is installed. */ edata->hint = pstrdup( "If you use DuckDB functions like read_parquet, you need to use the r['colname'] syntax to use columns. If " "you're already doing that, maybe you forgot to to give the function the r alias."); + } else if (edata->elevel == ERROR && edata->sqlerrcode == ERRCODE_SYNTAX_ERROR && + pgduckdb::IsExtensionRegistered() && + strcmp(edata->message_id, + "a column definition list is only allowed for functions returning \"record\"") == 0) { + /* + * NOTE: We can probably remove this hint after a few releases once + * we've updated all known blogposts that still used the old syntax. + * + * Similarly to the other hint, this could check for actual usages of + * the relevant DuckDB functions. + */ + edata->hint = pstrdup( + "If you use DuckDB functions like read_parquet, you need to use the r['colname'] syntax introduced " + "in pg_duckdb 0.3.0. It seems like you might be using the outdated \"AS (colname coltype, ...)\" syntax"); } } diff --git a/src/pgduckdb_options.cpp b/src/pgduckdb_options.cpp index dba6e306..00f34683 100644 --- a/src/pgduckdb_options.cpp +++ b/src/pgduckdb_options.cpp @@ -375,6 +375,79 @@ DECLARE_PG_FUNCTION(pgduckdb_recycle_ddb) { PG_RETURN_BOOL(true); } +Node * +CoerceRowSubscriptToText(struct ParseState *pstate, A_Indices *subscript) { + if (!subscript->uidx) { + elog(ERROR, "Creating a slice out of duckdb.row is not supported"); + } + + Node *subscript_expr = transformExpr(pstate, subscript->uidx, pstate->p_expr_kind); + int expr_location = exprLocation(subscript->uidx); + Oid subscript_expr_type = exprType(subscript_expr); + + if (subscript->lidx) { + elog(ERROR, "Creating a slice out of duckdb.row is not supported"); + } + + Node *coerced_expr = coerce_to_target_type(pstate, subscript_expr, subscript_expr_type, TEXTOID, -1, + COERCION_IMPLICIT, COERCE_IMPLICIT_CAST, expr_location); + if (!coerced_expr) { + ereport(ERROR, (errcode(ERRCODE_DATATYPE_MISMATCH), errmsg("duckdb.row subscript must have text type"), + parser_errposition(pstate, expr_location))); + } + + if (!IsA(subscript_expr, Const)) { + ereport(ERROR, (errcode(ERRCODE_DATATYPE_MISMATCH), errmsg("duckdb.row subscript must be a constant"), + parser_errposition(pstate, expr_location))); + } + + Const *subscript_const = castNode(Const, subscript_expr); + if (subscript_const->constisnull) { + ereport(ERROR, (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED), errmsg("duckdb.row subscript cannot be NULL"), + parser_errposition(pstate, expr_location))); + } + + return coerced_expr; +} + +/* + * In Postgres all index operations in a row ar all slices or all plain + * index operations. If you mix them, all are converted to slices. + * There's no difference in representation possible between + * "col[1:2][1]" and "col[1:2][1:]". If you want this seperation you + * need to use parenthesis to seperate: "(col[1:2])[1]" + * This might seem like fairly strange behaviour, but Postgres uses + * this to be able to slice in multi-dimensional arrays and thtis + * behaviour is documented here: + * https://www.postgresql.org/docs/current/arrays.html#ARRAYS-ACCESSING + * + * This is different from DuckDB, but there's not much we can do about + * that. So we'll have this same behaviour by, which means we need to always + * add the lower subscript to the slice. The lower subscript will be NULL in + * that case. + * + * See also comments on SubscriptingRef in nodes/subscripting.h + */ +void +AddSubscriptExpressions(SubscriptingRef *sbsref, struct ParseState *pstate, A_Indices *subscript, bool isSlice) { + Assert(isSlice || subscript->uidx); + + Node *upper_subscript_expr = NULL; + if (subscript->uidx) { + upper_subscript_expr = transformExpr(pstate, subscript->uidx, pstate->p_expr_kind); + } + + sbsref->refupperindexpr = lappend(sbsref->refupperindexpr, upper_subscript_expr); + + if (isSlice) { + Node *lower_subscript_expr = NULL; + if (subscript->uidx) { + lower_subscript_expr = transformExpr(pstate, subscript->lidx, pstate->p_expr_kind); + } + sbsref->reflowerindexpr = lappend(sbsref->reflowerindexpr, lower_subscript_expr); + } +} + /* * DuckdbRowSubscriptTransform is called by the parser when a subscripting * operation is performed on a duckdb.row. It has two main puprposes: @@ -405,50 +478,19 @@ DuckdbRowSubscriptTransform(SubscriptingRef *sbsref, List *indirection, struct P // Transform each subscript expression foreach_node(A_Indices, subscript, indirection) { - Assert(subscript->uidx); - - Node *subscript_expr = transformExpr(pstate, subscript->uidx, pstate->p_expr_kind); - /* The first subscript needs to be a TEXT constant, since it should be * a column reference. But the subscripts after that can be anything, * DuckDB should interpret those. */ if (first) { - int expr_location = exprLocation(subscript->uidx); - Oid subscript_expr_type = exprType(subscript_expr); - - if (subscript->lidx) { - elog(ERROR, "Creating a slice out of duckdb.row is not supported"); + sbsref->refupperindexpr = lappend(sbsref->refupperindexpr, CoerceRowSubscriptToText(pstate, subscript)); + if (isSlice) { + sbsref->reflowerindexpr = lappend(sbsref->reflowerindexpr, NULL); } - - Node *coerced_expr = coerce_to_target_type(pstate, subscript_expr, subscript_expr_type, TEXTOID, -1, - COERCION_IMPLICIT, COERCE_IMPLICIT_CAST, expr_location); - if (!coerced_expr) { - ereport(ERROR, (errcode(ERRCODE_DATATYPE_MISMATCH), errmsg("duckdb.row subscript must have text type"), - parser_errposition(pstate, expr_location))); - } - - if (!IsA(subscript_expr, Const)) { - ereport(ERROR, (errcode(ERRCODE_DATATYPE_MISMATCH), errmsg("duckdb.row subscript must be a constant"), - parser_errposition(pstate, expr_location))); - } - - Const *subscript_const = castNode(Const, subscript_expr); - if (subscript_const->constisnull) { - ereport(ERROR, (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED), errmsg("duckdb.row subscript cannot be NULL"), - parser_errposition(pstate, expr_location))); - } - - subscript_expr = coerced_expr; first = false; + continue; } - sbsref->refupperindexpr = lappend(sbsref->refupperindexpr, subscript_expr); - if (isSlice) { - Node *lower_subscript_expr = NULL; - if (subscript->uidx) { - lower_subscript_expr = transformExpr(pstate, subscript->lidx, pstate->p_expr_kind); - } - sbsref->reflowerindexpr = lappend(sbsref->reflowerindexpr, lower_subscript_expr); - } + + AddSubscriptExpressions(sbsref, pstate, subscript, isSlice); } // Set the result type of the subscripting operation @@ -508,18 +550,7 @@ DuckdbUnresolvedTypeSubscriptTransform(SubscriptingRef *sbsref, List *indirectio // Transform each subscript expression foreach_node(A_Indices, subscript, indirection) { - Assert(subscript->uidx); - - Node *subscript_expr = transformExpr(pstate, subscript->uidx, pstate->p_expr_kind); - - sbsref->refupperindexpr = lappend(sbsref->refupperindexpr, subscript_expr); - if (isSlice) { - Node *lower_subscript_expr = NULL; - if (subscript->uidx) { - lower_subscript_expr = transformExpr(pstate, subscript->lidx, pstate->p_expr_kind); - } - sbsref->reflowerindexpr = lappend(sbsref->reflowerindexpr, lower_subscript_expr); - } + AddSubscriptExpressions(sbsref, pstate, subscript, isSlice); } // Set the result type of the subscripting operation diff --git a/test/regression/expected/read_functions.out b/test/regression/expected/read_functions.out index 61dfa755..4684c971 100644 --- a/test/regression/expected/read_functions.out +++ b/test/regression/expected/read_functions.out @@ -62,6 +62,21 @@ SELECT arraycol[1:2] FROM ( {11,22} (1 row) +SELECT r['arraycol'][:] FROM read_parquet('../../data/indexable.parquet') r; + r.arraycol[:] +--------------- + {11,22,33} +(1 row) + +SELECT arraycol[:] FROM ( + SELECT r['arraycol'] arraycol + FROM read_parquet('../../data/indexable.parquet') r +) q; + arraycol +------------ + {11,22,33} +(1 row) + -- Subqueries correctly expand *, in case of multiple columns. SELECT * FROM ( SELECT 'something' as prefix, *, 'something else' as postfix @@ -175,6 +190,12 @@ SELECT * FROM ( something | 5.1 | 3.5 | 1.4 | 0.2 | Setosa | 255 | 65535 | 4294967295 | something else (1 row) +-- We show a hint for the new syntax when someone uses the old syntax. +SELECT count("sepal.length") FROM read_parquet('../../data/iris.parquet') AS ("sepal.length" FLOAT); +ERROR: a column definition list is only allowed for functions returning "record" +LINE 1: ... FROM read_parquet('../../data/iris.parquet') AS ("sepal.len... + ^ +HINT: If you use DuckDB functions like read_parquet, you need to use the r['colname'] syntax introduced in pg_duckdb 0.3.0. It seems like you might be using the outdated "AS (colname coltype, ...)" syntax -- read_csv SELECT count(r['sepal.length']) FROM read_csv('../../data/iris.csv') r; count @@ -269,6 +290,8 @@ LINE 1: SELECT r[3.14] FROM read_csv('../../data/web_page.csv') r li... ^ SELECT r['abc':'abc'] FROM read_csv('../../data/web_page.csv') r limit 1; ERROR: Creating a slice out of duckdb.row is not supported +SELECT r[:] FROM read_csv('../../data/web_page.csv') r limit 1; +ERROR: Creating a slice out of duckdb.row is not supported SELECT r[q.col] FROM read_csv('../../data/web_page.csv') r, diff --git a/test/regression/sql/read_functions.sql b/test/regression/sql/read_functions.sql index 82d32db5..3b9633bb 100644 --- a/test/regression/sql/read_functions.sql +++ b/test/regression/sql/read_functions.sql @@ -26,6 +26,11 @@ SELECT arraycol[1:2] FROM ( SELECT r['arraycol'] arraycol FROM read_parquet('../../data/indexable.parquet') r ) q; +SELECT r['arraycol'][:] FROM read_parquet('../../data/indexable.parquet') r; +SELECT arraycol[:] FROM ( + SELECT r['arraycol'] arraycol + FROM read_parquet('../../data/indexable.parquet') r +) q; -- Subqueries correctly expand *, in case of multiple columns. SELECT * FROM ( @@ -103,6 +108,9 @@ SELECT * FROM ( LIMIT 1 ) q; +-- We show a hint for the new syntax when someone uses the old syntax. +SELECT count("sepal.length") FROM read_parquet('../../data/iris.parquet') AS ("sepal.length" FLOAT); + -- read_csv SELECT count(r['sepal.length']) FROM read_csv('../../data/iris.csv') r; @@ -148,6 +156,7 @@ SELECT r[NULL] FROM read_csv('../../data/web_page.csv') r limit 1; SELECT r[123] FROM read_csv('../../data/web_page.csv') r limit 1; SELECT r[3.14] FROM read_csv('../../data/web_page.csv') r limit 1; SELECT r['abc':'abc'] FROM read_csv('../../data/web_page.csv') r limit 1; +SELECT r[:] FROM read_csv('../../data/web_page.csv') r limit 1; SELECT r[q.col] FROM read_csv('../../data/web_page.csv') r,