diff --git a/dlt/cli/init_command.py b/dlt/cli/init_command.py index 75cb74952c..531204e76f 100644 --- a/dlt/cli/init_command.py +++ b/dlt/cli/init_command.py @@ -11,7 +11,7 @@ from dlt.common.configuration import is_secret_hint, DOT_DLT, make_dot_dlt_path from dlt.common.configuration.providers import CONFIG_TOML, SECRETS_TOML, ConfigTomlProvider, SecretsTomlProvider from dlt.version import DLT_PKG_NAME, __version__ -from dlt.common.normalizers.names.snake_case import normalize_schema_name +from dlt.common.normalizers.names.snake_case import normalize_identifier from dlt.common.destination import DestinationReference from dlt.common.reflection.utils import creates_func_def_name_node, rewrite_python_script from dlt.common.schema.exceptions import InvalidSchemaName @@ -131,7 +131,7 @@ def init_command(pipeline_name: str, destination_name: str, use_generic_template init_script_name = pipeline_script # normalize source name - norm_source_name = normalize_schema_name(pipeline_name) + norm_source_name = normalize_identifier(pipeline_name) if norm_source_name != pipeline_name: raise InvalidSchemaName(pipeline_name, norm_source_name) dest_pipeline_script = norm_source_name + ".py" diff --git a/dlt/common/normalizers/json/relational.py b/dlt/common/normalizers/json/relational.py index dbfec8d84c..10693be69f 100644 --- a/dlt/common/normalizers/json/relational.py +++ b/dlt/common/normalizers/json/relational.py @@ -63,8 +63,8 @@ def _flatten(schema: Schema, table: str, dict_row: TDataItemRow, _r_lvl: int) -> def norm_row_dicts(dict_row: StrAny, __r_lvl: int, parent_name: Optional[str]) -> None: for k, v in dict_row.items(): - corrected_k = schema.normalize_column_name(k) - child_name = corrected_k if not parent_name else schema.normalize_make_path(parent_name, corrected_k) + corrected_k = schema.naming.normalize_identifier(k) + child_name = corrected_k if not parent_name else schema.naming.normalize_make_path(parent_name, corrected_k) # for lists and dicts we must check if type is possibly complex if isinstance(v, (dict, list)): if not _is_complex_type(schema, table, child_name, __r_lvl): @@ -143,7 +143,7 @@ def _normalize_list( yield from _normalize_row(schema, v, extend, table, parent_table, parent_row_id, idx, _r_lvl) elif isinstance(v, list): # normalize lists of lists, we assume all lists in the list have the same type so they should go to the same table - list_table_name = schema.normalize_make_path(table, "list") + list_table_name = schema.naming.normalize_make_path(table, "list") yield from _normalize_list(schema, v, extend, list_table_name, parent_table, parent_row_id, _r_lvl + 1) else: # list of simple types @@ -197,11 +197,11 @@ def _normalize_row( # normalize and yield lists for k, list_content in lists.items(): - yield from _normalize_list(schema, list_content, extend, schema.normalize_make_path(table, k), table, row_id, _r_lvl + 1) + yield from _normalize_list(schema, list_content, extend, schema.naming.normalize_make_path(table, k), table, row_id, _r_lvl + 1) def _validate_normalizer_config(schema: Schema, config: RelationalNormalizerConfig) -> None: - validate_dict(RelationalNormalizerConfig, config, "./normalizers/json/config", validator_f=column_name_validator(schema.normalize_column_name)) + validate_dict(RelationalNormalizerConfig, config, "./normalizers/json/config", validator_f=column_name_validator(schema.naming)) def update_normalizer_config(schema: Schema, config: RelationalNormalizerConfig) -> None: @@ -247,4 +247,4 @@ def normalize_data_item(schema: Schema, item: TDataItem, load_id: str, table_nam row = cast(TDataItemRowRoot, item) # identify load id if loaded data must be processed after loading incrementally row["_dlt_load_id"] = load_id - yield from _normalize_row(schema, cast(TDataItemRowChild, row), {}, schema.normalize_table_name(table_name)) + yield from _normalize_row(schema, cast(TDataItemRowChild, row), {}, schema.naming.normalize_identifier(table_name)) diff --git a/dlt/common/normalizers/names/__init__.py b/dlt/common/normalizers/names/__init__.py index d7c9a76eec..6d774c95b5 100644 --- a/dlt/common/normalizers/names/__init__.py +++ b/dlt/common/normalizers/names/__init__.py @@ -1,8 +1,2 @@ -from typing import Callable, Sequence +from .typing import NamingConvention -# function signature to normalize names -TNormalizeNameFunc = Callable[[str], str] -# function signature to make paths -TNormalizeMakePath = Callable[..., str] -# function signature to break path into components -TNormalizeBreakPath = Callable[[str], Sequence[str]] diff --git a/dlt/common/normalizers/names/snake_case.py b/dlt/common/normalizers/names/snake_case.py index f7569ee61f..6e0706d81e 100644 --- a/dlt/common/normalizers/names/snake_case.py +++ b/dlt/common/normalizers/names/snake_case.py @@ -18,43 +18,41 @@ PATH_SEPARATOR = "__" -# fix a name so it's acceptable as database table name +def camel_to_snake(name: str) -> str: + name = SNAKE_CASE_BREAK_1.sub(r'\1_\2', name) + return SNAKE_CASE_BREAK_2.sub(r'\1_\2', name).lower() + + @lru_cache(maxsize=None) -def normalize_table_name(name: str) -> str: - if not name: - raise ValueError(name) +def normalize_path(path: str) -> str: + """Breaks path into identifiers using PATH_SEPARATOR, normalizes components and reconstitutes the path""" + return normalize_make_path(*map(normalize_identifier, normalize_break_path(path))) - def camel_to_snake(name: str) -> str: - name = SNAKE_CASE_BREAK_1.sub(r'\1_\2', name) - return SNAKE_CASE_BREAK_2.sub(r'\1_\2', name).lower() +# fix a name so it's an acceptable name for a database column +@lru_cache(maxsize=None) +def normalize_identifier(name: str) -> str: + """Normalizes the identifier according to naming convention represented by this function""" + if not name: + raise ValueError(name) # all characters that are not letters digits or a few special chars are replaced with underscore # then convert to snake case name = camel_to_snake(RE_NON_ALPHANUMERIC.sub("_", name)) # leading digits will be prefixed if RE_LEADING_DIGITS.match(name): name = "_" + name - # max 2 consecutive underscores are allowed - return RE_DOUBLE_UNDERSCORES.sub("__", name) - -# fix a name so it's an acceptable name for a database column -@lru_cache(maxsize=None) -def normalize_column_name(name: str) -> str: # replace consecutive underscores with single one to prevent name clashes with PATH_SEPARATOR - return RE_UNDERSCORES.sub("_", normalize_table_name(name)) - - -# fix a name so it is acceptable as schema name -def normalize_schema_name(name: str) -> str: - return normalize_column_name(name) + return RE_UNDERSCORES.sub("_", name) -# build full db dataset (dataset) name out of (normalized) default dataset and schema name def normalize_make_dataset_name(dataset_name: str, default_schema_name: str, schema_name: str) -> str: + """Builds full db dataset (dataset) name out of (normalized) default dataset and schema name""" if not schema_name: - raise ValueError("schema_name is None") - norm_name = normalize_schema_name(dataset_name) + raise ValueError("schema_name is None or empty") + if not dataset_name: + raise ValueError("dataset_name is None or empty") + norm_name = normalize_identifier(dataset_name) if norm_name != dataset_name: raise InvalidDatasetName(dataset_name, norm_name) # if default schema is None then suffix is not added @@ -64,13 +62,13 @@ def normalize_make_dataset_name(dataset_name: str, default_schema_name: str, sch return norm_name -# this function builds path out of path elements using PATH_SEPARATOR -def normalize_make_path(*elems: Any) -> str: - return PATH_SEPARATOR.join(elems) +def normalize_make_path(*identifiers: Any) -> str: + """Builds path out of path identifiers using PATH_SEPARATOR. Identifiers are not normalized""" + return PATH_SEPARATOR.join(identifiers) -# this function break path into elements def normalize_break_path(path: str) -> Sequence[str]: + """Breaks path into sequence of identifiers""" return path.split(PATH_SEPARATOR) diff --git a/dlt/common/normalizers/names/typing.py b/dlt/common/normalizers/names/typing.py new file mode 100644 index 0000000000..7440d66963 --- /dev/null +++ b/dlt/common/normalizers/names/typing.py @@ -0,0 +1,25 @@ +from typing import Any, Protocol, Sequence + + +class NamingConvention(Protocol): + PATH_SEPARATOR: str + + def normalize_identifier(self, name: str) -> str: + """Normalizes the identifier according to naming convention represented by this function""" + ... + + def normalize_path(self, path: str) -> str: + """Breaks path into identifiers using PATH_SEPARATOR, normalizes components and reconstitutes the path""" + ... + + def normalize_make_path(self, *identifiers: Any) -> str: + """Builds path out of path identifiers using PATH_SEPARATOR. Identifiers are not normalized""" + ... + + def normalize_break_path(self, path: str) -> Sequence[str]: + """Breaks path into sequence of identifiers""" + ... + + def normalize_make_dataset_name(self, dataset_name: str, default_schema_name: str, schema_name: str) -> str: + """Builds full db dataset (dataset) name out of (normalized) default dataset and schema name""" + pass diff --git a/dlt/common/schema/schema.py b/dlt/common/schema/schema.py index 16f0543f27..a3dcceaa4a 100644 --- a/dlt/common/schema/schema.py +++ b/dlt/common/schema/schema.py @@ -4,7 +4,7 @@ from dlt.common import json from dlt.common.typing import DictStrAny, StrAny, REPattern, SupportsVariant, VARIANT_FIELD_FORMAT -from dlt.common.normalizers.names import TNormalizeBreakPath, TNormalizeMakePath, TNormalizeNameFunc +from dlt.common.normalizers.names import NamingConvention from dlt.common.normalizers.json import TNormalizeJSONFunc from dlt.common.schema.typing import (SCHEMA_ENGINE_VERSION, LOADS_TABLE_NAME, VERSION_TABLE_NAME, TNormalizersConfig, TPartialTableSchema, TSchemaSettings, TSimpleRegex, TStoredSchema, TSchemaTables, TTableSchema, TTableSchemaColumns, TColumnSchema, TColumnProp, TDataType, @@ -19,12 +19,7 @@ class Schema: ENGINE_VERSION: ClassVar[int] = SCHEMA_ENGINE_VERSION # name normalization functions - normalize_table_name: TNormalizeNameFunc - normalize_column_name: TNormalizeNameFunc - normalize_schema_name: TNormalizeNameFunc - normalize_make_dataset_name: TNormalizeMakePath - normalize_make_path: TNormalizeMakePath - normalize_break_path: TNormalizeBreakPath + naming: NamingConvention # json normalization function normalize_data_item: TNormalizeJSONFunc @@ -117,18 +112,18 @@ def _exclude(path: str, excludes: Sequence[REPattern], includes: Sequence[REPatt return is_excluded and not is_included # break table name in components - branch = self.normalize_break_path(table_name) + branch = self.naming.normalize_break_path(table_name) # check if any of the rows is excluded by rules in any of the tables for i in range(len(branch), 0, -1): # stop is exclusive in `range` # start at the top level table - c_t = self.normalize_make_path(*branch[:i]) + c_t = self.naming.normalize_make_path(*branch[:i]) excludes = self._compiled_excludes.get(c_t) # only if there's possibility to exclude, continue if excludes: includes = self._compiled_includes.get(c_t) or [] for field_name in list(row.keys()): - path = self.normalize_make_path(*branch[i:], field_name) + path = self.naming.normalize_make_path(*branch[i:], field_name) if _exclude(path, excludes, includes): # TODO: copy to new instance del row[field_name] # type: ignore @@ -228,14 +223,14 @@ def merge_hints(self, new_hints: Mapping[TColumnHint, Sequence[TSimpleRegex]]) - def normalize_table_identifiers(self, table: TTableSchema) -> TTableSchema: # normalize all identifiers in table according to name normalizer of the schema - table["name"] = self.normalize_table_name(table["name"]) + table["name"] = self.naming.normalize_path(table["name"]) parent = table.get("parent") if parent: - table["parent"] = self.normalize_table_name(parent) + table["parent"] = self.naming.normalize_path(parent) columns = table.get("columns") if columns: for c in columns.values(): - c["name"] = self.normalize_column_name(c["name"]) + c["name"] = self.naming.normalize_path(c["name"]) # re-index columns as the name changed table["columns"] = {c["name"]:c for c in columns.values()} return table @@ -353,7 +348,7 @@ def _coerce_non_null_value(self, table_columns: TTableSchemaColumns, table_name: # otherwise we must create variant extension to the table # pass final=True so no more auto-variants can be created recursively # TODO: generate callback so DLT user can decide what to do - variant_col_name = self.normalize_make_path(col_name, VARIANT_FIELD_FORMAT % py_type) + variant_col_name = self.naming.normalize_make_path(col_name, VARIANT_FIELD_FORMAT % py_type) return self._coerce_non_null_value(table_columns, table_name, variant_col_name, v, final=True) # if coerced value is variant, then extract variant value @@ -362,7 +357,7 @@ def _coerce_non_null_value(self, table_columns: TTableSchemaColumns, table_name: coerced_v = coerced_v() if isinstance(coerced_v, tuple): # variant recovered so call recursively with variant column name and variant value - variant_col_name = self.normalize_make_path(col_name, VARIANT_FIELD_FORMAT % coerced_v[0]) + variant_col_name = self.naming.normalize_make_path(col_name, VARIANT_FIELD_FORMAT % coerced_v[0]) return self._coerce_non_null_value(table_columns, table_name, variant_col_name, coerced_v[1], final=True) if not existing_column: @@ -408,12 +403,7 @@ def _configure_normalizers(self) -> None: # import desired modules naming_module, json_module = utils.import_normalizers(self._normalizers_config) # name normalization functions - self.normalize_table_name = naming_module.normalize_table_name - self.normalize_column_name = naming_module.normalize_column_name - self.normalize_schema_name = naming_module.normalize_schema_name - self.normalize_make_dataset_name = naming_module.normalize_make_dataset_name - self.normalize_make_path = naming_module.normalize_make_path - self.normalize_break_path = naming_module.normalize_break_path + self.naming = naming_module # data item normalization function self.normalize_data_item = json_module.normalize_data_item json_module.extend_schema(self) @@ -434,12 +424,7 @@ def _reset_schema(self, name: str, normalizers: TNormalizersConfig = None, norma self._type_detections: Sequence[TTypeDetections] = None self._normalizers_config: TNormalizersConfig = normalizers - self.normalize_table_name: TNormalizeNameFunc = None - self.normalize_column_name: TNormalizeNameFunc = None - self.normalize_schema_name: TNormalizeNameFunc = None - self.normalize_make_dataset_name: TNormalizeMakePath = None - self.normalize_make_path: TNormalizeMakePath = None - self.normalize_break_path: TNormalizeBreakPath = None + self.naming = None # json normalization function self.normalize_data_item: TNormalizeJSONFunc = None @@ -470,7 +455,7 @@ def _from_stored_schema(self, stored_schema: TStoredSchema) -> None: self._compile_settings() def _set_schema_name(self, name: str, normalize_name: bool) -> None: - normalized_name = self.normalize_schema_name(name) + normalized_name = self.naming.normalize_identifier(name) if name != normalized_name: if normalize_name: name = normalized_name diff --git a/dlt/common/schema/utils.py b/dlt/common/schema/utils.py index 7bf42f8c2f..9e77f3f9f0 100644 --- a/dlt/common/schema/utils.py +++ b/dlt/common/schema/utils.py @@ -15,7 +15,7 @@ from dlt.common.json._simplejson import custom_encode as json_custom_encode from dlt.common.arithmetics import InvalidOperation from dlt.common.exceptions import DictValidationException -from dlt.common.normalizers.names import TNormalizeNameFunc +from dlt.common.normalizers.names import NamingConvention from dlt.common.typing import DictStrAny, REPattern from dlt.common.time import parse_iso_like_datetime from dlt.common.utils import map_nested_in_place, str2bool @@ -129,13 +129,16 @@ def simple_regex_validator(path: str, pk: str, pv: Any, t: Any) -> bool: return False -def column_name_validator(normalize_func: TNormalizeNameFunc) -> TCustomValidator: +def column_name_validator(naming: NamingConvention) -> TCustomValidator: def validator(path: str, pk: str, pv: Any, t: Any) -> bool: if t is TColumnName: if not isinstance(pv, str): raise DictValidationException(f"In {path}: field {pk} value {pv} has invalid type {type(pv).__name__} while str is expected", path, pk, pv) - if normalize_func(pv) != pv: + try: + if naming.normalize_path(pv) != pv: + raise DictValidationException(f"In {path}: field {pk}: {pv} is not a valid column name", path, pk, pv) + except ValueError: raise DictValidationException(f"In {path}: field {pk}: {pv} is not a valid column name", path, pk, pv) return True else: @@ -597,7 +600,7 @@ def default_normalizers() -> TNormalizersConfig: } -def import_normalizers(normalizers_config: TNormalizersConfig) -> Tuple[ModuleType, ModuleType]: +def import_normalizers(normalizers_config: TNormalizersConfig) -> Tuple[NamingConvention, ModuleType]: # TODO: type the modules with protocols naming_module = import_module(normalizers_config["names"]) json_module = import_module(normalizers_config["json"]["module"]) diff --git a/dlt/destinations/bigquery/bigquery.py b/dlt/destinations/bigquery/bigquery.py index 690dab9b5b..d9102f85a9 100644 --- a/dlt/destinations/bigquery/bigquery.py +++ b/dlt/destinations/bigquery/bigquery.py @@ -93,7 +93,7 @@ class BigQueryClient(SqlJobClientBase): def __init__(self, schema: Schema, config: BigQueryClientConfiguration) -> None: sql_client = BigQuerySqlClient( - schema.normalize_make_dataset_name(config.dataset_name, config.default_schema_name, schema.name), + schema.naming.normalize_make_dataset_name(config.dataset_name, config.default_schema_name, schema.name), config.credentials ) super().__init__(schema, config, sql_client) diff --git a/dlt/destinations/duckdb/duck.py b/dlt/destinations/duckdb/duck.py index 889e1b4645..45df44c63d 100644 --- a/dlt/destinations/duckdb/duck.py +++ b/dlt/destinations/duckdb/duck.py @@ -44,7 +44,7 @@ class DuckDbClient(InsertValuesJobClient): def __init__(self, schema: Schema, config: DuckDbClientConfiguration) -> None: sql_client = DuckDbSqlClient( - schema.normalize_make_dataset_name(config.dataset_name, config.default_schema_name, schema.name), + schema.naming.normalize_make_dataset_name(config.dataset_name, config.default_schema_name, schema.name), config.credentials ) super().__init__(schema, config, sql_client) diff --git a/dlt/destinations/postgres/postgres.py b/dlt/destinations/postgres/postgres.py index c237eface5..68484f4fe9 100644 --- a/dlt/destinations/postgres/postgres.py +++ b/dlt/destinations/postgres/postgres.py @@ -54,7 +54,7 @@ class PostgresClient(InsertValuesJobClient): def __init__(self, schema: Schema, config: PostgresClientConfiguration) -> None: sql_client = Psycopg2SqlClient( - schema.normalize_make_dataset_name(config.dataset_name, config.default_schema_name, schema.name), + schema.naming.normalize_make_dataset_name(config.dataset_name, config.default_schema_name, schema.name), config.credentials ) super().__init__(schema, config, sql_client) diff --git a/dlt/destinations/redshift/redshift.py b/dlt/destinations/redshift/redshift.py index 0550c5b76d..431b8909a5 100644 --- a/dlt/destinations/redshift/redshift.py +++ b/dlt/destinations/redshift/redshift.py @@ -74,7 +74,7 @@ class RedshiftClient(InsertValuesJobClient): def __init__(self, schema: Schema, config: RedshiftClientConfiguration) -> None: sql_client = RedshiftSqlClient ( - schema.normalize_make_dataset_name(config.dataset_name, config.default_schema_name, schema.name), + schema.naming.normalize_make_dataset_name(config.dataset_name, config.default_schema_name, schema.name), config.credentials ) super().__init__(schema, config, sql_client) diff --git a/dlt/extract/extract.py b/dlt/extract/extract.py index 75de3b6ea3..9204b23859 100644 --- a/dlt/extract/extract.py +++ b/dlt/extract/extract.py @@ -58,7 +58,7 @@ def _write_item(table_name: str, item: TDataItems) -> None: # note: normalize function should be cached so there's almost no penalty on frequent calling # note: column schema is not required for jsonl writer used here # event.pop(DLT_METADATA_FIELD, None) # type: ignore - storage.write_data_item(extract_id, schema.name, schema.normalize_table_name(table_name), item, None) + storage.write_data_item(extract_id, schema.name, schema.naming.normalize_identifier(table_name), item, None) def _write_dynamic_table(resource: DltResource, item: TDataItem) -> None: table_name = resource._table_name_hint_fun(item) diff --git a/dlt/pipeline/pipeline.py b/dlt/pipeline/pipeline.py index 1f0d7e3893..95d455a9d0 100644 --- a/dlt/pipeline/pipeline.py +++ b/dlt/pipeline/pipeline.py @@ -865,7 +865,7 @@ def _make_schema_with_default_name(self) -> Schema: return Schema(self.pipeline_name, normalize_name=True) def _validate_dataset_name(self, dataset_name: str) -> None: - normalized_name = self._default_naming.normalize_schema_name(dataset_name) + normalized_name = self._default_naming.normalize_identifier(dataset_name) if normalized_name != dataset_name: raise InvalidDatasetName(dataset_name, normalized_name) @@ -873,7 +873,7 @@ def _set_dataset_name(self, dataset_name: str) -> None: if not dataset_name: if not self.dataset_name: # set default dataset name from pipeline name - dataset_name = self._default_naming.normalize_schema_name(self.pipeline_name) + dataset_name = self._default_naming.normalize_identifier(self.pipeline_name) else: return diff --git a/pyproject.toml b/pyproject.toml index 22054baffb..13c7e38c8d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "python-dlt" -version = "0.2.0a20" +version = "0.2.0a21" description = "DLT is an open-source python-native scalable data loading framework that does not require any devops efforts to run." authors = ["dltHub Inc. "] maintainers = [ "Marcin Rudolf ", "Adrian Brudaru ", "Ty Dunn "] diff --git a/tests/common/cases/schemas/github/issues.schema.json b/tests/common/cases/schemas/github/issues.schema.json new file mode 100644 index 0000000000..2760a20db0 --- /dev/null +++ b/tests/common/cases/schemas/github/issues.schema.json @@ -0,0 +1,1321 @@ +{ + "version": 2, + "version_hash": "IeCTkq8epwbjSy1O3jdkPPUkTPCt4hLj6RYo8uZ02JI=", + "engine_version": 5, + "name": "event", + "tables": { + "_dlt_version": { + "name": "_dlt_version", + "columns": { + "version": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "foreign_key": false, + "name": "version", + "data_type": "bigint", + "nullable": false + }, + "engine_version": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "foreign_key": false, + "name": "engine_version", + "data_type": "bigint", + "nullable": false + }, + "inserted_at": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "foreign_key": false, + "name": "inserted_at", + "data_type": "timestamp", + "nullable": false + }, + "schema_name": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "foreign_key": false, + "name": "schema_name", + "data_type": "text", + "nullable": false + }, + "version_hash": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "foreign_key": false, + "name": "version_hash", + "data_type": "text", + "nullable": false + }, + "schema": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "foreign_key": false, + "name": "schema", + "data_type": "text", + "nullable": false + } + }, + "write_disposition": "skip", + "description": "Created by DLT. Tracks schema updates" + }, + "_dlt_loads": { + "name": "_dlt_loads", + "columns": { + "load_id": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "foreign_key": false, + "name": "load_id", + "data_type": "text", + "nullable": false + }, + "schema_name": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "foreign_key": false, + "name": "schema_name", + "data_type": "text", + "nullable": true + }, + "status": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "foreign_key": false, + "name": "status", + "data_type": "bigint", + "nullable": false + }, + "inserted_at": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "foreign_key": false, + "name": "inserted_at", + "data_type": "timestamp", + "nullable": false + } + }, + "write_disposition": "skip", + "description": "Created by DLT. Tracks completed loads" + }, + "issues": { + "name": "issues", + "columns": { + "url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "foreign_key": false, + "name": "url", + "data_type": "text", + "nullable": true + }, + "repository_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "foreign_key": false, + "name": "repository_url", + "data_type": "text", + "nullable": true + }, + "labels_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "foreign_key": false, + "name": "labels_url", + "data_type": "text", + "nullable": true + }, + "comments_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "foreign_key": false, + "name": "comments_url", + "data_type": "text", + "nullable": true + }, + "events_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "foreign_key": false, + "name": "events_url", + "data_type": "text", + "nullable": true + }, + "html_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "foreign_key": false, + "name": "html_url", + "data_type": "text", + "nullable": true + }, + "id": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "foreign_key": false, + "name": "id", + "data_type": "bigint", + "nullable": true + }, + "node_id": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "foreign_key": false, + "name": "node_id", + "data_type": "text", + "nullable": true + }, + "number": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "foreign_key": false, + "name": "number", + "data_type": "bigint", + "nullable": true + }, + "title": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "foreign_key": false, + "name": "title", + "data_type": "text", + "nullable": true + }, + "user__login": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "foreign_key": false, + "name": "user__login", + "data_type": "text", + "nullable": true + }, + "user__id": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "foreign_key": false, + "name": "user__id", + "data_type": "bigint", + "nullable": true + }, + "user__node_id": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "foreign_key": false, + "name": "user__node_id", + "data_type": "text", + "nullable": true + }, + "user__avatar_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "foreign_key": false, + "name": "user__avatar_url", + "data_type": "text", + "nullable": true + }, + "user__gravatar_id": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "foreign_key": false, + "name": "user__gravatar_id", + "data_type": "text", + "nullable": true + }, + "user__url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "foreign_key": false, + "name": "user__url", + "data_type": "text", + "nullable": true + }, + "user__html_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "foreign_key": false, + "name": "user__html_url", + "data_type": "text", + "nullable": true + }, + "user__followers_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "foreign_key": false, + "name": "user__followers_url", + "data_type": "text", + "nullable": true + }, + "user__following_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "foreign_key": false, + "name": "user__following_url", + "data_type": "text", + "nullable": true + }, + "user__gists_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "foreign_key": false, + "name": "user__gists_url", + "data_type": "text", + "nullable": true + }, + "user__starred_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "foreign_key": false, + "name": "user__starred_url", + "data_type": "text", + "nullable": true + }, + "user__subscriptions_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "foreign_key": false, + "name": "user__subscriptions_url", + "data_type": "text", + "nullable": true + }, + "user__organizations_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "foreign_key": false, + "name": "user__organizations_url", + "data_type": "text", + "nullable": true + }, + "user__repos_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "foreign_key": false, + "name": "user__repos_url", + "data_type": "text", + "nullable": true + }, + "user__events_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "foreign_key": false, + "name": "user__events_url", + "data_type": "text", + "nullable": true + }, + "user__received_events_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "foreign_key": false, + "name": "user__received_events_url", + "data_type": "text", + "nullable": true + }, + "user__type": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "foreign_key": false, + "name": "user__type", + "data_type": "text", + "nullable": true + }, + "user__site_admin": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "foreign_key": false, + "name": "user__site_admin", + "data_type": "bool", + "nullable": true + }, + "state": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "foreign_key": false, + "name": "state", + "data_type": "text", + "nullable": true + }, + "locked": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "foreign_key": false, + "name": "locked", + "data_type": "bool", + "nullable": true + }, + "assignee__login": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "foreign_key": false, + "name": "assignee__login", + "data_type": "text", + "nullable": true + }, + "assignee__id": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "foreign_key": false, + "name": "assignee__id", + "data_type": "bigint", + "nullable": true + }, + "assignee__node_id": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "foreign_key": false, + "name": "assignee__node_id", + "data_type": "text", + "nullable": true + }, + "assignee__avatar_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "foreign_key": false, + "name": "assignee__avatar_url", + "data_type": "text", + "nullable": true + }, + "assignee__gravatar_id": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "foreign_key": false, + "name": "assignee__gravatar_id", + "data_type": "text", + "nullable": true + }, + "assignee__url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "foreign_key": false, + "name": "assignee__url", + "data_type": "text", + "nullable": true + }, + "assignee__html_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "foreign_key": false, + "name": "assignee__html_url", + "data_type": "text", + "nullable": true + }, + "assignee__followers_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "foreign_key": false, + "name": "assignee__followers_url", + "data_type": "text", + "nullable": true + }, + "assignee__following_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "foreign_key": false, + "name": "assignee__following_url", + "data_type": "text", + "nullable": true + }, + "assignee__gists_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "foreign_key": false, + "name": "assignee__gists_url", + "data_type": "text", + "nullable": true + }, + "assignee__starred_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "foreign_key": false, + "name": "assignee__starred_url", + "data_type": "text", + "nullable": true + }, + "assignee__subscriptions_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "foreign_key": false, + "name": "assignee__subscriptions_url", + "data_type": "text", + "nullable": true + }, + "assignee__organizations_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "foreign_key": false, + "name": "assignee__organizations_url", + "data_type": "text", + "nullable": true + }, + "assignee__repos_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "foreign_key": false, + "name": "assignee__repos_url", + "data_type": "text", + "nullable": true + }, + "assignee__events_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "foreign_key": false, + "name": "assignee__events_url", + "data_type": "text", + "nullable": true + }, + "assignee__received_events_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "foreign_key": false, + "name": "assignee__received_events_url", + "data_type": "text", + "nullable": true + }, + "assignee__type": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "foreign_key": false, + "name": "assignee__type", + "data_type": "text", + "nullable": true + }, + "assignee__site_admin": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "foreign_key": false, + "name": "assignee__site_admin", + "data_type": "bool", + "nullable": true + }, + "comments": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "foreign_key": false, + "name": "comments", + "data_type": "bigint", + "nullable": true + }, + "created_at": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "foreign_key": false, + "name": "created_at", + "data_type": "timestamp", + "nullable": true + }, + "updated_at": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "foreign_key": false, + "name": "updated_at", + "data_type": "timestamp", + "nullable": true + }, + "closed_at": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "foreign_key": false, + "name": "closed_at", + "data_type": "timestamp", + "nullable": true + }, + "author_association": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "foreign_key": false, + "name": "author_association", + "data_type": "text", + "nullable": true + }, + "body": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "foreign_key": false, + "name": "body", + "data_type": "text", + "nullable": true + }, + "reactions__url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "foreign_key": false, + "name": "reactions__url", + "data_type": "text", + "nullable": true + }, + "reactions__total_count": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "foreign_key": false, + "name": "reactions__total_count", + "data_type": "bigint", + "nullable": true + }, + "reactions___1": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "foreign_key": false, + "name": "reactions___1", + "data_type": "bigint", + "nullable": true + }, + "reactions__laugh": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "foreign_key": false, + "name": "reactions__laugh", + "data_type": "bigint", + "nullable": true + }, + "reactions__hooray": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "foreign_key": false, + "name": "reactions__hooray", + "data_type": "bigint", + "nullable": true + }, + "reactions__confused": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "foreign_key": false, + "name": "reactions__confused", + "data_type": "bigint", + "nullable": true + }, + "reactions__heart": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "foreign_key": false, + "name": "reactions__heart", + "data_type": "bigint", + "nullable": true + }, + "reactions__rocket": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "foreign_key": false, + "name": "reactions__rocket", + "data_type": "bigint", + "nullable": true + }, + "reactions__eyes": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "foreign_key": false, + "name": "reactions__eyes", + "data_type": "bigint", + "nullable": true + }, + "timeline_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "foreign_key": false, + "name": "timeline_url", + "data_type": "text", + "nullable": true + }, + "state_reason": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "foreign_key": false, + "name": "state_reason", + "data_type": "text", + "nullable": true + }, + "_dlt_load_id": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "foreign_key": false, + "name": "_dlt_load_id", + "data_type": "text", + "nullable": false + }, + "_dlt_id": { + "partition": false, + "cluster": false, + "unique": true, + "sort": false, + "primary_key": false, + "foreign_key": false, + "name": "_dlt_id", + "data_type": "text", + "nullable": false + }, + "draft": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "foreign_key": false, + "name": "draft", + "data_type": "bool", + "nullable": true + }, + "pull_request__url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "foreign_key": false, + "name": "pull_request__url", + "data_type": "text", + "nullable": true + }, + "pull_request__html_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "foreign_key": false, + "name": "pull_request__html_url", + "data_type": "text", + "nullable": true + }, + "pull_request__diff_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "foreign_key": false, + "name": "pull_request__diff_url", + "data_type": "text", + "nullable": true + }, + "pull_request__patch_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "foreign_key": false, + "name": "pull_request__patch_url", + "data_type": "text", + "nullable": true + }, + "pull_request__merged_at": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "foreign_key": false, + "name": "pull_request__merged_at", + "data_type": "timestamp", + "nullable": true + } + }, + "write_disposition": "append" + }, + "issues__labels": { + "name": "issues__labels", + "columns": { + "id": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "foreign_key": false, + "name": "id", + "data_type": "bigint", + "nullable": true + }, + "node_id": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "foreign_key": false, + "name": "node_id", + "data_type": "text", + "nullable": true + }, + "url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "foreign_key": false, + "name": "url", + "data_type": "text", + "nullable": true + }, + "name": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "foreign_key": false, + "name": "name", + "data_type": "text", + "nullable": true + }, + "color": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "foreign_key": false, + "name": "color", + "data_type": "text", + "nullable": true + }, + "default": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "foreign_key": false, + "name": "default", + "data_type": "bool", + "nullable": true + }, + "description": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "foreign_key": false, + "name": "description", + "data_type": "text", + "nullable": true + }, + "_dlt_parent_id": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "foreign_key": true, + "name": "_dlt_parent_id", + "data_type": "text", + "nullable": false + }, + "_dlt_list_idx": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "foreign_key": false, + "name": "_dlt_list_idx", + "data_type": "bigint", + "nullable": false + }, + "_dlt_id": { + "partition": false, + "cluster": false, + "unique": true, + "sort": false, + "primary_key": false, + "foreign_key": false, + "name": "_dlt_id", + "data_type": "text", + "nullable": false + } + }, + "parent": "issues" + }, + "issues__assignees": { + "name": "issues__assignees", + "columns": { + "login": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "foreign_key": false, + "name": "login", + "data_type": "text", + "nullable": true + }, + "id": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "foreign_key": false, + "name": "id", + "data_type": "bigint", + "nullable": true + }, + "node_id": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "foreign_key": false, + "name": "node_id", + "data_type": "text", + "nullable": true + }, + "avatar_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "foreign_key": false, + "name": "avatar_url", + "data_type": "text", + "nullable": true + }, + "gravatar_id": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "foreign_key": false, + "name": "gravatar_id", + "data_type": "text", + "nullable": true + }, + "url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "foreign_key": false, + "name": "url", + "data_type": "text", + "nullable": true + }, + "html_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "foreign_key": false, + "name": "html_url", + "data_type": "text", + "nullable": true + }, + "followers_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "foreign_key": false, + "name": "followers_url", + "data_type": "text", + "nullable": true + }, + "following_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "foreign_key": false, + "name": "following_url", + "data_type": "text", + "nullable": true + }, + "gists_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "foreign_key": false, + "name": "gists_url", + "data_type": "text", + "nullable": true + }, + "starred_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "foreign_key": false, + "name": "starred_url", + "data_type": "text", + "nullable": true + }, + "subscriptions_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "foreign_key": false, + "name": "subscriptions_url", + "data_type": "text", + "nullable": true + }, + "organizations_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "foreign_key": false, + "name": "organizations_url", + "data_type": "text", + "nullable": true + }, + "repos_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "foreign_key": false, + "name": "repos_url", + "data_type": "text", + "nullable": true + }, + "events_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "foreign_key": false, + "name": "events_url", + "data_type": "text", + "nullable": true + }, + "received_events_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "foreign_key": false, + "name": "received_events_url", + "data_type": "text", + "nullable": true + }, + "type": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "foreign_key": false, + "name": "type", + "data_type": "text", + "nullable": true + }, + "site_admin": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "foreign_key": false, + "name": "site_admin", + "data_type": "bool", + "nullable": true + }, + "_dlt_parent_id": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "foreign_key": true, + "name": "_dlt_parent_id", + "data_type": "text", + "nullable": false + }, + "_dlt_list_idx": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "foreign_key": false, + "name": "_dlt_list_idx", + "data_type": "bigint", + "nullable": false + }, + "_dlt_id": { + "partition": false, + "cluster": false, + "unique": true, + "sort": false, + "primary_key": false, + "foreign_key": false, + "name": "_dlt_id", + "data_type": "text", + "nullable": false + } + }, + "parent": "issues" + } + }, + "settings": { + "detections": [ + "timestamp", + "iso_timestamp" + ], + "default_hints": { + "not_null": [ + "_dlt_id", + "_dlt_root_id", + "_dlt_parent_id", + "_dlt_list_idx", + "_dlt_load_id" + ], + "foreign_key": [ + "_dlt_parent_id" + ], + "unique": [ + "_dlt_id" + ] + } + }, + "normalizers": { + "names": "dlt.common.normalizers.names.snake_case", + "json": { + "module": "dlt.common.normalizers.json.relational" + } + } +} \ No newline at end of file diff --git a/tests/common/normalizers/test_names_snake_case.py b/tests/common/normalizers/test_names_snake_case.py index 21421134d5..aa112a909f 100644 --- a/tests/common/normalizers/test_names_snake_case.py +++ b/tests/common/normalizers/test_names_snake_case.py @@ -1,33 +1,33 @@ import pytest -from dlt.common.normalizers.names.snake_case import normalize_column_name, normalize_table_name, normalize_make_dataset_name, RE_NON_ALPHANUMERIC +from dlt.common.normalizers.names.snake_case import normalize_identifier, normalize_path, normalize_make_dataset_name, RE_NON_ALPHANUMERIC from dlt.common.schema.exceptions import InvalidDatasetName -def test_normalize_column_name() -> None: - assert normalize_column_name("event_value") == "event_value" - assert normalize_column_name("event value") == "event_value" - assert normalize_column_name("event-.!:<>value") == "event_value" +def test_normalize_identifier() -> None: + assert normalize_identifier("event_value") == "event_value" + assert normalize_identifier("event value") == "event_value" + assert normalize_identifier("event-.!:<>value") == "event_value" # prefix leading digits - assert normalize_column_name("1event_n'") == "_1event_n_" - assert normalize_column_name("123event_n'") == "_123event_n_" + assert normalize_identifier("1event_n'") == "_1event_n_" + assert normalize_identifier("123event_n'") == "_123event_n_" # all lowercase and converted to snake - assert normalize_column_name("123BaNaNa") == "_123_ba_na_na" + assert normalize_identifier("123BaNaNa") == "_123_ba_na_na" # consecutive capital letters - assert normalize_column_name("BANANA") == "banana" - assert normalize_column_name("BAN_ANA") == "ban_ana" - assert normalize_column_name("BANaNA") == "ba_na_na" + assert normalize_identifier("BANANA") == "banana" + assert normalize_identifier("BAN_ANA") == "ban_ana" + assert normalize_identifier("BANaNA") == "ba_na_na" # handling spaces - assert normalize_column_name("Small Love Potion") == "small_love_potion" + assert normalize_identifier("Small Love Potion") == "small_love_potion" -def test_normalize_table_name() -> None: - assert normalize_table_name("small_love_potion") == "small_love_potion" - assert normalize_table_name("small__love__potion") == "small__love__potion" - assert normalize_table_name("Small_Love_Potion") == "small_love_potion" - assert normalize_table_name("Small__Love__Potion") == "small__love__potion" - assert normalize_table_name("Small Love Potion") == "small_love_potion" - assert normalize_table_name("Small Love Potion") == "small_love_potion" +def test_normalize_path() -> None: + assert normalize_path("small_love_potion") == "small_love_potion" + assert normalize_path("small__love__potion") == "small__love__potion" + assert normalize_path("Small_Love_Potion") == "small_love_potion" + assert normalize_path("Small__Love__Potion") == "small__love__potion" + assert normalize_path("Small Love Potion") == "small_love_potion" + assert normalize_path("Small Love Potion") == "small_love_potion" def test_normalize_non_alpha_single_underscore() -> None: @@ -37,8 +37,8 @@ def test_normalize_non_alpha_single_underscore() -> None: def test_normalizes_underscores() -> None: - assert normalize_column_name("event__value_value2____") == "event_value_value2_" - assert normalize_table_name("e_vent__value_value2____") == "e_vent__value_value2__" + assert normalize_identifier("event__value_value2____") == "event_value_value2_" + assert normalize_path("e_vent__value_value2___") == "e_vent__value_value2___" def test_normalize_make_dataset_name() -> None: diff --git a/tests/common/schema/custom_normalizers.py b/tests/common/schema/custom_normalizers.py index a69b22df1f..1bb1a17da4 100644 --- a/tests/common/schema/custom_normalizers.py +++ b/tests/common/schema/custom_normalizers.py @@ -4,18 +4,16 @@ from dlt.common.typing import TDataItem -def normalize_table_name(name: str) -> str: - return name.capitalize() +def normalize_path(path: str) -> str: + return normalize_make_path(*map(normalize_identifier, normalize_break_path(path))) -def normalize_column_name(name: str) -> str: +def normalize_identifier(name: str) -> str: + if name.startswith("column_"): + return name return "column_" + name.lower() -def normalize_schema_name(name: str) -> str: - return name.lower() - - def extend_schema(schema: Schema) -> None: json_config = schema._normalizers_config["json"]["config"] d_h = schema._settings.setdefault("default_hints", {}) diff --git a/tests/common/schema/test_schema.py b/tests/common/schema/test_schema.py index 71ef9a4366..3cecbd01c6 100644 --- a/tests/common/schema/test_schema.py +++ b/tests/common/schema/test_schema.py @@ -1,3 +1,4 @@ +from copy import deepcopy import os from typing import List, Sequence import pytest @@ -38,7 +39,7 @@ def schema() -> Schema: @pytest.fixture def cn_schema() -> Schema: - return Schema("default", { + return Schema("column_default", { "names": "tests.common.schema.custom_normalizers", "json": { "module": "tests.common.schema.custom_normalizers", @@ -50,13 +51,13 @@ def cn_schema() -> Schema: def test_normalize_schema_name(schema: Schema) -> None: - assert schema.normalize_schema_name("BAN_ANA") == "ban_ana" - assert schema.normalize_schema_name("event-.!:value") == "event_value" - assert schema.normalize_schema_name("123event-.!:value") == "_123event_value" + assert schema.naming.normalize_identifier("BAN_ANA") == "ban_ana" + assert schema.naming.normalize_identifier("event-.!:value") == "event_value" + assert schema.naming.normalize_identifier("123event-.!:value") == "_123event_value" with pytest.raises(ValueError): - assert schema.normalize_schema_name("") + assert schema.naming.normalize_identifier("") with pytest.raises(ValueError): - schema.normalize_schema_name(None) + schema.naming.normalize_identifier(None) def test_new_schema(schema: Schema) -> None: @@ -99,16 +100,25 @@ def test_load_corrupted_schema() -> None: def test_column_name_validator(schema: Schema) -> None: - assert utils.column_name_validator(schema.normalize_column_name)(".", "k", "v", str) is False - assert utils.column_name_validator(schema.normalize_column_name)(".", "k", "v", TColumnName) is True - - assert utils.column_name_validator(schema.normalize_column_name)(".", "k", "snake_case", TColumnName) is True + assert utils.column_name_validator(schema.naming)(".", "k", "v", str) is False + assert utils.column_name_validator(schema.naming)(".", "k", "v", TColumnName) is True + + assert utils.column_name_validator(schema.naming)(".", "k", "snake_case", TColumnName) is True + # double underscores are accepted + assert utils.column_name_validator(schema.naming)(".", "k", "snake__case", TColumnName) is True + # triple underscores are accepted + assert utils.column_name_validator(schema.naming)(".", "k", "snake___case", TColumnName) is True + # quadruple underscores generate empty identifier + with pytest.raises(DictValidationException) as e: + utils.column_name_validator(schema.naming)(".", "k", "snake____case", TColumnName) + assert "not a valid column name" in str(e.value) + # this name is invalid with pytest.raises(DictValidationException) as e: - utils.column_name_validator(schema.normalize_column_name)(".", "k", "1snake_case", TColumnName) + utils.column_name_validator(schema.naming)(".", "k", "1snake_case", TColumnName) assert "not a valid column name" in str(e.value) # expected str as base type with pytest.raises(DictValidationException): - utils.column_name_validator(schema.normalize_column_name)(".", "k", 1, TColumnName) + utils.column_name_validator(schema.naming)(".", "k", 1, TColumnName) def test_invalid_schema_name() -> None: @@ -188,7 +198,7 @@ def test_save_store_schema(schema: Schema, schema_storage: SchemaStorage) -> Non def test_save_store_schema_custom_normalizers(cn_schema: Schema, schema_storage: SchemaStorage) -> None: schema_storage.save_schema(cn_schema) - schema_copy = schema_storage.load_schema("default") + schema_copy = schema_storage.load_schema(cn_schema.name) assert_new_schema_values_custom_normalizers(schema_copy) @@ -416,6 +426,17 @@ def test_compare_columns() -> None: assert utils.compare_column(table["columns"]["col3"], table["columns"]["col4"]) is True +def test_normalize_table_identifiers() -> None: + schema_dict: TStoredSchema = load_json_case("schemas/github/issues.schema") + schema = Schema.from_dict(schema_dict) + # assert column generated from "reactions/+1" and "-1", it is a valid identifier even with three underscores + assert "reactions___1" in schema._schema_tables["issues"]["columns"] + issues_table = deepcopy(schema._schema_tables["issues"]) + # this schema is already normalized so normalization is idempotent + assert schema._schema_tables["issues"] == schema.normalize_table_identifiers(issues_table) + assert schema._schema_tables["issues"] == schema.normalize_table_identifiers(schema.normalize_table_identifiers(issues_table)) + + def assert_new_schema_values_custom_normalizers(schema: Schema) -> None: # check normalizers config assert schema._normalizers_config["names"] == "tests.common.schema.custom_normalizers" @@ -423,12 +444,12 @@ def assert_new_schema_values_custom_normalizers(schema: Schema) -> None: # check if schema was extended by json normalizer assert ["fake_id"] == schema.settings["default_hints"]["not_null"] # call normalizers - assert schema.normalize_column_name("a") == "column_a" - assert schema.normalize_table_name("a__b") == "A__b" - assert schema.normalize_schema_name("1A_b") == "1a_b" + assert schema.naming.normalize_identifier("a") == "column_a" + assert schema.naming.normalize_path("a__b") == "column_a__column_b" + assert schema.naming.normalize_identifier("1A_b") == "column_1a_b" # assumes elements are normalized - assert schema.normalize_make_path("A", "B", "!C") == "A__B__!C" - assert schema.normalize_break_path("A__B__!C") == ["A", "B", "!C"] + assert schema.naming.normalize_make_path("A", "B", "!C") == "A__B__!C" + assert schema.naming.normalize_break_path("A__B__!C") == ["A", "B", "!C"] row = list(schema.normalize_data_item(schema, {"bool": True}, "load_id", "a_table")) assert row[0] == (("a_table", None), {"bool": True}) @@ -448,12 +469,13 @@ def assert_new_schema_values(schema: Schema) -> None: # check if schema was extended by json normalizer assert set(["_dlt_id", "_dlt_root_id", "_dlt_parent_id", "_dlt_list_idx", "_dlt_load_id"]).issubset(schema.settings["default_hints"]["not_null"]) # call normalizers - assert schema.normalize_column_name("A") == "a" - assert schema.normalize_table_name("A__B") == "a__b" - assert schema.normalize_schema_name("1A_b") == "_1_a_b" + assert schema.naming.normalize_identifier("A") == "a" + assert schema.naming.normalize_path("A__B") == "a__b" + assert schema.naming.normalize_identifier("1A_b") == "_1_a_b" # assumes elements are normalized - assert schema.normalize_make_path("A", "B", "!C") == "A__B__!C" - assert schema.normalize_break_path("A__B__!C") == ["A", "B", "!C"] + assert schema.naming.normalize_make_path("A", "B", "!C") == "A__B__!C" + assert schema.naming.normalize_break_path("A__B__!C") == ["A", "B", "!C"] + assert schema.naming.normalize_break_path("reactions___1") == ["reactions", "_1"] schema.normalize_data_item(schema, {}, "load_id", schema.name) # check default tables tables = schema.tables diff --git a/tests/common/storages/test_schema_storage.py b/tests/common/storages/test_schema_storage.py index 6e4cb42545..2fb0188d74 100644 --- a/tests/common/storages/test_schema_storage.py +++ b/tests/common/storages/test_schema_storage.py @@ -233,10 +233,10 @@ def test_save_store_schema_over_import_sync(synced_storage: SchemaStorage) -> No def test_save_store_schema(storage: SchemaStorage) -> None: d_n = default_normalizers() d_n["names"] = "tests.common.schema.custom_normalizers" - schema = Schema("event", normalizers=d_n) + schema = Schema("column_event", normalizers=d_n) storage.save_schema(schema) - assert storage.storage.has_file(SchemaStorage.NAMED_SCHEMA_FILE_PATTERN % ("event", "json")) - loaded_schema = storage.load_schema("event") + assert storage.storage.has_file(SchemaStorage.NAMED_SCHEMA_FILE_PATTERN % ("column_event", "json")) + loaded_schema = storage.load_schema("column_event") assert loaded_schema.to_dict()["tables"]["_dlt_loads"] == schema.to_dict()["tables"]["_dlt_loads"] assert loaded_schema.to_dict() == schema.to_dict() diff --git a/tests/extract/test_extract_pipe.py b/tests/extract/test_extract_pipe.py index d1884a3add..4cd5ed2213 100644 --- a/tests/extract/test_extract_pipe.py +++ b/tests/extract/test_extract_pipe.py @@ -9,7 +9,7 @@ from dlt.common import sleep from dlt.common.typing import TDataItems from dlt.extract.typing import DataItemWithMeta -from dlt.extract.pipe import FilterItem, Pipe, PipeItem, PipeIterator +from dlt.extract.pipe import FilterItem, ManagedPipeIterator, Pipe, PipeItem, PipeIterator # from tests.utils import preserve_environ @@ -272,19 +272,109 @@ def test_pipe_copy_on_fork() -> None: assert elems[0].item is not elems[1].item -@pytest.mark.skip("Not implemented") -def test_async_pipe_exception() -> None: - pass +close_pipe_got_exit = False +close_pipe_yielding = False -@pytest.mark.skip("Not implemented") -def test_thread_pipe_exception() -> None: - pass +def test_close_on_async_exception() -> None: + def long_gen(): + global close_pipe_got_exit, close_pipe_yielding + async def _next_item(p: int) -> int: + return p -@pytest.mark.skip("Not implemented") -def test_sync_pipe_exception() -> None: - pass + # will be closed by PipeIterator + try: + close_pipe_yielding = True + for i in range(0, 10000): + yield _next_item(i) + close_pipe_yielding = False + except GeneratorExit: + close_pipe_got_exit = True + + # execute in a thread + async def raise_gen(item: int): + if item == 10: + raise RuntimeError("we fail") + return item + + assert_pipes_closed(raise_gen, long_gen) + + +def test_close_on_thread_pool_exception() -> None: + def long_gen(): + global close_pipe_got_exit, close_pipe_yielding + + @dlt.defer + def _next_item(p: int) -> int: + return p + + # will be closed by PipeIterator + try: + close_pipe_yielding = True + for i in range(0, 10000): + yield _next_item(i) + close_pipe_yielding = False + except GeneratorExit: + close_pipe_got_exit = True + + # execute in a thread + @dlt.defer + def raise_gen(item: int): + if item == 10: + raise RuntimeError("we fail") + return item + + assert_pipes_closed(raise_gen, long_gen) + + +def test_close_on_sync_exception() -> None: + + def long_gen(): + global close_pipe_got_exit, close_pipe_yielding + + # will be closed by PipeIterator + try: + close_pipe_yielding = True + yield from range(0, 10000) + close_pipe_yielding = False + except GeneratorExit: + close_pipe_got_exit = True + + def raise_gen(item: int): + if item == 10: + raise RuntimeError("we fail") + yield item + + assert_pipes_closed(raise_gen, long_gen) + + +def assert_pipes_closed(raise_gen, long_gen) -> None: + global close_pipe_got_exit, close_pipe_yielding + + close_pipe_got_exit = False + close_pipe_yielding = False + + print("START PIPE") + pit: PipeIterator = None + with PipeIterator.from_pipe(Pipe.from_data("failing", raise_gen, parent=Pipe.from_data("endless", long_gen()))) as pit: + with pytest.raises(RuntimeError): + list(pit) + # it got closed + assert pit._sources == [] + assert close_pipe_got_exit is True + # while long gen was still yielding + assert close_pipe_yielding is True + + close_pipe_got_exit = False + close_pipe_yielding = False + pit = ManagedPipeIterator.from_pipe(Pipe.from_data("failing", raise_gen, parent=Pipe.from_data("endless", long_gen()))) + with pytest.raises(RuntimeError): + list(pit) + assert pit._sources == [] + assert close_pipe_got_exit is True + # while long gen was still yielding + assert close_pipe_yielding is True def _f_items(pipe_items: Sequence[PipeItem]) -> List[TDataItems]: diff --git a/tests/load/test_job_client.py b/tests/load/test_job_client.py index bfb596fc7e..f6389aa302 100644 --- a/tests/load/test_job_client.py +++ b/tests/load/test_job_client.py @@ -335,7 +335,7 @@ def test_write_dispositions(client: SqlJobClientBase, write_disposition: str, fi client.schema.update_schema( new_table(table_name, write_disposition=write_disposition, columns=TABLE_UPDATE) ) - child_table = client.schema.normalize_make_path(table_name, "child") + child_table = client.schema.naming.normalize_make_path(table_name, "child") # add child table without write disposition so it will be inferred from the parent client.schema.update_schema( new_table(child_table, columns=TABLE_UPDATE, parent_table_name=table_name) diff --git a/tests/normalize/cases/github.issues.load_page_5_duck.json b/tests/normalize/cases/github.issues.load_page_5_duck.json new file mode 100644 index 0000000000..9afbb8c042 --- /dev/null +++ b/tests/normalize/cases/github.issues.load_page_5_duck.json @@ -0,0 +1,8061 @@ +[ + { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/71", + "repository_url": "https://api.github.com/repos/duckdb/duckdb", + "labels_url": "https://api.github.com/repos/duckdb/duckdb/issues/71/labels{/name}", + "comments_url": "https://api.github.com/repos/duckdb/duckdb/issues/71/comments", + "events_url": "https://api.github.com/repos/duckdb/duckdb/issues/71/events", + "html_url": "https://github.com/duckdb/duckdb/issues/71", + "id": 388089021, + "node_id": "MDU6SXNzdWUzODgwODkwMjE=", + "number": 71, + "title": "Timestamp type + functions", + "user": { + "login": "Mytherin", + "id": 3978469, + "node_id": "MDQ6VXNlcjM5Nzg0Njk=", + "avatar_url": "https://avatars.githubusercontent.com/u/3978469?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/Mytherin", + "html_url": "https://github.com/Mytherin", + "followers_url": "https://api.github.com/users/Mytherin/followers", + "following_url": "https://api.github.com/users/Mytherin/following{/other_user}", + "gists_url": "https://api.github.com/users/Mytherin/gists{/gist_id}", + "starred_url": "https://api.github.com/users/Mytherin/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/Mytherin/subscriptions", + "organizations_url": "https://api.github.com/users/Mytherin/orgs", + "repos_url": "https://api.github.com/users/Mytherin/repos", + "events_url": "https://api.github.com/users/Mytherin/events{/privacy}", + "received_events_url": "https://api.github.com/users/Mytherin/received_events", + "type": "User", + "site_admin": false + }, + "labels": [ + { + "id": 976002067, + "node_id": "MDU6TGFiZWw5NzYwMDIwNjc=", + "url": "https://api.github.com/repos/duckdb/duckdb/labels/good%20first%20issue", + "name": "good first issue", + "color": "7057ff", + "default": true, + "description": "Good for newcomers" + }, + { + "id": 1153696619, + "node_id": "MDU6TGFiZWwxMTUzNjk2NjE5", + "url": "https://api.github.com/repos/duckdb/duckdb/labels/Type", + "name": "Type", + "color": "f9d0c4", + "default": false, + "description": "Deals with the type system" + } + ], + "state": "closed", + "locked": false, + "assignee": { + "login": "diegomestre2", + "id": 1593103, + "node_id": "MDQ6VXNlcjE1OTMxMDM=", + "avatar_url": "https://avatars.githubusercontent.com/u/1593103?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/diegomestre2", + "html_url": "https://github.com/diegomestre2", + "followers_url": "https://api.github.com/users/diegomestre2/followers", + "following_url": "https://api.github.com/users/diegomestre2/following{/other_user}", + "gists_url": "https://api.github.com/users/diegomestre2/gists{/gist_id}", + "starred_url": "https://api.github.com/users/diegomestre2/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/diegomestre2/subscriptions", + "organizations_url": "https://api.github.com/users/diegomestre2/orgs", + "repos_url": "https://api.github.com/users/diegomestre2/repos", + "events_url": "https://api.github.com/users/diegomestre2/events{/privacy}", + "received_events_url": "https://api.github.com/users/diegomestre2/received_events", + "type": "User", + "site_admin": false + }, + "assignees": [ + { + "login": "diegomestre2", + "id": 1593103, + "node_id": "MDQ6VXNlcjE1OTMxMDM=", + "avatar_url": "https://avatars.githubusercontent.com/u/1593103?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/diegomestre2", + "html_url": "https://github.com/diegomestre2", + "followers_url": "https://api.github.com/users/diegomestre2/followers", + "following_url": "https://api.github.com/users/diegomestre2/following{/other_user}", + "gists_url": "https://api.github.com/users/diegomestre2/gists{/gist_id}", + "starred_url": "https://api.github.com/users/diegomestre2/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/diegomestre2/subscriptions", + "organizations_url": "https://api.github.com/users/diegomestre2/orgs", + "repos_url": "https://api.github.com/users/diegomestre2/repos", + "events_url": "https://api.github.com/users/diegomestre2/events{/privacy}", + "received_events_url": "https://api.github.com/users/diegomestre2/received_events", + "type": "User", + "site_admin": false + } + ], + "milestone": null, + "comments": 8, + "created_at": "2018-12-06T07:29:53Z", + "updated_at": "2021-04-21T07:24:41Z", + "closed_at": "2019-10-23T13:05:30Z", + "author_association": "COLLABORATOR", + "active_lock_reason": null, + "body": "For the DATE functions I just implemented them from the Postgres list of functions. The same should be done for TIMESTAMPs.", + "reactions": { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/71/reactions", + "total_count": 0, + "+1": 0, + "-1": 0, + "laugh": 0, + "hooray": 0, + "confused": 0, + "heart": 0, + "rocket": 0, + "eyes": 0 + }, + "timeline_url": "https://api.github.com/repos/duckdb/duckdb/issues/71/timeline", + "performed_via_github_app": null, + "state_reason": "completed" + }, + { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/46", + "repository_url": "https://api.github.com/repos/duckdb/duckdb", + "labels_url": "https://api.github.com/repos/duckdb/duckdb/issues/46/labels{/name}", + "comments_url": "https://api.github.com/repos/duckdb/duckdb/issues/46/comments", + "events_url": "https://api.github.com/repos/duckdb/duckdb/issues/46/events", + "html_url": "https://github.com/duckdb/duckdb/issues/46", + "id": 387724689, + "node_id": "MDU6SXNzdWUzODc3MjQ2ODk=", + "number": 46, + "title": "Support for FOREIGN KEY", + "user": { + "login": "Mytherin", + "id": 3978469, + "node_id": "MDQ6VXNlcjM5Nzg0Njk=", + "avatar_url": "https://avatars.githubusercontent.com/u/3978469?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/Mytherin", + "html_url": "https://github.com/Mytherin", + "followers_url": "https://api.github.com/users/Mytherin/followers", + "following_url": "https://api.github.com/users/Mytherin/following{/other_user}", + "gists_url": "https://api.github.com/users/Mytherin/gists{/gist_id}", + "starred_url": "https://api.github.com/users/Mytherin/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/Mytherin/subscriptions", + "organizations_url": "https://api.github.com/users/Mytherin/orgs", + "repos_url": "https://api.github.com/users/Mytherin/repos", + "events_url": "https://api.github.com/users/Mytherin/events{/privacy}", + "received_events_url": "https://api.github.com/users/Mytherin/received_events", + "type": "User", + "site_admin": false + }, + "labels": [ + { + "id": 1153697062, + "node_id": "MDU6TGFiZWwxMTUzNjk3MDYy", + "url": "https://api.github.com/repos/duckdb/duckdb/labels/Catalog", + "name": "Catalog", + "color": "77d664", + "default": false, + "description": "Deals with the catalog management" + }, + { + "id": 3551237865, + "node_id": "LA_kwDOCEU65s7Tq5bp", + "url": "https://api.github.com/repos/duckdb/duckdb/labels/feature", + "name": "feature", + "color": "3FD3C4", + "default": false, + "description": "" + } + ], + "state": "closed", + "locked": false, + "assignee": null, + "assignees": [ + + ], + "milestone": null, + "comments": 8, + "created_at": "2018-12-05T12:00:23Z", + "updated_at": "2022-03-18T07:26:35Z", + "closed_at": "2022-03-18T07:26:35Z", + "author_association": "COLLABORATOR", + "active_lock_reason": null, + "body": "", + "reactions": { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/46/reactions", + "total_count": 2, + "+1": 2, + "-1": 0, + "laugh": 0, + "hooray": 0, + "confused": 0, + "heart": 0, + "rocket": 0, + "eyes": 0 + }, + "timeline_url": "https://api.github.com/repos/duckdb/duckdb/issues/46/timeline", + "performed_via_github_app": null, + "state_reason": "completed" + }, + { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/10", + "repository_url": "https://api.github.com/repos/duckdb/duckdb", + "labels_url": "https://api.github.com/repos/duckdb/duckdb/issues/10/labels{/name}", + "comments_url": "https://api.github.com/repos/duckdb/duckdb/issues/10/comments", + "events_url": "https://api.github.com/repos/duckdb/duckdb/issues/10/events", + "html_url": "https://github.com/duckdb/duckdb/issues/10", + "id": 350377010, + "node_id": "MDU6SXNzdWUzNTAzNzcwMTA=", + "number": 10, + "title": "TPC-H Working", + "user": { + "login": "Mytherin", + "id": 3978469, + "node_id": "MDQ6VXNlcjM5Nzg0Njk=", + "avatar_url": "https://avatars.githubusercontent.com/u/3978469?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/Mytherin", + "html_url": "https://github.com/Mytherin", + "followers_url": "https://api.github.com/users/Mytherin/followers", + "following_url": "https://api.github.com/users/Mytherin/following{/other_user}", + "gists_url": "https://api.github.com/users/Mytherin/gists{/gist_id}", + "starred_url": "https://api.github.com/users/Mytherin/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/Mytherin/subscriptions", + "organizations_url": "https://api.github.com/users/Mytherin/orgs", + "repos_url": "https://api.github.com/users/Mytherin/repos", + "events_url": "https://api.github.com/users/Mytherin/events{/privacy}", + "received_events_url": "https://api.github.com/users/Mytherin/received_events", + "type": "User", + "site_admin": false + }, + "labels": [ + + ], + "state": "closed", + "locked": false, + "assignee": { + "login": "Mytherin", + "id": 3978469, + "node_id": "MDQ6VXNlcjM5Nzg0Njk=", + "avatar_url": "https://avatars.githubusercontent.com/u/3978469?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/Mytherin", + "html_url": "https://github.com/Mytherin", + "followers_url": "https://api.github.com/users/Mytherin/followers", + "following_url": "https://api.github.com/users/Mytherin/following{/other_user}", + "gists_url": "https://api.github.com/users/Mytherin/gists{/gist_id}", + "starred_url": "https://api.github.com/users/Mytherin/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/Mytherin/subscriptions", + "organizations_url": "https://api.github.com/users/Mytherin/orgs", + "repos_url": "https://api.github.com/users/Mytherin/repos", + "events_url": "https://api.github.com/users/Mytherin/events{/privacy}", + "received_events_url": "https://api.github.com/users/Mytherin/received_events", + "type": "User", + "site_admin": false + }, + "assignees": [ + { + "login": "Mytherin", + "id": 3978469, + "node_id": "MDQ6VXNlcjM5Nzg0Njk=", + "avatar_url": "https://avatars.githubusercontent.com/u/3978469?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/Mytherin", + "html_url": "https://github.com/Mytherin", + "followers_url": "https://api.github.com/users/Mytherin/followers", + "following_url": "https://api.github.com/users/Mytherin/following{/other_user}", + "gists_url": "https://api.github.com/users/Mytherin/gists{/gist_id}", + "starred_url": "https://api.github.com/users/Mytherin/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/Mytherin/subscriptions", + "organizations_url": "https://api.github.com/users/Mytherin/orgs", + "repos_url": "https://api.github.com/users/Mytherin/repos", + "events_url": "https://api.github.com/users/Mytherin/events{/privacy}", + "received_events_url": "https://api.github.com/users/Mytherin/received_events", + "type": "User", + "site_admin": false + } + ], + "milestone": null, + "comments": 8, + "created_at": "2018-08-14T10:51:35Z", + "updated_at": "2018-11-21T22:26:44Z", + "closed_at": "2018-11-21T22:26:44Z", + "author_association": "COLLABORATOR", + "active_lock_reason": null, + "body": "", + "reactions": { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/10/reactions", + "total_count": 0, + "+1": 0, + "-1": 0, + "laugh": 0, + "hooray": 0, + "confused": 0, + "heart": 0, + "rocket": 0, + "eyes": 0 + }, + "timeline_url": "https://api.github.com/repos/duckdb/duckdb/issues/10/timeline", + "performed_via_github_app": null, + "state_reason": "completed" + }, + { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/6349", + "repository_url": "https://api.github.com/repos/duckdb/duckdb", + "labels_url": "https://api.github.com/repos/duckdb/duckdb/issues/6349/labels{/name}", + "comments_url": "https://api.github.com/repos/duckdb/duckdb/issues/6349/comments", + "events_url": "https://api.github.com/repos/duckdb/duckdb/issues/6349/events", + "html_url": "https://github.com/duckdb/duckdb/issues/6349", + "id": 1589061034, + "node_id": "I_kwDOCEU65s5etyWq", + "number": 6349, + "title": "Julia: No TimeZone config", + "user": { + "login": "xcaptain", + "id": 4054836, + "node_id": "MDQ6VXNlcjQwNTQ4MzY=", + "avatar_url": "https://avatars.githubusercontent.com/u/4054836?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/xcaptain", + "html_url": "https://github.com/xcaptain", + "followers_url": "https://api.github.com/users/xcaptain/followers", + "following_url": "https://api.github.com/users/xcaptain/following{/other_user}", + "gists_url": "https://api.github.com/users/xcaptain/gists{/gist_id}", + "starred_url": "https://api.github.com/users/xcaptain/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/xcaptain/subscriptions", + "organizations_url": "https://api.github.com/users/xcaptain/orgs", + "repos_url": "https://api.github.com/users/xcaptain/repos", + "events_url": "https://api.github.com/users/xcaptain/events{/privacy}", + "received_events_url": "https://api.github.com/users/xcaptain/received_events", + "type": "User", + "site_admin": false + }, + "labels": [ + + ], + "state": "closed", + "locked": false, + "assignee": null, + "assignees": [ + + ], + "milestone": null, + "comments": 7, + "created_at": "2023-02-17T09:52:12Z", + "updated_at": "2023-02-21T02:13:39Z", + "closed_at": "2023-02-21T02:13:39Z", + "author_association": "CONTRIBUTOR", + "active_lock_reason": null, + "body": "### What happens?\n\nI found that the juliapkg doesn't contains a TimeZone config, which means when using julia, the timezone is always `UTC` and can't change to another one.\r\n\r\nMy test code:\r\n```julia\r\nusing DBInterface\r\nusing DuckDB\r\n\r\ncon = DBInterface.connect(DuckDB.DB, \":memory:\")\r\nDBInterface.execute(con, \"select current_timestamp;\")\r\n```\r\n\r\ngot\r\n\r\n```plain\r\n1×1 DataFrame\r\n Row │ get_current_timestamp() \r\n │ DateTime?\r\n─────┼─────────────────────────\r\n 1 │ 2023-02-17T09:39:07.618\r\n```\r\n\r\nwhile using CLI, execute the query `select current_timestamp` I got\r\n\r\n```plain\r\nv0.7.0 f7827396d7\r\nEnter \".help\" for usage hints.\r\nD select current_timestamp;\r\n┌────────────────────────────┐\r\n│ get_current_timestamp() │\r\n│ timestamp with time zone │\r\n├────────────────────────────┤\r\n│ 2023-02-17 17:34:31.361+08 │\r\n└────────────────────────────┘\r\n```\n\n### To Reproduce\n\nI tried to set timezone for the current connection session using the following code:\r\n\r\n```julia\r\nconfig = DuckDB.Config()\r\nconfig = DuckDB.set_config(config, \"TimeZone\", \"Asia/Shanghai\")\r\n```\r\n\r\ngot an error:\r\n```\r\nERROR: Unrecognized configuration option \"TimeZone\"\r\nStacktrace:\r\n [1] set_config(config::DuckDB.Config, name::String, option::String)\r\n @ DuckDB C:\\Users\\capta\\.julia\\packages\\DuckDB\\B2yPk\\src\\config.jl:27\r\n [2] top-level scope\r\n @ REPL[32]:1\r\n```\n\n### OS:\n\nWindows\n\n### DuckDB Version:\n\nv0.7.0 f7827396d7\n\n### DuckDB Client:\n\njulia\n\n### Full Name:\n\nJoey\n\n### Affiliation:\n\nSelf-Employed\n\n### Have you tried this on the latest `master` branch?\n\n- [X] I agree\n\n### Have you tried the steps to reproduce? Do they include all relevant data and configuration? Does the issue you report still appear there?\n\n- [X] I agree", + "reactions": { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/6349/reactions", + "total_count": 0, + "+1": 0, + "-1": 0, + "laugh": 0, + "hooray": 0, + "confused": 0, + "heart": 0, + "rocket": 0, + "eyes": 0 + }, + "timeline_url": "https://api.github.com/repos/duckdb/duckdb/issues/6349/timeline", + "performed_via_github_app": null, + "state_reason": "completed" + }, + { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/6344", + "repository_url": "https://api.github.com/repos/duckdb/duckdb", + "labels_url": "https://api.github.com/repos/duckdb/duckdb/issues/6344/labels{/name}", + "comments_url": "https://api.github.com/repos/duckdb/duckdb/issues/6344/comments", + "events_url": "https://api.github.com/repos/duckdb/duckdb/issues/6344/events", + "html_url": "https://github.com/duckdb/duckdb/issues/6344", + "id": 1588435488, + "node_id": "I_kwDOCEU65s5erZog", + "number": 6344, + "title": "ODBC: SSIS cannot read VARCHAR", + "user": { + "login": "sdmcallister", + "id": 16430997, + "node_id": "MDQ6VXNlcjE2NDMwOTk3", + "avatar_url": "https://avatars.githubusercontent.com/u/16430997?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/sdmcallister", + "html_url": "https://github.com/sdmcallister", + "followers_url": "https://api.github.com/users/sdmcallister/followers", + "following_url": "https://api.github.com/users/sdmcallister/following{/other_user}", + "gists_url": "https://api.github.com/users/sdmcallister/gists{/gist_id}", + "starred_url": "https://api.github.com/users/sdmcallister/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/sdmcallister/subscriptions", + "organizations_url": "https://api.github.com/users/sdmcallister/orgs", + "repos_url": "https://api.github.com/users/sdmcallister/repos", + "events_url": "https://api.github.com/users/sdmcallister/events{/privacy}", + "received_events_url": "https://api.github.com/users/sdmcallister/received_events", + "type": "User", + "site_admin": false + }, + "labels": [ + + ], + "state": "open", + "locked": false, + "assignee": null, + "assignees": [ + + ], + "milestone": null, + "comments": 7, + "created_at": "2023-02-16T22:06:29Z", + "updated_at": "2023-02-21T10:10:32Z", + "closed_at": null, + "author_association": "NONE", + "active_lock_reason": null, + "body": "### What happens?\r\n\r\nUisng 64 bit driver. \r\n\r\nSSIS can read numeric fields like INT, BIGINT, DOUBLE, but fails when trying to read VARCHAR with the following error:\r\n\r\nArithmetic operation resulted in an overflow.\r\n\r\n\r\n\r\n\r\n### To Reproduce\r\n\r\nLoad NYC Taxi Data from here:\r\nhttps://www.kaggle.com/datasets/elemento/nyc-yellow-taxi-trip-data?resource=download\r\n\r\nCREATE TABLE taxi AS SELECT * FROM \r\n\r\nIN SSIS create the connection. \r\n\r\nCreate a query with this command:\r\n\r\nselect store_and_fwd_flag FROM taxi LIMIT 4;\r\n\r\nThis also does results in same error:\r\n\r\nselect CAST(store_and_fwd_flag AS VARCHAR(50)) FROM taxi LIMIT 4;\r\n\r\n### OS:\r\n\r\nWindows 10\r\n\r\n### DuckDB Version:\r\n\r\n0.7\r\n\r\n### DuckDB Client:\r\n\r\nODBC\r\n\r\n### Full Name:\r\n\r\nSteve M\r\n\r\n### Affiliation:\r\n\r\nNone\r\n\r\n### Have you tried this on the latest `master` branch?\r\n\r\n- [X] I agree\r\n\r\n### Have you tried the steps to reproduce? Do they include all relevant data and configuration? Does the issue you report still appear there?\r\n\r\n- [X] I agree", + "reactions": { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/6344/reactions", + "total_count": 1, + "+1": 1, + "-1": 0, + "laugh": 0, + "hooray": 0, + "confused": 0, + "heart": 0, + "rocket": 0, + "eyes": 0 + }, + "timeline_url": "https://api.github.com/repos/duckdb/duckdb/issues/6344/timeline", + "performed_via_github_app": null, + "state_reason": null + }, + { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/6315", + "repository_url": "https://api.github.com/repos/duckdb/duckdb", + "labels_url": "https://api.github.com/repos/duckdb/duckdb/issues/6315/labels{/name}", + "comments_url": "https://api.github.com/repos/duckdb/duckdb/issues/6315/comments", + "events_url": "https://api.github.com/repos/duckdb/duckdb/issues/6315/events", + "html_url": "https://github.com/duckdb/duckdb/issues/6315", + "id": 1587029053, + "node_id": "I_kwDOCEU65s5emCQ9", + "number": 6315, + "title": "python: segfault when accessing `description` after fetching rows", + "user": { + "login": "cldellow", + "id": 193185, + "node_id": "MDQ6VXNlcjE5MzE4NQ==", + "avatar_url": "https://avatars.githubusercontent.com/u/193185?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/cldellow", + "html_url": "https://github.com/cldellow", + "followers_url": "https://api.github.com/users/cldellow/followers", + "following_url": "https://api.github.com/users/cldellow/following{/other_user}", + "gists_url": "https://api.github.com/users/cldellow/gists{/gist_id}", + "starred_url": "https://api.github.com/users/cldellow/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/cldellow/subscriptions", + "organizations_url": "https://api.github.com/users/cldellow/orgs", + "repos_url": "https://api.github.com/users/cldellow/repos", + "events_url": "https://api.github.com/users/cldellow/events{/privacy}", + "received_events_url": "https://api.github.com/users/cldellow/received_events", + "type": "User", + "site_admin": false + }, + "labels": [ + + ], + "state": "closed", + "locked": false, + "assignee": { + "login": "Mytherin", + "id": 3978469, + "node_id": "MDQ6VXNlcjM5Nzg0Njk=", + "avatar_url": "https://avatars.githubusercontent.com/u/3978469?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/Mytherin", + "html_url": "https://github.com/Mytherin", + "followers_url": "https://api.github.com/users/Mytherin/followers", + "following_url": "https://api.github.com/users/Mytherin/following{/other_user}", + "gists_url": "https://api.github.com/users/Mytherin/gists{/gist_id}", + "starred_url": "https://api.github.com/users/Mytherin/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/Mytherin/subscriptions", + "organizations_url": "https://api.github.com/users/Mytherin/orgs", + "repos_url": "https://api.github.com/users/Mytherin/repos", + "events_url": "https://api.github.com/users/Mytherin/events{/privacy}", + "received_events_url": "https://api.github.com/users/Mytherin/received_events", + "type": "User", + "site_admin": false + }, + "assignees": [ + { + "login": "Mytherin", + "id": 3978469, + "node_id": "MDQ6VXNlcjM5Nzg0Njk=", + "avatar_url": "https://avatars.githubusercontent.com/u/3978469?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/Mytherin", + "html_url": "https://github.com/Mytherin", + "followers_url": "https://api.github.com/users/Mytherin/followers", + "following_url": "https://api.github.com/users/Mytherin/following{/other_user}", + "gists_url": "https://api.github.com/users/Mytherin/gists{/gist_id}", + "starred_url": "https://api.github.com/users/Mytherin/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/Mytherin/subscriptions", + "organizations_url": "https://api.github.com/users/Mytherin/orgs", + "repos_url": "https://api.github.com/users/Mytherin/repos", + "events_url": "https://api.github.com/users/Mytherin/events{/privacy}", + "received_events_url": "https://api.github.com/users/Mytherin/received_events", + "type": "User", + "site_admin": false + } + ], + "milestone": null, + "comments": 7, + "created_at": "2023-02-16T05:11:01Z", + "updated_at": "2023-02-16T23:38:14Z", + "closed_at": "2023-02-16T23:38:14Z", + "author_association": "NONE", + "active_lock_reason": null, + "body": "### What happens?\n\nPython code that worked in 0.6.1 segfaults in 0.7.0.\n\n### To Reproduce\n\n```\r\npython3 -mvenv venv\r\n. venv/bin/activate\r\npip install duckdb==0.7.0\r\npython3\r\n>>> import duckdb\r\n>>> c = duckdb.connect()\r\n>>> rv = c.execute(\"select * from sqlite_master where type = 'table'\")\r\n>>> rv.fetchall()\r\n[]\r\n>>> rv.description\r\nSegmentation fault (core dumped)\r\n```\r\n\r\nContrasting with 0.6.1:\r\n\r\n```\r\n>>> import duckdb\r\n>>> c = duckdb.connect()\r\n>>> rv = c.execute(\"select * from sqlite_master where type = 'table'\")\r\n>>> rv.fetchall()\r\n[]\r\n>>> rv.description\r\n[('type', 'STRING', None, None, None, None, None), ('name', 'STRING', None, None, None, None, None), ('tbl_name', 'STRING', None, None, None, None, None), ('rootpage', 'NUMBER', None, None, None, None, None), ('sql', 'STRING', None, None, None, None, None)]\r\n```\n\n### OS:\n\nUbuntu 20.04.3, x64\n\n### DuckDB Version:\n\n0.7.0\n\n### DuckDB Client:\n\nPython\n\n### Full Name:\n\nColin Dellow\n\n### Affiliation:\n\nHobbyist\n\n### Have you tried this on the latest `master` branch?\n\n- [X] I agree\n\n### Have you tried the steps to reproduce? Do they include all relevant data and configuration? Does the issue you report still appear there?\n\n- [X] I agree", + "reactions": { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/6315/reactions", + "total_count": 1, + "+1": 0, + "-1": 0, + "laugh": 0, + "hooray": 0, + "confused": 1, + "heart": 0, + "rocket": 0, + "eyes": 0 + }, + "timeline_url": "https://api.github.com/repos/duckdb/duckdb/issues/6315/timeline", + "performed_via_github_app": null, + "state_reason": "completed" + }, + { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/6272", + "repository_url": "https://api.github.com/repos/duckdb/duckdb", + "labels_url": "https://api.github.com/repos/duckdb/duckdb/issues/6272/labels{/name}", + "comments_url": "https://api.github.com/repos/duckdb/duckdb/issues/6272/comments", + "events_url": "https://api.github.com/repos/duckdb/duckdb/issues/6272/events", + "html_url": "https://github.com/duckdb/duckdb/issues/6272", + "id": 1583917110, + "node_id": "I_kwDOCEU65s5eaKg2", + "number": 6272, + "title": "percent_rank bugs", + "user": { + "login": "rydeng", + "id": 9131195, + "node_id": "MDQ6VXNlcjkxMzExOTU=", + "avatar_url": "https://avatars.githubusercontent.com/u/9131195?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/rydeng", + "html_url": "https://github.com/rydeng", + "followers_url": "https://api.github.com/users/rydeng/followers", + "following_url": "https://api.github.com/users/rydeng/following{/other_user}", + "gists_url": "https://api.github.com/users/rydeng/gists{/gist_id}", + "starred_url": "https://api.github.com/users/rydeng/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/rydeng/subscriptions", + "organizations_url": "https://api.github.com/users/rydeng/orgs", + "repos_url": "https://api.github.com/users/rydeng/repos", + "events_url": "https://api.github.com/users/rydeng/events{/privacy}", + "received_events_url": "https://api.github.com/users/rydeng/received_events", + "type": "User", + "site_admin": false + }, + "labels": [ + { + "id": 976002058, + "node_id": "MDU6TGFiZWw5NzYwMDIwNTg=", + "url": "https://api.github.com/repos/duckdb/duckdb/labels/bug", + "name": "bug", + "color": "d73a4a", + "default": true, + "description": "Something isn't working" + } + ], + "state": "open", + "locked": false, + "assignee": { + "login": "hawkfish", + "id": 13156216, + "node_id": "MDQ6VXNlcjEzMTU2MjE2", + "avatar_url": "https://avatars.githubusercontent.com/u/13156216?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/hawkfish", + "html_url": "https://github.com/hawkfish", + "followers_url": "https://api.github.com/users/hawkfish/followers", + "following_url": "https://api.github.com/users/hawkfish/following{/other_user}", + "gists_url": "https://api.github.com/users/hawkfish/gists{/gist_id}", + "starred_url": "https://api.github.com/users/hawkfish/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/hawkfish/subscriptions", + "organizations_url": "https://api.github.com/users/hawkfish/orgs", + "repos_url": "https://api.github.com/users/hawkfish/repos", + "events_url": "https://api.github.com/users/hawkfish/events{/privacy}", + "received_events_url": "https://api.github.com/users/hawkfish/received_events", + "type": "User", + "site_admin": false + }, + "assignees": [ + { + "login": "hawkfish", + "id": 13156216, + "node_id": "MDQ6VXNlcjEzMTU2MjE2", + "avatar_url": "https://avatars.githubusercontent.com/u/13156216?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/hawkfish", + "html_url": "https://github.com/hawkfish", + "followers_url": "https://api.github.com/users/hawkfish/followers", + "following_url": "https://api.github.com/users/hawkfish/following{/other_user}", + "gists_url": "https://api.github.com/users/hawkfish/gists{/gist_id}", + "starred_url": "https://api.github.com/users/hawkfish/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/hawkfish/subscriptions", + "organizations_url": "https://api.github.com/users/hawkfish/orgs", + "repos_url": "https://api.github.com/users/hawkfish/repos", + "events_url": "https://api.github.com/users/hawkfish/events{/privacy}", + "received_events_url": "https://api.github.com/users/hawkfish/received_events", + "type": "User", + "site_admin": false + } + ], + "milestone": null, + "comments": 7, + "created_at": "2023-02-14T10:40:34Z", + "updated_at": "2023-02-18T21:45:00Z", + "closed_at": null, + "author_association": "NONE", + "active_lock_reason": null, + "body": "### What happens?\r\n\r\npercent_rank sometimes generate duplicate rows\r\n\r\n### To Reproduce\r\n```python\r\nimport pandas as pd\r\nimport numpy as np\r\nids = range(0, 3000)\r\ndates = pd.date_range(\"2017-01-01\", \"2020-12-31\").date\r\ndf = pd.concat([pd.DataFrame(\r\n {\r\n \"id\": [i] * len(dates),\r\n \"date\": dates,\r\n \"v1\": np.random.random(len(dates)),\r\n \"v2\": np.random.random(len(dates)),\r\n \"v3\": np.random.random(len(dates)),\r\n }) for i in ids])\r\nimport duckdb\r\nfor i in range(0, 10000):\r\n con = duckdb.connect()\r\n test = con.query(\"select date, id, v1, v2, percent_rank() over (partition by date order by v1) as rank_v1, v3 from df order by id, date;\").df()\r\n if len(test) != len(df):\r\n print(f\"{len(test)} is not {len(df)}\")\r\n print(f\"drop duplicate rows {len(test.drop_duplicates())}\")\r\n```\r\n### OS:\r\n\r\nlinux, x64\r\n\r\n### DuckDB Version:\r\n\r\n0.7.0\r\n\r\n### DuckDB Client:\r\n\r\npython\r\n\r\n### Full Name:\r\n\r\nRoy Dennis\r\n\r\n### Affiliation:\r\n\r\nbq\r\n\r\n### Have you tried this on the latest `master` branch?\r\n\r\n- [X] I agree\r\n\r\n### Have you tried the steps to reproduce? Do they include all relevant data and configuration? Does the issue you report still appear there?\r\n\r\n- [X] I agree", + "reactions": { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/6272/reactions", + "total_count": 0, + "+1": 0, + "-1": 0, + "laugh": 0, + "hooray": 0, + "confused": 0, + "heart": 0, + "rocket": 0, + "eyes": 0 + }, + "timeline_url": "https://api.github.com/repos/duckdb/duckdb/issues/6272/timeline", + "performed_via_github_app": null, + "state_reason": null + }, + { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/6259", + "repository_url": "https://api.github.com/repos/duckdb/duckdb", + "labels_url": "https://api.github.com/repos/duckdb/duckdb/issues/6259/labels{/name}", + "comments_url": "https://api.github.com/repos/duckdb/duckdb/issues/6259/comments", + "events_url": "https://api.github.com/repos/duckdb/duckdb/issues/6259/events", + "html_url": "https://github.com/duckdb/duckdb/issues/6259", + "id": 1583466740, + "node_id": "I_kwDOCEU65s5eYcj0", + "number": 6259, + "title": "`json_extract` exploding when inside `map`", + "user": { + "login": "cmackenzie1", + "id": 10947617, + "node_id": "MDQ6VXNlcjEwOTQ3NjE3", + "avatar_url": "https://avatars.githubusercontent.com/u/10947617?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/cmackenzie1", + "html_url": "https://github.com/cmackenzie1", + "followers_url": "https://api.github.com/users/cmackenzie1/followers", + "following_url": "https://api.github.com/users/cmackenzie1/following{/other_user}", + "gists_url": "https://api.github.com/users/cmackenzie1/gists{/gist_id}", + "starred_url": "https://api.github.com/users/cmackenzie1/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/cmackenzie1/subscriptions", + "organizations_url": "https://api.github.com/users/cmackenzie1/orgs", + "repos_url": "https://api.github.com/users/cmackenzie1/repos", + "events_url": "https://api.github.com/users/cmackenzie1/events{/privacy}", + "received_events_url": "https://api.github.com/users/cmackenzie1/received_events", + "type": "User", + "site_admin": false + }, + "labels": [ + + ], + "state": "open", + "locked": false, + "assignee": { + "login": "LindsayWray", + "id": 69161963, + "node_id": "MDQ6VXNlcjY5MTYxOTYz", + "avatar_url": "https://avatars.githubusercontent.com/u/69161963?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/LindsayWray", + "html_url": "https://github.com/LindsayWray", + "followers_url": "https://api.github.com/users/LindsayWray/followers", + "following_url": "https://api.github.com/users/LindsayWray/following{/other_user}", + "gists_url": "https://api.github.com/users/LindsayWray/gists{/gist_id}", + "starred_url": "https://api.github.com/users/LindsayWray/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/LindsayWray/subscriptions", + "organizations_url": "https://api.github.com/users/LindsayWray/orgs", + "repos_url": "https://api.github.com/users/LindsayWray/repos", + "events_url": "https://api.github.com/users/LindsayWray/events{/privacy}", + "received_events_url": "https://api.github.com/users/LindsayWray/received_events", + "type": "User", + "site_admin": false + }, + "assignees": [ + { + "login": "LindsayWray", + "id": 69161963, + "node_id": "MDQ6VXNlcjY5MTYxOTYz", + "avatar_url": "https://avatars.githubusercontent.com/u/69161963?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/LindsayWray", + "html_url": "https://github.com/LindsayWray", + "followers_url": "https://api.github.com/users/LindsayWray/followers", + "following_url": "https://api.github.com/users/LindsayWray/following{/other_user}", + "gists_url": "https://api.github.com/users/LindsayWray/gists{/gist_id}", + "starred_url": "https://api.github.com/users/LindsayWray/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/LindsayWray/subscriptions", + "organizations_url": "https://api.github.com/users/LindsayWray/orgs", + "repos_url": "https://api.github.com/users/LindsayWray/repos", + "events_url": "https://api.github.com/users/LindsayWray/events{/privacy}", + "received_events_url": "https://api.github.com/users/LindsayWray/received_events", + "type": "User", + "site_admin": false + } + ], + "milestone": null, + "comments": 7, + "created_at": "2023-02-14T04:24:41Z", + "updated_at": "2023-02-17T13:33:08Z", + "closed_at": null, + "author_association": "NONE", + "active_lock_reason": null, + "body": "### What happens?\r\n\r\nWhen reading from JSON (file or column of `json` type), if you do a select with a map, and the map key or value is populated using `json_extract_string`, the `map` fails with `Error: Invalid Input Error: Error in MAP creation: key list has a different size from value list (1 keys, 2 values)`\r\n\r\n### To Reproduce\r\n\r\n`example.json`\r\n\r\n```json\r\n{\"name\":\"donald\", \"is_duck\": true}\r\n{\"name\":\"cole\", \"is_duck\": false, \"extra\": \"hello, cole\"}\r\n```\r\n\r\nOpen DuckDB\r\n\r\n```sql\r\nselect map(['name'], [json_extract_string(json, '$.name')]) from read_ndjson_objects('./example.json');\r\n-- Error: Invalid Input Error: Error in MAP creation: key list has a different size from value list (1 keys, 2 values)\r\n```\r\n\r\n\r\n### OS:\r\n\r\nmacOS x64\r\n\r\n### DuckDB Version:\r\n\r\n0.7.0\r\n\r\n### DuckDB Client:\r\n\r\ncli\r\n\r\n### Full Name:\r\n\r\nCole MacKenzie\r\n\r\n### Affiliation:\r\n\r\nCloudflare\r\n\r\n### Have you tried this on the latest `master` branch?\r\n\r\n- [X] I agree\r\n\r\n### Have you tried the steps to reproduce? Do they include all relevant data and configuration? Does the issue you report still appear there?\r\n\r\n- [X] I agree", + "reactions": { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/6259/reactions", + "total_count": 0, + "+1": 0, + "-1": 0, + "laugh": 0, + "hooray": 0, + "confused": 0, + "heart": 0, + "rocket": 0, + "eyes": 0 + }, + "timeline_url": "https://api.github.com/repos/duckdb/duckdb/issues/6259/timeline", + "performed_via_github_app": null, + "state_reason": null + }, + { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/6110", + "repository_url": "https://api.github.com/repos/duckdb/duckdb", + "labels_url": "https://api.github.com/repos/duckdb/duckdb/issues/6110/labels{/name}", + "comments_url": "https://api.github.com/repos/duckdb/duckdb/issues/6110/comments", + "events_url": "https://api.github.com/repos/duckdb/duckdb/issues/6110/events", + "html_url": "https://github.com/duckdb/duckdb/pull/6110", + "id": 1572637437, + "node_id": "PR_kwDOCEU65s5JV85y", + "number": 6110, + "title": "[Java] BigDecimal scale > precision bug fix", + "user": { + "login": "Tishj", + "id": 17162323, + "node_id": "MDQ6VXNlcjE3MTYyMzIz", + "avatar_url": "https://avatars.githubusercontent.com/u/17162323?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/Tishj", + "html_url": "https://github.com/Tishj", + "followers_url": "https://api.github.com/users/Tishj/followers", + "following_url": "https://api.github.com/users/Tishj/following{/other_user}", + "gists_url": "https://api.github.com/users/Tishj/gists{/gist_id}", + "starred_url": "https://api.github.com/users/Tishj/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/Tishj/subscriptions", + "organizations_url": "https://api.github.com/users/Tishj/orgs", + "repos_url": "https://api.github.com/users/Tishj/repos", + "events_url": "https://api.github.com/users/Tishj/events{/privacy}", + "received_events_url": "https://api.github.com/users/Tishj/received_events", + "type": "User", + "site_admin": false + }, + "labels": [ + + ], + "state": "closed", + "locked": false, + "assignee": null, + "assignees": [ + + ], + "milestone": null, + "comments": 7, + "created_at": "2023-02-06T14:10:11Z", + "updated_at": "2023-02-12T13:01:46Z", + "closed_at": "2023-02-12T13:01:32Z", + "author_association": "CONTRIBUTOR", + "active_lock_reason": null, + "draft": false, + "pull_request": { + "url": "https://api.github.com/repos/duckdb/duckdb/pulls/6110", + "html_url": "https://github.com/duckdb/duckdb/pull/6110", + "diff_url": "https://github.com/duckdb/duckdb/pull/6110.diff", + "patch_url": "https://github.com/duckdb/duckdb/pull/6110.patch", + "merged_at": "2023-02-12T13:01:32Z" + }, + "body": "This PR fixes #6073 \r\n\r\nAs the title says, in java BigDecimal can have a scale that is larger than its precision.\r\nSince we map directly precision -> width this causes a problem internally because we assert that width >= scale.", + "reactions": { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/6110/reactions", + "total_count": 0, + "+1": 0, + "-1": 0, + "laugh": 0, + "hooray": 0, + "confused": 0, + "heart": 0, + "rocket": 0, + "eyes": 0 + }, + "timeline_url": "https://api.github.com/repos/duckdb/duckdb/issues/6110/timeline", + "performed_via_github_app": null, + "state_reason": null + }, + { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/6084", + "repository_url": "https://api.github.com/repos/duckdb/duckdb", + "labels_url": "https://api.github.com/repos/duckdb/duckdb/issues/6084/labels{/name}", + "comments_url": "https://api.github.com/repos/duckdb/duckdb/issues/6084/comments", + "events_url": "https://api.github.com/repos/duckdb/duckdb/issues/6084/events", + "html_url": "https://github.com/duckdb/duckdb/issues/6084", + "id": 1569662471, + "node_id": "I_kwDOCEU65s5djyYH", + "number": 6084, + "title": "CSV Reader - report file name in case of date conversion error", + "user": { + "login": "Mytherin", + "id": 3978469, + "node_id": "MDQ6VXNlcjM5Nzg0Njk=", + "avatar_url": "https://avatars.githubusercontent.com/u/3978469?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/Mytherin", + "html_url": "https://github.com/Mytherin", + "followers_url": "https://api.github.com/users/Mytherin/followers", + "following_url": "https://api.github.com/users/Mytherin/following{/other_user}", + "gists_url": "https://api.github.com/users/Mytherin/gists{/gist_id}", + "starred_url": "https://api.github.com/users/Mytherin/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/Mytherin/subscriptions", + "organizations_url": "https://api.github.com/users/Mytherin/orgs", + "repos_url": "https://api.github.com/users/Mytherin/repos", + "events_url": "https://api.github.com/users/Mytherin/events{/privacy}", + "received_events_url": "https://api.github.com/users/Mytherin/received_events", + "type": "User", + "site_admin": false + }, + "labels": [ + + ], + "state": "closed", + "locked": false, + "assignee": { + "login": "pdet", + "id": 7377477, + "node_id": "MDQ6VXNlcjczNzc0Nzc=", + "avatar_url": "https://avatars.githubusercontent.com/u/7377477?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/pdet", + "html_url": "https://github.com/pdet", + "followers_url": "https://api.github.com/users/pdet/followers", + "following_url": "https://api.github.com/users/pdet/following{/other_user}", + "gists_url": "https://api.github.com/users/pdet/gists{/gist_id}", + "starred_url": "https://api.github.com/users/pdet/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/pdet/subscriptions", + "organizations_url": "https://api.github.com/users/pdet/orgs", + "repos_url": "https://api.github.com/users/pdet/repos", + "events_url": "https://api.github.com/users/pdet/events{/privacy}", + "received_events_url": "https://api.github.com/users/pdet/received_events", + "type": "User", + "site_admin": false + }, + "assignees": [ + { + "login": "pdet", + "id": 7377477, + "node_id": "MDQ6VXNlcjczNzc0Nzc=", + "avatar_url": "https://avatars.githubusercontent.com/u/7377477?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/pdet", + "html_url": "https://github.com/pdet", + "followers_url": "https://api.github.com/users/pdet/followers", + "following_url": "https://api.github.com/users/pdet/following{/other_user}", + "gists_url": "https://api.github.com/users/pdet/gists{/gist_id}", + "starred_url": "https://api.github.com/users/pdet/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/pdet/subscriptions", + "organizations_url": "https://api.github.com/users/pdet/orgs", + "repos_url": "https://api.github.com/users/pdet/repos", + "events_url": "https://api.github.com/users/pdet/events{/privacy}", + "received_events_url": "https://api.github.com/users/pdet/received_events", + "type": "User", + "site_admin": false + } + ], + "milestone": null, + "comments": 7, + "created_at": "2023-02-03T11:04:06Z", + "updated_at": "2023-02-07T16:14:53Z", + "closed_at": "2023-02-07T16:14:52Z", + "author_association": "COLLABORATOR", + "active_lock_reason": null, + "body": "### Discussed in https://github.com/duckdb/duckdb/discussions/5959\r\n\r\n
\r\n\r\nOriginally posted by **gunnarmorling** January 22, 2023\r\nI'm using `read_csv()` to query a set of CSV files in a directory:\r\n\r\n```\r\nSELECT COUNT(*) FROM read_csv('my-csvs/*.csv', ...);\r\n```\r\n\r\nOne of the files has a datum which doesn't correspond to the configured format, so the query fails:\r\n\r\n```\r\nError: Invalid Input Error: Could not parse string \"18-1218-12-1991\" according to format specifier \"%d-%m-%Y\"\r\n```\r\n\r\nSuggestion: Please print out the name of the affected file. While I could grep for the problematic file, having the the name as part of the error message will streamline the experience. Thanks a lot for your consideration!
", + "reactions": { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/6084/reactions", + "total_count": 0, + "+1": 0, + "-1": 0, + "laugh": 0, + "hooray": 0, + "confused": 0, + "heart": 0, + "rocket": 0, + "eyes": 0 + }, + "timeline_url": "https://api.github.com/repos/duckdb/duckdb/issues/6084/timeline", + "performed_via_github_app": null, + "state_reason": "completed" + }, + { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/6068", + "repository_url": "https://api.github.com/repos/duckdb/duckdb", + "labels_url": "https://api.github.com/repos/duckdb/duckdb/issues/6068/labels{/name}", + "comments_url": "https://api.github.com/repos/duckdb/duckdb/issues/6068/comments", + "events_url": "https://api.github.com/repos/duckdb/duckdb/issues/6068/events", + "html_url": "https://github.com/duckdb/duckdb/pull/6068", + "id": 1566748968, + "node_id": "PR_kwDOCEU65s5JCqOI", + "number": 6068, + "title": "fix: DESCRIBE does not show primary key", + "user": { + "login": "gkaretka", + "id": 10887050, + "node_id": "MDQ6VXNlcjEwODg3MDUw", + "avatar_url": "https://avatars.githubusercontent.com/u/10887050?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/gkaretka", + "html_url": "https://github.com/gkaretka", + "followers_url": "https://api.github.com/users/gkaretka/followers", + "following_url": "https://api.github.com/users/gkaretka/following{/other_user}", + "gists_url": "https://api.github.com/users/gkaretka/gists{/gist_id}", + "starred_url": "https://api.github.com/users/gkaretka/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/gkaretka/subscriptions", + "organizations_url": "https://api.github.com/users/gkaretka/orgs", + "repos_url": "https://api.github.com/users/gkaretka/repos", + "events_url": "https://api.github.com/users/gkaretka/events{/privacy}", + "received_events_url": "https://api.github.com/users/gkaretka/received_events", + "type": "User", + "site_admin": false + }, + "labels": [ + + ], + "state": "closed", + "locked": false, + "assignee": null, + "assignees": [ + + ], + "milestone": null, + "comments": 7, + "created_at": "2023-02-01T20:07:08Z", + "updated_at": "2023-02-15T22:16:01Z", + "closed_at": "2023-02-15T22:15:45Z", + "author_association": "CONTRIBUTOR", + "active_lock_reason": null, + "draft": false, + "pull_request": { + "url": "https://api.github.com/repos/duckdb/duckdb/pulls/6068", + "html_url": "https://github.com/duckdb/duckdb/pull/6068", + "diff_url": "https://github.com/duckdb/duckdb/pull/6068.diff", + "patch_url": "https://github.com/duckdb/duckdb/pull/6068.patch", + "merged_at": "2023-02-15T22:15:45Z" + }, + "body": "fixes #1468", + "reactions": { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/6068/reactions", + "total_count": 0, + "+1": 0, + "-1": 0, + "laugh": 0, + "hooray": 0, + "confused": 0, + "heart": 0, + "rocket": 0, + "eyes": 0 + }, + "timeline_url": "https://api.github.com/repos/duckdb/duckdb/issues/6068/timeline", + "performed_via_github_app": null, + "state_reason": null + }, + { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/5832", + "repository_url": "https://api.github.com/repos/duckdb/duckdb", + "labels_url": "https://api.github.com/repos/duckdb/duckdb/issues/5832/labels{/name}", + "comments_url": "https://api.github.com/repos/duckdb/duckdb/issues/5832/comments", + "events_url": "https://api.github.com/repos/duckdb/duckdb/issues/5832/events", + "html_url": "https://github.com/duckdb/duckdb/issues/5832", + "id": 1521545814, + "node_id": "I_kwDOCEU65s5asPJW", + "number": 5832, + "title": "Relational API thinks that `SUM()` is `NULL`, how to change it to 0?", + "user": { + "login": "krlmlr", + "id": 1741643, + "node_id": "MDQ6VXNlcjE3NDE2NDM=", + "avatar_url": "https://avatars.githubusercontent.com/u/1741643?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/krlmlr", + "html_url": "https://github.com/krlmlr", + "followers_url": "https://api.github.com/users/krlmlr/followers", + "following_url": "https://api.github.com/users/krlmlr/following{/other_user}", + "gists_url": "https://api.github.com/users/krlmlr/gists{/gist_id}", + "starred_url": "https://api.github.com/users/krlmlr/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/krlmlr/subscriptions", + "organizations_url": "https://api.github.com/users/krlmlr/orgs", + "repos_url": "https://api.github.com/users/krlmlr/repos", + "events_url": "https://api.github.com/users/krlmlr/events{/privacy}", + "received_events_url": "https://api.github.com/users/krlmlr/received_events", + "type": "User", + "site_admin": false + }, + "labels": [ + + ], + "state": "closed", + "locked": false, + "assignee": { + "login": "Tmonster", + "id": 6248601, + "node_id": "MDQ6VXNlcjYyNDg2MDE=", + "avatar_url": "https://avatars.githubusercontent.com/u/6248601?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/Tmonster", + "html_url": "https://github.com/Tmonster", + "followers_url": "https://api.github.com/users/Tmonster/followers", + "following_url": "https://api.github.com/users/Tmonster/following{/other_user}", + "gists_url": "https://api.github.com/users/Tmonster/gists{/gist_id}", + "starred_url": "https://api.github.com/users/Tmonster/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/Tmonster/subscriptions", + "organizations_url": "https://api.github.com/users/Tmonster/orgs", + "repos_url": "https://api.github.com/users/Tmonster/repos", + "events_url": "https://api.github.com/users/Tmonster/events{/privacy}", + "received_events_url": "https://api.github.com/users/Tmonster/received_events", + "type": "User", + "site_admin": false + }, + "assignees": [ + { + "login": "Tmonster", + "id": 6248601, + "node_id": "MDQ6VXNlcjYyNDg2MDE=", + "avatar_url": "https://avatars.githubusercontent.com/u/6248601?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/Tmonster", + "html_url": "https://github.com/Tmonster", + "followers_url": "https://api.github.com/users/Tmonster/followers", + "following_url": "https://api.github.com/users/Tmonster/following{/other_user}", + "gists_url": "https://api.github.com/users/Tmonster/gists{/gist_id}", + "starred_url": "https://api.github.com/users/Tmonster/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/Tmonster/subscriptions", + "organizations_url": "https://api.github.com/users/Tmonster/orgs", + "repos_url": "https://api.github.com/users/Tmonster/repos", + "events_url": "https://api.github.com/users/Tmonster/events{/privacy}", + "received_events_url": "https://api.github.com/users/Tmonster/received_events", + "type": "User", + "site_admin": false + } + ], + "milestone": null, + "comments": 7, + "created_at": "2023-01-05T22:30:33Z", + "updated_at": "2023-01-13T05:26:57Z", + "closed_at": "2023-01-13T05:26:56Z", + "author_association": "COLLABORATOR", + "active_lock_reason": null, + "body": "### What happens?\n\nFor the corner case of an empty input, `SUM()` over a column returns `NULL` . This is perhaps consistent with SQL, but not with the data frame behavior.\n\n### To Reproduce\n\n``` r\r\nlibrary(duckdb)\r\n#> Loading required package: DBI\r\n\r\ndf <- data.frame(a = integer())\r\n\r\ndplyr::summarise(df, sum(a))\r\n#> sum(a)\r\n#> 1 0\r\n\r\ncon <- dbConnect(duckdb())\r\n\r\nrel <- duckdb:::rel_from_df(con, df)\r\n\r\nref <- duckdb:::expr_reference(\"a\")\r\nfn <- duckdb:::expr_function(\"sum\", list(ref))\r\nexprs <- list(fn)\r\n\r\nout <- duckdb:::rel_aggregate(rel, list(), exprs)\r\nout\r\n#> DuckDB Relation: \r\n#> ---------------------\r\n#> --- Relation Tree ---\r\n#> ---------------------\r\n#> Limit 1\r\n#> Aggregate [sum(a)]\r\n#> r_dataframe_scan(0x10f686698)\r\n#> \r\n#> ---------------------\r\n#> -- Result Columns --\r\n#> ---------------------\r\n#> - sum(a) (HUGEINT)\r\n\r\nduckdb:::rapi_rel_to_df(out)\r\n#> # A tibble: 1 × 1\r\n#> `sum(a)`\r\n#> \r\n#> 1 NA\r\n```\r\n\r\nCreated on 2023-01-05 with [reprex v2.0.2](https://reprex.tidyverse.org)\n\n### OS:\n\nmacOS aarch64\n\n### DuckDB Version:\n\n2dcdc245566bb9a0fc717bc9d2c3867f911abc6d\n\n### DuckDB Client:\n\nR\n\n### Full Name:\n\nKirill Müller\n\n### Affiliation:\n\ncynkra GmbH\n\n### Have you tried this on the latest `master` branch?\n\n- [X] I agree\n\n### Have you tried the steps to reproduce? Do they include all relevant data and configuration? Does the issue you report still appear there?\n\n- [X] I agree", + "reactions": { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/5832/reactions", + "total_count": 0, + "+1": 0, + "-1": 0, + "laugh": 0, + "hooray": 0, + "confused": 0, + "heart": 0, + "rocket": 0, + "eyes": 0 + }, + "timeline_url": "https://api.github.com/repos/duckdb/duckdb/issues/5832/timeline", + "performed_via_github_app": null, + "state_reason": "completed" + }, + { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/5822", + "repository_url": "https://api.github.com/repos/duckdb/duckdb", + "labels_url": "https://api.github.com/repos/duckdb/duckdb/issues/5822/labels{/name}", + "comments_url": "https://api.github.com/repos/duckdb/duckdb/issues/5822/comments", + "events_url": "https://api.github.com/repos/duckdb/duckdb/issues/5822/events", + "html_url": "https://github.com/duckdb/duckdb/pull/5822", + "id": 1517720501, + "node_id": "PR_kwDOCEU65s5GjuVv", + "number": 5822, + "title": "Export right left and full joins", + "user": { + "login": "Tmonster", + "id": 6248601, + "node_id": "MDQ6VXNlcjYyNDg2MDE=", + "avatar_url": "https://avatars.githubusercontent.com/u/6248601?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/Tmonster", + "html_url": "https://github.com/Tmonster", + "followers_url": "https://api.github.com/users/Tmonster/followers", + "following_url": "https://api.github.com/users/Tmonster/following{/other_user}", + "gists_url": "https://api.github.com/users/Tmonster/gists{/gist_id}", + "starred_url": "https://api.github.com/users/Tmonster/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/Tmonster/subscriptions", + "organizations_url": "https://api.github.com/users/Tmonster/orgs", + "repos_url": "https://api.github.com/users/Tmonster/repos", + "events_url": "https://api.github.com/users/Tmonster/events{/privacy}", + "received_events_url": "https://api.github.com/users/Tmonster/received_events", + "type": "User", + "site_admin": false + }, + "labels": [ + + ], + "state": "closed", + "locked": false, + "assignee": null, + "assignees": [ + + ], + "milestone": null, + "comments": 7, + "created_at": "2023-01-03T17:42:02Z", + "updated_at": "2023-01-25T15:16:49Z", + "closed_at": "2023-01-25T15:16:49Z", + "author_association": "CONTRIBUTOR", + "active_lock_reason": null, + "draft": false, + "pull_request": { + "url": "https://api.github.com/repos/duckdb/duckdb/pulls/5822", + "html_url": "https://github.com/duckdb/duckdb/pull/5822", + "diff_url": "https://github.com/duckdb/duckdb/pull/5822.diff", + "patch_url": "https://github.com/duckdb/duckdb/pull/5822.patch", + "merged_at": "2023-01-25T15:16:49Z" + }, + "body": "Exporting right, left, and full joins to the relational API. ", + "reactions": { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/5822/reactions", + "total_count": 0, + "+1": 0, + "-1": 0, + "laugh": 0, + "hooray": 0, + "confused": 0, + "heart": 0, + "rocket": 0, + "eyes": 0 + }, + "timeline_url": "https://api.github.com/repos/duckdb/duckdb/issues/5822/timeline", + "performed_via_github_app": null, + "state_reason": null + }, + { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/5759", + "repository_url": "https://api.github.com/repos/duckdb/duckdb", + "labels_url": "https://api.github.com/repos/duckdb/duckdb/issues/5759/labels{/name}", + "comments_url": "https://api.github.com/repos/duckdb/duckdb/issues/5759/comments", + "events_url": "https://api.github.com/repos/duckdb/duckdb/issues/5759/events", + "html_url": "https://github.com/duckdb/duckdb/issues/5759", + "id": 1506746322, + "node_id": "I_kwDOCEU65s5Zzx_S", + "number": 5759, + "title": "Segmentation fault (or various memory errors) on WAL application", + "user": { + "login": "paulewog", + "id": 8584721, + "node_id": "MDQ6VXNlcjg1ODQ3MjE=", + "avatar_url": "https://avatars.githubusercontent.com/u/8584721?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/paulewog", + "html_url": "https://github.com/paulewog", + "followers_url": "https://api.github.com/users/paulewog/followers", + "following_url": "https://api.github.com/users/paulewog/following{/other_user}", + "gists_url": "https://api.github.com/users/paulewog/gists{/gist_id}", + "starred_url": "https://api.github.com/users/paulewog/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/paulewog/subscriptions", + "organizations_url": "https://api.github.com/users/paulewog/orgs", + "repos_url": "https://api.github.com/users/paulewog/repos", + "events_url": "https://api.github.com/users/paulewog/events{/privacy}", + "received_events_url": "https://api.github.com/users/paulewog/received_events", + "type": "User", + "site_admin": false + }, + "labels": [ + + ], + "state": "closed", + "locked": false, + "assignee": { + "login": "samansmink", + "id": 2925274, + "node_id": "MDQ6VXNlcjI5MjUyNzQ=", + "avatar_url": "https://avatars.githubusercontent.com/u/2925274?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/samansmink", + "html_url": "https://github.com/samansmink", + "followers_url": "https://api.github.com/users/samansmink/followers", + "following_url": "https://api.github.com/users/samansmink/following{/other_user}", + "gists_url": "https://api.github.com/users/samansmink/gists{/gist_id}", + "starred_url": "https://api.github.com/users/samansmink/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/samansmink/subscriptions", + "organizations_url": "https://api.github.com/users/samansmink/orgs", + "repos_url": "https://api.github.com/users/samansmink/repos", + "events_url": "https://api.github.com/users/samansmink/events{/privacy}", + "received_events_url": "https://api.github.com/users/samansmink/received_events", + "type": "User", + "site_admin": false + }, + "assignees": [ + { + "login": "samansmink", + "id": 2925274, + "node_id": "MDQ6VXNlcjI5MjUyNzQ=", + "avatar_url": "https://avatars.githubusercontent.com/u/2925274?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/samansmink", + "html_url": "https://github.com/samansmink", + "followers_url": "https://api.github.com/users/samansmink/followers", + "following_url": "https://api.github.com/users/samansmink/following{/other_user}", + "gists_url": "https://api.github.com/users/samansmink/gists{/gist_id}", + "starred_url": "https://api.github.com/users/samansmink/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/samansmink/subscriptions", + "organizations_url": "https://api.github.com/users/samansmink/orgs", + "repos_url": "https://api.github.com/users/samansmink/repos", + "events_url": "https://api.github.com/users/samansmink/events{/privacy}", + "received_events_url": "https://api.github.com/users/samansmink/received_events", + "type": "User", + "site_admin": false + } + ], + "milestone": null, + "comments": 7, + "created_at": "2022-12-21T18:33:22Z", + "updated_at": "2023-01-06T14:19:31Z", + "closed_at": "2023-01-06T14:19:31Z", + "author_association": "NONE", + "active_lock_reason": null, + "body": "### What happens?\r\n\r\nWhen applying a WAL file from an `INSERT INTO ... SELECT FROM ... GROUP BY`, a variety of errors occur, including:\r\n* corrupted double-linked list (Linux / AMD64)\r\n* Segmentation fault (OSX)\r\n* Also the below set of logs (OSX):\r\n```\r\nduckdb(31324,0x119e43600) malloc: *** error for object 0x6: pointer being freed was not allocated\r\nduckdb(31324,0x119e43600) malloc: *** set a breakpoint in malloc_error_break to debug\r\nAbort trap: 6\r\n```\r\n\r\nIf the group-by is removed, it does not occur.\r\n\r\nIf the inserts are broken up using LIMIT/OFFSET with breaks inbetween, this also works.\r\n\r\nWill upload the CSV.\r\n\r\n### To Reproduce\r\n\r\n```\r\nCREATE TABLE temp AS SELECT column0, last(column1) FROM 'anon-data.csv.gz' GROUP BY column0;\r\n```\r\n\r\n### OS:\r\n\r\nOSX, Linux AMD64\r\n\r\n### DuckDB Version:\r\n\r\n* 0.6.1\r\n* Also tried latest completed build off master - a3ea1654263e8f844ba6fad4ebf3f535d7aa52f1\r\n\r\n### DuckDB Client:\r\n\r\nCLI\r\n\r\n### Full Name:\r\n\r\nPaul Ellsworth\r\n\r\n### Affiliation:\r\n\r\nTenable, Inc.\r\n\r\n### Have you tried this on the latest `master` branch?\r\n\r\n- [X] I agree\r\n\r\n### Have you tried the steps to reproduce? Do they include all relevant data and configuration? Does the issue you report still appear there?\r\n\r\n- [X] I agree", + "reactions": { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/5759/reactions", + "total_count": 0, + "+1": 0, + "-1": 0, + "laugh": 0, + "hooray": 0, + "confused": 0, + "heart": 0, + "rocket": 0, + "eyes": 0 + }, + "timeline_url": "https://api.github.com/repos/duckdb/duckdb/issues/5759/timeline", + "performed_via_github_app": null, + "state_reason": "completed" + }, + { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/5714", + "repository_url": "https://api.github.com/repos/duckdb/duckdb", + "labels_url": "https://api.github.com/repos/duckdb/duckdb/issues/5714/labels{/name}", + "comments_url": "https://api.github.com/repos/duckdb/duckdb/issues/5714/comments", + "events_url": "https://api.github.com/repos/duckdb/duckdb/issues/5714/events", + "html_url": "https://github.com/duckdb/duckdb/issues/5714", + "id": 1498970547, + "node_id": "I_kwDOCEU65s5ZWHmz", + "number": 5714, + "title": "Adding Lateral Join causes results to change.", + "user": { + "login": "lloydtabb", + "id": 1093458, + "node_id": "MDQ6VXNlcjEwOTM0NTg=", + "avatar_url": "https://avatars.githubusercontent.com/u/1093458?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/lloydtabb", + "html_url": "https://github.com/lloydtabb", + "followers_url": "https://api.github.com/users/lloydtabb/followers", + "following_url": "https://api.github.com/users/lloydtabb/following{/other_user}", + "gists_url": "https://api.github.com/users/lloydtabb/gists{/gist_id}", + "starred_url": "https://api.github.com/users/lloydtabb/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/lloydtabb/subscriptions", + "organizations_url": "https://api.github.com/users/lloydtabb/orgs", + "repos_url": "https://api.github.com/users/lloydtabb/repos", + "events_url": "https://api.github.com/users/lloydtabb/events{/privacy}", + "received_events_url": "https://api.github.com/users/lloydtabb/received_events", + "type": "User", + "site_admin": false + }, + "labels": [ + + ], + "state": "closed", + "locked": false, + "assignee": null, + "assignees": [ + + ], + "milestone": null, + "comments": 7, + "created_at": "2022-12-15T19:28:20Z", + "updated_at": "2022-12-17T12:20:09Z", + "closed_at": "2022-12-17T12:20:09Z", + "author_association": "NONE", + "active_lock_reason": null, + "body": "### What happens?\n\nI have query with a single lateral join. Adding a second lateral join causes the results to change for no aparaent reason (looks like records are somehow removed when they shouldn't be).\r\n\r\n\n\n### To Reproduce\n\nDownload the data.\r\nhttps://github.com/malloydata/malloy/blob/ltabb/index-with-stages/test/data/duckdb/ga_sample.parquet\r\n\r\nRun this query\r\n\r\n```\r\n SELECT \r\n hits_0.page.\"pageTitle\" as \"pageTitle\",\r\n COUNT(DISTINCT CONCAT(ga_sessions.\"__distinct_key\", 'x', hits_0_outer.__row_id)) as \"hits_count\",\r\n -- COUNT(DISTINCT CASE WHEN product_0.\"productQuantity\">0 THEN CONCAT(ga_sessions.\"__distinct_key\", 'x', hits_0_outer.__row_id) END) as \"sold_count\"\r\n FROM (SELECT GEN_RANDOM_UUID() as __distinct_key, * FROM ga_sample.parquet as x) as ga_sessions\r\n , (SELECT UNNEST(GENERATE_SERIES(1,length(ga_sessions.\"hits\"),1)) as __row_id, UNNEST(coalesce(ga_sessions.\"hits\",[null]))) as hits_0_outer(__row_id, hits_0)\r\n -- , (SELECT UNNEST(coalesce(hits_0.\"product\",[null]))) as product_0_outer(product_0)\r\n , (SELECT UNNEST([null])) as product_0_outer(product_0)\r\n WHERE ga_sessions.totals.\"transactionRevenue\">0\r\n GROUP BY 1\r\n ORDER BY 2 desc NULLS LAST\r\n LIMIT 3\r\n```\r\nThe output is \r\n\r\n![image](https://user-images.githubusercontent.com/1093458/207949121-7034e9af-ce4c-4d04-904d-037b52d19ecc.png)\r\n\r\nChange the join of product_0 to actually have some data (none is actually referenced in the query)\r\n\r\n```\r\n SELECT \r\n hits_0.page.\"pageTitle\" as \"pageTitle\",\r\n COUNT(DISTINCT CONCAT(ga_sessions.\"__distinct_key\", 'x', hits_0_outer.__row_id)) as \"hits_count\",\r\n -- COUNT(DISTINCT CASE WHEN product_0.\"productQuantity\">0 THEN CONCAT(ga_sessions.\"__distinct_key\", 'x', hits_0_outer.__row_id) END) as \"sold_count\"\r\n FROM (SELECT GEN_RANDOM_UUID() as __distinct_key, * FROM ga_sample.parquet as x) as ga_sessions\r\n , (SELECT UNNEST(GENERATE_SERIES(1,length(ga_sessions.\"hits\"),1)) as __row_id, UNNEST(coalesce(ga_sessions.\"hits\",[null]))) as hits_0_outer(__row_id, hits_0)\r\n , (SELECT UNNEST(coalesce(hits_0.\"product\",[null]))) as product_0_outer(product_0)\r\n -- , (SELECT UNNEST([null])) as product_0_outer(product_0)\r\n WHERE ga_sessions.totals.\"transactionRevenue\">0\r\n GROUP BY 1\r\n ORDER BY 2 desc NULLS LAST\r\n LIMIT 3\r\n```\r\n\r\nResults are different (the first result is missing}\r\n\r\n![image](https://user-images.githubusercontent.com/1093458/207949502-bd1f8040-a22d-4e48-bc8e-6749bbb8352f.png)\r\n\r\n\n\n### OS:\n\nlinux\n\n### DuckDB Version:\n\nv0.6.2-dev416 0367c35c9b\n\n### DuckDB Client:\n\ncli\n\n### Full Name:\n\nlloyd tabb\n\n### Affiliation:\n\ngoogle\n\n### Have you tried this on the latest `master` branch?\n\n- [X] I agree\n\n### Have you tried the steps to reproduce? Do they include all relevant data and configuration? Does the issue you report still appear there?\n\n- [X] I agree", + "reactions": { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/5714/reactions", + "total_count": 0, + "+1": 0, + "-1": 0, + "laugh": 0, + "hooray": 0, + "confused": 0, + "heart": 0, + "rocket": 0, + "eyes": 0 + }, + "timeline_url": "https://api.github.com/repos/duckdb/duckdb/issues/5714/timeline", + "performed_via_github_app": null, + "state_reason": "completed" + }, + { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/5660", + "repository_url": "https://api.github.com/repos/duckdb/duckdb", + "labels_url": "https://api.github.com/repos/duckdb/duckdb/issues/5660/labels{/name}", + "comments_url": "https://api.github.com/repos/duckdb/duckdb/issues/5660/comments", + "events_url": "https://api.github.com/repos/duckdb/duckdb/issues/5660/events", + "html_url": "https://github.com/duckdb/duckdb/issues/5660", + "id": 1489074642, + "node_id": "I_kwDOCEU65s5YwXnS", + "number": 5660, + "title": "Internal error on GROUP BY ALL", + "user": { + "login": "bmschmidt", + "id": 1110758, + "node_id": "MDQ6VXNlcjExMTA3NTg=", + "avatar_url": "https://avatars.githubusercontent.com/u/1110758?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/bmschmidt", + "html_url": "https://github.com/bmschmidt", + "followers_url": "https://api.github.com/users/bmschmidt/followers", + "following_url": "https://api.github.com/users/bmschmidt/following{/other_user}", + "gists_url": "https://api.github.com/users/bmschmidt/gists{/gist_id}", + "starred_url": "https://api.github.com/users/bmschmidt/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/bmschmidt/subscriptions", + "organizations_url": "https://api.github.com/users/bmschmidt/orgs", + "repos_url": "https://api.github.com/users/bmschmidt/repos", + "events_url": "https://api.github.com/users/bmschmidt/events{/privacy}", + "received_events_url": "https://api.github.com/users/bmschmidt/received_events", + "type": "User", + "site_admin": false + }, + "labels": [ + { + "id": 976002058, + "node_id": "MDU6TGFiZWw5NzYwMDIwNTg=", + "url": "https://api.github.com/repos/duckdb/duckdb/labels/bug", + "name": "bug", + "color": "d73a4a", + "default": true, + "description": "Something isn't working" + } + ], + "state": "closed", + "locked": false, + "assignee": { + "login": "Tmonster", + "id": 6248601, + "node_id": "MDQ6VXNlcjYyNDg2MDE=", + "avatar_url": "https://avatars.githubusercontent.com/u/6248601?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/Tmonster", + "html_url": "https://github.com/Tmonster", + "followers_url": "https://api.github.com/users/Tmonster/followers", + "following_url": "https://api.github.com/users/Tmonster/following{/other_user}", + "gists_url": "https://api.github.com/users/Tmonster/gists{/gist_id}", + "starred_url": "https://api.github.com/users/Tmonster/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/Tmonster/subscriptions", + "organizations_url": "https://api.github.com/users/Tmonster/orgs", + "repos_url": "https://api.github.com/users/Tmonster/repos", + "events_url": "https://api.github.com/users/Tmonster/events{/privacy}", + "received_events_url": "https://api.github.com/users/Tmonster/received_events", + "type": "User", + "site_admin": false + }, + "assignees": [ + { + "login": "Tmonster", + "id": 6248601, + "node_id": "MDQ6VXNlcjYyNDg2MDE=", + "avatar_url": "https://avatars.githubusercontent.com/u/6248601?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/Tmonster", + "html_url": "https://github.com/Tmonster", + "followers_url": "https://api.github.com/users/Tmonster/followers", + "following_url": "https://api.github.com/users/Tmonster/following{/other_user}", + "gists_url": "https://api.github.com/users/Tmonster/gists{/gist_id}", + "starred_url": "https://api.github.com/users/Tmonster/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/Tmonster/subscriptions", + "organizations_url": "https://api.github.com/users/Tmonster/orgs", + "repos_url": "https://api.github.com/users/Tmonster/repos", + "events_url": "https://api.github.com/users/Tmonster/events{/privacy}", + "received_events_url": "https://api.github.com/users/Tmonster/received_events", + "type": "User", + "site_admin": false + } + ], + "milestone": null, + "comments": 7, + "created_at": "2022-12-10T22:48:54Z", + "updated_at": "2023-01-17T12:52:48Z", + "closed_at": "2023-01-17T12:52:48Z", + "author_association": "NONE", + "active_lock_reason": null, + "body": "### What happens?\n\nThanks again for this software.\r\n\r\nI've noticed that a `GROUP BY ALL` with an UNLIST operation throws an unclear error (\"InternalException: INTERNAL Error: Failed to bind column reference \"x\" [5.0] (bindings: [7.0])\"), while manually passing the column names to group by produces the correct result.\r\n\r\nAny subsequent queries throw the following error:\r\n```\r\nInvalidInputException: Invalid Input Error: Attempting to execute an unsuccessful or closed pending query result\r\nError: FATAL Error: Failed: database has been invalidated because of a previous fatal error. The database must be restarted prior to being used again.\r\nOriginal error: \"INTERNAL Error: Failed to bind column reference \"x\" [5.0] (bindings: [7.0])\"\r\n```\n\n### To Reproduce\n\nThrows an error:\r\n\r\n```python\r\nimport duckdb\r\ncon = duckdb.connect(\":memory:\")\r\ncon.execute(\"\"\"CREATE TABLE foo AS SELECT 'a, b, c' AS \"x\", '1' AS y\"\"\")\r\ncon.execute(\"\"\"SELECT y, UNLIST(string_split(\"x\", ', ')) x, COUNT(*) FROM foo GROUP BY ALL\"\"\")\r\n```\r\n\r\nWorks nicely:\r\n\r\n```python\r\nimport duckdb\r\ncon = duckdb.connect(\":memory:\")\r\ncon.execute(\"\"\"CREATE TABLE foo AS SELECT 'a, b, c' AS \"x\", '1' AS y\"\"\")\r\ncon.execute(\"\"\"SELECT y, UNLIST(string_split(\"x\", ', ')) x, COUNT(*) FROM foo GROUP BY x, y\"\"\")\r\n```\r\n\r\n\r\n\n\n### OS:\n\nOS X, Apple M1 Pro\n\n### DuckDB Version:\n\n0.6.1\n\n### DuckDB Client:\n\nPython\n\n### Full Name:\n\nBen Schmidt\n\n### Affiliation:\n\nNomic\n\n### Have you tried this on the latest `master` branch?\n\n- [X] I agree\n\n### Have you tried the steps to reproduce? Do they include all relevant data and configuration? Does the issue you report still appear there?\n\n- [X] I agree", + "reactions": { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/5660/reactions", + "total_count": 0, + "+1": 0, + "-1": 0, + "laugh": 0, + "hooray": 0, + "confused": 0, + "heart": 0, + "rocket": 0, + "eyes": 0 + }, + "timeline_url": "https://api.github.com/repos/duckdb/duckdb/issues/5660/timeline", + "performed_via_github_app": null, + "state_reason": "completed" + }, + { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/5640", + "repository_url": "https://api.github.com/repos/duckdb/duckdb", + "labels_url": "https://api.github.com/repos/duckdb/duckdb/issues/5640/labels{/name}", + "comments_url": "https://api.github.com/repos/duckdb/duckdb/issues/5640/comments", + "events_url": "https://api.github.com/repos/duckdb/duckdb/issues/5640/events", + "html_url": "https://github.com/duckdb/duckdb/issues/5640", + "id": 1485598788, + "node_id": "I_kwDOCEU65s5YjHBE", + "number": 5640, + "title": "Segfault on linux aarch64 with kernel large pages ", + "user": { + "login": "psanford", + "id": 33375, + "node_id": "MDQ6VXNlcjMzMzc1", + "avatar_url": "https://avatars.githubusercontent.com/u/33375?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/psanford", + "html_url": "https://github.com/psanford", + "followers_url": "https://api.github.com/users/psanford/followers", + "following_url": "https://api.github.com/users/psanford/following{/other_user}", + "gists_url": "https://api.github.com/users/psanford/gists{/gist_id}", + "starred_url": "https://api.github.com/users/psanford/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/psanford/subscriptions", + "organizations_url": "https://api.github.com/users/psanford/orgs", + "repos_url": "https://api.github.com/users/psanford/repos", + "events_url": "https://api.github.com/users/psanford/events{/privacy}", + "received_events_url": "https://api.github.com/users/psanford/received_events", + "type": "User", + "site_admin": false + }, + "labels": [ + { + "id": 976002058, + "node_id": "MDU6TGFiZWw5NzYwMDIwNTg=", + "url": "https://api.github.com/repos/duckdb/duckdb/labels/bug", + "name": "bug", + "color": "d73a4a", + "default": true, + "description": "Something isn't working" + } + ], + "state": "closed", + "locked": false, + "assignee": { + "login": "lnkuiper", + "id": 14329666, + "node_id": "MDQ6VXNlcjE0MzI5NjY2", + "avatar_url": "https://avatars.githubusercontent.com/u/14329666?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/lnkuiper", + "html_url": "https://github.com/lnkuiper", + "followers_url": "https://api.github.com/users/lnkuiper/followers", + "following_url": "https://api.github.com/users/lnkuiper/following{/other_user}", + "gists_url": "https://api.github.com/users/lnkuiper/gists{/gist_id}", + "starred_url": "https://api.github.com/users/lnkuiper/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/lnkuiper/subscriptions", + "organizations_url": "https://api.github.com/users/lnkuiper/orgs", + "repos_url": "https://api.github.com/users/lnkuiper/repos", + "events_url": "https://api.github.com/users/lnkuiper/events{/privacy}", + "received_events_url": "https://api.github.com/users/lnkuiper/received_events", + "type": "User", + "site_admin": false + }, + "assignees": [ + { + "login": "lnkuiper", + "id": 14329666, + "node_id": "MDQ6VXNlcjE0MzI5NjY2", + "avatar_url": "https://avatars.githubusercontent.com/u/14329666?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/lnkuiper", + "html_url": "https://github.com/lnkuiper", + "followers_url": "https://api.github.com/users/lnkuiper/followers", + "following_url": "https://api.github.com/users/lnkuiper/following{/other_user}", + "gists_url": "https://api.github.com/users/lnkuiper/gists{/gist_id}", + "starred_url": "https://api.github.com/users/lnkuiper/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/lnkuiper/subscriptions", + "organizations_url": "https://api.github.com/users/lnkuiper/orgs", + "repos_url": "https://api.github.com/users/lnkuiper/repos", + "events_url": "https://api.github.com/users/lnkuiper/events{/privacy}", + "received_events_url": "https://api.github.com/users/lnkuiper/received_events", + "type": "User", + "site_admin": false + } + ], + "milestone": null, + "comments": 7, + "created_at": "2022-12-08T23:36:38Z", + "updated_at": "2023-01-21T12:21:05Z", + "closed_at": "2023-01-21T12:21:05Z", + "author_association": "NONE", + "active_lock_reason": null, + "body": "### What happens?\n\nStarting with v0.6.0, the DuckDB cli segfaults as soon as I attempt to do many actions, including tab completion or trying to read a file from disk. The segfaults do not happen on v0.5.x. \n\n### To Reproduce\n\nThis occurs on a linux/aarch64 system with 16k pages. The specific hardware I'm testing on is a MacBook Air M2 running Asahi linux.\r\n\r\nThis seems to have been introduced when the jemalloc dependency was added. \r\n\r\nBut, you might say, jemalloc supports large pages! Indeed it does, however this patch seems to disable that behavior on linux: https://github.com/duckdb/duckdb/pull/4971/commits/3b67618ac98e49aaa33163c48eb18ba5f69880f7\r\n\r\nReverting that patch fixes the segfaults on my system.\n\n### OS:\n\nlinux/aarch64\n\n### DuckDB Version:\n\nv0.6.1\n\n### DuckDB Client:\n\ncli\n\n### Full Name:\n\nPeter Sanford\n\n### Affiliation:\n\nnone \n\n### Have you tried this on the latest `master` branch?\n\n- [X] I agree\n\n### Have you tried the steps to reproduce? Do they include all relevant data and configuration? Does the issue you report still appear there?\n\n- [X] I agree", + "reactions": { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/5640/reactions", + "total_count": 0, + "+1": 0, + "-1": 0, + "laugh": 0, + "hooray": 0, + "confused": 0, + "heart": 0, + "rocket": 0, + "eyes": 0 + }, + "timeline_url": "https://api.github.com/repos/duckdb/duckdb/issues/5640/timeline", + "performed_via_github_app": null, + "state_reason": "completed" + }, + { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/5564", + "repository_url": "https://api.github.com/repos/duckdb/duckdb", + "labels_url": "https://api.github.com/repos/duckdb/duckdb/issues/5564/labels{/name}", + "comments_url": "https://api.github.com/repos/duckdb/duckdb/issues/5564/comments", + "events_url": "https://api.github.com/repos/duckdb/duckdb/issues/5564/events", + "html_url": "https://github.com/duckdb/duckdb/pull/5564", + "id": 1470751152, + "node_id": "PR_kwDOCEU65s5EB6kj", + "number": 5564, + "title": "Add the three-argument variant of the pg_catalog.pg_get_expr function", + "user": { + "login": "jwills", + "id": 535396, + "node_id": "MDQ6VXNlcjUzNTM5Ng==", + "avatar_url": "https://avatars.githubusercontent.com/u/535396?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/jwills", + "html_url": "https://github.com/jwills", + "followers_url": "https://api.github.com/users/jwills/followers", + "following_url": "https://api.github.com/users/jwills/following{/other_user}", + "gists_url": "https://api.github.com/users/jwills/gists{/gist_id}", + "starred_url": "https://api.github.com/users/jwills/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/jwills/subscriptions", + "organizations_url": "https://api.github.com/users/jwills/orgs", + "repos_url": "https://api.github.com/users/jwills/repos", + "events_url": "https://api.github.com/users/jwills/events{/privacy}", + "received_events_url": "https://api.github.com/users/jwills/received_events", + "type": "User", + "site_admin": false + }, + "labels": [ + + ], + "state": "closed", + "locked": false, + "assignee": null, + "assignees": [ + + ], + "milestone": null, + "comments": 7, + "created_at": "2022-12-01T06:20:22Z", + "updated_at": "2022-12-03T05:09:47Z", + "closed_at": "2022-12-03T05:09:42Z", + "author_association": "CONTRIBUTOR", + "active_lock_reason": null, + "draft": false, + "pull_request": { + "url": "https://api.github.com/repos/duckdb/duckdb/pulls/5564", + "html_url": "https://github.com/duckdb/duckdb/pull/5564", + "diff_url": "https://github.com/duckdb/duckdb/pull/5564.diff", + "patch_url": "https://github.com/duckdb/duckdb/pull/5564.patch", + "merged_at": null + }, + "body": "I bumped into this issue during my DBeaver work for my Postgres proxy server; SQLAlchemy uses the two-argument variant of `pg_get_expr`, but DBeaver uses the three-arg variant. Adding it here for completeness with no additional testing (the two-arg variant is identical in implementation and is already tested in `test/sql/pg_catalog/sqlalchemy.test`)", + "reactions": { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/5564/reactions", + "total_count": 0, + "+1": 0, + "-1": 0, + "laugh": 0, + "hooray": 0, + "confused": 0, + "heart": 0, + "rocket": 0, + "eyes": 0 + }, + "timeline_url": "https://api.github.com/repos/duckdb/duckdb/issues/5564/timeline", + "performed_via_github_app": null, + "state_reason": null + }, + { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/5523", + "repository_url": "https://api.github.com/repos/duckdb/duckdb", + "labels_url": "https://api.github.com/repos/duckdb/duckdb/issues/5523/labels{/name}", + "comments_url": "https://api.github.com/repos/duckdb/duckdb/issues/5523/comments", + "events_url": "https://api.github.com/repos/duckdb/duckdb/issues/5523/events", + "html_url": "https://github.com/duckdb/duckdb/issues/5523", + "id": 1466719597, + "node_id": "I_kwDOCEU65s5XbF1t", + "number": 5523, + "title": "list() aggregate function segfaults when used as a window function", + "user": { + "login": "voberoi", + "id": 26339, + "node_id": "MDQ6VXNlcjI2MzM5", + "avatar_url": "https://avatars.githubusercontent.com/u/26339?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/voberoi", + "html_url": "https://github.com/voberoi", + "followers_url": "https://api.github.com/users/voberoi/followers", + "following_url": "https://api.github.com/users/voberoi/following{/other_user}", + "gists_url": "https://api.github.com/users/voberoi/gists{/gist_id}", + "starred_url": "https://api.github.com/users/voberoi/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/voberoi/subscriptions", + "organizations_url": "https://api.github.com/users/voberoi/orgs", + "repos_url": "https://api.github.com/users/voberoi/repos", + "events_url": "https://api.github.com/users/voberoi/events{/privacy}", + "received_events_url": "https://api.github.com/users/voberoi/received_events", + "type": "User", + "site_admin": false + }, + "labels": [ + + ], + "state": "closed", + "locked": false, + "assignee": { + "login": "taniabogatsch", + "id": 44262898, + "node_id": "MDQ6VXNlcjQ0MjYyODk4", + "avatar_url": "https://avatars.githubusercontent.com/u/44262898?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/taniabogatsch", + "html_url": "https://github.com/taniabogatsch", + "followers_url": "https://api.github.com/users/taniabogatsch/followers", + "following_url": "https://api.github.com/users/taniabogatsch/following{/other_user}", + "gists_url": "https://api.github.com/users/taniabogatsch/gists{/gist_id}", + "starred_url": "https://api.github.com/users/taniabogatsch/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/taniabogatsch/subscriptions", + "organizations_url": "https://api.github.com/users/taniabogatsch/orgs", + "repos_url": "https://api.github.com/users/taniabogatsch/repos", + "events_url": "https://api.github.com/users/taniabogatsch/events{/privacy}", + "received_events_url": "https://api.github.com/users/taniabogatsch/received_events", + "type": "User", + "site_admin": false + }, + "assignees": [ + { + "login": "taniabogatsch", + "id": 44262898, + "node_id": "MDQ6VXNlcjQ0MjYyODk4", + "avatar_url": "https://avatars.githubusercontent.com/u/44262898?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/taniabogatsch", + "html_url": "https://github.com/taniabogatsch", + "followers_url": "https://api.github.com/users/taniabogatsch/followers", + "following_url": "https://api.github.com/users/taniabogatsch/following{/other_user}", + "gists_url": "https://api.github.com/users/taniabogatsch/gists{/gist_id}", + "starred_url": "https://api.github.com/users/taniabogatsch/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/taniabogatsch/subscriptions", + "organizations_url": "https://api.github.com/users/taniabogatsch/orgs", + "repos_url": "https://api.github.com/users/taniabogatsch/repos", + "events_url": "https://api.github.com/users/taniabogatsch/events{/privacy}", + "received_events_url": "https://api.github.com/users/taniabogatsch/received_events", + "type": "User", + "site_admin": false + } + ], + "milestone": null, + "comments": 7, + "created_at": "2022-11-28T16:32:19Z", + "updated_at": "2022-12-05T15:28:44Z", + "closed_at": "2022-12-05T15:22:53Z", + "author_association": "NONE", + "active_lock_reason": null, + "body": "### What happens?\n\nA segfault occurs when using the list() aggregate function as a window function on a Parquet file I have:\r\n\r\n```\r\n❯ duckdb \r\nv0.6.0 2213f9c946\r\nEnter \".help\" for usage hints.\r\nConnected to a transient in-memory database.\r\nUse \".open FILENAME\" to reopen on a persistent database.\r\nD install parquet;\r\nD load parquet;\r\nD SELECT user_id, list(event_name) OVER (PARTITION BY user_id ORDER BY event_timestamp ASC) FROM 'github-sample.parquet';\r\nduckdb(80879,0x16f5db000) malloc: Incorrect checksum for freed object 0x131819200: probably modified after being freed.\r\nCorrupt value: 0x6873755000000009\r\nduckdb(80879,0x16f5db000) malloc: *** set a breakpoint in malloc_error_break to debug\r\nzsh: abort duckdb\r\n```\r\n\r\nThe issue does not occur when using `list()` in a GROUP BY context. Only in a window context. I have tried other window functions on this data, and they work -- including `string_agg()` and `row_number()`.\r\n\r\nThis seems to be limited to `list()` being used as a window function.\n\n### To Reproduce\n\n`SELECT user_id, list(event_name) OVER (PARTITION BY user_id ORDER BY event_timestamp ASC) FROM 'github-sample.parquet';`\r\n\r\nThe sample data (14 MB) is available in this repository with a README that shares how to reproduce it: https://github.com/voberoi/duckdb-window-function-segfault\n\n### OS:\n\nMac OS -- M1 arm64\n\n### DuckDB Version:\n\n0.6.0 (also reproducible with 0.5.1)\n\n### DuckDB Client:\n\nDuckDB shell\n\n### Full Name:\n\nVikram Oberoi\n\n### Affiliation:\n\nNo affiliation\n\n### Have you tried this on the latest `master` branch?\n\n- [X] I agree\n\n### Have you tried the steps to reproduce? Do they include all relevant data and configuration? Does the issue you report still appear there?\n\n- [X] I agree", + "reactions": { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/5523/reactions", + "total_count": 0, + "+1": 0, + "-1": 0, + "laugh": 0, + "hooray": 0, + "confused": 0, + "heart": 0, + "rocket": 0, + "eyes": 0 + }, + "timeline_url": "https://api.github.com/repos/duckdb/duckdb/issues/5523/timeline", + "performed_via_github_app": null, + "state_reason": "completed" + }, + { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/5455", + "repository_url": "https://api.github.com/repos/duckdb/duckdb", + "labels_url": "https://api.github.com/repos/duckdb/duckdb/issues/5455/labels{/name}", + "comments_url": "https://api.github.com/repos/duckdb/duckdb/issues/5455/comments", + "events_url": "https://api.github.com/repos/duckdb/duckdb/issues/5455/events", + "html_url": "https://github.com/duckdb/duckdb/issues/5455", + "id": 1460154241, + "node_id": "I_kwDOCEU65s5XCC-B", + "number": 5455, + "title": "Memory Cost Increase Too Fast When Frequently Insert and Delete Rows (compared with 0.5.1)", + "user": { + "login": "foTok", + "id": 4091925, + "node_id": "MDQ6VXNlcjQwOTE5MjU=", + "avatar_url": "https://avatars.githubusercontent.com/u/4091925?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/foTok", + "html_url": "https://github.com/foTok", + "followers_url": "https://api.github.com/users/foTok/followers", + "following_url": "https://api.github.com/users/foTok/following{/other_user}", + "gists_url": "https://api.github.com/users/foTok/gists{/gist_id}", + "starred_url": "https://api.github.com/users/foTok/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/foTok/subscriptions", + "organizations_url": "https://api.github.com/users/foTok/orgs", + "repos_url": "https://api.github.com/users/foTok/repos", + "events_url": "https://api.github.com/users/foTok/events{/privacy}", + "received_events_url": "https://api.github.com/users/foTok/received_events", + "type": "User", + "site_admin": false + }, + "labels": [ + + ], + "state": "closed", + "locked": false, + "assignee": { + "login": "Mytherin", + "id": 3978469, + "node_id": "MDQ6VXNlcjM5Nzg0Njk=", + "avatar_url": "https://avatars.githubusercontent.com/u/3978469?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/Mytherin", + "html_url": "https://github.com/Mytherin", + "followers_url": "https://api.github.com/users/Mytherin/followers", + "following_url": "https://api.github.com/users/Mytherin/following{/other_user}", + "gists_url": "https://api.github.com/users/Mytherin/gists{/gist_id}", + "starred_url": "https://api.github.com/users/Mytherin/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/Mytherin/subscriptions", + "organizations_url": "https://api.github.com/users/Mytherin/orgs", + "repos_url": "https://api.github.com/users/Mytherin/repos", + "events_url": "https://api.github.com/users/Mytherin/events{/privacy}", + "received_events_url": "https://api.github.com/users/Mytherin/received_events", + "type": "User", + "site_admin": false + }, + "assignees": [ + { + "login": "Mytherin", + "id": 3978469, + "node_id": "MDQ6VXNlcjM5Nzg0Njk=", + "avatar_url": "https://avatars.githubusercontent.com/u/3978469?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/Mytherin", + "html_url": "https://github.com/Mytherin", + "followers_url": "https://api.github.com/users/Mytherin/followers", + "following_url": "https://api.github.com/users/Mytherin/following{/other_user}", + "gists_url": "https://api.github.com/users/Mytherin/gists{/gist_id}", + "starred_url": "https://api.github.com/users/Mytherin/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/Mytherin/subscriptions", + "organizations_url": "https://api.github.com/users/Mytherin/orgs", + "repos_url": "https://api.github.com/users/Mytherin/repos", + "events_url": "https://api.github.com/users/Mytherin/events{/privacy}", + "received_events_url": "https://api.github.com/users/Mytherin/received_events", + "type": "User", + "site_admin": false + } + ], + "milestone": null, + "comments": 7, + "created_at": "2022-11-22T16:07:32Z", + "updated_at": "2022-12-20T10:12:03Z", + "closed_at": "2022-12-20T10:12:03Z", + "author_association": "NONE", + "active_lock_reason": null, + "body": "### What happens?\n\nWhen there exist many insertions/deletes/updates, memory increase visibly even the number of rows does not change.\r\n\r\nIs there any inner mechanism change from 0.5.1 to 0.6.0?\n\n### To Reproduce\n\n1. create a database in file mode.\r\n2. create a table and load some data into the table.\r\n3. randomly select some data insert into the table, and then delete them (the randomly inserted rows), to keep the row number a constant.\r\n4. If insert and delete 500 rows per second, the memory will increase greater than 1G in 1 hour.\r\n\r\nIn 0.5.1, memory increasing is slow, and the increasing is even invisible if perserve_insertion_order = false.\r\nHowever, in 0.6.0, the memory increases too fast.\n\n### OS:\n\ncentos 7\n\n### DuckDB Version:\n\n0.6.0\n\n### DuckDB Client:\n\nC++\n\n### Full Name:\n\nwenfeng\n\n### Affiliation:\n\n--\n\n### Have you tried this on the latest `master` branch?\n\n- [X] I agree\n\n### Have you tried the steps to reproduce? Do they include all relevant data and configuration? Does the issue you report still appear there?\n\n- [X] I agree", + "reactions": { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/5455/reactions", + "total_count": 0, + "+1": 0, + "-1": 0, + "laugh": 0, + "hooray": 0, + "confused": 0, + "heart": 0, + "rocket": 0, + "eyes": 0 + }, + "timeline_url": "https://api.github.com/repos/duckdb/duckdb/issues/5455/timeline", + "performed_via_github_app": null, + "state_reason": "completed" + }, + { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/5424", + "repository_url": "https://api.github.com/repos/duckdb/duckdb", + "labels_url": "https://api.github.com/repos/duckdb/duckdb/issues/5424/labels{/name}", + "comments_url": "https://api.github.com/repos/duckdb/duckdb/issues/5424/comments", + "events_url": "https://api.github.com/repos/duckdb/duckdb/issues/5424/events", + "html_url": "https://github.com/duckdb/duckdb/issues/5424", + "id": 1457431705, + "node_id": "I_kwDOCEU65s5W3qSZ", + "number": 5424, + "title": "memory allocation failed when checkpoint.", + "user": { + "login": "foTok", + "id": 4091925, + "node_id": "MDQ6VXNlcjQwOTE5MjU=", + "avatar_url": "https://avatars.githubusercontent.com/u/4091925?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/foTok", + "html_url": "https://github.com/foTok", + "followers_url": "https://api.github.com/users/foTok/followers", + "following_url": "https://api.github.com/users/foTok/following{/other_user}", + "gists_url": "https://api.github.com/users/foTok/gists{/gist_id}", + "starred_url": "https://api.github.com/users/foTok/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/foTok/subscriptions", + "organizations_url": "https://api.github.com/users/foTok/orgs", + "repos_url": "https://api.github.com/users/foTok/repos", + "events_url": "https://api.github.com/users/foTok/events{/privacy}", + "received_events_url": "https://api.github.com/users/foTok/received_events", + "type": "User", + "site_admin": false + }, + "labels": [ + + ], + "state": "closed", + "locked": false, + "assignee": null, + "assignees": [ + + ], + "milestone": null, + "comments": 7, + "created_at": "2022-11-21T07:21:12Z", + "updated_at": "2022-12-05T13:16:23Z", + "closed_at": "2022-11-21T08:54:39Z", + "author_association": "NONE", + "active_lock_reason": null, + "body": "### What happens?\n\nmemory allocation failed when checkpoint.\r\n\r\nProgress crashed when checkpoint some times.\r\nerror in progoress:\r\nmalloc.c:3723: _int_malloc: Assertion `(unsigned long) (size) >= (unsigned long) (nb)' failed.\r\n\r\nerror in monitor script:\r\nsometimes: ./duck_monitor.sh: line 49: 62734 Aborted (core dumped) nohup XXXX\r\nsometimes: ./duck_monitor.sh: line 49: 45690 Segmentation fault (core dumped) nohup XXX\r\n\r\ncall stack may be used.\r\n/lib64/libc.so.6(+0x6dce1)[0x7fa968f31ce1]\r\n/lib64/libc.so.6(+0x734c6)[0x7fa968f374c6]\r\n/lib64/libc.so.6(+0x75a4f)[0x7fa968f39a4f]\r\n/lib64/libc.so.6(__libc_malloc+0x50)[0x7fa968f3b090]\r\n/home/usr/software/gcc-11.2.0/lib64/libstdc++.so.6(_Znwm+0x18)[0x7fa969824fc8]\r\n/home/usr/workspace/DuckDBVerification/lib/libduckdb.so(+0x117e0c8)[0x7fa96af1e0c8]\r\n/home/usr/workspace/DuckDBVerification/lib/libduckdb.so(_ZN6duckdb11FSSTStorage18StringFinalAnalyzeERNS_12AnalyzeStateE+0x11f)[0x7fa96ae8230f]\r\n/home/usr/workspace/DuckDBVerification/lib/libduckdb.so(_ZN6duckdb22ColumnDataCheckpointer27DetectBestCompressionMethodERm+0x170)[0x7fa96aed7300]\r\n/home/usr/workspace/DuckDBVerification/lib/libduckdb.so(_ZN6duckdb22ColumnDataCheckpointer11WriteToDiskEv+0x92)[0x7fa96aed7702]\r\n/home/usr/workspace/DuckDBVerification/lib/libduckdb.so(_ZN6duckdb10ColumnData10CheckpointERNS_8RowGroupERNS_19PartialBlockManagerERNS_20ColumnCheckpointInfoE+0x142)[0x7fa96aedb002]\r\n/home/usr/workspace/DuckDBVerification/lib/libduckdb.so(_ZN6duckdb18StandardColumnData10CheckpointERNS_8RowGroupERNS_19PartialBlockManagerERNS_20ColumnCheckpointInfoE+0x40)[0x7fa96aedb4c0]\r\n/home/usr/workspace/DuckDBVerification/lib/libduckdb.so(_ZN6duckdb8RowGroup11WriteToDiskERNS_19PartialBlockManagerERKSt6vectorINS_15CompressionTypeESaIS4_EE+0xb7)[0x7fa96aedbf87]\r\n/home/usr/workspace/DuckDBVerification/lib/libduckdb.so(_ZN6duckdb8RowGroup10CheckpointERNS_14RowGroupWriterERSt6vectorISt10unique_ptrINS_14BaseStatisticsESt14default_deleteIS5_EESaIS8_EE+0x86)[0x7fa96aedc236]\r\n/home/usr/workspace/DuckDBVerification/lib/libduckdb.so(_ZN6duckdb18RowGroupCollection10CheckpointERNS_15TableDataWriterERSt6vectorISt10unique_ptrINS_14BaseStatisticsESt14default_deleteIS5_EESaIS8_EE+0x5d)[0x7fa96aedc7bd]\r\n/home/usr/workspace/DuckDBVerification/lib/libduckdb.so(_ZN6duckdb9DataTable10CheckpointERNS_15TableDataWriterE+0xc7)[0x7fa96af00387]\r\n/home/usr/workspace/DuckDBVerification/lib/libduckdb.so(_ZN6duckdb16CheckpointWriter10WriteTableERNS_17TableCatalogEntryE+0x8a)[0x7fa96aef271a]\r\n/home/usr/workspace/DuckDBVerification/lib/libduckdb.so(_ZN6duckdb16CheckpointWriter11WriteSchemaERNS_18SchemaCatalogEntryE+0x5b5)[0x7fa96aef2d45]\r\n/home/usr/workspace/DuckDBVerification/lib/libduckdb.so(_ZN6duckdb26SingleFileCheckpointWriter16CreateCheckpointEv+0x1a7)[0x7fa96aeff5a7]\r\n/home/usr/workspace/DuckDBVerification/lib/libduckdb.so(_ZN6duckdb24SingleFileStorageManager16CreateCheckpointEbb+0xa0)[0x7fa96aeff830]\r\n/home/usr/workspace/DuckDBVerification/lib/libduckdb.so(_ZN6duckdb18TransactionManager17CommitTransactionB5cxx11ERNS_13ClientContextEPNS_11TransactionE+0x346)[0x7fa96af16b26]\r\n/home/usr/workspace/DuckDBVerification/lib/libduckdb.so(_ZN6duckdb18TransactionContext6CommitEv+0x42)[0x7fa96af16d12]\r\n/home/usr/workspace/DuckDBVerification/lib/libduckdb.so(_ZN6duckdb13ClientContext16EndQueryInternalERNS_17ClientContextLockEbb+0x1b9)[0x7fa96ae37de9]\r\n/home/usr/workspace/DuckDBVerification/lib/libduckdb.so(_ZN6duckdb13ClientContext15CleanupInternalERNS_17ClientContextLockEPNS_15BaseQueryResultEb+0x7f)[0x7fa96ae380df]\r\n/home/usr/workspace/DuckDBVerification/lib/libduckdb.so(_ZN6duckdb13ClientContext13FetchInternalERNS_17ClientContextLockERNS_8ExecutorERNS_15BaseQueryResultE+0x4e)[0x7fa96ae3859e]\r\n/home/usr/workspace/DuckDBVerification/lib/libduckdb.so(_ZN6duckdb13ClientContext19FetchResultInternalERNS_17ClientContextLockERNS_18PendingQueryResultE+0x298)[0x7fa96ae38898]\r\n/home/usr/workspace/DuckDBVerification/lib/libduckdb.so(_ZN6duckdb18PendingQueryResult15ExecuteInternalERNS_17ClientContextLockE+0x72)[0x7fa96ae39242]\r\n/home/usr/workspace/DuckDBVerification/lib/libduckdb.so(_ZN6duckdb13ClientContext27ExecutePendingQueryInternalERNS_17ClientContextLockERNS_18PendingQueryResultE+0xd)[0x7fa96ae3927d]\r\n/home/usr/workspace/DuckDBVerification/lib/libduckdb.so(_ZN6duckdb13ClientContext5QueryERKNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEb+0x228)[0x7fa96ae43738]\r\n/home/usr/workspace/DuckDBVerification/lib/libduckdb.so(_ZN6duckdb10Connection5QueryERKNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEE+0x18)[0x7fa96ae43a78]\r\n./DuckDBVerification[0x44888b]\r\n/home/usr/software/gcc-11.2.0/lib64/libstdc++.so.6(+0xd1340)[0x7fa96984e340]\r\n/lib64/libpthread.so.0(+0x86e4)[0x7fa969b8b6e4]\r\n/lib64/libc.so.6(clone+0x6d)[0x7fa968faa2dd]\r\n\n\n### To Reproduce\n\n1. Load much data, ~500M in csv, 80 tables.\r\n2. Randomly select some rows(about 10) and reinsert them into the original table.\r\n3. Process crashed in about 30min.\n\n### OS:\n\ncentos 7\n\n### DuckDB Version:\n\n0.6.0\n\n### DuckDB Client:\n\nC++\n\n### Full Name:\n\nwenfeng\n\n### Affiliation:\n\n--\n\n### Have you tried this on the latest `master` branch?\n\n- [X] I agree\n\n### Have you tried the steps to reproduce? Do they include all relevant data and configuration? Does the issue you report still appear there?\n\n- [X] I agree", + "reactions": { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/5424/reactions", + "total_count": 0, + "+1": 0, + "-1": 0, + "laugh": 0, + "hooray": 0, + "confused": 0, + "heart": 0, + "rocket": 0, + "eyes": 0 + }, + "timeline_url": "https://api.github.com/repos/duckdb/duckdb/issues/5424/timeline", + "performed_via_github_app": null, + "state_reason": "completed" + }, + { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/5419", + "repository_url": "https://api.github.com/repos/duckdb/duckdb", + "labels_url": "https://api.github.com/repos/duckdb/duckdb/issues/5419/labels{/name}", + "comments_url": "https://api.github.com/repos/duckdb/duckdb/issues/5419/comments", + "events_url": "https://api.github.com/repos/duckdb/duckdb/issues/5419/events", + "html_url": "https://github.com/duckdb/duckdb/pull/5419", + "id": 1456872549, + "node_id": "PR_kwDOCEU65s5DTQJg", + "number": 5419, + "title": "httpfs: check environment vars for AWS Credentials", + "user": { + "login": "satotake", + "id": 9415800, + "node_id": "MDQ6VXNlcjk0MTU4MDA=", + "avatar_url": "https://avatars.githubusercontent.com/u/9415800?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/satotake", + "html_url": "https://github.com/satotake", + "followers_url": "https://api.github.com/users/satotake/followers", + "following_url": "https://api.github.com/users/satotake/following{/other_user}", + "gists_url": "https://api.github.com/users/satotake/gists{/gist_id}", + "starred_url": "https://api.github.com/users/satotake/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/satotake/subscriptions", + "organizations_url": "https://api.github.com/users/satotake/orgs", + "repos_url": "https://api.github.com/users/satotake/repos", + "events_url": "https://api.github.com/users/satotake/events{/privacy}", + "received_events_url": "https://api.github.com/users/satotake/received_events", + "type": "User", + "site_admin": false + }, + "labels": [ + + ], + "state": "closed", + "locked": false, + "assignee": null, + "assignees": [ + + ], + "milestone": null, + "comments": 7, + "created_at": "2022-11-20T11:46:21Z", + "updated_at": "2022-12-03T11:27:29Z", + "closed_at": "2022-12-03T11:27:26Z", + "author_association": "CONTRIBUTOR", + "active_lock_reason": null, + "draft": false, + "pull_request": { + "url": "https://api.github.com/repos/duckdb/duckdb/pulls/5419", + "html_url": "https://github.com/duckdb/duckdb/pull/5419", + "diff_url": "https://github.com/duckdb/duckdb/pull/5419.diff", + "patch_url": "https://github.com/duckdb/duckdb/pull/5419.patch", + "merged_at": "2022-12-03T11:27:26Z" + }, + "body": "- Read and set AWS environment variables when `httpfs` extension is loaded\r\n - AWS_DEFAULT_REGION \r\n - AWS_ACCESS_KEY_ID \r\n - AWS_SECRET_ACCESS_KEY \r\n - AWS_SESSION_TOKEN\r\n- issue #4021 (partially)", + "reactions": { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/5419/reactions", + "total_count": 0, + "+1": 0, + "-1": 0, + "laugh": 0, + "hooray": 0, + "confused": 0, + "heart": 0, + "rocket": 0, + "eyes": 0 + }, + "timeline_url": "https://api.github.com/repos/duckdb/duckdb/issues/5419/timeline", + "performed_via_github_app": null, + "state_reason": null + }, + { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/5409", + "repository_url": "https://api.github.com/repos/duckdb/duckdb", + "labels_url": "https://api.github.com/repos/duckdb/duckdb/issues/5409/labels{/name}", + "comments_url": "https://api.github.com/repos/duckdb/duckdb/issues/5409/comments", + "events_url": "https://api.github.com/repos/duckdb/duckdb/issues/5409/events", + "html_url": "https://github.com/duckdb/duckdb/pull/5409", + "id": 1455422389, + "node_id": "PR_kwDOCEU65s5DObwS", + "number": 5409, + "title": "Add more undefined behavior", + "user": { + "login": "akuzm", + "id": 36882414, + "node_id": "MDQ6VXNlcjM2ODgyNDE0", + "avatar_url": "https://avatars.githubusercontent.com/u/36882414?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/akuzm", + "html_url": "https://github.com/akuzm", + "followers_url": "https://api.github.com/users/akuzm/followers", + "following_url": "https://api.github.com/users/akuzm/following{/other_user}", + "gists_url": "https://api.github.com/users/akuzm/gists{/gist_id}", + "starred_url": "https://api.github.com/users/akuzm/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/akuzm/subscriptions", + "organizations_url": "https://api.github.com/users/akuzm/orgs", + "repos_url": "https://api.github.com/users/akuzm/repos", + "events_url": "https://api.github.com/users/akuzm/events{/privacy}", + "received_events_url": "https://api.github.com/users/akuzm/received_events", + "type": "User", + "site_admin": false + }, + "labels": [ + + ], + "state": "closed", + "locked": false, + "assignee": null, + "assignees": [ + + ], + "milestone": null, + "comments": 7, + "created_at": "2022-11-18T16:26:00Z", + "updated_at": "2022-11-21T11:40:43Z", + "closed_at": "2022-11-21T11:28:07Z", + "author_association": "NONE", + "active_lock_reason": null, + "draft": true, + "pull_request": { + "url": "https://api.github.com/repos/duckdb/duckdb/pulls/5409", + "html_url": "https://github.com/duckdb/duckdb/pull/5409", + "diff_url": "https://github.com/duckdb/duckdb/pull/5409.diff", + "patch_url": "https://github.com/duckdb/duckdb/pull/5409.patch", + "merged_at": null + }, + "body": "In release builds, replace assertions with a corresponding __builtin_unreachable() call. This conveys the invariants stated by the assertions to the compiler, hopefully helping it generate more efficient code.\r\n\r\nInspired by this post about undefined behavior in Rust: https://blog.sigplan.org/2021/11/18/undefined-behavior-deserves-a-better-reputation/\r\n\r\nI lazily ran some micro-benchmark, and the most significant increase I noticed was about 20% on the `simple_distinct` benchmark. Please see the attached differential flamegraph. \r\n\r\n![diff](https://user-images.githubusercontent.com/36882414/202752800-b8ad6d9b-6d15-4667-a63e-3d6b422af0d7.svg)\r\n\r\nDo you think this is a good idea? We could also change `ASSERT_RESTRICT` to use the same logic.\r\n\r\n#### Other references\r\n\r\nSome ancient article from Regehr where he tries just that change for clang, compiling it with GCC 4.8.2 and Clang 3.4: https://blog.regehr.org/archives/1096\r\n\r\nC++23 is going to standartize `[[assume (expression)]]` which is exactly what we need here. One important point is that it doesn't evaluate the expression. https://en.cppreference.com/w/cpp/language/attributes/assume\r\n\r\nThe proposal for [[assume]]: https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2022/p1774r8.pdf\r\n\r\n#### Some preliminary findings\r\n\r\ngcc-10 in CI: some serious regressions (20%) in h2oai queries, small regressions all over: https://github.com/duckdb/duckdb/actions/runs/3499903758/jobs/5861989817\r\n\r\nlocal clang-14, `-O3 -march=native`, h2oai and micro benchmarks: a couple percent speedup on most queries, and on the few queries even 10% and more.\r\n\r\nlocal gcc 11, `-O3 -march=native`, less impressive speedups than clang-14, but also no regressions.\r\n\r\nclang's `__builtin_assume` is kinda useless because it doesn't accept inline functions that are not explicitly marked as pure:\r\n```\r\n/home/akuzm/duck1/duckdb/src/include/duckdb/common/types/vector.hpp:286:3: warning: the argument to '__builtin_assume' has side effects that will be discarded [-Wassume]\r\n D_ASSERT(vector.GetVectorType() == VectorType::FLAT_VECTOR);\r\n ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\r\n/home/akuzm/duck1/duckdb/src/include/duckdb/common/assert.hpp:36:38: note: expanded from macro 'D_ASSERT'\r\n#define D_ASSERT(X) __builtin_assume(static_cast(X))\r\n ^~~~~~~~~~~~~~~~~~~~\r\n```", + "reactions": { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/5409/reactions", + "total_count": 0, + "+1": 0, + "-1": 0, + "laugh": 0, + "hooray": 0, + "confused": 0, + "heart": 0, + "rocket": 0, + "eyes": 0 + }, + "timeline_url": "https://api.github.com/repos/duckdb/duckdb/issues/5409/timeline", + "performed_via_github_app": null, + "state_reason": null + }, + { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/5370", + "repository_url": "https://api.github.com/repos/duckdb/duckdb", + "labels_url": "https://api.github.com/repos/duckdb/duckdb/issues/5370/labels{/name}", + "comments_url": "https://api.github.com/repos/duckdb/duckdb/issues/5370/comments", + "events_url": "https://api.github.com/repos/duckdb/duckdb/issues/5370/events", + "html_url": "https://github.com/duckdb/duckdb/issues/5370", + "id": 1450457322, + "node_id": "I_kwDOCEU65s5WdDjq", + "number": 5370, + "title": "Inner Joins exits the NodeJs Process", + "user": { + "login": "rayishome", + "id": 20548100, + "node_id": "MDQ6VXNlcjIwNTQ4MTAw", + "avatar_url": "https://avatars.githubusercontent.com/u/20548100?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/rayishome", + "html_url": "https://github.com/rayishome", + "followers_url": "https://api.github.com/users/rayishome/followers", + "following_url": "https://api.github.com/users/rayishome/following{/other_user}", + "gists_url": "https://api.github.com/users/rayishome/gists{/gist_id}", + "starred_url": "https://api.github.com/users/rayishome/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/rayishome/subscriptions", + "organizations_url": "https://api.github.com/users/rayishome/orgs", + "repos_url": "https://api.github.com/users/rayishome/repos", + "events_url": "https://api.github.com/users/rayishome/events{/privacy}", + "received_events_url": "https://api.github.com/users/rayishome/received_events", + "type": "User", + "site_admin": false + }, + "labels": [ + + ], + "state": "open", + "locked": false, + "assignee": null, + "assignees": [ + + ], + "milestone": null, + "comments": 7, + "created_at": "2022-11-15T22:20:06Z", + "updated_at": "2023-02-01T15:48:58Z", + "closed_at": null, + "author_association": "NONE", + "active_lock_reason": null, + "body": "### What happens?\n\nThis query exits the node process: Any inner join\r\n\t\r\n\tselect d.dimvalue, count(*) from Fact f inner join dimTable d on d.did = f.did group by d.dimvalue\r\n\r\n\tThis query returns no values showing no missing values, but even if there where it shouldn't be an issue:\r\n\r\n\tselect * from Fact f where f.did not in (select did from dimTable)\r\n\r\n\tThis query works, but an inner join is needed for more advanced queries:\r\n\t\r\n\tselect d.dimvalue, count(*) from Fact f left join dimTable d on d.did = f.did group by d.dimvalue\n\n### To Reproduce\n\nThe query above should reproduce the issue. It's an issue in 5.1 and 6.0. My dataset has large Varchar columns so it's possible it's running out of memory, but not sure since the processes exists without an error code. \n\n### OS:\n\nx64\n\n### DuckDB Version:\n\n5.1 and 6.0\n\n### DuckDB Client:\n\nNodeJS\n\n### Full Name:\n\nRay Malone\n\n### Affiliation:\n\nNo affiliation\n\n### Have you tried this on the latest `master` branch?\n\n- [X] I agree\n\n### Have you tried the steps to reproduce? Do they include all relevant data and configuration? Does the issue you report still appear there?\n\n- [X] I agree", + "reactions": { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/5370/reactions", + "total_count": 0, + "+1": 0, + "-1": 0, + "laugh": 0, + "hooray": 0, + "confused": 0, + "heart": 0, + "rocket": 0, + "eyes": 0 + }, + "timeline_url": "https://api.github.com/repos/duckdb/duckdb/issues/5370/timeline", + "performed_via_github_app": null, + "state_reason": null + }, + { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/5342", + "repository_url": "https://api.github.com/repos/duckdb/duckdb", + "labels_url": "https://api.github.com/repos/duckdb/duckdb/issues/5342/labels{/name}", + "comments_url": "https://api.github.com/repos/duckdb/duckdb/issues/5342/comments", + "events_url": "https://api.github.com/repos/duckdb/duckdb/issues/5342/events", + "html_url": "https://github.com/duckdb/duckdb/issues/5342", + "id": 1448624227, + "node_id": "I_kwDOCEU65s5WWEBj", + "number": 5342, + "title": "Strange error with make_date/date_part on large dataset", + "user": { + "login": "dylanscott", + "id": 491393, + "node_id": "MDQ6VXNlcjQ5MTM5Mw==", + "avatar_url": "https://avatars.githubusercontent.com/u/491393?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/dylanscott", + "html_url": "https://github.com/dylanscott", + "followers_url": "https://api.github.com/users/dylanscott/followers", + "following_url": "https://api.github.com/users/dylanscott/following{/other_user}", + "gists_url": "https://api.github.com/users/dylanscott/gists{/gist_id}", + "starred_url": "https://api.github.com/users/dylanscott/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/dylanscott/subscriptions", + "organizations_url": "https://api.github.com/users/dylanscott/orgs", + "repos_url": "https://api.github.com/users/dylanscott/repos", + "events_url": "https://api.github.com/users/dylanscott/events{/privacy}", + "received_events_url": "https://api.github.com/users/dylanscott/received_events", + "type": "User", + "site_admin": false + }, + "labels": [ + { + "id": 976002058, + "node_id": "MDU6TGFiZWw5NzYwMDIwNTg=", + "url": "https://api.github.com/repos/duckdb/duckdb/labels/bug", + "name": "bug", + "color": "d73a4a", + "default": true, + "description": "Something isn't working" + }, + { + "id": 3982366033, + "node_id": "LA_kwDOCEU65s7tXhVR", + "url": "https://api.github.com/repos/duckdb/duckdb/labels/Silver%20Foundation%20Member", + "name": "Silver Foundation Member", + "color": "CFCFCF", + "default": false, + "description": "" + } + ], + "state": "closed", + "locked": false, + "assignee": { + "login": "hawkfish", + "id": 13156216, + "node_id": "MDQ6VXNlcjEzMTU2MjE2", + "avatar_url": "https://avatars.githubusercontent.com/u/13156216?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/hawkfish", + "html_url": "https://github.com/hawkfish", + "followers_url": "https://api.github.com/users/hawkfish/followers", + "following_url": "https://api.github.com/users/hawkfish/following{/other_user}", + "gists_url": "https://api.github.com/users/hawkfish/gists{/gist_id}", + "starred_url": "https://api.github.com/users/hawkfish/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/hawkfish/subscriptions", + "organizations_url": "https://api.github.com/users/hawkfish/orgs", + "repos_url": "https://api.github.com/users/hawkfish/repos", + "events_url": "https://api.github.com/users/hawkfish/events{/privacy}", + "received_events_url": "https://api.github.com/users/hawkfish/received_events", + "type": "User", + "site_admin": false + }, + "assignees": [ + { + "login": "hawkfish", + "id": 13156216, + "node_id": "MDQ6VXNlcjEzMTU2MjE2", + "avatar_url": "https://avatars.githubusercontent.com/u/13156216?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/hawkfish", + "html_url": "https://github.com/hawkfish", + "followers_url": "https://api.github.com/users/hawkfish/followers", + "following_url": "https://api.github.com/users/hawkfish/following{/other_user}", + "gists_url": "https://api.github.com/users/hawkfish/gists{/gist_id}", + "starred_url": "https://api.github.com/users/hawkfish/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/hawkfish/subscriptions", + "organizations_url": "https://api.github.com/users/hawkfish/orgs", + "repos_url": "https://api.github.com/users/hawkfish/repos", + "events_url": "https://api.github.com/users/hawkfish/events{/privacy}", + "received_events_url": "https://api.github.com/users/hawkfish/received_events", + "type": "User", + "site_admin": false + } + ], + "milestone": null, + "comments": 7, + "created_at": "2022-11-14T19:31:59Z", + "updated_at": "2022-11-17T09:48:44Z", + "closed_at": "2022-11-17T09:48:44Z", + "author_association": "NONE", + "active_lock_reason": null, + "body": "### What happens?\n\nWe've encountered a strange bug on a specific query over a large sample dataset. With a certain set of filters we are encountering an error evaluating `make_date(date_part(['year', 'month', 'day'], \"timestamtz_column\"))`, which we use as a workaround to convert `timestamptz` columns to `date`s while respecting the configured `TimeZone`.\r\n\r\nI don't have a very good sense for what specifically might be triggering the issue. We ran into it accidentally and the repro presented here is isolated from that test case but isn't particularly minimal (exported as a ~2m row parquet file). But I don't believe it is a case of bad/corrupt input data because as shown in the repro steps we can successfully execute a couple of similar queries that seemingly should hit the same case if it were down to a bad row.\n\n### To Reproduce\n\nTo make things easier I have created a [little git repo](https://github.com/dylanscott/duckdb-repro) containing the parquet file and code to repro, as well as a couple of scripts to setup a virtualenv with the required dependencies and run the repro in it. I've tested this case with DuckDB 0.5.1, the new 0.6 release, and the latest master branch for good measure. The `repro.py` case contains the following:\r\n\r\nFirst we read the contents of the parquet file with pandas and register it with DuckDB:\r\n\r\n```python\r\nimport duckdb\r\nimport pandas\r\n\r\nconn = duckdb.connect(database=':memory:')\r\n\r\nrepro_df = pandas.read_parquet('repro.parquet', engine='fastparquet')\r\nconn.register('repro', repro_df)\r\n```\r\n\r\nThen we run a couple of queries that intend to show that this is a bug and not an issue with the input data. First we show that we can successfully run that `timestamptz` -> `date` conversion code for all rows in the dataset:\r\n\r\n```python\r\nall_converted_df = conn.execute(\"\"\"\r\nselect make_date(date_part(['year', 'month', 'day'], \"ACTUAL_DATE\")) as \"ACTUAL_DATE\" from repro\r\n\"\"\").df()\r\n```\r\n\r\nThe 2nd query gets closer to the erroring query - moving the date conversion into the where clause and filtering by the `FLAG` column, but this one still succeeds:\r\n\r\n```python\r\nless_filtered_df = conn.execute(\"\"\"\r\nselect * from repro\r\nwhere \"FLAG\" = true\r\n AND make_date(date_part(['year', 'month', 'day'], \"ACTUAL_DATE\")) >= '2021-01-01'\r\n\"\"\").df()\r\n```\r\n\r\nFinally we add a filter on the `HASH` column, which should run on a subset of the rows from the previous query but for some reason now hits an error:\r\n\r\n```python\r\nerroring_df = conn.execute(\"\"\"\r\nselect * from repro\r\nwhere \"FLAG\" = true\r\n AND \"HASH\" = 'be763d263e00fe8a7c1e37ef441c5519'\r\n AND make_date(date_part(['year', 'month', 'day'], \"ACTUAL_DATE\")) >= '2021-01-01'\r\n\"\"\").df()\r\n```\r\n\r\n```\r\nduckdb.ConversionException: Conversion Error: Date out of range: 0-0-0\r\n```\r\n\r\nFrom grepping the DuckDB codebase it seems like that error is coming from the `make_date` call and indicates that it's getting zeros for year/month/day, even though it doesn't seem to be in the other queries. Weird!\r\n\r\nAlso this may or may not be useful but here are a few things I encountered while isolating this repro that seemed interesting:\r\n\r\n- Selecting directly from the parquet file rather than reading with Pandas and registering doesn't repro the issue.\r\n - The main difference I could see is I think `ACTUAL_DATE` ends up as a `timestamp` (without time zone) when reading directly from the parquet file whereas Pandas reads it into a `datetime64[ns, UTC]` column which should map to `timestamptz` \r\n- I get the `Date out of range: 0-0-0` error when reproing on my M1 Mac, but I get different values in the error when we initially encountered the error in our app where the Python process is running in an x86-64 Linux VM. Specifically: `Date out of range: 1514828672-1514828656-1514828656`\r\n- The only existing issue I could find that seemed like it might be related was #2860 which also concerns a weird date conversion error manifesting on large datasets.\r\n - Though the only comment said it looks like an issue with the optimizer and I tested with `pragma disable_optimizer;` and still hit the issue.\n\n### OS:\n\nmacOS, Linux\n\n### DuckDB Version:\n\n0.6, 0.5.1\n\n### DuckDB Client:\n\nPython\n\n### Full Name:\n\nDylan Scott\n\n### Affiliation:\n\nHex Technologies\n\n### Have you tried this on the latest `master` branch?\n\n- [X] I agree\n\n### Have you tried the steps to reproduce? Do they include all relevant data and configuration? Does the issue you report still appear there?\n\n- [X] I agree", + "reactions": { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/5342/reactions", + "total_count": 0, + "+1": 0, + "-1": 0, + "laugh": 0, + "hooray": 0, + "confused": 0, + "heart": 0, + "rocket": 0, + "eyes": 0 + }, + "timeline_url": "https://api.github.com/repos/duckdb/duckdb/issues/5342/timeline", + "performed_via_github_app": null, + "state_reason": "completed" + }, + { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/5328", + "repository_url": "https://api.github.com/repos/duckdb/duckdb", + "labels_url": "https://api.github.com/repos/duckdb/duckdb/issues/5328/labels{/name}", + "comments_url": "https://api.github.com/repos/duckdb/duckdb/issues/5328/comments", + "events_url": "https://api.github.com/repos/duckdb/duckdb/issues/5328/events", + "html_url": "https://github.com/duckdb/duckdb/issues/5328", + "id": 1447641666, + "node_id": "I_kwDOCEU65s5WSUJC", + "number": 5328, + "title": "String to HUGEINT cast bug", + "user": { + "login": "taniabogatsch", + "id": 44262898, + "node_id": "MDQ6VXNlcjQ0MjYyODk4", + "avatar_url": "https://avatars.githubusercontent.com/u/44262898?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/taniabogatsch", + "html_url": "https://github.com/taniabogatsch", + "followers_url": "https://api.github.com/users/taniabogatsch/followers", + "following_url": "https://api.github.com/users/taniabogatsch/following{/other_user}", + "gists_url": "https://api.github.com/users/taniabogatsch/gists{/gist_id}", + "starred_url": "https://api.github.com/users/taniabogatsch/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/taniabogatsch/subscriptions", + "organizations_url": "https://api.github.com/users/taniabogatsch/orgs", + "repos_url": "https://api.github.com/users/taniabogatsch/repos", + "events_url": "https://api.github.com/users/taniabogatsch/events{/privacy}", + "received_events_url": "https://api.github.com/users/taniabogatsch/received_events", + "type": "User", + "site_admin": false + }, + "labels": [ + { + "id": 976002058, + "node_id": "MDU6TGFiZWw5NzYwMDIwNTg=", + "url": "https://api.github.com/repos/duckdb/duckdb/labels/bug", + "name": "bug", + "color": "d73a4a", + "default": true, + "description": "Something isn't working" + } + ], + "state": "open", + "locked": false, + "assignee": null, + "assignees": [ + + ], + "milestone": null, + "comments": 7, + "created_at": "2022-11-14T08:51:57Z", + "updated_at": "2022-11-16T21:58:45Z", + "closed_at": null, + "author_association": "CONTRIBUTOR", + "active_lock_reason": null, + "body": "### What happens?\n\nCasting from string to hugeint behaves incorrect.\n\n### To Reproduce\n\n```sql\r\nD select '1.8259857912588366e+37'::hugeint;\r\n┌───────────────────────────────────────────┐\r\n│ CAST('1.8259857912588366e+37' AS HUGEINT) │\r\n│ int128 │\r\n├───────────────────────────────────────────┤\r\n│ 20000000000000000000000000000000000000 │\r\n└───────────────────────────────────────────┘\r\n```\r\nBut this works.\r\n```sql\r\nD select 1.8259857912588366e+37::hugeint;\r\n┌─────────────────────────────────────────┐\r\n│ CAST(1.8259857912588366e+37 AS HUGEINT) │\r\n│ int128 │\r\n├─────────────────────────────────────────┤\r\n│ 18259857912588365870837119913054699520 │\r\n└─────────────────────────────────────────┘\r\n```\n\n### OS:\n\niOS\n\n### DuckDB Version:\n\nmaster\n\n### DuckDB Client:\n\nCLI\n\n### Full Name:\n\nTania Bogatsch\n\n### Affiliation:\n\nDuckDB Labs\n\n### Have you tried this on the latest `master` branch?\n\n- [X] I agree\n\n### Have you tried the steps to reproduce? Do they include all relevant data and configuration? Does the issue you report still appear there?\n\n- [X] I agree", + "reactions": { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/5328/reactions", + "total_count": 0, + "+1": 0, + "-1": 0, + "laugh": 0, + "hooray": 0, + "confused": 0, + "heart": 0, + "rocket": 0, + "eyes": 0 + }, + "timeline_url": "https://api.github.com/repos/duckdb/duckdb/issues/5328/timeline", + "performed_via_github_app": null, + "state_reason": null + }, + { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/5290", + "repository_url": "https://api.github.com/repos/duckdb/duckdb", + "labels_url": "https://api.github.com/repos/duckdb/duckdb/issues/5290/labels{/name}", + "comments_url": "https://api.github.com/repos/duckdb/duckdb/issues/5290/comments", + "events_url": "https://api.github.com/repos/duckdb/duckdb/issues/5290/events", + "html_url": "https://github.com/duckdb/duckdb/pull/5290", + "id": 1444566750, + "node_id": "PR_kwDOCEU65s5CqAA7", + "number": 5290, + "title": "Enum type added to the types metadata table", + "user": { + "login": "LindsayWray", + "id": 69161963, + "node_id": "MDQ6VXNlcjY5MTYxOTYz", + "avatar_url": "https://avatars.githubusercontent.com/u/69161963?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/LindsayWray", + "html_url": "https://github.com/LindsayWray", + "followers_url": "https://api.github.com/users/LindsayWray/followers", + "following_url": "https://api.github.com/users/LindsayWray/following{/other_user}", + "gists_url": "https://api.github.com/users/LindsayWray/gists{/gist_id}", + "starred_url": "https://api.github.com/users/LindsayWray/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/LindsayWray/subscriptions", + "organizations_url": "https://api.github.com/users/LindsayWray/orgs", + "repos_url": "https://api.github.com/users/LindsayWray/repos", + "events_url": "https://api.github.com/users/LindsayWray/events{/privacy}", + "received_events_url": "https://api.github.com/users/LindsayWray/received_events", + "type": "User", + "site_admin": false + }, + "labels": [ + { + "id": 4024407704, + "node_id": "LA_kwDOCEU65s7v35aY", + "url": "https://api.github.com/repos/duckdb/duckdb/labels/Merge%20After%20Feature%20Freeze", + "name": "Merge After Feature Freeze", + "color": "90EE90", + "default": false, + "description": "" + } + ], + "state": "closed", + "locked": false, + "assignee": null, + "assignees": [ + + ], + "milestone": null, + "comments": 7, + "created_at": "2022-11-10T21:10:25Z", + "updated_at": "2022-11-28T09:22:47Z", + "closed_at": "2022-11-15T11:49:09Z", + "author_association": "NONE", + "active_lock_reason": null, + "draft": false, + "pull_request": { + "url": "https://api.github.com/repos/duckdb/duckdb/pulls/5290", + "html_url": "https://github.com/duckdb/duckdb/pull/5290", + "diff_url": "https://github.com/duckdb/duckdb/pull/5290.diff", + "patch_url": "https://github.com/duckdb/duckdb/pull/5290.patch", + "merged_at": "2022-11-15T11:49:09Z" + }, + "body": "This pr addresses issue #5174 and offers an alternative solution. \r\n\r\n@Mause suggested to output an additional field in the underlying table (duckdb_types) instead, which the pg_... view consumes to provide a postgres compatible interface and therefore compatible with SQLAlchemy. So I’ve added the enum type to duckdb_types. \r\n\r\n\"image\"\r\n\r\n14 nov. additions: \r\nThe pg_enum view now shows the metadata of enum types similar to Postgres. To get the enum data for this view, I have added an extra column to the duckdb_types table containing the possible values of each enum type.\r\nSome small changes in the format_pg_type function and pg_type view were also required to make this functionality resemble the Postgres results as much as possible.\r\n", + "reactions": { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/5290/reactions", + "total_count": 0, + "+1": 0, + "-1": 0, + "laugh": 0, + "hooray": 0, + "confused": 0, + "heart": 0, + "rocket": 0, + "eyes": 0 + }, + "timeline_url": "https://api.github.com/repos/duckdb/duckdb/issues/5290/timeline", + "performed_via_github_app": null, + "state_reason": null + }, + { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/5124", + "repository_url": "https://api.github.com/repos/duckdb/duckdb", + "labels_url": "https://api.github.com/repos/duckdb/duckdb/issues/5124/labels{/name}", + "comments_url": "https://api.github.com/repos/duckdb/duckdb/issues/5124/comments", + "events_url": "https://api.github.com/repos/duckdb/duckdb/issues/5124/events", + "html_url": "https://github.com/duckdb/duckdb/issues/5124", + "id": 1428048461, + "node_id": "I_kwDOCEU65s5VHkpN", + "number": 5124, + "title": "\"ERROR: AddressSanitizer: heap-buffer-overflow\" when in on-disk mode (not in-memory) when working with arrays", + "user": { + "login": "karayv", + "id": 2749230, + "node_id": "MDQ6VXNlcjI3NDkyMzA=", + "avatar_url": "https://avatars.githubusercontent.com/u/2749230?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/karayv", + "html_url": "https://github.com/karayv", + "followers_url": "https://api.github.com/users/karayv/followers", + "following_url": "https://api.github.com/users/karayv/following{/other_user}", + "gists_url": "https://api.github.com/users/karayv/gists{/gist_id}", + "starred_url": "https://api.github.com/users/karayv/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/karayv/subscriptions", + "organizations_url": "https://api.github.com/users/karayv/orgs", + "repos_url": "https://api.github.com/users/karayv/repos", + "events_url": "https://api.github.com/users/karayv/events{/privacy}", + "received_events_url": "https://api.github.com/users/karayv/received_events", + "type": "User", + "site_admin": false + }, + "labels": [ + + ], + "state": "closed", + "locked": false, + "assignee": { + "login": "Mytherin", + "id": 3978469, + "node_id": "MDQ6VXNlcjM5Nzg0Njk=", + "avatar_url": "https://avatars.githubusercontent.com/u/3978469?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/Mytherin", + "html_url": "https://github.com/Mytherin", + "followers_url": "https://api.github.com/users/Mytherin/followers", + "following_url": "https://api.github.com/users/Mytherin/following{/other_user}", + "gists_url": "https://api.github.com/users/Mytherin/gists{/gist_id}", + "starred_url": "https://api.github.com/users/Mytherin/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/Mytherin/subscriptions", + "organizations_url": "https://api.github.com/users/Mytherin/orgs", + "repos_url": "https://api.github.com/users/Mytherin/repos", + "events_url": "https://api.github.com/users/Mytherin/events{/privacy}", + "received_events_url": "https://api.github.com/users/Mytherin/received_events", + "type": "User", + "site_admin": false + }, + "assignees": [ + { + "login": "Mytherin", + "id": 3978469, + "node_id": "MDQ6VXNlcjM5Nzg0Njk=", + "avatar_url": "https://avatars.githubusercontent.com/u/3978469?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/Mytherin", + "html_url": "https://github.com/Mytherin", + "followers_url": "https://api.github.com/users/Mytherin/followers", + "following_url": "https://api.github.com/users/Mytherin/following{/other_user}", + "gists_url": "https://api.github.com/users/Mytherin/gists{/gist_id}", + "starred_url": "https://api.github.com/users/Mytherin/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/Mytherin/subscriptions", + "organizations_url": "https://api.github.com/users/Mytherin/orgs", + "repos_url": "https://api.github.com/users/Mytherin/repos", + "events_url": "https://api.github.com/users/Mytherin/events{/privacy}", + "received_events_url": "https://api.github.com/users/Mytherin/received_events", + "type": "User", + "site_admin": false + } + ], + "milestone": null, + "comments": 7, + "created_at": "2022-10-29T03:25:30Z", + "updated_at": "2022-11-01T06:32:58Z", + "closed_at": "2022-11-01T06:32:58Z", + "author_association": "NONE", + "active_lock_reason": null, + "body": "### What happens?\n\nJDBC and CLI fail when working with arrays. In the case of JDBC, JVM crashes (the crash reports are not very informative). I was able to reproduce the JDBC issue on mac and linux. To reproduce in CLI I used mac, client versions 0.5.1 and 0.4.0. With `make debug` on 0.4.0 the report looks like this:\r\n```\r\n=================================================================\r\n==49159==ERROR: AddressSanitizer: heap-buffer-overflow on address 0x61a0000005c0 at pc 0x000126c08f20 bp 0x70000702fa50 sp 0x70000702f218\r\nREAD of size 16 at 0x61a0000005c0 thread T1\r\n #0 0x126c08f1f in __asan_memcpy+0x1af (libclang_rt.asan_osx_dynamic.dylib:x86_64h+0x48f1f)\r\n #1 0x1127fc1dc in void duckdb::ListExtractTemplate(unsigned long long, duckdb::VectorData&, duckdb::VectorData&, duckdb::Vector&, unsigned long long, duckdb::Vector&) list_extract.cpp:66\r\n #2 0x1127ecf98 in duckdb::ExecuteListExtractInternal(unsigned long long, duckdb::VectorData&, duckdb::VectorData&, duckdb::Vector&, unsigned long long, duckdb::Vector&) list_extract.cpp:118\r\n #3 0x1127eb866 in duckdb::ExecuteListExtract(duckdb::Vector&, duckdb::Vector&, duckdb::Vector&, unsigned long long) list_extract.cpp:156\r\n #4 0x11274090c in duckdb::ListExtractFunction(duckdb::DataChunk&, duckdb::ExpressionState&, duckdb::Vector&) list_extract.cpp:184\r\n #5 0x112a5d10e in decltype(static_cast(fp)(static_cast(fp0), static_cast(fp0), static_cast(fp0))) std::__1::__invoke(void (*&)(duckdb::DataChunk&, duckdb::ExpressionState&, duckdb::Vector&), duckdb::DataChunk&, duckdb::ExpressionState&, duckdb::Vector&) type_traits:3918\r\n #6 0x112a5ce94 in void std::__1::__invoke_void_return_wrapper::__call(void (*&)(duckdb::DataChunk&, duckdb::ExpressionState&, duckdb::Vector&), duckdb::DataChunk&, duckdb::ExpressionState&, duckdb::Vector&) invoke.h:61\r\n #7 0x112a5cb6c in std::__1::__function::__alloc_func, void (duckdb::DataChunk&, duckdb::ExpressionState&, duckdb::Vector&)>::operator()(duckdb::DataChunk&, duckdb::ExpressionState&, duckdb::Vector&) function.h:178\r\n #8 0x112a566c0 in std::__1::__function::__func, void (duckdb::DataChunk&, duckdb::ExpressionState&, duckdb::Vector&)>::operator()(duckdb::DataChunk&, duckdb::ExpressionState&, duckdb::Vector&) function.h:352\r\n #9 0x115469400 in std::__1::__function::__value_func::operator()(duckdb::DataChunk&, duckdb::ExpressionState&, duckdb::Vector&) const function.h:505\r\n #10 0x11505b7b0 in std::__1::function::operator()(duckdb::DataChunk&, duckdb::ExpressionState&, duckdb::Vector&) const function.h:1182\r\n #11 0x11505af2a in duckdb::ExpressionExecutor::Execute(duckdb::BoundFunctionExpression const&, duckdb::ExpressionState*, duckdb::SelectionVector const*, unsigned long long, duckdb::Vector&) execute_function.cpp:44\r\n #12 0x115e6b6e2 in duckdb::ExpressionExecutor::Execute(duckdb::Expression const&, duckdb::ExpressionState*, duckdb::SelectionVector const*, unsigned long long, duckdb::Vector&) expression_executor.cpp:173\r\n #13 0x115e68b0f in duckdb::ExpressionExecutor::ExecuteExpression(unsigned long long, duckdb::Vector&) expression_executor.cpp:75\r\n #14 0x115e680b0 in duckdb::ExpressionExecutor::Execute(duckdb::DataChunk*, duckdb::DataChunk&) expression_executor.cpp:46\r\n #15 0x116e65884 in duckdb::ExpressionExecutor::Execute(duckdb::DataChunk&, duckdb::DataChunk&) expression_executor.hpp:32\r\n #16 0x115b18160 in duckdb::PhysicalProjection::Execute(duckdb::ExecutionContext&, duckdb::DataChunk&, duckdb::DataChunk&, duckdb::GlobalOperatorState&, duckdb::OperatorState&) const physical_projection.cpp:29\r\n #17 0x11680c3b7 in duckdb::PipelineExecutor::Execute(duckdb::DataChunk&, duckdb::DataChunk&, unsigned long long) pipeline_executor.cpp:292\r\n #18 0x1168093a2 in duckdb::PipelineExecutor::ExecutePushInternal(duckdb::DataChunk&, unsigned long long) pipeline_executor.cpp:103\r\n #19 0x116807f80 in duckdb::PipelineExecutor::Execute(unsigned long long) pipeline_executor.cpp:64\r\n #20 0x11680b2c0 in duckdb::PipelineExecutor::Execute() pipeline_executor.cpp:78\r\n #21 0x1168b3d06 in duckdb::PipelineTask::ExecuteTask(duckdb::TaskExecutionMode) pipeline.cpp:43\r\n #22 0x1167cf946 in duckdb::ExecutorTask::Execute(duckdb::TaskExecutionMode) executor_task.cpp:17\r\n #23 0x116812360 in duckdb::TaskScheduler::ExecuteForever(std::__1::atomic*) task_scheduler.cpp:135\r\n #24 0x1168140c8 in duckdb::ThreadExecuteTasks(duckdb::TaskScheduler*, std::__1::atomic*) task_scheduler.cpp:166\r\n #25 0x116911872 in decltype(static_cast(fp)(static_cast(fp0), static_cast*>(fp0))) std::__1::__invoke*), duckdb::TaskScheduler*, std::__1::atomic*>(void (*&&)(duckdb::TaskScheduler*, std::__1::atomic*), duckdb::TaskScheduler*&&, std::__1::atomic*&&) type_traits:3918\r\n #26 0x116911668 in void std::__1::__thread_execute >, void (*)(duckdb::TaskScheduler*, std::__1::atomic*), duckdb::TaskScheduler*, std::__1::atomic*, 2ul, 3ul>(std::__1::tuple >, void (*)(duckdb::TaskScheduler*, std::__1::atomic*), duckdb::TaskScheduler*, std::__1::atomic*>&, std::__1::__tuple_indices<2ul, 3ul>) thread:287\r\n #27 0x11690ec06 in void* std::__1::__thread_proxy >, void (*)(duckdb::TaskScheduler*, std::__1::atomic*), duckdb::TaskScheduler*, std::__1::atomic*> >(void*) thread:298\r\n #28 0x7ff8009504e0 in _pthread_start+0x7c (libsystem_pthread.dylib:x86_64+0x64e0)\r\n #29 0x7ff80094bf6a in thread_start+0xe (libsystem_pthread.dylib:x86_64+0x1f6a)\r\n\r\n0x61a0000005c0 is located 0 bytes to the right of 1344-byte region [0x61a000000080,0x61a0000005c0)\r\nallocated by thread T1 here:\r\n #0 0x126c1c2bd in wrap__Znam+0x7d (libclang_rt.asan_osx_dynamic.dylib:x86_64h+0x5c2bd)\r\n #1 0x113fc4ef4 in duckdb::VectorBuffer::VectorBuffer(unsigned long long) vector_buffer.hpp:56\r\n #2 0x113fc4510 in duckdb::VectorBuffer::VectorBuffer(unsigned long long) vector_buffer.hpp:54\r\n #3 0x113fc4347 in std::__1::__shared_ptr_emplace >::__shared_ptr_emplace(std::__1::allocator, unsigned long long&&) shared_ptr.h:298\r\n #4 0x113fc3630 in std::__1::__shared_ptr_emplace >::__shared_ptr_emplace(std::__1::allocator, unsigned long long&&) shared_ptr.h:292\r\n #5 0x113fc3212 in std::__1::shared_ptr std::__1::allocate_shared, unsigned long long, void>(std::__1::allocator const&, unsigned long long&&) shared_ptr.h:1106\r\n #6 0x113fc2f7a in std::__1::shared_ptr std::__1::make_shared(unsigned long long&&) shared_ptr.h:1115\r\n #7 0x113e2407f in std::__1::shared_ptr duckdb::make_buffer(unsigned long long&&) types.hpp:199\r\n #8 0x113e23f57 in duckdb::VectorBuffer::CreateStandardVector(duckdb::PhysicalType, unsigned long long) vector_buffer.cpp:12\r\n #9 0x113e24315 in duckdb::VectorBuffer::CreateStandardVector(duckdb::LogicalType const&, unsigned long long) vector_buffer.cpp:24\r\n #10 0x113e29c85 in duckdb::Vector::Initialize(bool, unsigned long long) vector.cpp:216\r\n #11 0x113e2951c in duckdb::Vector::Vector(duckdb::LogicalType, bool, bool, unsigned long long) vector.cpp:24\r\n #12 0x113e2a133 in duckdb::Vector::Vector(duckdb::LogicalType, bool, bool, unsigned long long) vector.cpp:22\r\n #13 0x113e1badd in duckdb::Vector::Vector(duckdb::LogicalType, unsigned long long) vector.cpp:28\r\n #14 0x116a81383 in std::__1::__shared_ptr_emplace >::__shared_ptr_emplace(std::__1::allocator, duckdb::LogicalType&, unsigned int&) shared_ptr.h:298\r\n #15 0x116a80588 in std::__1::__shared_ptr_emplace >::__shared_ptr_emplace(std::__1::allocator, duckdb::LogicalType&, unsigned int&) shared_ptr.h:292\r\n #16 0x116a80155 in std::__1::shared_ptr std::__1::allocate_shared, duckdb::LogicalType&, unsigned int&, void>(std::__1::allocator const&, duckdb::LogicalType&, unsigned int&) shared_ptr.h:1106\r\n #17 0x116a7fd6f in std::__1::shared_ptr std::__1::make_shared(duckdb::LogicalType&, unsigned int&) shared_ptr.h:1115\r\n #18 0x116942557 in std::__1::shared_ptr duckdb::make_buffer(duckdb::LogicalType&, unsigned int&) types.hpp:199\r\n #19 0x116940c9b in duckdb::DictionaryCompressionStorage::StringInitScan(duckdb::ColumnSegment&) dictionary_compression.cpp:435\r\n #20 0x116bbaa31 in duckdb::ColumnSegment::InitializeScan(duckdb::ColumnScanState&) column_segment.cpp:74\r\n #21 0x116bc3b98 in duckdb::ColumnData::ScanVector(duckdb::ColumnScanState&, duckdb::Vector&, unsigned long long) column_data.cpp:62\r\n #22 0x116bcce54 in duckdb::ColumnData::ScanCount(duckdb::ColumnScanState&, duckdb::Vector&, unsigned long long) column_data.cpp:153\r\n #23 0x116c6361c in duckdb::StandardColumnData::ScanCount(duckdb::ColumnScanState&, duckdb::Vector&, unsigned long long) standard_column_data.cpp:71\r\n #24 0x116c0cc71 in duckdb::ListColumnData::ScanCount(duckdb::ColumnScanState&, duckdb::Vector&, unsigned long long) list_column_data.cpp:110\r\n #25 0x116c0b367 in duckdb::ListColumnData::Scan(duckdb::Transaction&, unsigned long long, duckdb::ColumnScanState&, duckdb::Vector&) list_column_data.cpp:70\r\n #26 0x116c3917e in void duckdb::RowGroup::TemplatedScan<(duckdb::TableScanType)0>(duckdb::Transaction*, duckdb::RowGroupScanState&, duckdb::DataChunk&) row_group.cpp:345\r\n #27 0x116c37c18 in duckdb::RowGroup::Scan(duckdb::Transaction&, duckdb::RowGroupScanState&, duckdb::DataChunk&) row_group.cpp:430\r\n #28 0x116e408b0 in duckdb::DataTable::ScanBaseTable(duckdb::Transaction&, duckdb::DataChunk&, duckdb::TableScanState&) data_table.cpp:391\r\n #29 0x116e404d0 in duckdb::DataTable::Scan(duckdb::Transaction&, duckdb::DataChunk&, duckdb::TableScanState&, std::__1::vector >&) data_table.cpp:379\r\n\r\nThread T1 created by T0 here:\r\n #0 0x126c0493c in wrap_pthread_create+0x5c (libclang_rt.asan_osx_dynamic.dylib:x86_64h+0x4493c)\r\n #1 0x11690ea00 in std::__1::__libcpp_thread_create(_opaque_pthread_t**, void* (*)(void*), void*) __threading_support:421\r\n #2 0x11690e38f in std::__1::thread::thread*), duckdb::TaskScheduler*, std::__1::atomic*, void>(void (&)(duckdb::TaskScheduler*, std::__1::atomic*), duckdb::TaskScheduler*&&, std::__1::atomic*&&) thread:314\r\n #3 0x11690de10 in std::__1::thread::thread*), duckdb::TaskScheduler*, std::__1::atomic*, void>(void (&)(duckdb::TaskScheduler*, std::__1::atomic*), duckdb::TaskScheduler*&&, std::__1::atomic*&&) thread:306\r\n #4 0x116814000 in std::__1::unique_ptr > duckdb::make_unique*), duckdb::TaskScheduler*, std::__1::atomic*>(void (&)(duckdb::TaskScheduler*, std::__1::atomic*), duckdb::TaskScheduler*&&, std::__1::atomic*&&) helper.hpp:41\r\n #5 0x11681132c in duckdb::TaskScheduler::SetThreadsInternal(int) task_scheduler.cpp:214\r\n #6 0x116813664 in duckdb::TaskScheduler::SetThreads(int) task_scheduler.cpp:180\r\n #7 0x1163db569 in duckdb::DatabaseInstance::Initialize(char const*, duckdb::DBConfig*) database.cpp:137\r\n #8 0x1163de3bd in duckdb::DuckDB::DuckDB(char const*, duckdb::DBConfig*) database.cpp:141\r\n #9 0x1163de738 in duckdb::DuckDB::DuckDB(char const*, duckdb::DBConfig*) database.cpp:140\r\n #10 0x10ff708c8 in std::__1::unique_ptr > duckdb::make_unique(char const*&, duckdb::DBConfig*&&) helper.hpp:41\r\n #11 0x10ff6fe78 in sqlite3_open_v2 sqlite3_api_wrapper.cpp:97\r\n #12 0x10fe75789 in open_db shell.c:14246\r\n #13 0x10fe90933 in do_meta_command shell.c:18351\r\n #14 0x10feab1ad in process_input shell.c:20073\r\n #15 0x10fe72f2c in main shell.c:20900\r\n #16 0x126b0d52d in start+0x1cd (dyld:x86_64+0x552d)\r\n\r\nSUMMARY: AddressSanitizer: heap-buffer-overflow (libclang_rt.asan_osx_dynamic.dylib:x86_64h+0x48f1f) in __asan_memcpy+0x1af\r\nShadow bytes around the buggy address:\r\n 0x1c3400000060: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00\r\n 0x1c3400000070: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00\r\n 0x1c3400000080: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00\r\n 0x1c3400000090: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00\r\n 0x1c34000000a0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00\r\n=>0x1c34000000b0: 00 00 00 00 00 00 00 00[fa]fa fa fa fa fa fa fa\r\n 0x1c34000000c0: fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa\r\n 0x1c34000000d0: fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa\r\n 0x1c34000000e0: fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa\r\n 0x1c34000000f0: fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa\r\n 0x1c3400000100: fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa\r\nShadow byte legend (one shadow byte represents 8 application bytes):\r\n Addressable: 00\r\n Partially addressable: 01 02 03 04 05 06 07 \r\n Heap left redzone: fa\r\n Freed heap region: fd\r\n Stack left redzone: f1\r\n Stack mid redzone: f2\r\n Stack right redzone: f3\r\n Stack after return: f5\r\n Stack use after scope: f8\r\n Global redzone: f9\r\n Global init order: f6\r\n Poisoned by user: f7\r\n Container overflow: fc\r\n Array cookie: ac\r\n Intra object redzone: bb\r\n ASan internal: fe\r\n Left alloca redzone: ca\r\n Right alloca redzone: cb\r\n==49159==ABORTING\r\n\r\n```\n\n### To Reproduce\n\n```sql\r\n$ ./duckdb\r\n.open db1\r\n\r\ncreate table t1 as\r\nselect\r\n\th1,\r\n\th2,\r\n\th3,\r\n\tLOWER(TRIM(s)) as s,\r\nfrom\r\n\tread_csv_auto('jvm_crash3.csv', ALL_VARCHAR = 1);\r\n\r\ncreate table t2 (\r\n h1 VARCHAR,\r\n h2 VARCHAR,\r\n h3 VARCHAR,\r\n s VARCHAR[],\r\n primary key(h1)\r\n);\r\n\r\ninsert into\t\r\n\tt2 \r\nselect\r\n\th1,\r\n\th2,\r\n\th3,\r\n\tstring_to_array(s, ',') as s\r\nfrom t1;\r\ndrop table t1;\r\n\r\ncreate table t3 as select s[1] from t2;\r\n```\n\n### OS:\n\nLinux, MaxOS\n\n### DuckDB Version:\n\n0.4.0, 0.4.1, 0.5.1\n\n### DuckDB Client:\n\nCLI, JDBC\n\n### Full Name:\n\nAndrii Karaivanskyi\n\n### Affiliation:\n\nIndeed\n\n### Have you tried this on the latest `master` branch?\n\n- [X] I agree\n\n### Have you tried the steps to reproduce? Do they include all relevant data and configuration? Does the issue you report still appear there?\n\n- [X] I agree", + "reactions": { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/5124/reactions", + "total_count": 0, + "+1": 0, + "-1": 0, + "laugh": 0, + "hooray": 0, + "confused": 0, + "heart": 0, + "rocket": 0, + "eyes": 0 + }, + "timeline_url": "https://api.github.com/repos/duckdb/duckdb/issues/5124/timeline", + "performed_via_github_app": null, + "state_reason": "completed" + }, + { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/5097", + "repository_url": "https://api.github.com/repos/duckdb/duckdb", + "labels_url": "https://api.github.com/repos/duckdb/duckdb/issues/5097/labels{/name}", + "comments_url": "https://api.github.com/repos/duckdb/duckdb/issues/5097/comments", + "events_url": "https://api.github.com/repos/duckdb/duckdb/issues/5097/events", + "html_url": "https://github.com/duckdb/duckdb/issues/5097", + "id": 1424939118, + "node_id": "I_kwDOCEU65s5U7thu", + "number": 5097, + "title": "explain cann't runout, when inner and outer join 48 tables.", + "user": { + "login": "digoal", + "id": 1920239, + "node_id": "MDQ6VXNlcjE5MjAyMzk=", + "avatar_url": "https://avatars.githubusercontent.com/u/1920239?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/digoal", + "html_url": "https://github.com/digoal", + "followers_url": "https://api.github.com/users/digoal/followers", + "following_url": "https://api.github.com/users/digoal/following{/other_user}", + "gists_url": "https://api.github.com/users/digoal/gists{/gist_id}", + "starred_url": "https://api.github.com/users/digoal/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/digoal/subscriptions", + "organizations_url": "https://api.github.com/users/digoal/orgs", + "repos_url": "https://api.github.com/users/digoal/repos", + "events_url": "https://api.github.com/users/digoal/events{/privacy}", + "received_events_url": "https://api.github.com/users/digoal/received_events", + "type": "User", + "site_admin": false + }, + "labels": [ + + ], + "state": "closed", + "locked": false, + "assignee": { + "login": "Tmonster", + "id": 6248601, + "node_id": "MDQ6VXNlcjYyNDg2MDE=", + "avatar_url": "https://avatars.githubusercontent.com/u/6248601?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/Tmonster", + "html_url": "https://github.com/Tmonster", + "followers_url": "https://api.github.com/users/Tmonster/followers", + "following_url": "https://api.github.com/users/Tmonster/following{/other_user}", + "gists_url": "https://api.github.com/users/Tmonster/gists{/gist_id}", + "starred_url": "https://api.github.com/users/Tmonster/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/Tmonster/subscriptions", + "organizations_url": "https://api.github.com/users/Tmonster/orgs", + "repos_url": "https://api.github.com/users/Tmonster/repos", + "events_url": "https://api.github.com/users/Tmonster/events{/privacy}", + "received_events_url": "https://api.github.com/users/Tmonster/received_events", + "type": "User", + "site_admin": false + }, + "assignees": [ + { + "login": "Tmonster", + "id": 6248601, + "node_id": "MDQ6VXNlcjYyNDg2MDE=", + "avatar_url": "https://avatars.githubusercontent.com/u/6248601?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/Tmonster", + "html_url": "https://github.com/Tmonster", + "followers_url": "https://api.github.com/users/Tmonster/followers", + "following_url": "https://api.github.com/users/Tmonster/following{/other_user}", + "gists_url": "https://api.github.com/users/Tmonster/gists{/gist_id}", + "starred_url": "https://api.github.com/users/Tmonster/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/Tmonster/subscriptions", + "organizations_url": "https://api.github.com/users/Tmonster/orgs", + "repos_url": "https://api.github.com/users/Tmonster/repos", + "events_url": "https://api.github.com/users/Tmonster/events{/privacy}", + "received_events_url": "https://api.github.com/users/Tmonster/received_events", + "type": "User", + "site_admin": false + } + ], + "milestone": null, + "comments": 7, + "created_at": "2022-10-27T02:41:23Z", + "updated_at": "2022-12-07T15:25:51Z", + "closed_at": "2022-12-07T15:25:51Z", + "author_association": "NONE", + "active_lock_reason": null, + "body": "### What happens?\n\nexplain query; -- cann't runout. and CPU 100%\r\nquery is:\r\n```\r\nexplain \r\nselect * from \r\nt1 \r\ninner join t2 on (t1.c2=t2.c1) \r\nleft join t4 on (t1.c3=t4.c1) left join t5 on (t4.c2=t5.c1) left join t6 on (t5.c2=t6.c1) \r\nleft join t7 on (t6.c2=t7.c1) left join t8 on (t7.c2=t8.c1) left join t9 on (t8.c2=t9.c1) \r\nleft join t10 on (t9.c2=t10.c1) left join t11 on (t10.c2=t11.c1) left join t12 on (t11.c2=t12.c1) \r\nleft join t13 on (t12.c2=t13.c1) left join t14 on (t13.c2=t14.c1) left join t15 on (t14.c2=t15.c1) \r\nleft join t16 on (t15.c2=t16.c1) left join t17 on (t16.c2=t17.c1) left join t18 on (t17.c2=t18.c1) \r\nleft join t19 on (t18.c2=t19.c1) left join t20 on (t19.c2=t20.c1) left join t21 on (t20.c2=t21.c1) \r\nleft join t22 on (t21.c2=t22.c1) left join t23 on (t22.c2=t23.c1) left join t24 on (t23.c2=t24.c1) \r\nleft join t25 on (t24.c2=t25.c1) left join t26 on (t25.c2=t26.c1) left join t27 on (t26.c2=t27.c1) \r\nleft join t28 on (t27.c2=t28.c1) left join t29 on (t28.c2=t29.c1) left join t30 on (t29.c2=t30.c1) \r\nleft join t31 on (t30.c2=t31.c1) left join t32 on (t31.c2=t32.c1) left join t33 on (t32.c2=t33.c1) \r\nleft join t34 on (t33.c2=t34.c1) left join t35 on (t34.c2=t35.c1) left join t36 on (t35.c2=t36.c1) \r\nleft join t37 on (t36.c2=t37.c1) left join t38 on (t37.c2=t38.c1) left join t39 on (t38.c2=t39.c1) \r\nleft join t3 ttttt3 on (ttttt3.c6=t33.c5) -- 加一行 \r\nleft join t40 on (t39.c2=t40.c1) left join t41 on (t40.c2=t41.c1) left join t42 on (t41.c2=t42.c1) \r\nleft join t43 on (t42.c2=t43.c1) left join t44 on (t43.c2=t44.c1) left join t45 on (t44.c2=t45.c1) \r\nleft join t46 on (t45.c2=t46.c1) left join t47 on (t46.c2=t47.c1) left join t48 on (t47.c2=t48.c1) \r\nleft join t5 ttt5 on (t42.c1=ttt5.c3) left join t5 ttt6 on (ttt5.c4=ttt6.c5) left join t5 ttt7 on (ttt6.c6=ttt7.c7) \r\ninner join t3 on (t2.c2=t3.c1) \r\nleft join t4 tt4 on (t47.c1=tt4.c3) left join t4 tt5 on (tt4.c4=tt5.c5) \r\n; \r\n```\n\n### To Reproduce\n\n1、download the test data:\r\nhttps://github.com/digoal/blog/blob/master/202210/db.duckdb.bak.tar.bz2\r\n2、load into duckdb v0.5.1 7c111322d\r\n3、run \r\n```\r\nexplain \r\nselect * from \r\nt1 \r\ninner join t2 on (t1.c2=t2.c1) \r\nleft join t4 on (t1.c3=t4.c1) left join t5 on (t4.c2=t5.c1) left join t6 on (t5.c2=t6.c1) \r\nleft join t7 on (t6.c2=t7.c1) left join t8 on (t7.c2=t8.c1) left join t9 on (t8.c2=t9.c1) \r\nleft join t10 on (t9.c2=t10.c1) left join t11 on (t10.c2=t11.c1) left join t12 on (t11.c2=t12.c1) \r\nleft join t13 on (t12.c2=t13.c1) left join t14 on (t13.c2=t14.c1) left join t15 on (t14.c2=t15.c1) \r\nleft join t16 on (t15.c2=t16.c1) left join t17 on (t16.c2=t17.c1) left join t18 on (t17.c2=t18.c1) \r\nleft join t19 on (t18.c2=t19.c1) left join t20 on (t19.c2=t20.c1) left join t21 on (t20.c2=t21.c1) \r\nleft join t22 on (t21.c2=t22.c1) left join t23 on (t22.c2=t23.c1) left join t24 on (t23.c2=t24.c1) \r\nleft join t25 on (t24.c2=t25.c1) left join t26 on (t25.c2=t26.c1) left join t27 on (t26.c2=t27.c1) \r\nleft join t28 on (t27.c2=t28.c1) left join t29 on (t28.c2=t29.c1) left join t30 on (t29.c2=t30.c1) \r\nleft join t31 on (t30.c2=t31.c1) left join t32 on (t31.c2=t32.c1) left join t33 on (t32.c2=t33.c1) \r\nleft join t34 on (t33.c2=t34.c1) left join t35 on (t34.c2=t35.c1) left join t36 on (t35.c2=t36.c1) \r\nleft join t37 on (t36.c2=t37.c1) left join t38 on (t37.c2=t38.c1) left join t39 on (t38.c2=t39.c1) \r\nleft join t3 ttttt3 on (ttttt3.c6=t33.c5) -- 加一行 \r\nleft join t40 on (t39.c2=t40.c1) left join t41 on (t40.c2=t41.c1) left join t42 on (t41.c2=t42.c1) \r\nleft join t43 on (t42.c2=t43.c1) left join t44 on (t43.c2=t44.c1) left join t45 on (t44.c2=t45.c1) \r\nleft join t46 on (t45.c2=t46.c1) left join t47 on (t46.c2=t47.c1) left join t48 on (t47.c2=t48.c1) \r\nleft join t5 ttt5 on (t42.c1=ttt5.c3) left join t5 ttt6 on (ttt5.c4=ttt6.c5) left join t5 ttt7 on (ttt6.c6=ttt7.c7) \r\ninner join t3 on (t2.c2=t3.c1) \r\nleft join t4 tt4 on (t47.c1=tt4.c3) left join t4 tt5 on (tt4.c4=tt5.c5) \r\n; \r\n``` \r\n4、here is a perf.svg , when running explain query. \r\nhttps://github.com/digoal/blog/blob/master/202210/perf.svg.tar.bz2\n\n### OS:\n\nMacOS\n\n### DuckDB Version:\n\nv0.5.1 7c111322d\n\n### DuckDB Client:\n\nCLI\n\n### Full Name:\n\nDigoal Zhou\n\n### Affiliation:\n\nAlibaba\n\n### Have you tried this on the latest `master` branch?\n\n- [X] I agree\n\n### Have you tried the steps to reproduce? Do they include all relevant data and configuration? Does the issue you report still appear there?\n\n- [X] I agree", + "reactions": { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/5097/reactions", + "total_count": 0, + "+1": 0, + "-1": 0, + "laugh": 0, + "hooray": 0, + "confused": 0, + "heart": 0, + "rocket": 0, + "eyes": 0 + }, + "timeline_url": "https://api.github.com/repos/duckdb/duckdb/issues/5097/timeline", + "performed_via_github_app": null, + "state_reason": "completed" + }, + { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/4981", + "repository_url": "https://api.github.com/repos/duckdb/duckdb", + "labels_url": "https://api.github.com/repos/duckdb/duckdb/issues/4981/labels{/name}", + "comments_url": "https://api.github.com/repos/duckdb/duckdb/issues/4981/comments", + "events_url": "https://api.github.com/repos/duckdb/duckdb/issues/4981/events", + "html_url": "https://github.com/duckdb/duckdb/issues/4981", + "id": 1407733642, + "node_id": "I_kwDOCEU65s5T6E-K", + "number": 4981, + "title": "S3 Access from Sagemaker: DuckDB never uses more than 2 threads when accessing S3", + "user": { + "login": "CerebralMastication", + "id": 126879, + "node_id": "MDQ6VXNlcjEyNjg3OQ==", + "avatar_url": "https://avatars.githubusercontent.com/u/126879?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/CerebralMastication", + "html_url": "https://github.com/CerebralMastication", + "followers_url": "https://api.github.com/users/CerebralMastication/followers", + "following_url": "https://api.github.com/users/CerebralMastication/following{/other_user}", + "gists_url": "https://api.github.com/users/CerebralMastication/gists{/gist_id}", + "starred_url": "https://api.github.com/users/CerebralMastication/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/CerebralMastication/subscriptions", + "organizations_url": "https://api.github.com/users/CerebralMastication/orgs", + "repos_url": "https://api.github.com/users/CerebralMastication/repos", + "events_url": "https://api.github.com/users/CerebralMastication/events{/privacy}", + "received_events_url": "https://api.github.com/users/CerebralMastication/received_events", + "type": "User", + "site_admin": false + }, + "labels": [ + + ], + "state": "closed", + "locked": false, + "assignee": { + "login": "samansmink", + "id": 2925274, + "node_id": "MDQ6VXNlcjI5MjUyNzQ=", + "avatar_url": "https://avatars.githubusercontent.com/u/2925274?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/samansmink", + "html_url": "https://github.com/samansmink", + "followers_url": "https://api.github.com/users/samansmink/followers", + "following_url": "https://api.github.com/users/samansmink/following{/other_user}", + "gists_url": "https://api.github.com/users/samansmink/gists{/gist_id}", + "starred_url": "https://api.github.com/users/samansmink/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/samansmink/subscriptions", + "organizations_url": "https://api.github.com/users/samansmink/orgs", + "repos_url": "https://api.github.com/users/samansmink/repos", + "events_url": "https://api.github.com/users/samansmink/events{/privacy}", + "received_events_url": "https://api.github.com/users/samansmink/received_events", + "type": "User", + "site_admin": false + }, + "assignees": [ + { + "login": "samansmink", + "id": 2925274, + "node_id": "MDQ6VXNlcjI5MjUyNzQ=", + "avatar_url": "https://avatars.githubusercontent.com/u/2925274?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/samansmink", + "html_url": "https://github.com/samansmink", + "followers_url": "https://api.github.com/users/samansmink/followers", + "following_url": "https://api.github.com/users/samansmink/following{/other_user}", + "gists_url": "https://api.github.com/users/samansmink/gists{/gist_id}", + "starred_url": "https://api.github.com/users/samansmink/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/samansmink/subscriptions", + "organizations_url": "https://api.github.com/users/samansmink/orgs", + "repos_url": "https://api.github.com/users/samansmink/repos", + "events_url": "https://api.github.com/users/samansmink/events{/privacy}", + "received_events_url": "https://api.github.com/users/samansmink/received_events", + "type": "User", + "site_admin": false + } + ], + "milestone": null, + "comments": 7, + "created_at": "2022-10-13T12:41:43Z", + "updated_at": "2023-01-13T09:50:27Z", + "closed_at": "2023-01-13T09:50:27Z", + "author_association": "NONE", + "active_lock_reason": null, + "body": "All my testing seems to indicate that when reading from S3, DuckDB never uses more than 2 threads, regardless of number of processors available. Others have confirmed that this behavior is not expected. So escalating this discussion into an issue. \r\n\r\n\r\n### Discussed in https://github.com/duckdb/duckdb/discussions/4780\r\n\r\n
\r\n\r\nOriginally posted by **CerebralMastication** September 22, 2022\r\nHey all, I've been playing around with DuckDB on Sagemaker reading parquet files on S3. I expected it to be considerably slower than a local file system, of course. However the performance has been really surprisingly slow. Slow enough that I'm wondering if I'm doing something wrong or if possibly DuckDB isn't reading from S3 in parallel threads the way it does on a local file system. \r\n\r\nI have a bucket in S3 with a ~7000 parquet files. I'm using only the first 101 of these for my testing. The script that generates the test data can be found [here](https://github.com/CerebralMastication/DataToolTesting/blob/main/simulate_imbalanced_data.ipynb). \r\n\r\nmy test of reading from S3 with duckdb is in [this notebook](https://github.com/CerebralMastication/DataToolTesting/blob/main/query_s3.ipynb). \r\n\r\nThe crux of the test is this simple summation:\r\n\r\n```\r\ndf = con.execute(f\"select sum(value) from parquet_scan({my_parquet_list[0:100]}) ;\").df()\r\n```\r\n\r\nI'm passing `parquet_scan` a list of 101 files which represent approx 165MB and 11.5m records. What's unexpected is I ran this test using 4 different Sagemaker instance sizes with 2, 4, 8, and 16 VCPU. They all ran the query in ~12 seconds. (I used `%%timeit` cell magic so ran 7 loops and I reran each test 3 times. I found some variation around run time, but it was not related to number of CPUs which I found unexpected as I would have expected read speed to decrease with increasing number of vCPUs. \r\n\r\nI'm also a bit surprised that this read took 12 seconds even on a single thread. With my data and compute both located in `eu-west-1 ` I would have expected to get 100 Gbps throughput. \r\n\r\nIt feels like maybe I'm doing this wrong. Any suggestions you all have would be greatly appreciated. \r\n\r\n\r\nUpdate:\r\nI wanted to compare DuckDB timing above with simply slurping all the parquet files into Pandas and filtering them. I realize that this is RAM constrained but I know that the `awswrangler` is single threaded so that will give me some notion of how fast simply reading the data from S3 on one thread is. So I ran the following:\r\n\r\n```\r\ndf = wr.s3.read_parquet(path=my_parquet_list[0:100], boto3_session=my_session)\r\ndf.value.sum()\r\n```\r\nwhich should move all those same parquet files out of S3 and read them into ram then filter. \r\n\r\nit runs in 8.5 seconds on a worker with 4 vCPU and 16GiB RAM (ml.g4dn.xlarge)\r\n\r\n
", + "reactions": { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/4981/reactions", + "total_count": 0, + "+1": 0, + "-1": 0, + "laugh": 0, + "hooray": 0, + "confused": 0, + "heart": 0, + "rocket": 0, + "eyes": 0 + }, + "timeline_url": "https://api.github.com/repos/duckdb/duckdb/issues/4981/timeline", + "performed_via_github_app": null, + "state_reason": "completed" + }, + { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/4878", + "repository_url": "https://api.github.com/repos/duckdb/duckdb", + "labels_url": "https://api.github.com/repos/duckdb/duckdb/issues/4878/labels{/name}", + "comments_url": "https://api.github.com/repos/duckdb/duckdb/issues/4878/comments", + "events_url": "https://api.github.com/repos/duckdb/duckdb/issues/4878/events", + "html_url": "https://github.com/duckdb/duckdb/pull/4878", + "id": 1396416227, + "node_id": "PR_kwDOCEU65s5AJIda", + "number": 4878, + "title": "[Compression] CHIMP128 Compression Algorithm", + "user": { + "login": "Tishj", + "id": 17162323, + "node_id": "MDQ6VXNlcjE3MTYyMzIz", + "avatar_url": "https://avatars.githubusercontent.com/u/17162323?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/Tishj", + "html_url": "https://github.com/Tishj", + "followers_url": "https://api.github.com/users/Tishj/followers", + "following_url": "https://api.github.com/users/Tishj/following{/other_user}", + "gists_url": "https://api.github.com/users/Tishj/gists{/gist_id}", + "starred_url": "https://api.github.com/users/Tishj/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/Tishj/subscriptions", + "organizations_url": "https://api.github.com/users/Tishj/orgs", + "repos_url": "https://api.github.com/users/Tishj/repos", + "events_url": "https://api.github.com/users/Tishj/events{/privacy}", + "received_events_url": "https://api.github.com/users/Tishj/received_events", + "type": "User", + "site_admin": false + }, + "labels": [ + + ], + "state": "closed", + "locked": false, + "assignee": null, + "assignees": [ + + ], + "milestone": null, + "comments": 7, + "created_at": "2022-10-04T14:51:00Z", + "updated_at": "2022-10-20T13:49:14Z", + "closed_at": "2022-10-20T07:06:38Z", + "author_association": "CONTRIBUTOR", + "active_lock_reason": null, + "draft": false, + "pull_request": { + "url": "https://api.github.com/repos/duckdb/duckdb/pulls/4878", + "html_url": "https://github.com/duckdb/duckdb/pull/4878", + "diff_url": "https://github.com/duckdb/duckdb/pull/4878.diff", + "patch_url": "https://github.com/duckdb/duckdb/pull/4878.patch", + "merged_at": "2022-10-20T07:06:37Z" + }, + "body": "This PR adds the Chimp compression algorithm.\r\n\r\n# CHIMP(128)\r\n\r\nChimp is a compression algorithm that can be used to store floating point values (DOUBLE or FLOAT).\r\n\r\nThe algorithm was introduced recently as a competitor to Gorilla, the paper about the algorithm can be found [here](http://pages.cs.aueb.gr/~kotidis/Publications/chimp.pdf)\r\n\r\nIn short:\r\nIt uses a ring buffer to keep track of the past 128 values, to select the best suited reference value.\r\nAfter it has done that, it xors with that reference value.\r\nIt can store this information in 4 different ways.\r\nIt is serialized to disk with a 2 bit flag to indicate how it's stored, followed by the bits relevant to that storage method.\r\n\r\n## Compression time benchmarks\r\n\r\nUncompressed:\r\n```\r\nname run timing\r\nbenchmark/micro/compression/chimp/chimp_store.benchmark 1 2.398617\r\nbenchmark/micro/compression/chimp/chimp_store.benchmark 2 2.418939\r\nbenchmark/micro/compression/chimp/chimp_store.benchmark 3 2.470288\r\nbenchmark/micro/compression/chimp/chimp_store.benchmark 4 2.418947\r\nbenchmark/micro/compression/chimp/chimp_store.benchmark 5 3.512568\r\n```\r\nChimp:\r\n```\r\nname run timing\r\nbenchmark/micro/compression/chimp/chimp_store.benchmark 1 4.904306\r\nbenchmark/micro/compression/chimp/chimp_store.benchmark 2 4.029724\r\nbenchmark/micro/compression/chimp/chimp_store.benchmark 3 4.280829\r\nbenchmark/micro/compression/chimp/chimp_store.benchmark 4 3.999000\r\nbenchmark/micro/compression/chimp/chimp_store.benchmark 5 4.062333\r\n```\r\n\r\n## Decompression time benchmarks (sequential scan)\r\n\r\nUncompressed:\r\n```\r\nname run timing\r\nbenchmark/micro/compression/chimp/chimp_read.benchmark 1 0.020655\r\nbenchmark/micro/compression/chimp/chimp_read.benchmark 2 0.021452\r\nbenchmark/micro/compression/chimp/chimp_read.benchmark 3 0.019731\r\nbenchmark/micro/compression/chimp/chimp_read.benchmark 4 0.019733\r\nbenchmark/micro/compression/chimp/chimp_read.benchmark 5 0.020495\r\n```\r\nChimp:\r\n```\r\nname run timing\r\nbenchmark/micro/compression/chimp/chimp_read.benchmark 1 0.106700\r\nbenchmark/micro/compression/chimp/chimp_read.benchmark 2 0.101608\r\nbenchmark/micro/compression/chimp/chimp_read.benchmark 3 0.105511\r\nbenchmark/micro/compression/chimp/chimp_read.benchmark 4 0.109822\r\nbenchmark/micro/compression/chimp/chimp_read.benchmark 5 0.098079\r\n```\r\n```", + "reactions": { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/4878/reactions", + "total_count": 5, + "+1": 1, + "-1": 0, + "laugh": 0, + "hooray": 4, + "confused": 0, + "heart": 0, + "rocket": 0, + "eyes": 0 + }, + "timeline_url": "https://api.github.com/repos/duckdb/duckdb/issues/4878/timeline", + "performed_via_github_app": null, + "state_reason": null + }, + { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/4859", + "repository_url": "https://api.github.com/repos/duckdb/duckdb", + "labels_url": "https://api.github.com/repos/duckdb/duckdb/issues/4859/labels{/name}", + "comments_url": "https://api.github.com/repos/duckdb/duckdb/issues/4859/comments", + "events_url": "https://api.github.com/repos/duckdb/duckdb/issues/4859/events", + "html_url": "https://github.com/duckdb/duckdb/issues/4859", + "id": 1393221520, + "node_id": "I_kwDOCEU65s5TCt-Q", + "number": 4859, + "title": "Segfault on SELECT from S3 Parquet File", + "user": { + "login": "ngould", + "id": 3130631, + "node_id": "MDQ6VXNlcjMxMzA2MzE=", + "avatar_url": "https://avatars.githubusercontent.com/u/3130631?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/ngould", + "html_url": "https://github.com/ngould", + "followers_url": "https://api.github.com/users/ngould/followers", + "following_url": "https://api.github.com/users/ngould/following{/other_user}", + "gists_url": "https://api.github.com/users/ngould/gists{/gist_id}", + "starred_url": "https://api.github.com/users/ngould/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/ngould/subscriptions", + "organizations_url": "https://api.github.com/users/ngould/orgs", + "repos_url": "https://api.github.com/users/ngould/repos", + "events_url": "https://api.github.com/users/ngould/events{/privacy}", + "received_events_url": "https://api.github.com/users/ngould/received_events", + "type": "User", + "site_admin": false + }, + "labels": [ + { + "id": 976002058, + "node_id": "MDU6TGFiZWw5NzYwMDIwNTg=", + "url": "https://api.github.com/repos/duckdb/duckdb/labels/bug", + "name": "bug", + "color": "d73a4a", + "default": true, + "description": "Something isn't working" + }, + { + "id": 2202756058, + "node_id": "MDU6TGFiZWwyMjAyNzU2MDU4", + "url": "https://api.github.com/repos/duckdb/duckdb/labels/Parquet", + "name": "Parquet", + "color": "e5ca72", + "default": false, + "description": "" + } + ], + "state": "closed", + "locked": false, + "assignee": { + "login": "hannes", + "id": 227792, + "node_id": "MDQ6VXNlcjIyNzc5Mg==", + "avatar_url": "https://avatars.githubusercontent.com/u/227792?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/hannes", + "html_url": "https://github.com/hannes", + "followers_url": "https://api.github.com/users/hannes/followers", + "following_url": "https://api.github.com/users/hannes/following{/other_user}", + "gists_url": "https://api.github.com/users/hannes/gists{/gist_id}", + "starred_url": "https://api.github.com/users/hannes/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/hannes/subscriptions", + "organizations_url": "https://api.github.com/users/hannes/orgs", + "repos_url": "https://api.github.com/users/hannes/repos", + "events_url": "https://api.github.com/users/hannes/events{/privacy}", + "received_events_url": "https://api.github.com/users/hannes/received_events", + "type": "User", + "site_admin": false + }, + "assignees": [ + { + "login": "hannes", + "id": 227792, + "node_id": "MDQ6VXNlcjIyNzc5Mg==", + "avatar_url": "https://avatars.githubusercontent.com/u/227792?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/hannes", + "html_url": "https://github.com/hannes", + "followers_url": "https://api.github.com/users/hannes/followers", + "following_url": "https://api.github.com/users/hannes/following{/other_user}", + "gists_url": "https://api.github.com/users/hannes/gists{/gist_id}", + "starred_url": "https://api.github.com/users/hannes/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/hannes/subscriptions", + "organizations_url": "https://api.github.com/users/hannes/orgs", + "repos_url": "https://api.github.com/users/hannes/repos", + "events_url": "https://api.github.com/users/hannes/events{/privacy}", + "received_events_url": "https://api.github.com/users/hannes/received_events", + "type": "User", + "site_admin": false + } + ], + "milestone": null, + "comments": 7, + "created_at": "2022-10-01T04:04:55Z", + "updated_at": "2022-10-20T17:40:15Z", + "closed_at": "2022-10-18T21:32:09Z", + "author_association": "NONE", + "active_lock_reason": null, + "body": "### What happens?\n\nCode terminates on a segmentation fault when attempting to query a parquet file stored in AWS S3.\n\n### To Reproduce\n\nExample Python code:\r\n\r\n```\r\ncon = duckdb.connect(database=':memory:')\r\n\r\ncon.execute('''\r\nINSTALL httpfs;\r\nLOAD httpfs;\r\nSET s3_region='us-east-1';\r\nSET s3_access_key_id='';\r\nSET s3_secret_access_key='';\r\n''')\r\n\r\ncon.execute(\"SELECT * FROM read_parquet('s3://my-bucket/my-filename.snappy.parquet')\")\r\nprint(con.fetchall())\r\n```\n\n### OS:\n\nmacOS Big Sur (11.6)\n\n### DuckDB Version:\n\n0.5.1\n\n### DuckDB Client:\n\nPython and CLI\n\n### Full Name:\n\nNathan Gould\n\n### Affiliation:\n\nEndeavor Labs\n\n### Have you tried this on the latest `master` branch?\n\n- [X] I agree\n\n### Have you tried the steps to reproduce? Do they include all relevant data and configuration? Does the issue you report still appear there?\n\n- [X] I agree", + "reactions": { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/4859/reactions", + "total_count": 0, + "+1": 0, + "-1": 0, + "laugh": 0, + "hooray": 0, + "confused": 0, + "heart": 0, + "rocket": 0, + "eyes": 0 + }, + "timeline_url": "https://api.github.com/repos/duckdb/duckdb/issues/4859/timeline", + "performed_via_github_app": null, + "state_reason": "completed" + }, + { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/4853", + "repository_url": "https://api.github.com/repos/duckdb/duckdb", + "labels_url": "https://api.github.com/repos/duckdb/duckdb/issues/4853/labels{/name}", + "comments_url": "https://api.github.com/repos/duckdb/duckdb/issues/4853/comments", + "events_url": "https://api.github.com/repos/duckdb/duckdb/issues/4853/events", + "html_url": "https://github.com/duckdb/duckdb/issues/4853", + "id": 1392550165, + "node_id": "I_kwDOCEU65s5TAKEV", + "number": 4853, + "title": " Unsupported compression codec \"GZIP\". Supported options are [...], gzip or [...]", + "user": { + "login": "darked89", + "id": 367479, + "node_id": "MDQ6VXNlcjM2NzQ3OQ==", + "avatar_url": "https://avatars.githubusercontent.com/u/367479?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/darked89", + "html_url": "https://github.com/darked89", + "followers_url": "https://api.github.com/users/darked89/followers", + "following_url": "https://api.github.com/users/darked89/following{/other_user}", + "gists_url": "https://api.github.com/users/darked89/gists{/gist_id}", + "starred_url": "https://api.github.com/users/darked89/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/darked89/subscriptions", + "organizations_url": "https://api.github.com/users/darked89/orgs", + "repos_url": "https://api.github.com/users/darked89/repos", + "events_url": "https://api.github.com/users/darked89/events{/privacy}", + "received_events_url": "https://api.github.com/users/darked89/received_events", + "type": "User", + "site_admin": false + }, + "labels": [ + + ], + "state": "closed", + "locked": false, + "assignee": null, + "assignees": [ + + ], + "milestone": null, + "comments": 7, + "created_at": "2022-09-30T14:42:47Z", + "updated_at": "2022-10-03T16:45:24Z", + "closed_at": "2022-10-03T16:45:24Z", + "author_association": "NONE", + "active_lock_reason": null, + "body": "### What happens?\r\n\r\nHello,\r\n\r\nI bumped into a rather unexpected problem:\r\n```\r\nCREATE TABLE express AS SELECT * FROM read_parquet('express_data_float32.gzip6.parquet') LIMIT 5;\r\nError: Invalid Error: Unsupported compression codec \"GZIP\". Supported options are uncompressed, gzip or snappy\r\n```\r\n\r\nversion: \r\nsystem: Linux ```5.14.0-4mx-amd64 #1 SMP Debian 5.14.16-1~mx21+1 (2021-11-05) x86_64 GNU/Linux```\r\n\r\nThe parquet was created using Python polars:\r\n\r\n```\r\ndf_trans.write_parquet(\"express_data_float32.gzip6.parquet\", compression=\"gzip\", compression_level=6)\r\n```\r\n\r\nand I can read it back in polars without problems:\r\n```\r\nimport polars as pl\r\ndf = pl.read_parquet(\"express_data_float32.gzip6.parquet\")\r\ndf\r\nOut[3]: \r\nshape: (48803, 1981)\r\n```\r\n\r\n**EDIT**\r\nAlso works in R with arrow:\r\n```\r\nlibrary(arrow)\r\ndf <- read_parquet(\"express_data_float32.gzip6.parquet\")\r\n```\r\n\r\n\r\nI am sorry but I have not compiled the main branch from source to exclude that it has been fixed already. There is a chance that Polars is doing something unortodox. \r\n\r\nMany thanks for your help\r\n\r\nDarek Kedra\r\n\r\n\r\n### To Reproduce\r\n\r\n```\r\nCREATE TABLE express AS SELECT * FROM read_parquet('express_data_float32.gzip6.parquet') LIMIT 5;\r\nError: Invalid Error: Unsupported compression codec \"GZIP\". Supported options are uncompressed, gzip or snappy\r\n```\r\n\r\n### OS:\r\n\r\nLinux \r\n\r\n### DuckDB Version:\r\n\r\nv0.5.1 7c111322d\r\n\r\n### DuckDB Client:\r\n\r\nCLI\r\n\r\n### Full Name:\r\n\r\nDarek Kedra\r\n\r\n### Affiliation:\r\n\r\nU.Salamanca\r\n\r\n### Have you tried this on the latest `master` branch?\r\n\r\n- [X] I agree\r\n\r\n### Have you tried the steps to reproduce? Do they include all relevant data and configuration? Does the issue you report still appear there?\r\n\r\n- [X] I agree", + "reactions": { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/4853/reactions", + "total_count": 0, + "+1": 0, + "-1": 0, + "laugh": 0, + "hooray": 0, + "confused": 0, + "heart": 0, + "rocket": 0, + "eyes": 0 + }, + "timeline_url": "https://api.github.com/repos/duckdb/duckdb/issues/4853/timeline", + "performed_via_github_app": null, + "state_reason": "completed" + }, + { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/4826", + "repository_url": "https://api.github.com/repos/duckdb/duckdb", + "labels_url": "https://api.github.com/repos/duckdb/duckdb/issues/4826/labels{/name}", + "comments_url": "https://api.github.com/repos/duckdb/duckdb/issues/4826/comments", + "events_url": "https://api.github.com/repos/duckdb/duckdb/issues/4826/events", + "html_url": "https://github.com/duckdb/duckdb/issues/4826", + "id": 1389339334, + "node_id": "I_kwDOCEU65s5Sz6LG", + "number": 4826, + "title": "[capi] How to register arrow data and query on it?", + "user": { + "login": "wangfenjin", + "id": 4160646, + "node_id": "MDQ6VXNlcjQxNjA2NDY=", + "avatar_url": "https://avatars.githubusercontent.com/u/4160646?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/wangfenjin", + "html_url": "https://github.com/wangfenjin", + "followers_url": "https://api.github.com/users/wangfenjin/followers", + "following_url": "https://api.github.com/users/wangfenjin/following{/other_user}", + "gists_url": "https://api.github.com/users/wangfenjin/gists{/gist_id}", + "starred_url": "https://api.github.com/users/wangfenjin/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/wangfenjin/subscriptions", + "organizations_url": "https://api.github.com/users/wangfenjin/orgs", + "repos_url": "https://api.github.com/users/wangfenjin/repos", + "events_url": "https://api.github.com/users/wangfenjin/events{/privacy}", + "received_events_url": "https://api.github.com/users/wangfenjin/received_events", + "type": "User", + "site_admin": false + }, + "labels": [ + + ], + "state": "open", + "locked": false, + "assignee": null, + "assignees": [ + + ], + "milestone": null, + "comments": 7, + "created_at": "2022-09-28T13:13:20Z", + "updated_at": "2022-10-15T05:45:10Z", + "closed_at": null, + "author_association": "CONTRIBUTOR", + "active_lock_reason": null, + "body": "Hi, seems we don't have API to register arrow data as table, similar to the [python example](https://github.com/duckdb/duckdb/blob/0d13deb759dd993ce037d37d79cc419c725553cd/tools/pythonpkg/tests/fast/arrow/test_dataset.py#L73)\r\n\r\n```python\r\n conn.register(\"pqdata\",pqdata)\r\n```\r\n\r\nMay I know do we have any plan on that? thanks.\r\n\r\nrelated to https://github.com/wangfenjin/duckdb-rs/issues/79\r\n", + "reactions": { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/4826/reactions", + "total_count": 0, + "+1": 0, + "-1": 0, + "laugh": 0, + "hooray": 0, + "confused": 0, + "heart": 0, + "rocket": 0, + "eyes": 0 + }, + "timeline_url": "https://api.github.com/repos/duckdb/duckdb/issues/4826/timeline", + "performed_via_github_app": null, + "state_reason": null + }, + { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/4756", + "repository_url": "https://api.github.com/repos/duckdb/duckdb", + "labels_url": "https://api.github.com/repos/duckdb/duckdb/issues/4756/labels{/name}", + "comments_url": "https://api.github.com/repos/duckdb/duckdb/issues/4756/comments", + "events_url": "https://api.github.com/repos/duckdb/duckdb/issues/4756/events", + "html_url": "https://github.com/duckdb/duckdb/issues/4756", + "id": 1378120289, + "node_id": "I_kwDOCEU65s5SJHJh", + "number": 4756, + "title": "[Fuzzer] C/C++ API missing backslash interpretation", + "user": { + "login": "PedroTadim", + "id": 8604310, + "node_id": "MDQ6VXNlcjg2MDQzMTA=", + "avatar_url": "https://avatars.githubusercontent.com/u/8604310?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/PedroTadim", + "html_url": "https://github.com/PedroTadim", + "followers_url": "https://api.github.com/users/PedroTadim/followers", + "following_url": "https://api.github.com/users/PedroTadim/following{/other_user}", + "gists_url": "https://api.github.com/users/PedroTadim/gists{/gist_id}", + "starred_url": "https://api.github.com/users/PedroTadim/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/PedroTadim/subscriptions", + "organizations_url": "https://api.github.com/users/PedroTadim/orgs", + "repos_url": "https://api.github.com/users/PedroTadim/repos", + "events_url": "https://api.github.com/users/PedroTadim/events{/privacy}", + "received_events_url": "https://api.github.com/users/PedroTadim/received_events", + "type": "User", + "site_admin": false + }, + "labels": [ + + ], + "state": "closed", + "locked": false, + "assignee": null, + "assignees": [ + + ], + "milestone": null, + "comments": 7, + "created_at": "2022-09-19T15:14:16Z", + "updated_at": "2022-10-13T08:04:22Z", + "closed_at": "2022-10-13T08:04:22Z", + "author_association": "CONTRIBUTOR", + "active_lock_reason": null, + "body": "### What happens?\r\n\r\nIn the C++ API run the following:\r\n\r\n```\r\nDuckDB db(nullptr);\r\nConnection con(db);\r\ncon.EnableQueryVerification();\r\ncon.SendQuery(\"SELECT BLOB '\\x27';\");\r\n```\r\nIt will give the error: Parser Error: unterminated quoted string at or near \"'''::BLOB\". \r\n\r\nIf I do this on the shell or without query verification, the statement is successful. I think there's something in the C/C++ API.\r\n\r\n### To Reproduce\r\n\r\nRun the statements above.\r\n\r\n### OS:\r\n\r\nLinux\r\n\r\n### DuckDB Version:\r\n\r\nlatest from sources\r\n\r\n### DuckDB Client:\r\n\r\nC/C++ API\r\n\r\n### Full Name:\r\n\r\nPedro Ferreira\r\n\r\n### Affiliation:\r\n\r\nHuawei\r\n\r\n### Have you tried this on the latest `master` branch?\r\n\r\n- [X] I agree\r\n\r\n### Have you tried the steps to reproduce? Do they include all relevant data and configuration? Does the issue you report still appear there?\r\n\r\n- [X] I agree", + "reactions": { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/4756/reactions", + "total_count": 0, + "+1": 0, + "-1": 0, + "laugh": 0, + "hooray": 0, + "confused": 0, + "heart": 0, + "rocket": 0, + "eyes": 0 + }, + "timeline_url": "https://api.github.com/repos/duckdb/duckdb/issues/4756/timeline", + "performed_via_github_app": null, + "state_reason": "completed" + }, + { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/4606", + "repository_url": "https://api.github.com/repos/duckdb/duckdb", + "labels_url": "https://api.github.com/repos/duckdb/duckdb/issues/4606/labels{/name}", + "comments_url": "https://api.github.com/repos/duckdb/duckdb/issues/4606/comments", + "events_url": "https://api.github.com/repos/duckdb/duckdb/issues/4606/events", + "html_url": "https://github.com/duckdb/duckdb/issues/4606", + "id": 1361548518, + "node_id": "I_kwDOCEU65s5RJ5Tm", + "number": 4606, + "title": "Error: Invalid Error: Unsupported compression codec \"ZSTD\". Supported options are uncompressed, gzip or snappy", + "user": { + "login": "vbmithr", + "id": 797581, + "node_id": "MDQ6VXNlcjc5NzU4MQ==", + "avatar_url": "https://avatars.githubusercontent.com/u/797581?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/vbmithr", + "html_url": "https://github.com/vbmithr", + "followers_url": "https://api.github.com/users/vbmithr/followers", + "following_url": "https://api.github.com/users/vbmithr/following{/other_user}", + "gists_url": "https://api.github.com/users/vbmithr/gists{/gist_id}", + "starred_url": "https://api.github.com/users/vbmithr/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/vbmithr/subscriptions", + "organizations_url": "https://api.github.com/users/vbmithr/orgs", + "repos_url": "https://api.github.com/users/vbmithr/repos", + "events_url": "https://api.github.com/users/vbmithr/events{/privacy}", + "received_events_url": "https://api.github.com/users/vbmithr/received_events", + "type": "User", + "site_admin": false + }, + "labels": [ + + ], + "state": "open", + "locked": false, + "assignee": null, + "assignees": [ + + ], + "milestone": null, + "comments": 7, + "created_at": "2022-09-05T07:56:00Z", + "updated_at": "2022-11-23T10:10:03Z", + "closed_at": null, + "author_association": "NONE", + "active_lock_reason": null, + "body": "### What happens?\n\nCan't read a ZSTD compressed parquet file.\n\n### To Reproduce\n\n```D select * from read_parquet('/home/vb/code/dm/FTX/2022-04-29_08-26-33.146732/2022-04-29_08-26-33.146732/BTC_USD.trade.parquet') limit 10;\r\nError: Invalid Error: Unsupported compression codec \"ZSTD\". Supported options are uncompressed, gzip or snappy\r\n```\r\n[BTC_USD.trade.parquet.gz](https://github.com/duckdb/duckdb/files/9487821/BTC_USD.trade.parquet.gz)\r\n\n\n### OS:\n\nLinux\n\n### DuckDB Version:\n\nmaster\n\n### DuckDB Client:\n\nCLI\n\n### Full Name:\n\nVincent Bernardoff\n\n### Affiliation:\n\nDeepmarker\n\n### Have you tried this on the latest `master` branch?\n\n- [X] I agree\n\n### Have you tried the steps to reproduce? Do they include all relevant data and configuration? Does the issue you report still appear there?\n\n- [X] I agree", + "reactions": { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/4606/reactions", + "total_count": 2, + "+1": 2, + "-1": 0, + "laugh": 0, + "hooray": 0, + "confused": 0, + "heart": 0, + "rocket": 0, + "eyes": 0 + }, + "timeline_url": "https://api.github.com/repos/duckdb/duckdb/issues/4606/timeline", + "performed_via_github_app": null, + "state_reason": null + }, + { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/4529", + "repository_url": "https://api.github.com/repos/duckdb/duckdb", + "labels_url": "https://api.github.com/repos/duckdb/duckdb/issues/4529/labels{/name}", + "comments_url": "https://api.github.com/repos/duckdb/duckdb/issues/4529/comments", + "events_url": "https://api.github.com/repos/duckdb/duckdb/issues/4529/events", + "html_url": "https://github.com/duckdb/duckdb/issues/4529", + "id": 1354713940, + "node_id": "I_kwDOCEU65s5Qv0tU", + "number": 4529, + "title": "JDBC 0.4.0 fails with `GLIBC_2.23' not found` on Centos 7", + "user": { + "login": "karayv", + "id": 2749230, + "node_id": "MDQ6VXNlcjI3NDkyMzA=", + "avatar_url": "https://avatars.githubusercontent.com/u/2749230?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/karayv", + "html_url": "https://github.com/karayv", + "followers_url": "https://api.github.com/users/karayv/followers", + "following_url": "https://api.github.com/users/karayv/following{/other_user}", + "gists_url": "https://api.github.com/users/karayv/gists{/gist_id}", + "starred_url": "https://api.github.com/users/karayv/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/karayv/subscriptions", + "organizations_url": "https://api.github.com/users/karayv/orgs", + "repos_url": "https://api.github.com/users/karayv/repos", + "events_url": "https://api.github.com/users/karayv/events{/privacy}", + "received_events_url": "https://api.github.com/users/karayv/received_events", + "type": "User", + "site_admin": false + }, + "labels": [ + + ], + "state": "open", + "locked": false, + "assignee": null, + "assignees": [ + + ], + "milestone": null, + "comments": 7, + "created_at": "2022-08-29T18:29:52Z", + "updated_at": "2022-10-28T01:29:31Z", + "closed_at": null, + "author_association": "NONE", + "active_lock_reason": null, + "body": "### What happens?\n\nI'm getting an error while trying to run integration tests with DuckDB:\r\n```\r\n08:56:01 \u001B[31m Cause: java.lang.UnsatisfiedLinkError: /tmp/libduckdb_java251203987240672711.so: /lib64/libm.so.6: version `GLIBC_2.23' not found (required by /tmp/libduckdb_java251203987240672711.so)\u001B[0m\r\n08:56:01 \u001B[31m at java.lang.ClassLoader$NativeLibrary.load(Native Method)\u001B[0m\r\n08:56:01 \u001B[31m at java.lang.ClassLoader.loadLibrary0(ClassLoader.java:1950)\u001B[0m\r\n08:56:01 \u001B[31m at java.lang.ClassLoader.loadLibrary(ClassLoader.java:1832)\u001B[0m\r\n08:56:01 \u001B[31m at java.lang.Runtime.load0(Runtime.java:811)\u001B[0m\r\n08:56:01 \u001B[31m at java.lang.System.load(System.java:1088)\u001B[0m\r\n08:56:01 \u001B[31m at org.duckdb.DuckDBNative.(DuckDBNative.java:44)\u001B[0m\r\n08:56:01 \u001B[31m at org.duckdb.DuckDBDatabase.(DuckDBDatabase.java:22)\u001B[0m\r\n08:56:01 \u001B[31m at org.duckdb.DuckDBDriver.connect(DuckDBDriver.java:35)\u001B[0m\r\n08:56:01 \u001B[31m at java.sql.DriverManager.getConnection(DriverManager.java:664)\u001B[0m\r\n08:56:01 \u001B[31m at java.sql.DriverManager.getConnection(DriverManager.java:270)\u001B[0m\r\n```\r\n\r\nCentos 7 is very popular, in fact I'm struggling to find a replacement image. Centos 7 has glibc version 2.17. Please suggest a workaround or fix. \n\n### To Reproduce\n\nImport `org.duckdb:duckdb_jdbc:0.4.0` to a scala project. \r\nUsing the scala code create in-memory DB as described in the DuckDB documentation.\r\nCreate a table. Perform insert, select and delete. \r\n\r\nI'm sure this can be reproduced using java as well.\n\n### OS:\n\ncentos-release-7-9.2009.1.el7.centos.x86_64\n\n### DuckDB Version:\n\n0.4.0\n\n### DuckDB Client:\n\norg.duckdb:duckdb_jdbc:0.4.0\n\n### Full Name:\n\nAndrii Karaivanskyi\n\n### Affiliation:\n\nIndeed\n\n### Have you tried this on the latest `master` branch?\n\n- [ ] I agree\n\n### Have you tried the steps to reproduce? Do they include all relevant data and configuration? Does the issue you report still appear there?\n\n- [X] I agree", + "reactions": { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/4529/reactions", + "total_count": 0, + "+1": 0, + "-1": 0, + "laugh": 0, + "hooray": 0, + "confused": 0, + "heart": 0, + "rocket": 0, + "eyes": 0 + }, + "timeline_url": "https://api.github.com/repos/duckdb/duckdb/issues/4529/timeline", + "performed_via_github_app": null, + "state_reason": null + }, + { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/4526", + "repository_url": "https://api.github.com/repos/duckdb/duckdb", + "labels_url": "https://api.github.com/repos/duckdb/duckdb/issues/4526/labels{/name}", + "comments_url": "https://api.github.com/repos/duckdb/duckdb/issues/4526/comments", + "events_url": "https://api.github.com/repos/duckdb/duckdb/issues/4526/events", + "html_url": "https://github.com/duckdb/duckdb/pull/4526", + "id": 1354245717, + "node_id": "PR_kwDOCEU65s49970T", + "number": 4526, + "title": "[C-API] Decimal casting to other type fixes", + "user": { + "login": "Tishj", + "id": 17162323, + "node_id": "MDQ6VXNlcjE3MTYyMzIz", + "avatar_url": "https://avatars.githubusercontent.com/u/17162323?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/Tishj", + "html_url": "https://github.com/Tishj", + "followers_url": "https://api.github.com/users/Tishj/followers", + "following_url": "https://api.github.com/users/Tishj/following{/other_user}", + "gists_url": "https://api.github.com/users/Tishj/gists{/gist_id}", + "starred_url": "https://api.github.com/users/Tishj/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/Tishj/subscriptions", + "organizations_url": "https://api.github.com/users/Tishj/orgs", + "repos_url": "https://api.github.com/users/Tishj/repos", + "events_url": "https://api.github.com/users/Tishj/events{/privacy}", + "received_events_url": "https://api.github.com/users/Tishj/received_events", + "type": "User", + "site_admin": false + }, + "labels": [ + + ], + "state": "closed", + "locked": false, + "assignee": null, + "assignees": [ + + ], + "milestone": null, + "comments": 7, + "created_at": "2022-08-29T12:55:09Z", + "updated_at": "2022-11-03T10:21:54Z", + "closed_at": "2022-11-03T10:21:51Z", + "author_association": "CONTRIBUTOR", + "active_lock_reason": null, + "draft": false, + "pull_request": { + "url": "https://api.github.com/repos/duckdb/duckdb/pulls/4526", + "html_url": "https://github.com/duckdb/duckdb/pull/4526", + "diff_url": "https://github.com/duckdb/duckdb/pull/4526.diff", + "patch_url": "https://github.com/duckdb/duckdb/pull/4526.patch", + "merged_at": "2022-11-03T10:21:51Z" + }, + "body": "This PR fixes #4506 \r\n\r\nIn the C-API Decimals are not being treated as the correct internal type, and the `width` and `scale` of the decimal is not taken into account at all.\r\n\r\nBridges/adapters are built+tested for the following cast operations:\r\n\r\n## Cast from DECIMAL\r\n\r\n- [x] DECIMAL -> DOUBLE\r\n- [x] DECIMAL -> FLOAT\r\n- [x] DECIMAL -> VARCHAR\r\n- [x] DECIMAL -> HUGEINT\r\n- [x] DECIMAL -> BIGINT\r\n- [x] DECIMAL -> UBIGINT\r\n- [x] DECIMAL -> INTEGER\r\n- [x] DECIMAL -> UINTEGER\r\n- [x] DECIMAL -> SMALLINT\r\n- [x] DECIMAL -> USMALLINT\r\n- [x] DECIMAL -> TINYINT\r\n- [x] DECIMAL -> UTINYINT\r\n\r\n## Cast to DECIMAL\r\n\r\n- [x] DOUBLE -> DECIMAL\r\n- [x] FLOAT -> DECIMAL\r\n- [x] VARCHAR -> DECIMAL\r\n- [x] HUGEINT -> DECIMAL\r\n- [x] BIGINT -> DECIMAL\r\n- [x] UBIGINT -> DECIMAL\r\n- [x] INTEGER -> DECIMAL\r\n- [x] UINTEGER -> DECIMAL\r\n- [x] SMALLINT -> DECIMAL\r\n- [x] USMALLINT -> DECIMAL\r\n- [x] TINYINT -> DECIMAL\r\n- [x] UTINYINT -> DECIMAL\r\n\r\n## Still missing\r\n\r\nDECIMAL -> DECIMAL", + "reactions": { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/4526/reactions", + "total_count": 0, + "+1": 0, + "-1": 0, + "laugh": 0, + "hooray": 0, + "confused": 0, + "heart": 0, + "rocket": 0, + "eyes": 0 + }, + "timeline_url": "https://api.github.com/repos/duckdb/duckdb/issues/4526/timeline", + "performed_via_github_app": null, + "state_reason": null + }, + { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/4406", + "repository_url": "https://api.github.com/repos/duckdb/duckdb", + "labels_url": "https://api.github.com/repos/duckdb/duckdb/issues/4406/labels{/name}", + "comments_url": "https://api.github.com/repos/duckdb/duckdb/issues/4406/comments", + "events_url": "https://api.github.com/repos/duckdb/duckdb/issues/4406/events", + "html_url": "https://github.com/duckdb/duckdb/issues/4406", + "id": 1339403328, + "node_id": "I_kwDOCEU65s5P1axA", + "number": 4406, + "title": "DuckDB crash: uncaught `TransactionException: Cannot create index with outstanding updates`", + "user": { + "login": "aarashy", + "id": 12239115, + "node_id": "MDQ6VXNlcjEyMjM5MTE1", + "avatar_url": "https://avatars.githubusercontent.com/u/12239115?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/aarashy", + "html_url": "https://github.com/aarashy", + "followers_url": "https://api.github.com/users/aarashy/followers", + "following_url": "https://api.github.com/users/aarashy/following{/other_user}", + "gists_url": "https://api.github.com/users/aarashy/gists{/gist_id}", + "starred_url": "https://api.github.com/users/aarashy/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/aarashy/subscriptions", + "organizations_url": "https://api.github.com/users/aarashy/orgs", + "repos_url": "https://api.github.com/users/aarashy/repos", + "events_url": "https://api.github.com/users/aarashy/events{/privacy}", + "received_events_url": "https://api.github.com/users/aarashy/received_events", + "type": "User", + "site_admin": false + }, + "labels": [ + + ], + "state": "closed", + "locked": false, + "assignee": null, + "assignees": [ + + ], + "milestone": null, + "comments": 7, + "created_at": "2022-08-15T19:36:54Z", + "updated_at": "2022-08-17T05:47:50Z", + "closed_at": "2022-08-17T05:47:50Z", + "author_association": "CONTRIBUTOR", + "active_lock_reason": null, + "body": "### What happens?\r\n\r\nReopening the following issue because I found a consistent repro on the latest DuckDB version: https://github.com/duckdb/duckdb/issues/3666\r\n\r\nI found a series of SQL statements which consistently causes DuckDB files to go into a corrupted state. Committing the transaction crashes the process. Any attempt to open a DB file after running these commands also crashes the process.\r\n\r\n### To Reproduce\r\n\r\n```\r\nD create table test (id bigint primary key, c1 text);\r\nD insert into test (id, c1) values (1, 'foo');\r\nD insert into test (id, c1) values (2, 'bar');\r\nD begin transaction;\r\nD delete from test where id = 1;\r\nD update test set c1='baz' where id=2;\r\nD commit;\r\nlibc++abi: terminating with uncaught exception of type duckdb::TransactionException: TransactionContext Error: Cannot create index with outstanding updates\r\nAbort trap: 6\r\nlogout\r\n```\r\n\r\n### OS:\r\n\r\niOS / docker\r\n\r\n### DuckDB Version:\r\n\r\n0.4.0\r\n\r\n### DuckDB Client:\r\n\r\nRust + CLI\r\n\r\n### Full Name:\r\n\r\nAarash Heydari\r\n\r\n### Affiliation:\r\n\r\nDataland\r\n\r\n### Have you tried this on the latest `master` branch?\r\n\r\n- [X] I agree\r\n\r\n### Have you tried the steps to reproduce? Do they include all relevant data and configuration? Does the issue you report still appear there?\r\n\r\n- [X] I agree", + "reactions": { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/4406/reactions", + "total_count": 0, + "+1": 0, + "-1": 0, + "laugh": 0, + "hooray": 0, + "confused": 0, + "heart": 0, + "rocket": 0, + "eyes": 0 + }, + "timeline_url": "https://api.github.com/repos/duckdb/duckdb/issues/4406/timeline", + "performed_via_github_app": null, + "state_reason": "completed" + }, + { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/4366", + "repository_url": "https://api.github.com/repos/duckdb/duckdb", + "labels_url": "https://api.github.com/repos/duckdb/duckdb/issues/4366/labels{/name}", + "comments_url": "https://api.github.com/repos/duckdb/duckdb/issues/4366/comments", + "events_url": "https://api.github.com/repos/duckdb/duckdb/issues/4366/events", + "html_url": "https://github.com/duckdb/duckdb/pull/4366", + "id": 1336934474, + "node_id": "PR_kwDOCEU65s49E8iN", + "number": 4366, + "title": "FSST compression", + "user": { + "login": "samansmink", + "id": 2925274, + "node_id": "MDQ6VXNlcjI5MjUyNzQ=", + "avatar_url": "https://avatars.githubusercontent.com/u/2925274?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/samansmink", + "html_url": "https://github.com/samansmink", + "followers_url": "https://api.github.com/users/samansmink/followers", + "following_url": "https://api.github.com/users/samansmink/following{/other_user}", + "gists_url": "https://api.github.com/users/samansmink/gists{/gist_id}", + "starred_url": "https://api.github.com/users/samansmink/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/samansmink/subscriptions", + "organizations_url": "https://api.github.com/users/samansmink/orgs", + "repos_url": "https://api.github.com/users/samansmink/repos", + "events_url": "https://api.github.com/users/samansmink/events{/privacy}", + "received_events_url": "https://api.github.com/users/samansmink/received_events", + "type": "User", + "site_admin": false + }, + "labels": [ + + ], + "state": "closed", + "locked": false, + "assignee": null, + "assignees": [ + + ], + "milestone": null, + "comments": 7, + "created_at": "2022-08-12T08:53:43Z", + "updated_at": "2022-10-03T13:35:04Z", + "closed_at": "2022-10-03T13:35:00Z", + "author_association": "CONTRIBUTOR", + "active_lock_reason": null, + "draft": false, + "pull_request": { + "url": "https://api.github.com/repos/duckdb/duckdb/pulls/4366", + "html_url": "https://github.com/duckdb/duckdb/pull/4366", + "diff_url": "https://github.com/duckdb/duckdb/pull/4366.diff", + "patch_url": "https://github.com/duckdb/duckdb/pull/4366.patch", + "merged_at": "2022-10-03T13:35:00Z" + }, + "body": "## PR\r\nThis PR adds a new compression method to duckdb, called FSST. In a nutshell, FSST is similar to dictionary compression, except instead of storing entire strings in a dictionary, a lookup table is used to store common substrings. For more details, check out the original paper and the source code in the [repo](https://github.com/cwida/fsst). FSST provides performance similar or better than LZ4, with the added benefit of fine-grained access to the compressed data.\r\n\r\n## Base Implementation\r\nFSST is implemented with a combination of delta encoding and bitpacking for compressing the dictionary offsets. For the compression analyze step, we randomly sample 25% of the vectors that of the row group and fully compress it to determine the compressed size. Compression reuses the FSST encoder that is generated during the analysis step to compress all the strings. During a scan, we cache the dictionary offset of the last decoded row to speed up the delta decoding in sequential scans. Note that similar to dictionary compression, a minimum compression ratio of 1.2 is required for FSST to be selected by the checkpointer to prevent unnecessary overhead for poorly compressible data. \r\n\r\n## Late decompression\r\nThis PR also includes a new vector type `VectorType::FSST_VECTOR` that allows for late decompression of FSST strings. Late decompression can improve performance as some of the data may be filtered out and does not need to be decompressed at all. Additionally, it opens the door to compressed execution, where operators are implemented to directly operate on the the compressed data without needing to decompress at all. Note that currently, emitting fsst vectors is disabled, but can be enabled with `SET enable_fsst_vectors=true`. The reason for this is that it currently has a higher overhead and we're not really using the benefits of it yet.\r\n\r\n## SIMD\r\nCurrently the simd implementation of FSST that uses AVX512 intrinsics is disabled, to experiment with this, there's a flag in `third_party/fsst/CMakeLists.txt` that can be set to enable it, note that this is currently untested in duckdb. \r\n\r\n## Next steps\r\nOptimize memory usage of analysis step. Currently when a string column is analyzed in by the ColumnDataCheckpointer, the strings are stored separately by both dictionary compression and FSST. It would be nice to be able to share the string data during analysis.\r\n\r\nExperiment with compressed execution. For example, a constant filter on an FSST encrypted column could be applied by encrypting the constant with the same symbol table instead of decrypting the column. This has two benefits: the comparison itself is sped up by operating on smaller strings, and also less data needs to be decrypted overall.\r\n\r\nSwitch to a single symbol table per row group. Currently the FSST symbol table is stored once per compressed segment, as this is easier to implement. This does come at an overhead of a few percent, so we could switch to storing it once per row group. This is probably also useful for implementing compressed execution as that will require determining which symbol table is used.\r\n \r\n## Results\r\nAll benchmarks run on m5.xlarge.\r\n\r\n### Compression\r\n**TPCH SF1**\r\nThis benchmark shows the total database size on disk with different combinations of string compression functions enabled. Note that in this benchmark we only change the string compression functions, all fixed size datatypes remain compressed with the default compression schemes (bitpacking/rle).\r\n\r\ncompression | storage size\r\n-- | --\r\nno string compression | 761M\r\ndictionary compression | 510M\r\nfsst and dictionary | 251M\r\n\r\nAs expected, fsst adds a big improvement to the tpch storage size. This is expected as with fsst, we can compress columns such as l_comment and c_name very well. For example we compress l_comment with about 3x compression ratio, which matches the results reported in the FSST paper closely.\r\n\r\n### Microbenchmarks\r\n In this benchmark we compare fsst both with and without late decompression. A big advantage of using FSST is compression and decompressed speed, however, FSST does add some overhead. Especially compared to dictionary compression, which is often faster than a normal scan in duckdb.\r\n\r\nThe regular read/store benchmarks aim to have a \"realistic\" compression ratios based on the compression ratios found in the fsst paper. The `_worst_case` benchmarks have uncompressible string data. The `late_decompression` benchmark contains a filter with a selectivity of 10% on a different column, demonstrating the effect of late decompression.\r\n\r\nbenchmark | baseline | dict | fsst | fsst_late_decomp | dict_diff | fsst_diff | fsst_late_decomp_diff\r\n -- | -- | -- | -- | -- | -- | -- | --\r\nbenchmark/micro/compression/fsst/fsst_late_decompression.benchmark | 0.63 | 0.31 | 0.73 | 0.70 | -50% | 15% | 10%\r\nbenchmark/micro/compression/fsst/fsst_read.benchmark | 0.88 | 0.51 | 0.96 | 1.22 | -42% | 9% | 38%\r\nbenchmark/micro/compression/fsst/fsst_read_worst_case.benchmark | 0.42 | 0.43 | 0.79 | 0.98 | 2% | 88% | 133%\r\nbenchmark/micro/compression/fsst/fsst_store.benchmark | 0.60 | 0.76 | 0.69 | 0.67 | 26% | 15% | 12%\r\nbenchmark/micro/compression/fsst/fsst_store_worst_case.benchmark | 1.11 | 1.77 | 1.30 | 1.19 | 58% | 16% | 7%\r\nbenchmark/micro/compression/store_tpch_sf1.benchmark | 25.53 | 26.50 | 27.12 | 27.12 | 4% | 6% | 6%\r\n\r\n\r\nBased on these benchmarks, we see that fsst decompression does come at some performance overhead, especially at low compression ratios. We could consider setting the minimum_compression ratio a bit higher based on these numbers.\r\n\r\nNext up, a benchmark that measures how long writing and checkpointing takes for tpch sf1:\r\n\r\nbenchmark | no string compression | only dict | dict and fsst | dict_diff | dict_fsst_diff\r\n -- | -- | -- | -- | -- | --\r\nbenchmark/micro/compression/store_tpch_sf1.benchmark | 25.53 | 26.50 | 27.12 | 4% | 6%\r\n\r\n### TPCH SF1\r\nNext, we run tpch on a persistent db to see how the overhead from fsst translates into more realistic queries. All queries where no significant difference was measured have been discarded. These overheads seem pretty reasonable for the achieved compression.\r\n\r\nbenchmark | baseline_without_fsst | fsst | fsst_late_decomp | fsst_diff | fsst_late_decomp_diff\r\n -- | -- | -- | -- | -- | --\r\nq10 | 0.15 | 0.17 | 0.18 | 8% | 17%\r\nq13 | 0.13 | 0.17 | 0.22 | 29% | 66%\r\nq17 | 0.20 | 0.22 | 0.22 | 9% | 7%\r\nq22 | 0.05 | 0.08 | 0.08 | 50% | 57%", + "reactions": { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/4366/reactions", + "total_count": 12, + "+1": 1, + "-1": 0, + "laugh": 0, + "hooray": 11, + "confused": 0, + "heart": 0, + "rocket": 0, + "eyes": 0 + }, + "timeline_url": "https://api.github.com/repos/duckdb/duckdb/issues/4366/timeline", + "performed_via_github_app": null, + "state_reason": null + }, + { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/4320", + "repository_url": "https://api.github.com/repos/duckdb/duckdb", + "labels_url": "https://api.github.com/repos/duckdb/duckdb/issues/4320/labels{/name}", + "comments_url": "https://api.github.com/repos/duckdb/duckdb/issues/4320/comments", + "events_url": "https://api.github.com/repos/duckdb/duckdb/issues/4320/events", + "html_url": "https://github.com/duckdb/duckdb/issues/4320", + "id": 1331472036, + "node_id": "I_kwDOCEU65s5PXKak", + "number": 4320, + "title": "Support UNION BY NAME", + "user": { + "login": "Mytherin", + "id": 3978469, + "node_id": "MDQ6VXNlcjM5Nzg0Njk=", + "avatar_url": "https://avatars.githubusercontent.com/u/3978469?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/Mytherin", + "html_url": "https://github.com/Mytherin", + "followers_url": "https://api.github.com/users/Mytherin/followers", + "following_url": "https://api.github.com/users/Mytherin/following{/other_user}", + "gists_url": "https://api.github.com/users/Mytherin/gists{/gist_id}", + "starred_url": "https://api.github.com/users/Mytherin/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/Mytherin/subscriptions", + "organizations_url": "https://api.github.com/users/Mytherin/orgs", + "repos_url": "https://api.github.com/users/Mytherin/repos", + "events_url": "https://api.github.com/users/Mytherin/events{/privacy}", + "received_events_url": "https://api.github.com/users/Mytherin/received_events", + "type": "User", + "site_admin": false + }, + "labels": [ + { + "id": 976002067, + "node_id": "MDU6TGFiZWw5NzYwMDIwNjc=", + "url": "https://api.github.com/repos/duckdb/duckdb/labels/good%20first%20issue", + "name": "good first issue", + "color": "7057ff", + "default": true, + "description": "Good for newcomers" + } + ], + "state": "closed", + "locked": false, + "assignee": { + "login": "lokax", + "id": 57343445, + "node_id": "MDQ6VXNlcjU3MzQzNDQ1", + "avatar_url": "https://avatars.githubusercontent.com/u/57343445?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/lokax", + "html_url": "https://github.com/lokax", + "followers_url": "https://api.github.com/users/lokax/followers", + "following_url": "https://api.github.com/users/lokax/following{/other_user}", + "gists_url": "https://api.github.com/users/lokax/gists{/gist_id}", + "starred_url": "https://api.github.com/users/lokax/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/lokax/subscriptions", + "organizations_url": "https://api.github.com/users/lokax/orgs", + "repos_url": "https://api.github.com/users/lokax/repos", + "events_url": "https://api.github.com/users/lokax/events{/privacy}", + "received_events_url": "https://api.github.com/users/lokax/received_events", + "type": "User", + "site_admin": false + }, + "assignees": [ + { + "login": "lokax", + "id": 57343445, + "node_id": "MDQ6VXNlcjU3MzQzNDQ1", + "avatar_url": "https://avatars.githubusercontent.com/u/57343445?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/lokax", + "html_url": "https://github.com/lokax", + "followers_url": "https://api.github.com/users/lokax/followers", + "following_url": "https://api.github.com/users/lokax/following{/other_user}", + "gists_url": "https://api.github.com/users/lokax/gists{/gist_id}", + "starred_url": "https://api.github.com/users/lokax/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/lokax/subscriptions", + "organizations_url": "https://api.github.com/users/lokax/orgs", + "repos_url": "https://api.github.com/users/lokax/repos", + "events_url": "https://api.github.com/users/lokax/events{/privacy}", + "received_events_url": "https://api.github.com/users/lokax/received_events", + "type": "User", + "site_admin": false + } + ], + "milestone": null, + "comments": 7, + "created_at": "2022-08-08T07:48:40Z", + "updated_at": "2022-09-02T09:19:52Z", + "closed_at": "2022-09-02T09:19:51Z", + "author_association": "COLLABORATOR", + "active_lock_reason": null, + "body": "See #4308\r\n\r\nWe should support the `UNION BY NAME` operator to union together tables with re-ordered and potentially new columns. This operator is useful for unifying data from an evolving schema, where new columns have been added over time.\r\n\r\nExample usage:\r\n\r\n```sql\r\ncreate table integers(i integer, j integer);\r\ncreate table strings(j integer, k varchar);\r\ninsert into integers values (1, 10);\r\ninsert into strings values (20, 'hello world');\r\nSELECT i, j FROM integers UNION BY NAME SELECT j, k FROM strings;\r\n┌──────┬────┬─────────────┐\r\n│ i │ j │ k │\r\n├──────┼────┼─────────────┤\r\n│ 1 │ 10 │ NULL │\r\n│ NULL │ 20 │ hello world │\r\n└──────┴────┴─────────────┘\r\n\r\n-- equivalent to \r\nSELECT i, j, NULL AS k FROM integers UNION ALL SELECT NULL as i, j, k FROM strings;\r\n```\r\n\r\nInternally this should be a simple rewrite in `bind_setop_node.cpp` that pushes a projection to re-order columns, together with inserting constant `NULLs` were required with the appropriate names.\r\n\r\nSee also [here](https://spark.apache.org/docs/3.1.1/api/python/reference/api/pyspark.sql.DataFrame.unionByName.html) for how Spark supports this operation.", + "reactions": { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/4320/reactions", + "total_count": 2, + "+1": 0, + "-1": 0, + "laugh": 0, + "hooray": 2, + "confused": 0, + "heart": 0, + "rocket": 0, + "eyes": 0 + }, + "timeline_url": "https://api.github.com/repos/duckdb/duckdb/issues/4320/timeline", + "performed_via_github_app": null, + "state_reason": "completed" + }, + { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/4314", + "repository_url": "https://api.github.com/repos/duckdb/duckdb", + "labels_url": "https://api.github.com/repos/duckdb/duckdb/issues/4314/labels{/name}", + "comments_url": "https://api.github.com/repos/duckdb/duckdb/issues/4314/comments", + "events_url": "https://api.github.com/repos/duckdb/duckdb/issues/4314/events", + "html_url": "https://github.com/duckdb/duckdb/issues/4314", + "id": 1331090544, + "node_id": "I_kwDOCEU65s5PVtRw", + "number": 4314, + "title": "Python client: parameters in group by result in an error", + "user": { + "login": "roveo", + "id": 5427958, + "node_id": "MDQ6VXNlcjU0Mjc5NTg=", + "avatar_url": "https://avatars.githubusercontent.com/u/5427958?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/roveo", + "html_url": "https://github.com/roveo", + "followers_url": "https://api.github.com/users/roveo/followers", + "following_url": "https://api.github.com/users/roveo/following{/other_user}", + "gists_url": "https://api.github.com/users/roveo/gists{/gist_id}", + "starred_url": "https://api.github.com/users/roveo/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/roveo/subscriptions", + "organizations_url": "https://api.github.com/users/roveo/orgs", + "repos_url": "https://api.github.com/users/roveo/repos", + "events_url": "https://api.github.com/users/roveo/events{/privacy}", + "received_events_url": "https://api.github.com/users/roveo/received_events", + "type": "User", + "site_admin": false + }, + "labels": [ + + ], + "state": "closed", + "locked": false, + "assignee": null, + "assignees": [ + + ], + "milestone": null, + "comments": 7, + "created_at": "2022-08-07T18:35:59Z", + "updated_at": "2022-08-22T09:10:05Z", + "closed_at": "2022-08-22T09:10:00Z", + "author_association": "NONE", + "active_lock_reason": null, + "body": "### What happens?\r\n\r\nWhen using `?`-parameters in e.g. `date_part` part argument, group by query fails\r\n\r\n### To Reproduce\r\n\r\n```py\r\nimport duckdb\r\n\r\nconn = duckdb.connect(\":memory:\")\r\nconn.execute(\"create table test (dt date); insert into test values ('2022-01-01');\");\r\nconn.execute(\"select date_part(?, dt), count(*) from test group by date_part(?, dt)\", ('year', 'year')).fetchall()\r\n```\r\n\r\n```\r\nInvalidInputException: Invalid Input Error: Attempting to execute an unsuccessful or closed pending query result\r\nError: Binder Error: column \"dt\" must appear in the GROUP BY clause or be used in an aggregate function\r\n```\r\n\r\n### OS:\r\n\r\nMacOS\r\n\r\n### DuckDB Version:\r\n\r\n0.4.1.dev1300\r\n\r\n### DuckDB Client:\r\n\r\nPython\r\n\r\n### Full Name:\r\n\r\nMikhail Akimov\r\n\r\n### Affiliation:\r\n\r\nhttps://github.com/discover-labs\r\n\r\n### Have you tried this on the latest `master` branch?\r\n\r\n- [X] I agree\r\n\r\n### Have you tried the steps to reproduce? Do they include all relevant data and configuration? Does the issue you report still appear there?\r\n\r\n- [X] I agree", + "reactions": { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/4314/reactions", + "total_count": 0, + "+1": 0, + "-1": 0, + "laugh": 0, + "hooray": 0, + "confused": 0, + "heart": 0, + "rocket": 0, + "eyes": 0 + }, + "timeline_url": "https://api.github.com/repos/duckdb/duckdb/issues/4314/timeline", + "performed_via_github_app": null, + "state_reason": "completed" + }, + { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/4302", + "repository_url": "https://api.github.com/repos/duckdb/duckdb", + "labels_url": "https://api.github.com/repos/duckdb/duckdb/issues/4302/labels{/name}", + "comments_url": "https://api.github.com/repos/duckdb/duckdb/issues/4302/comments", + "events_url": "https://api.github.com/repos/duckdb/duckdb/issues/4302/events", + "html_url": "https://github.com/duckdb/duckdb/issues/4302", + "id": 1330008779, + "node_id": "I_kwDOCEU65s5PRlLL", + "number": 4302, + "title": "Support Tagged UNION Type", + "user": { + "login": "Mytherin", + "id": 3978469, + "node_id": "MDQ6VXNlcjM5Nzg0Njk=", + "avatar_url": "https://avatars.githubusercontent.com/u/3978469?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/Mytherin", + "html_url": "https://github.com/Mytherin", + "followers_url": "https://api.github.com/users/Mytherin/followers", + "following_url": "https://api.github.com/users/Mytherin/following{/other_user}", + "gists_url": "https://api.github.com/users/Mytherin/gists{/gist_id}", + "starred_url": "https://api.github.com/users/Mytherin/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/Mytherin/subscriptions", + "organizations_url": "https://api.github.com/users/Mytherin/orgs", + "repos_url": "https://api.github.com/users/Mytherin/repos", + "events_url": "https://api.github.com/users/Mytherin/events{/privacy}", + "received_events_url": "https://api.github.com/users/Mytherin/received_events", + "type": "User", + "site_admin": false + }, + "labels": [ + + ], + "state": "open", + "locked": false, + "assignee": null, + "assignees": [ + + ], + "milestone": null, + "comments": 7, + "created_at": "2022-08-05T14:25:11Z", + "updated_at": "2022-08-17T14:20:18Z", + "closed_at": null, + "author_association": "COLLABORATOR", + "active_lock_reason": null, + "body": "We want to add support for tagged union types to DuckDB. The tagged union would allow multiple types to co-exist within the same column, similar to the way a C-style union would work. Example syntax might be:\r\n\r\n```sql\r\nCREATE TABLE tbl(u UNION(i INTEGER, v VARCHAR));\r\nINSERT INTO tbl VALUES (1);\r\nINSERT INTO tbl VALUES ('hello');\r\nSELECT * FROM tbl;\r\n```\r\n\r\nInternally union vectors should be represented as two vectors:\r\n\r\n* A `tag` vector, which is a `uint8_t` (or `uint16_t`) vector\r\n* A `data` vector, that holds the primary data of the elements union'd together in a C-like manner (i.e. each row in the data array has the maximum length of the child union elements, so in the union above the size of each row will be 16-bytes, the size of `string_t`).\r\n\r\nThere should be operations that access the individual union elements similar to a struct. The entries that have a different tag should return `NULL` instead. We should also be able to query the tag of the union, e.g.:\r\n\r\n```sql\r\nSELECT * FROM tbl;\r\n-- 1, hello\r\nSELECT u.i FROM tbl;\r\n-- 1, NULL\r\nSELECT u.v FROM tbl;\r\n-- NULL, v\r\nSELECT u.tag FROM tbl;\r\n-- 0, 1\r\n```\r\n\r\nUnions of nested types are more challenging. In this case, we would want to store child vectors for each of the nested types. For example:\r\n\r\n```sql\r\nCREATE TABLE tbl(u UNION(l INTEGER[], s STRUCT(x INT, y INT)));\r\nINSERT INTO tbl VALUES ([1, 2, 3, 4]);\r\nINSERT INTO tbl VALUES ({'x': 42, 'y': 84});\r\n-- union vector has three child vectors, one for the list and two for the struct\r\n```\r\n\r\nWe can choose to either (1) leave empty spots for the child vectors for which the tag does not apply, or (2) [likely more efficient] only add elements to the union vectors for which the tag applies. Option two saves on storage, particularly if only a single tag is used, but complicates retrieval of individual tagged elements somewhat.\r\n\r\nOptimizations should be applied that optimize unions for which only a single tag is used, both in storage and in vectors themselves. Ideally the performance of a union that only uses one of its types should be the same as using that type directly.\r\n\r\nCC @Maxxen", + "reactions": { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/4302/reactions", + "total_count": 1, + "+1": 0, + "-1": 0, + "laugh": 0, + "hooray": 0, + "confused": 0, + "heart": 0, + "rocket": 1, + "eyes": 0 + }, + "timeline_url": "https://api.github.com/repos/duckdb/duckdb/issues/4302/timeline", + "performed_via_github_app": null, + "state_reason": null + }, + { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/4295", + "repository_url": "https://api.github.com/repos/duckdb/duckdb", + "labels_url": "https://api.github.com/repos/duckdb/duckdb/issues/4295/labels{/name}", + "comments_url": "https://api.github.com/repos/duckdb/duckdb/issues/4295/comments", + "events_url": "https://api.github.com/repos/duckdb/duckdb/issues/4295/events", + "html_url": "https://github.com/duckdb/duckdb/issues/4295", + "id": 1329150391, + "node_id": "I_kwDOCEU65s5POTm3", + "number": 4295, + "title": "When reading parquet, filter using IN () on two values much slower than equals on single value", + "user": { + "login": "mauropagano", + "id": 9085907, + "node_id": "MDQ6VXNlcjkwODU5MDc=", + "avatar_url": "https://avatars.githubusercontent.com/u/9085907?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/mauropagano", + "html_url": "https://github.com/mauropagano", + "followers_url": "https://api.github.com/users/mauropagano/followers", + "following_url": "https://api.github.com/users/mauropagano/following{/other_user}", + "gists_url": "https://api.github.com/users/mauropagano/gists{/gist_id}", + "starred_url": "https://api.github.com/users/mauropagano/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/mauropagano/subscriptions", + "organizations_url": "https://api.github.com/users/mauropagano/orgs", + "repos_url": "https://api.github.com/users/mauropagano/repos", + "events_url": "https://api.github.com/users/mauropagano/events{/privacy}", + "received_events_url": "https://api.github.com/users/mauropagano/received_events", + "type": "User", + "site_admin": false + }, + "labels": [ + + ], + "state": "open", + "locked": false, + "assignee": null, + "assignees": [ + + ], + "milestone": null, + "comments": 7, + "created_at": "2022-08-04T20:37:06Z", + "updated_at": "2022-11-19T09:16:34Z", + "closed_at": null, + "author_association": "CONTRIBUTOR", + "active_lock_reason": null, + "body": "### What happens?\r\n\r\nWhen reading a parquet dataset (no specific partitioning, so no pruning) via `parquet_scan()` , applying a filter via `column in (value1, value2)` is much slower than a single value via `column = value`. \r\nSignificantly slower than running the query twice (once per distinct value)\r\n\r\nThere is also some inconsistent behavior between `duckdb.query()` and `conn = duckdb.connect; conn.execute()` where the former goes parallel when using an `IN` clause while the latter does not. Adding `order by` to the latter enables parallelism back.\r\n\r\nRunning parallel or doing something like `create temp table as ..` provides good performance back, at the cost of a serious increase in machine resource though (e.g. the testcase runs in 1s for a single value on a single core, 1.5s with 2 values using 24 cores, would take ~2 sec to run the query twice on a single core)\r\n\r\nTo be clear, I'm raising this for the much slower perf of `=` vs `IN ()`, not `duckdb.query` vs `conn.execute` \r\n\r\n### To Reproduce\r\n\r\nThis testcase produces ~1.5GB of data, with larger directories the effect is magnified approx linearly\r\n\r\n```\r\nimport numpy as np\r\nimport pandas as pd\r\nimport duckdb\r\n\r\nconn = duckdb.connect()\r\na = np.random.randint(0, 100, (100_000_000, 10))\r\ndf = pd.DataFrame(a, columns=[f\"c{i}\" for i in range(1,11)])\r\n# manually partitioning, example could be high cardinality partitioning column that has multiple values in the same slice\r\n# (or a dataset that is consistently appended to)\r\nfor i in range(10, 101, 10):\r\n df.query(f\"c1 >= {i-10} and c1 <{i}\").to_parquet(f\"/tmp/pq/{i}.parquet\")\r\n\r\n# reading a single value the performance is about the same and does need / use parallism\r\n%timeit -n3 -r2 duckdb.query(\"select * from parquet_scan('/tmp/pq/*.parquet') where c1 = 2\").df()\r\n%timeit -n3 -r2 conn.execute(\"select * from parquet_scan('/tmp/pq/*.parquet') where c1 = 2\").df()\r\n\r\n# with in() the performance is drastically different, even when running fast (duckdb.query) it takes a lot more resources\r\n%timeit -n3 -r2 duckdb.query(\"select * from parquet_scan('/tmp/pq/*.parquet') where c1 in (2, 25)\").df()\r\n%timeit -n3 -r2 conn.execute(\"select * from parquet_scan('/tmp/pq/*.parquet') where c1 in (2, 25)\").df()\r\n\r\n# adding order by allows parallelism in conn.execute() and it's faster but takes the whole machine\r\n%timeit -n3 -r2 conn.execute(\"select * from parquet_scan('/tmp/pq/*.parquet') where c1 in (2, 25) order by c1\").df()\r\n\r\n# the mismatch does NOT reproduce creating a temp table, but it does go parallel and ends up using the whole machine\r\n%timeit -n1 -r1 duckdb.query(\"create temp table t_duckdb as select * from parquet_scan('/tmp/pq/*.parquet') where c1 in (2, 25)\")\r\n%timeit -n1 -r1 conn.execute(\"create temp table t_conn as select * from parquet_scan('/tmp/pq/*.parquet') where c1 in (2, 25)\")\r\n```\r\n\r\n### OS:\r\n\r\nLinux\r\n\r\n### DuckDB Version:\r\n\r\n0.4.1-dev1175\r\n\r\n### DuckDB Client:\r\n\r\nPython\r\n\r\n### Full Name:\r\n\r\nMauro Pagano\r\n\r\n### Affiliation:\r\n\r\nIndependent consultant\r\n\r\n### Have you tried this on the latest `master` branch?\r\n\r\n- [X] I agree\r\n\r\n### Have you tried the steps to reproduce? Do they include all relevant data and configuration? Does the issue you report still appear there?\r\n\r\n- [X] I agree", + "reactions": { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/4295/reactions", + "total_count": 0, + "+1": 0, + "-1": 0, + "laugh": 0, + "hooray": 0, + "confused": 0, + "heart": 0, + "rocket": 0, + "eyes": 0 + }, + "timeline_url": "https://api.github.com/repos/duckdb/duckdb/issues/4295/timeline", + "performed_via_github_app": null, + "state_reason": null + }, + { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/4200", + "repository_url": "https://api.github.com/repos/duckdb/duckdb", + "labels_url": "https://api.github.com/repos/duckdb/duckdb/issues/4200/labels{/name}", + "comments_url": "https://api.github.com/repos/duckdb/duckdb/issues/4200/comments", + "events_url": "https://api.github.com/repos/duckdb/duckdb/issues/4200/events", + "html_url": "https://github.com/duckdb/duckdb/issues/4200", + "id": 1317074429, + "node_id": "I_kwDOCEU65s5OgPX9", + "number": 4200, + "title": "CSV Auto Import Incorrect Behavior", + "user": { + "login": "chrisfw", + "id": 6135741, + "node_id": "MDQ6VXNlcjYxMzU3NDE=", + "avatar_url": "https://avatars.githubusercontent.com/u/6135741?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/chrisfw", + "html_url": "https://github.com/chrisfw", + "followers_url": "https://api.github.com/users/chrisfw/followers", + "following_url": "https://api.github.com/users/chrisfw/following{/other_user}", + "gists_url": "https://api.github.com/users/chrisfw/gists{/gist_id}", + "starred_url": "https://api.github.com/users/chrisfw/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/chrisfw/subscriptions", + "organizations_url": "https://api.github.com/users/chrisfw/orgs", + "repos_url": "https://api.github.com/users/chrisfw/repos", + "events_url": "https://api.github.com/users/chrisfw/events{/privacy}", + "received_events_url": "https://api.github.com/users/chrisfw/received_events", + "type": "User", + "site_admin": false + }, + "labels": [ + + ], + "state": "open", + "locked": false, + "assignee": null, + "assignees": [ + + ], + "milestone": null, + "comments": 7, + "created_at": "2022-07-25T16:20:49Z", + "updated_at": "2022-08-22T11:29:29Z", + "closed_at": null, + "author_association": "NONE", + "active_lock_reason": null, + "body": "### What happens?\n\nThe CSV auto import functionality does not seem to work correctly when there is a record that has embedded quotes and the record position is at some kind of internally defined boundary.\r\n\r\n[mytest_fails.csv](https://github.com/duckdb/duckdb/files/9182839/mytest_fails.csv)\r\n[mytest_succeeds_but_incorrect_columns.csv](https://github.com/duckdb/duckdb/files/9182840/mytest_succeeds_but_incorrect_columns.csv)\r\n[mytest_succeeds_correctly.csv](https://github.com/duckdb/duckdb/files/9182947/mytest_succeeds_correctly.csv)\r\n\r\n\r\n\n\n### To Reproduce\n\n**### Use the uploaded test CSV files and repeat the steps below.**\r\n\r\nIn a CSV file (**mytest_fails.csv**) with a header record and duplicated rows for 1023 records and a record at position 1025 that has embedded quotes in column content, a parse error occurs: \r\n```\r\nD create or replace table customer as select c.* from read_csv_auto('E:\\data\\tmw\\net_new_staging\\groupA\\mytest_fails.csv',header=True,all_varchar=True,filename=True) c;\r\nError: Invalid Input Error: Error in file \"E:\\data\\tmw\\net_new_staging\\groupA\\mytest_fails.csv\" on line 1025: quote should be followed by end of value, end of row or another quote. (DELIMITER=',' (auto detected), QUOTE='\"' (auto detected), ESCAPE='' (auto detected), HEADER=1, SAMPLE_SIZE=10240, IGNORE_ERRORS=0, ALL_VARCHAR=1)\r\n```\r\nHowever, after removing one of the duplicated rows in another copy of the CSV file (**my_test_succeeds_but_incorrect_columns.csv**), no parse error occurs. Upon further investigation though, the actual column header row is not used for the column determination and instead the content of the problematic last row with embedded quotes in a column is treated as the column header definition.\r\n```\r\nD create or replace table customer as select c.* from read_csv_auto('E:\\data\\tmw\\net_new_staging\\groupA\\mytest_succeeds_but_incorrect_columns.csv',header=True,all_varchar=True,filename=True) c;\r\nD show customer;\r\n┌────────────────────────────┬─────────────┬──────┬─────┬─────────┬───────┐\r\n│ column_name │ column_type │ null │ key │ default │ extra │\r\n├────────────────────────────┼─────────────┼──────┼─────┼─────────┼───────┤\r\n│ \"1009092\" │ VARCHAR │ YES │ │ │ │\r\n│ \"ALFRED 'JAMES' PICKERING\" │ VARCHAR │ YES │ │ │ │\r\n│ \"ALFRED \"JAMES\"\" │ VARCHAR │ YES │ │ │ │\r\n│ \"\" │ VARCHAR │ YES │ │ │ │\r\n│ \"PICKERING\" │ VARCHAR │ YES │ │ │ │\r\n│ \"\"_1 │ VARCHAR │ YES │ │ │ │\r\n│ \" \" │ VARCHAR │ YES │ │ │ │\r\n│ \"\"_2 │ VARCHAR │ YES │ │ │ │\r\n│ 0 │ VARCHAR │ YES │ │ │ │\r\n│ 0_1 │ VARCHAR │ YES │ │ │ │\r\n│ \"\"_3 │ VARCHAR │ YES │ │ │ │\r\n│ \"\"_4 │ VARCHAR │ YES │ │ │ │\r\n│ column12 │ VARCHAR │ YES │ │ │ │\r\n│ column13 │ VARCHAR │ YES │ │ │ │\r\n│ column14 │ VARCHAR │ YES │ │ │ │\r\n│ \"\"_5 │ VARCHAR │ YES │ │ │ │\r\n│ column16 │ VARCHAR │ YES │ │ │ │\r\n│ \"\"_6 │ VARCHAR │ YES │ │ │ │\r\n│ column18 │ VARCHAR │ YES │ │ │ │\r\n│ \"\"_7 │ VARCHAR │ YES │ │ │ │\r\n│ \"2021-03-05\" │ VARCHAR │ YES │ │ │ │\r\n│ filename │ VARCHAR │ YES │ │ │ │\r\n└────────────────────────────┴─────────────┴──────┴─────┴─────────┴───────┘\r\n```\r\nIn a third test CSV file (**mytest_succeeds_correctly.csv**), which is just a copy of the **mytest_succeeds_but_incorrect_columns.csv** file with the embedded quotes removed on the problematic record, the CSV parsing works correctly.\r\n```\r\nD create or replace table customer as select c.* from read_csv_auto('E:\\data\\tmw\\net_new_staging\\groupA\\**mytest_succeeds_correctly.csv**',header=True,all_varchar=True,filename=True) c;\r\nD show customer;\r\n┌──────────────────┬─────────────┬──────┬─────┬─────────┬───────┐\r\n│ column_name │ column_type │ null │ key │ default │ extra │\r\n├──────────────────┼─────────────┼──────┼─────┼─────────┼───────┤\r\n│ CUSTOMERID │ VARCHAR │ YES │ │ │ │\r\n│ FULLNAME │ VARCHAR │ YES │ │ │ │\r\n│ FIRSTNAME │ VARCHAR │ YES │ │ │ │\r\n│ MIDDLENAME │ VARCHAR │ YES │ │ │ │\r\n│ LASTNAME │ VARCHAR │ YES │ │ │ │\r\n│ NAMETITLE │ VARCHAR │ YES │ │ │ │\r\n│ NAMESUFFIX │ VARCHAR │ YES │ │ │ │\r\n│ DBA │ VARCHAR │ YES │ │ │ │\r\n│ CUSTTYPE │ VARCHAR │ YES │ │ │ │\r\n│ ACTIVECODE │ VARCHAR │ YES │ │ │ │\r\n│ MOTHERMAIDENNAME │ VARCHAR │ YES │ │ │ │\r\n│ EMPLOYERNAME │ VARCHAR │ YES │ │ │ │\r\n│ EMPLOYERPHONE │ VARCHAR │ YES │ │ │ │\r\n│ EMPLOYERPHONEEXT │ VARCHAR │ YES │ │ │ │\r\n│ OTHERIDTYPE1 │ VARCHAR │ YES │ │ │ │\r\n│ OTHERIDVALUE1 │ VARCHAR │ YES │ │ │ │\r\n│ OTHERIDTYPE2 │ VARCHAR │ YES │ │ │ │\r\n│ OTHERIDVALUE2 │ VARCHAR │ YES │ │ │ │\r\n│ OTHERIDTYPE3 │ VARCHAR │ YES │ │ │ │\r\n│ OTHERIDVALUE3 │ VARCHAR │ YES │ │ │ │\r\n│ UPDATEDATE │ VARCHAR │ YES │ │ │ │\r\n│ filename │ VARCHAR │ YES │ │ │ │\r\n└──────────────────┴─────────────┴──────┴─────┴─────────┴───────┘\r\nD select count(*) from customer;\r\n┌──────────────┐\r\n│ count_star() │\r\n├──────────────┤\r\n│ 1024 │\r\n└──────────────┘\r\n\r\n│ CUSTOMERID │ FULLNAME │ FIRSTNAME │ MIDDLENAME │ LASTNAME │ NAMETITLE │ NAMESUFFIX │ DBA │ CUSTTYPE │ ACTIVECODE │ MOTHERMAIDENNAME │ EMPLOYERNAME │ EMPLOYERPHONE │ EMPLOYERPHONEEXT │ OTHERIDTYPE1 │ OTHERIDVALUE1 │ OTHERIDTYPE2 │ OTHERIDVALUE2 │ OTHERIDTYPE3 │ OTHERIDVALUE3 │ UPDATEDATE │ filename │\r\n├────────────┼───────────┼───────────┼────────────┼──────────┼───────────┼────────────┼─────┼──────────┼────────────┼──────────────────┼──────────────┼───────────────┼──────────────────┼──────────────┼───────────────┼──────────────┼───────────────┼──────────────┼───────────────┼────────────┼──────────────────────────────────────────────────────────────────┤\r\n│ 1234567 │ JOHN, DOE │ JOHN │ │ DOE │ │ │ │ 0 │ 0 │ │ │ │ │ │ │ │ │ │ │ 2021-03-05 │ E:\\data\\tmw\\net_new_staging\\groupA\\mytest_succeeds_correctly.csv │\r\n│ 1234567 │ JOHN, DOE │ JOHN │ │ DOE │ │ │ │ 0 │ 0 │ │ │ │ │ │ │ │ │ │ │ 2021-03-05 │ E:\\data\\tmw\\net_new_staging\\groupA\\mytest_succeeds_correctly.csv │\r\n│ 1234567 │ JOHN, DOE │ JOHN │ │ DOE │ │ │ │ 0 │ 0 │ │ │ │ │ │ │ │ │ │ │ 2021-03-05 │ E:\\data\\tmw\\net_new_staging\\groupA\\mytest_succeeds_correctly.csv │\r\n│ 1234567 │ JOHN, DOE │ JOHN │ │ DOE │ │ │ │ 0 │ 0 │ │ │ │ │ │ │ │ │ │ │ 2021-03-05 │ E:\\data\\tmw\\net_new_staging\\groupA\\mytest_succeeds_correctly.csv │\r\n│ 1234567 │ JOHN, DOE │ JOHN │ │ DOE │ │ │ │ 0 │ 0 │ │ │ │ │ │ │ │ │ │ │ 2021-03-05 │ E:\\data\\tmw\\net_new_staging\\groupA\\mytest_succeeds_correctly.csv │\r\n│ 1234567 │ JOHN, DOE │ JOHN │ │ DOE │ │ │ │ 0 │ 0 │ │ │ │ │ │ │ │ │ │ │ 2021-03-05 │ E:\\data\\tmw\\net_new_staging\\groupA\\mytest_succeeds_correctly.csv │\r\n│ 1234567 │ JOHN, DOE │ JOHN │ │ DOE │ │ │ │ 0 │ 0 │ │ │ │ │ │ │ │ │ │ │ 2021-03-05 │ E:\\data\\tmw\\net_new_staging\\groupA\\mytest_succeeds_correctly.csv │\r\n│ 1234567 │ JOHN, DOE │ JOHN │ │ DOE │ │ │ │ 0 │ 0 │ │ │ │ │ │ │ │ │ │ │ 2021-03-05 │ E:\\data\\tmw\\net_new_staging\\groupA\\mytest_succeeds_correctly.csv │\r\n│ 1234567 │ JOHN, DOE │ JOHN │ │ DOE │ │ │ │ 0 │ 0 │ │ │ │ │ │ │ │ │ │ │ 2021-03-05 │ E:\\data\\tmw\\net_new_staging\\groupA\\mytest_succeeds_correctly.csv │\r\n│ 1234567 │ JOHN, DOE │ JOHN │ │ DOE │ │ │ │ 0 │ 0 │ │ │ │ │ │ │ │ │ │ │ 2021-03-05 │ E:\\data\\tmw\\net_new_staging\\groupA\\mytest_succeeds_correctly.csv │\r\n└────────────┴───────────┴───────────┴────────────┴──────────┴───────────┴────────────┴─────┴──────────┴────────────┴──────────────────┴──────────────┴───────────────┴──────────────────┴──────────────┴───────────────┴──────────────┴───────────────┴──────────────┴───────────────┴────────────┴────────────────────────────────────────────────────\r\n\r\n```\n\n### OS:\n\nWindows\n\n### DuckDB Version:\n\nv0.4.0\n\n### DuckDB Client:\n\nCLI\n\n### Full Name:\n\nChris Whelan\n\n### Affiliation:\n\nSystems and Software\n\n### Have you tried this on the latest `master` branch?\n\n- [X] I agree\n\n### Have you tried the steps to reproduce? Do they include all relevant data and configuration? Does the issue you report still appear there?\n\n- [X] I agree", + "reactions": { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/4200/reactions", + "total_count": 0, + "+1": 0, + "-1": 0, + "laugh": 0, + "hooray": 0, + "confused": 0, + "heart": 0, + "rocket": 0, + "eyes": 0 + }, + "timeline_url": "https://api.github.com/repos/duckdb/duckdb/issues/4200/timeline", + "performed_via_github_app": null, + "state_reason": null + }, + { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/4170", + "repository_url": "https://api.github.com/repos/duckdb/duckdb", + "labels_url": "https://api.github.com/repos/duckdb/duckdb/issues/4170/labels{/name}", + "comments_url": "https://api.github.com/repos/duckdb/duckdb/issues/4170/comments", + "events_url": "https://api.github.com/repos/duckdb/duckdb/issues/4170/events", + "html_url": "https://github.com/duckdb/duckdb/issues/4170", + "id": 1310442876, + "node_id": "I_kwDOCEU65s5OG8V8", + "number": 4170, + "title": "read_csv_auto IO error when reading a large (> 2 GB) file", + "user": { + "login": "vinhdizzo", + "id": 114053, + "node_id": "MDQ6VXNlcjExNDA1Mw==", + "avatar_url": "https://avatars.githubusercontent.com/u/114053?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/vinhdizzo", + "html_url": "https://github.com/vinhdizzo", + "followers_url": "https://api.github.com/users/vinhdizzo/followers", + "following_url": "https://api.github.com/users/vinhdizzo/following{/other_user}", + "gists_url": "https://api.github.com/users/vinhdizzo/gists{/gist_id}", + "starred_url": "https://api.github.com/users/vinhdizzo/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/vinhdizzo/subscriptions", + "organizations_url": "https://api.github.com/users/vinhdizzo/orgs", + "repos_url": "https://api.github.com/users/vinhdizzo/repos", + "events_url": "https://api.github.com/users/vinhdizzo/events{/privacy}", + "received_events_url": "https://api.github.com/users/vinhdizzo/received_events", + "type": "User", + "site_admin": false + }, + "labels": [ + + ], + "state": "closed", + "locked": false, + "assignee": null, + "assignees": [ + + ], + "milestone": null, + "comments": 7, + "created_at": "2022-07-20T05:05:40Z", + "updated_at": "2022-07-24T09:56:02Z", + "closed_at": "2022-07-24T07:11:07Z", + "author_association": "NONE", + "active_lock_reason": null, + "body": "#### What happens?\r\nThe `read_csv_auto` file appears to error out (IO error) for large (>2 GB) files.\r\n\r\n#### To Reproduce\r\nCreate a large 2.2 GB tab-delimited file on command line:\r\n```sh\r\necho -e \"a\\tb\\tc\\td\\te\\tf\\tg\\th\\ti\\tj\\tk\\tl\\tm\\tn\\to\\tp\\tq\\tr\\ts\\tt\\tu\\tv\\tw\\tx\\ty\" > tmp.txt\r\necho -e \"60080000037503411\\t37503411\\t\\N\\t\\N\\t\\N\\t\\N\\t\\N\\tunsubmitted\\t2021-01-17 23:25:27.129\\t2021-01-17 23:25:27.129\\t\\N\\t\\N\\t\\N\\t\\N\\t\\N\\t\\N\\t\\N\\t\\N\\t60080000000955068\\tregular_submission\\tnot_graded_anonymously\\t\\N\\t\\N\\t\\N\\t-541917868910584810\\tnot_graded\\t\\N\" | perl -ne 'print $_ x 10000000' >> tmp.txt\r\n```\r\nThis should return a `tmp.txt` file with 10,000,001 rows, including the header.\r\n\r\nLaunch duckdb.exe on Windows, and run the following:\r\n```\r\nSELECT count(*) FROM read_csv_auto('tmp.txt', delim='\\t', header=True)\r\n ;\r\n```\r\n\r\nI get the following error message:\r\n```\r\nv0.4.0 da9ee490d\r\nEnter \".help\" for usage hints.\r\nConnected to a transient in-memory database.\r\nUse \".open FILENAME\" to reopen on a persistent database.\r\nD\r\nD SELECT count(*) FROM read_csv_auto('tmp.txt', delim='\\t', header=True)\r\n> ;\r\nError: IO Error: No files found that match the pattern \"tmp.txt\"\r\n```\r\n\r\nSame thing happens with `duckdb` in R.\r\n\r\n#### Environment (please complete the following information):\r\n - OS: Windows 10\r\n - DuckDB Version: v0.4.0 da9ee490d\r\n - DuckDB Client: Windows command line, and using the R duckdb client.\r\n\r\n#### Identity Disclosure:\r\n - Full Name: Vinh Nguyen\r\n - Affiliation: Irvine Valley College\r\n\r\n#### Before Submitting\r\n\r\n- [ ] **Have you tried this on the latest `master` branch?**\r\n* **Python**: `pip install duckdb --upgrade --pre`\r\n* **R**: `install.packages(\"https://github.com/duckdb/duckdb/releases/download/master-builds/duckdb_r_src.tar.gz\", repos = NULL)`\r\n* **Other Platforms**: You can find binaries [here](https://github.com/duckdb/duckdb/releases/tag/master-builds) or compile from source.\r\n\r\nI have not tried this on the `master` branch as the Windows binaries are not available.\r\n\r\n- [X] **Have you tried the steps to reproduce? Do they include all relevant data and configuration? Does the issue you report still appear there?**\r\n\r\nYes", + "reactions": { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/4170/reactions", + "total_count": 0, + "+1": 0, + "-1": 0, + "laugh": 0, + "hooray": 0, + "confused": 0, + "heart": 0, + "rocket": 0, + "eyes": 0 + }, + "timeline_url": "https://api.github.com/repos/duckdb/duckdb/issues/4170/timeline", + "performed_via_github_app": null, + "state_reason": "completed" + }, + { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/4116", + "repository_url": "https://api.github.com/repos/duckdb/duckdb", + "labels_url": "https://api.github.com/repos/duckdb/duckdb/issues/4116/labels{/name}", + "comments_url": "https://api.github.com/repos/duckdb/duckdb/issues/4116/comments", + "events_url": "https://api.github.com/repos/duckdb/duckdb/issues/4116/events", + "html_url": "https://github.com/duckdb/duckdb/issues/4116", + "id": 1303517352, + "node_id": "I_kwDOCEU65s5Nshio", + "number": 4116, + "title": "Can't write raw data from R", + "user": { + "login": "wurli", + "id": 17475731, + "node_id": "MDQ6VXNlcjE3NDc1NzMx", + "avatar_url": "https://avatars.githubusercontent.com/u/17475731?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/wurli", + "html_url": "https://github.com/wurli", + "followers_url": "https://api.github.com/users/wurli/followers", + "following_url": "https://api.github.com/users/wurli/following{/other_user}", + "gists_url": "https://api.github.com/users/wurli/gists{/gist_id}", + "starred_url": "https://api.github.com/users/wurli/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/wurli/subscriptions", + "organizations_url": "https://api.github.com/users/wurli/orgs", + "repos_url": "https://api.github.com/users/wurli/repos", + "events_url": "https://api.github.com/users/wurli/events{/privacy}", + "received_events_url": "https://api.github.com/users/wurli/received_events", + "type": "User", + "site_admin": false + }, + "labels": [ + { + "id": 1269662541, + "node_id": "MDU6TGFiZWwxMjY5NjYyNTQx", + "url": "https://api.github.com/repos/duckdb/duckdb/labels/R", + "name": "R", + "color": "6ddb99", + "default": false, + "description": "R integration" + } + ], + "state": "closed", + "locked": false, + "assignee": { + "login": "krlmlr", + "id": 1741643, + "node_id": "MDQ6VXNlcjE3NDE2NDM=", + "avatar_url": "https://avatars.githubusercontent.com/u/1741643?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/krlmlr", + "html_url": "https://github.com/krlmlr", + "followers_url": "https://api.github.com/users/krlmlr/followers", + "following_url": "https://api.github.com/users/krlmlr/following{/other_user}", + "gists_url": "https://api.github.com/users/krlmlr/gists{/gist_id}", + "starred_url": "https://api.github.com/users/krlmlr/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/krlmlr/subscriptions", + "organizations_url": "https://api.github.com/users/krlmlr/orgs", + "repos_url": "https://api.github.com/users/krlmlr/repos", + "events_url": "https://api.github.com/users/krlmlr/events{/privacy}", + "received_events_url": "https://api.github.com/users/krlmlr/received_events", + "type": "User", + "site_admin": false + }, + "assignees": [ + { + "login": "krlmlr", + "id": 1741643, + "node_id": "MDQ6VXNlcjE3NDE2NDM=", + "avatar_url": "https://avatars.githubusercontent.com/u/1741643?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/krlmlr", + "html_url": "https://github.com/krlmlr", + "followers_url": "https://api.github.com/users/krlmlr/followers", + "following_url": "https://api.github.com/users/krlmlr/following{/other_user}", + "gists_url": "https://api.github.com/users/krlmlr/gists{/gist_id}", + "starred_url": "https://api.github.com/users/krlmlr/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/krlmlr/subscriptions", + "organizations_url": "https://api.github.com/users/krlmlr/orgs", + "repos_url": "https://api.github.com/users/krlmlr/repos", + "events_url": "https://api.github.com/users/krlmlr/events{/privacy}", + "received_events_url": "https://api.github.com/users/krlmlr/received_events", + "type": "User", + "site_admin": false + } + ], + "milestone": null, + "comments": 7, + "created_at": "2022-07-13T14:24:15Z", + "updated_at": "2022-08-22T08:48:35Z", + "closed_at": "2022-08-22T08:48:35Z", + "author_association": "NONE", + "active_lock_reason": null, + "body": "#### What happens?\r\nSerialising R objects to `` type, then attempting to write to a `duckdb` connection results in the following errors:\r\n```\r\nError: rapi_execute: Unsupported column type for scan\r\nError: rapi_register_df: Failed to register data frame: std::exception\r\n```\r\n\r\n#### To Reproduce\r\nHere is some minimal R code to reproduce the issue:\r\n```\r\ncon <- DBI::dbConnect(duckdb::duckdb())\r\n# This would work:\r\n# con <- DBI::dbConnect(RSQLite::SQLite())\r\n\r\nDBI::dbCreateTable(\r\n conn = con,\r\n name = \"raw_test\",\r\n fields = list(file = \"blob\")\r\n)\r\n\r\nDBI::dbAppendTable(\r\n conn = con,\r\n name = \"raw_test\",\r\n value = data.frame(file = I(list(as.raw(1:3)))),\r\n field.types = list(file = \"blob\")\r\n)\r\n```\r\n\r\n#### Environment (please complete the following information):\r\n - OS: Windows 10\r\n - DuckDB Version: R package 0.4.0\r\n - DuckDB Client: R\r\n\r\nR session info:\r\n```\r\nR version 4.1.3 (2022-03-10)\r\nPlatform: x86_64-w64-mingw32/x64 (64-bit)\r\nRunning under: Windows 10 x64 (build 19044)\r\n\r\nMatrix products: default\r\n\r\nlocale:\r\n[1] LC_COLLATE=English_United Kingdom.1252 LC_CTYPE=English_United Kingdom.1252 \r\n[3] LC_MONETARY=English_United Kingdom.1252 LC_NUMERIC=C \r\n[5] LC_TIME=English_United Kingdom.1252 \r\n\r\nattached base packages:\r\n[1] stats graphics grDevices utils datasets methods base \r\n\r\nother attached packages:\r\n[1] databasePackage_0.0.0.9000 testthat_3.1.4 \r\n\r\nloaded via a namespace (and not attached):\r\n [1] Rcpp_1.0.8 compiler_4.1.3 prettyunits_1.1.1 remotes_2.4.2 tools_4.1.3 \r\n [6] odbc_1.3.3 bit_4.0.4 digest_0.6.29 pkgbuild_1.3.1 pkgload_1.3.0 \r\n[11] memoise_2.0.1 evaluate_0.15 lifecycle_1.0.1 pkgconfig_2.0.3 rlang_1.0.3 \r\n[16] cli_3.3.0 DBI_1.1.3 rstudioapi_0.13 yaml_2.3.5 xfun_0.30 \r\n[21] fastmap_1.1.0 duckdb_0.4.0 withr_2.5.0 knitr_1.39 hms_1.1.1 \r\n[26] vctrs_0.4.1 desc_1.4.1 fs_1.5.2 devtools_2.4.3 bit64_4.0.5 \r\n[31] rprojroot_2.0.2 glue_1.6.2 R6_2.5.1 processx_3.5.2 rmarkdown_2.14 \r\n[36] sessioninfo_1.2.2 blob_1.2.2 callr_3.7.0 purrr_0.3.4 magrittr_2.0.3 \r\n[41] ps_1.6.0 ellipsis_0.3.2 htmltools_0.5.2 usethis_2.1.6 rsconnect_0.8.26 \r\n[46] cachem_1.0.6 crayon_1.5.1 brio_1.1.3 \r\n```\r\n\r\n#### Identity Disclosure:\r\n - Full Name: Jacob Scott\r\n - Affiliation: UK Government", + "reactions": { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/4116/reactions", + "total_count": 0, + "+1": 0, + "-1": 0, + "laugh": 0, + "hooray": 0, + "confused": 0, + "heart": 0, + "rocket": 0, + "eyes": 0 + }, + "timeline_url": "https://api.github.com/repos/duckdb/duckdb/issues/4116/timeline", + "performed_via_github_app": null, + "state_reason": "completed" + }, + { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/3977", + "repository_url": "https://api.github.com/repos/duckdb/duckdb", + "labels_url": "https://api.github.com/repos/duckdb/duckdb/issues/3977/labels{/name}", + "comments_url": "https://api.github.com/repos/duckdb/duckdb/issues/3977/comments", + "events_url": "https://api.github.com/repos/duckdb/duckdb/issues/3977/events", + "html_url": "https://github.com/duckdb/duckdb/pull/3977", + "id": 1285887009, + "node_id": "PR_kwDOCEU65s46a67a", + "number": 3977, + "title": "Adding alias part 2", + "user": { + "login": "handstuyennn", + "id": 49063786, + "node_id": "MDQ6VXNlcjQ5MDYzNzg2", + "avatar_url": "https://avatars.githubusercontent.com/u/49063786?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/handstuyennn", + "html_url": "https://github.com/handstuyennn", + "followers_url": "https://api.github.com/users/handstuyennn/followers", + "following_url": "https://api.github.com/users/handstuyennn/following{/other_user}", + "gists_url": "https://api.github.com/users/handstuyennn/gists{/gist_id}", + "starred_url": "https://api.github.com/users/handstuyennn/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/handstuyennn/subscriptions", + "organizations_url": "https://api.github.com/users/handstuyennn/orgs", + "repos_url": "https://api.github.com/users/handstuyennn/repos", + "events_url": "https://api.github.com/users/handstuyennn/events{/privacy}", + "received_events_url": "https://api.github.com/users/handstuyennn/received_events", + "type": "User", + "site_admin": false + }, + "labels": [ + + ], + "state": "closed", + "locked": false, + "assignee": null, + "assignees": [ + + ], + "milestone": null, + "comments": 7, + "created_at": "2022-06-27T14:23:18Z", + "updated_at": "2022-07-14T08:07:48Z", + "closed_at": "2022-07-14T08:07:43Z", + "author_association": "CONTRIBUTOR", + "active_lock_reason": null, + "draft": false, + "pull_request": { + "url": "https://api.github.com/repos/duckdb/duckdb/pulls/3977", + "html_url": "https://github.com/duckdb/duckdb/pull/3977", + "diff_url": "https://github.com/duckdb/duckdb/pull/3977.diff", + "patch_url": "https://github.com/duckdb/duckdb/pull/3977.patch", + "merged_at": "2022-07-14T08:07:43Z" + }, + "body": "Hi @Mytherin \r\n\r\nRelate to [this thread](https://github.com/duckdb/duckdb/issues/3827).\r\n\r\nI finished part 2 of alias type: Provide a way of binding the type in functions, so functions can be created that take the type as input or output the type.\r\n\r\nPlease review and give me your feedback\r\n\r\nThanks", + "reactions": { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/3977/reactions", + "total_count": 0, + "+1": 0, + "-1": 0, + "laugh": 0, + "hooray": 0, + "confused": 0, + "heart": 0, + "rocket": 0, + "eyes": 0 + }, + "timeline_url": "https://api.github.com/repos/duckdb/duckdb/issues/3977/timeline", + "performed_via_github_app": null, + "state_reason": null + }, + { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/3875", + "repository_url": "https://api.github.com/repos/duckdb/duckdb", + "labels_url": "https://api.github.com/repos/duckdb/duckdb/issues/3875/labels{/name}", + "comments_url": "https://api.github.com/repos/duckdb/duckdb/issues/3875/comments", + "events_url": "https://api.github.com/repos/duckdb/duckdb/issues/3875/events", + "html_url": "https://github.com/duckdb/duckdb/issues/3875", + "id": 1273001636, + "node_id": "I_kwDOCEU65s5L4Hak", + "number": 3875, + "title": "R: RETURNING SQL clause not supported", + "user": { + "login": "krlmlr", + "id": 1741643, + "node_id": "MDQ6VXNlcjE3NDE2NDM=", + "avatar_url": "https://avatars.githubusercontent.com/u/1741643?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/krlmlr", + "html_url": "https://github.com/krlmlr", + "followers_url": "https://api.github.com/users/krlmlr/followers", + "following_url": "https://api.github.com/users/krlmlr/following{/other_user}", + "gists_url": "https://api.github.com/users/krlmlr/gists{/gist_id}", + "starred_url": "https://api.github.com/users/krlmlr/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/krlmlr/subscriptions", + "organizations_url": "https://api.github.com/users/krlmlr/orgs", + "repos_url": "https://api.github.com/users/krlmlr/repos", + "events_url": "https://api.github.com/users/krlmlr/events{/privacy}", + "received_events_url": "https://api.github.com/users/krlmlr/received_events", + "type": "User", + "site_admin": false + }, + "labels": [ + { + "id": 1269662541, + "node_id": "MDU6TGFiZWwxMjY5NjYyNTQx", + "url": "https://api.github.com/repos/duckdb/duckdb/labels/R", + "name": "R", + "color": "6ddb99", + "default": false, + "description": "R integration" + } + ], + "state": "closed", + "locked": false, + "assignee": null, + "assignees": [ + + ], + "milestone": null, + "comments": 7, + "created_at": "2022-06-16T03:28:54Z", + "updated_at": "2022-06-18T21:01:43Z", + "closed_at": "2022-06-18T09:20:45Z", + "author_association": "COLLABORATOR", + "active_lock_reason": null, + "body": "#### What happens?\r\n\r\n`INSERT INTO ... RETURNING ...` doesn't provide a result set, but it's available from the CLI.\r\n\r\n#### To Reproduce\r\n\r\n``` r\r\ncon <- DBI::dbConnect(duckdb::duckdb())\r\n\r\nDBI::dbExecute(con, \"CREATE TEMP TABLE x (a int, PRIMARY KEY (a))\")\r\n#> [1] 0\r\nDBI::dbGetQuery(con, \"INSERT INTO x VALUES (1) RETURNING (a)\")\r\n#> Warning in dbFetch(rs, n = n, ...): Should not call dbFetch() on results that do\r\n#> not come from SELECT\r\n#> data frame with 0 columns and 0 rows\r\n```\r\n\r\nCreated on 2022-06-16 by the [reprex package](https://reprex.tidyverse.org) (v2.0.1)\r\n\r\n\r\n```\r\n➜ duckdb git:(d4c437597) ✗ echo \"CREATE TEMP TABLE x (a int, PRIMARY KEY (a));\r\nINSERT INTO x VALUES (1) RETURNING (a)\r\n\" | build/debug/duckdb\r\n┌───┐\r\n│ a │\r\n├───┤\r\n│ 1 │\r\n└───┘\r\n```\r\n\r\n\r\n#### Environment (please complete the following information):\r\n\r\n - OS: macOS\r\n - DuckDB Version: d4c437597\r\n - DuckDB Client: R\r\n\r\n#### Before Submitting\r\n\r\n- [x] **Have you tried this on the latest `master` branch?**\r\n- [x] **Have you tried the steps to reproduce? Do they include all relevant data and configuration? Does the issue you report still appear there?**\r\n", + "reactions": { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/3875/reactions", + "total_count": 0, + "+1": 0, + "-1": 0, + "laugh": 0, + "hooray": 0, + "confused": 0, + "heart": 0, + "rocket": 0, + "eyes": 0 + }, + "timeline_url": "https://api.github.com/repos/duckdb/duckdb/issues/3875/timeline", + "performed_via_github_app": null, + "state_reason": "completed" + }, + { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/3681", + "repository_url": "https://api.github.com/repos/duckdb/duckdb", + "labels_url": "https://api.github.com/repos/duckdb/duckdb/issues/3681/labels{/name}", + "comments_url": "https://api.github.com/repos/duckdb/duckdb/issues/3681/comments", + "events_url": "https://api.github.com/repos/duckdb/duckdb/issues/3681/events", + "html_url": "https://github.com/duckdb/duckdb/pull/3681", + "id": 1242455535, + "node_id": "PR_kwDOCEU65s44J8ps", + "number": 3681, + "title": "R: Fail CI/CD on NOTEs, check examples on UBSAN, log valgrind output", + "user": { + "login": "krlmlr", + "id": 1741643, + "node_id": "MDQ6VXNlcjE3NDE2NDM=", + "avatar_url": "https://avatars.githubusercontent.com/u/1741643?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/krlmlr", + "html_url": "https://github.com/krlmlr", + "followers_url": "https://api.github.com/users/krlmlr/followers", + "following_url": "https://api.github.com/users/krlmlr/following{/other_user}", + "gists_url": "https://api.github.com/users/krlmlr/gists{/gist_id}", + "starred_url": "https://api.github.com/users/krlmlr/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/krlmlr/subscriptions", + "organizations_url": "https://api.github.com/users/krlmlr/orgs", + "repos_url": "https://api.github.com/users/krlmlr/repos", + "events_url": "https://api.github.com/users/krlmlr/events{/privacy}", + "received_events_url": "https://api.github.com/users/krlmlr/received_events", + "type": "User", + "site_admin": false + }, + "labels": [ + + ], + "state": "closed", + "locked": false, + "assignee": null, + "assignees": [ + + ], + "milestone": null, + "comments": 7, + "created_at": "2022-05-20T00:27:09Z", + "updated_at": "2022-06-12T20:13:40Z", + "closed_at": "2022-06-12T20:13:34Z", + "author_association": "COLLABORATOR", + "active_lock_reason": null, + "draft": false, + "pull_request": { + "url": "https://api.github.com/repos/duckdb/duckdb/pulls/3681", + "html_url": "https://github.com/duckdb/duckdb/pull/3681", + "diff_url": "https://github.com/duckdb/duckdb/pull/3681.diff", + "patch_url": "https://github.com/duckdb/duckdb/pull/3681.patch", + "merged_at": "2022-06-12T20:13:34Z" + }, + "body": "to avoid discussions on CRAN. To finish, need to revise a7d95a32e. Please advise.", + "reactions": { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/3681/reactions", + "total_count": 0, + "+1": 0, + "-1": 0, + "laugh": 0, + "hooray": 0, + "confused": 0, + "heart": 0, + "rocket": 0, + "eyes": 0 + }, + "timeline_url": "https://api.github.com/repos/duckdb/duckdb/issues/3681/timeline", + "performed_via_github_app": null, + "state_reason": null + }, + { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/3667", + "repository_url": "https://api.github.com/repos/duckdb/duckdb", + "labels_url": "https://api.github.com/repos/duckdb/duckdb/issues/3667/labels{/name}", + "comments_url": "https://api.github.com/repos/duckdb/duckdb/issues/3667/comments", + "events_url": "https://api.github.com/repos/duckdb/duckdb/issues/3667/events", + "html_url": "https://github.com/duckdb/duckdb/pull/3667", + "id": 1239943608, + "node_id": "PR_kwDOCEU65s44BpZE", + "number": 3667, + "title": "Handling dataframes with repeated names in columns outside the bind. Now when registering df for scan.", + "user": { + "login": "pdet", + "id": 7377477, + "node_id": "MDQ6VXNlcjczNzc0Nzc=", + "avatar_url": "https://avatars.githubusercontent.com/u/7377477?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/pdet", + "html_url": "https://github.com/pdet", + "followers_url": "https://api.github.com/users/pdet/followers", + "following_url": "https://api.github.com/users/pdet/following{/other_user}", + "gists_url": "https://api.github.com/users/pdet/gists{/gist_id}", + "starred_url": "https://api.github.com/users/pdet/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/pdet/subscriptions", + "organizations_url": "https://api.github.com/users/pdet/orgs", + "repos_url": "https://api.github.com/users/pdet/repos", + "events_url": "https://api.github.com/users/pdet/events{/privacy}", + "received_events_url": "https://api.github.com/users/pdet/received_events", + "type": "User", + "site_admin": false + }, + "labels": [ + + ], + "state": "closed", + "locked": false, + "assignee": null, + "assignees": [ + + ], + "milestone": null, + "comments": 7, + "created_at": "2022-05-18T13:10:08Z", + "updated_at": "2022-05-20T14:39:59Z", + "closed_at": "2022-05-20T09:47:21Z", + "author_association": "MEMBER", + "active_lock_reason": null, + "draft": false, + "pull_request": { + "url": "https://api.github.com/repos/duckdb/duckdb/pulls/3667", + "html_url": "https://github.com/duckdb/duckdb/pull/3667", + "diff_url": "https://github.com/duckdb/duckdb/pull/3667.diff", + "patch_url": "https://github.com/duckdb/duckdb/pull/3667.patch", + "merged_at": "2022-05-20T09:47:21Z" + }, + "body": "FIx: #2365\r\nFix: #3669\r\n\r\n@TNonet , you were right thanks for reopening the issue and pointing out a suitable solution.\r\nIf possible, double-check the tests are testing what they should be testing now :-).", + "reactions": { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/3667/reactions", + "total_count": 0, + "+1": 0, + "-1": 0, + "laugh": 0, + "hooray": 0, + "confused": 0, + "heart": 0, + "rocket": 0, + "eyes": 0 + }, + "timeline_url": "https://api.github.com/repos/duckdb/duckdb/issues/3667/timeline", + "performed_via_github_app": null, + "state_reason": null + }, + { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/3618", + "repository_url": "https://api.github.com/repos/duckdb/duckdb", + "labels_url": "https://api.github.com/repos/duckdb/duckdb/issues/3618/labels{/name}", + "comments_url": "https://api.github.com/repos/duckdb/duckdb/issues/3618/comments", + "events_url": "https://api.github.com/repos/duckdb/duckdb/issues/3618/events", + "html_url": "https://github.com/duckdb/duckdb/pull/3618", + "id": 1232152492, + "node_id": "PR_kwDOCEU65s43oLRD", + "number": 3618, + "title": "Struct Types for Node.js UDFs", + "user": { + "login": "ankoh", + "id": 3986510, + "node_id": "MDQ6VXNlcjM5ODY1MTA=", + "avatar_url": "https://avatars.githubusercontent.com/u/3986510?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/ankoh", + "html_url": "https://github.com/ankoh", + "followers_url": "https://api.github.com/users/ankoh/followers", + "following_url": "https://api.github.com/users/ankoh/following{/other_user}", + "gists_url": "https://api.github.com/users/ankoh/gists{/gist_id}", + "starred_url": "https://api.github.com/users/ankoh/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/ankoh/subscriptions", + "organizations_url": "https://api.github.com/users/ankoh/orgs", + "repos_url": "https://api.github.com/users/ankoh/repos", + "events_url": "https://api.github.com/users/ankoh/events{/privacy}", + "received_events_url": "https://api.github.com/users/ankoh/received_events", + "type": "User", + "site_admin": false + }, + "labels": [ + { + "id": 1153696619, + "node_id": "MDU6TGFiZWwxMTUzNjk2NjE5", + "url": "https://api.github.com/repos/duckdb/duckdb/labels/Type", + "name": "Type", + "color": "f9d0c4", + "default": false, + "description": "Deals with the type system" + }, + { + "id": 3551237865, + "node_id": "LA_kwDOCEU65s7Tq5bp", + "url": "https://api.github.com/repos/duckdb/duckdb/labels/feature", + "name": "feature", + "color": "3FD3C4", + "default": false, + "description": "" + } + ], + "state": "closed", + "locked": false, + "assignee": null, + "assignees": [ + + ], + "milestone": null, + "comments": 7, + "created_at": "2022-05-11T07:42:16Z", + "updated_at": "2022-06-28T09:09:41Z", + "closed_at": "2022-06-28T09:09:38Z", + "author_association": "MEMBER", + "active_lock_reason": null, + "draft": false, + "pull_request": { + "url": "https://api.github.com/repos/duckdb/duckdb/pulls/3618", + "html_url": "https://github.com/duckdb/duckdb/pull/3618", + "diff_url": "https://github.com/duckdb/duckdb/pull/3618.diff", + "patch_url": "https://github.com/duckdb/duckdb/pull/3618.patch", + "merged_at": "2022-06-28T09:09:38Z" + }, + "body": "In this PR:\r\n* Node UDFs now support struct types.\r\n* Node UDFs now support arbitrary argument counts.\r\n* Resolve node api headers with CMake and add Node.js to global CMakeLists behind BUILD_NODE flag.\r\n* Fixed clang-tidy warnings\r\n* Reduce gyp build warnings, we can probably get this to zero later.\r\n\r\nWe pass the UDF arguments very similar to the Wasm UDFs by constructing native arrays.\r\nhttps://github.com/ankoh/duckdb/blob/master/tools/nodejs/src/data_chunk.cpp\r\n\r\nThis PR is a draft because we need more udf tests and need to map more primitive types to native arrays.\r\n(Atm, it's only integer, doubles, varchars and structs).\r\n\r\nFurther, we should use the data chunk mapping for the query results themselves in the future.\r\nWe would just stream the data chunks chunk-by-chunk with native arrays which should be faster than the current approach of materializing individual Napi row objects in C++.\r\nIn order for this to not break existing node.js users, we would need to glue Arrow-js style proxy objects on top of the native arrays.\r\nThe user would then still see the object which is effectively just a fat pointer into the column arrays.", + "reactions": { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/3618/reactions", + "total_count": 2, + "+1": 2, + "-1": 0, + "laugh": 0, + "hooray": 0, + "confused": 0, + "heart": 0, + "rocket": 0, + "eyes": 0 + }, + "timeline_url": "https://api.github.com/repos/duckdb/duckdb/issues/3618/timeline", + "performed_via_github_app": null, + "state_reason": null + }, + { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/3510", + "repository_url": "https://api.github.com/repos/duckdb/duckdb", + "labels_url": "https://api.github.com/repos/duckdb/duckdb/issues/3510/labels{/name}", + "comments_url": "https://api.github.com/repos/duckdb/duckdb/issues/3510/comments", + "events_url": "https://api.github.com/repos/duckdb/duckdb/issues/3510/events", + "html_url": "https://github.com/duckdb/duckdb/issues/3510", + "id": 1215668622, + "node_id": "I_kwDOCEU65s5IdaGO", + "number": 3510, + "title": "Serialization Error", + "user": { + "login": "buchu", + "id": 1553893, + "node_id": "MDQ6VXNlcjE1NTM4OTM=", + "avatar_url": "https://avatars.githubusercontent.com/u/1553893?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/buchu", + "html_url": "https://github.com/buchu", + "followers_url": "https://api.github.com/users/buchu/followers", + "following_url": "https://api.github.com/users/buchu/following{/other_user}", + "gists_url": "https://api.github.com/users/buchu/gists{/gist_id}", + "starred_url": "https://api.github.com/users/buchu/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/buchu/subscriptions", + "organizations_url": "https://api.github.com/users/buchu/orgs", + "repos_url": "https://api.github.com/users/buchu/repos", + "events_url": "https://api.github.com/users/buchu/events{/privacy}", + "received_events_url": "https://api.github.com/users/buchu/received_events", + "type": "User", + "site_admin": false + }, + "labels": [ + + ], + "state": "closed", + "locked": false, + "assignee": null, + "assignees": [ + + ], + "milestone": null, + "comments": 7, + "created_at": "2022-04-26T09:28:04Z", + "updated_at": "2022-05-08T00:35:19Z", + "closed_at": "2022-04-26T13:05:41Z", + "author_association": "NONE", + "active_lock_reason": null, + "body": "#### What happens?\r\n\r\nBinary : \r\nduckdb data.db \r\nError: unable to open database \"data.db\": Serialization Error: Attempting to read a required field, but field is missing\r\n\r\nPython :\r\nRuntimeError Traceback (most recent call last)\r\n----> 1 conn_xymon = duckdb.connect(\"data.db\", read_only=True)\r\nRuntimeError: Serialization Error: Attempting to read a required field, but field is missing\r\n\r\n#### To Reproduce\r\nError on all my databases files\r\n\r\n#### Environment (please complete the following information):\r\n - OS: MacOS 12.3.1\r\n - DuckDB Version: v0.3.3 fe9ba8003\r\n - DuckDB Client: Binary and Python (same error)\r\n\r\n#### Before Submitting\r\n\r\n- [x] **Have you tried this on the latest `master` branch?**\r\n* **Python**: `pip install duckdb --upgrade --pre`\r\n* **R**: `install.packages(\"https://github.com/duckdb/duckdb/releases/download/master-builds/duckdb_r_src.tar.gz\", repos = NULL)`\r\n* **Other Platforms**: You can find binaries [here](https://github.com/duckdb/duckdb/releases/tag/master-builds) or compile from source.\r\n\r\n- [ ] **Have you tried the steps to reproduce? Do they include all relevant data and configuration? Does the issue you report still appear there?**\r\n", + "reactions": { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/3510/reactions", + "total_count": 0, + "+1": 0, + "-1": 0, + "laugh": 0, + "hooray": 0, + "confused": 0, + "heart": 0, + "rocket": 0, + "eyes": 0 + }, + "timeline_url": "https://api.github.com/repos/duckdb/duckdb/issues/3510/timeline", + "performed_via_github_app": null, + "state_reason": "completed" + }, + { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/3497", + "repository_url": "https://api.github.com/repos/duckdb/duckdb", + "labels_url": "https://api.github.com/repos/duckdb/duckdb/issues/3497/labels{/name}", + "comments_url": "https://api.github.com/repos/duckdb/duckdb/issues/3497/comments", + "events_url": "https://api.github.com/repos/duckdb/duckdb/issues/3497/events", + "html_url": "https://github.com/duckdb/duckdb/issues/3497", + "id": 1214176509, + "node_id": "I_kwDOCEU65s5IXtz9", + "number": 3497, + "title": "Casting to integer truncates instead of rounding", + "user": { + "login": "szarnyasg", + "id": 1402801, + "node_id": "MDQ6VXNlcjE0MDI4MDE=", + "avatar_url": "https://avatars.githubusercontent.com/u/1402801?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/szarnyasg", + "html_url": "https://github.com/szarnyasg", + "followers_url": "https://api.github.com/users/szarnyasg/followers", + "following_url": "https://api.github.com/users/szarnyasg/following{/other_user}", + "gists_url": "https://api.github.com/users/szarnyasg/gists{/gist_id}", + "starred_url": "https://api.github.com/users/szarnyasg/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/szarnyasg/subscriptions", + "organizations_url": "https://api.github.com/users/szarnyasg/orgs", + "repos_url": "https://api.github.com/users/szarnyasg/repos", + "events_url": "https://api.github.com/users/szarnyasg/events{/privacy}", + "received_events_url": "https://api.github.com/users/szarnyasg/received_events", + "type": "User", + "site_admin": false + }, + "labels": [ + { + "id": 976002058, + "node_id": "MDU6TGFiZWw5NzYwMDIwNTg=", + "url": "https://api.github.com/repos/duckdb/duckdb/labels/bug", + "name": "bug", + "color": "d73a4a", + "default": true, + "description": "Something isn't working" + } + ], + "state": "closed", + "locked": false, + "assignee": { + "login": "hawkfish", + "id": 13156216, + "node_id": "MDQ6VXNlcjEzMTU2MjE2", + "avatar_url": "https://avatars.githubusercontent.com/u/13156216?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/hawkfish", + "html_url": "https://github.com/hawkfish", + "followers_url": "https://api.github.com/users/hawkfish/followers", + "following_url": "https://api.github.com/users/hawkfish/following{/other_user}", + "gists_url": "https://api.github.com/users/hawkfish/gists{/gist_id}", + "starred_url": "https://api.github.com/users/hawkfish/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/hawkfish/subscriptions", + "organizations_url": "https://api.github.com/users/hawkfish/orgs", + "repos_url": "https://api.github.com/users/hawkfish/repos", + "events_url": "https://api.github.com/users/hawkfish/events{/privacy}", + "received_events_url": "https://api.github.com/users/hawkfish/received_events", + "type": "User", + "site_admin": false + }, + "assignees": [ + { + "login": "hawkfish", + "id": 13156216, + "node_id": "MDQ6VXNlcjEzMTU2MjE2", + "avatar_url": "https://avatars.githubusercontent.com/u/13156216?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/hawkfish", + "html_url": "https://github.com/hawkfish", + "followers_url": "https://api.github.com/users/hawkfish/followers", + "following_url": "https://api.github.com/users/hawkfish/following{/other_user}", + "gists_url": "https://api.github.com/users/hawkfish/gists{/gist_id}", + "starred_url": "https://api.github.com/users/hawkfish/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/hawkfish/subscriptions", + "organizations_url": "https://api.github.com/users/hawkfish/orgs", + "repos_url": "https://api.github.com/users/hawkfish/repos", + "events_url": "https://api.github.com/users/hawkfish/events{/privacy}", + "received_events_url": "https://api.github.com/users/hawkfish/received_events", + "type": "User", + "site_admin": false + } + ], + "milestone": null, + "comments": 7, + "created_at": "2022-04-25T08:46:03Z", + "updated_at": "2022-06-01T22:33:05Z", + "closed_at": "2022-06-01T22:33:05Z", + "author_association": "CONTRIBUTOR", + "active_lock_reason": null, + "body": "#### What happens?\r\n\r\nCasting a float value to an integer truncates the value, whereas the same in Postgres rounds it.\r\n\r\n#### To Reproduce\r\n\r\nIn DuckDB:\r\n```\r\nv0.3.3-dev1399 7c5ba6c0e\r\nEnter \".help\" for usage hints.\r\nConnected to a transient in-memory database.\r\nUse \".open FILENAME\" to reopen on a persistent database.\r\nD select cast(0.55 as integer) as x;\r\n┌───┐\r\n│ x │\r\n├───┤\r\n│ 0 │\r\n└───┘\r\n```\r\n\r\nIn Postgres:\r\n```console\r\npsql (14.2 (Debian 14.2-1.pgdg110+1))\r\nType \"help\" for help.\r\n\r\nldbcsnb=# select cast(0.55 as integer) as x;\r\n x \r\n---\r\n 1\r\n(1 row)\r\n```\r\n\r\n#### Environment (please complete the following information):\r\n - OS: Fedora 35\r\n - DuckDB Version: 0.3.2, 7c5ba6c0e\r\n - DuckDB Client: CLI\r\n", + "reactions": { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/3497/reactions", + "total_count": 0, + "+1": 0, + "-1": 0, + "laugh": 0, + "hooray": 0, + "confused": 0, + "heart": 0, + "rocket": 0, + "eyes": 0 + }, + "timeline_url": "https://api.github.com/repos/duckdb/duckdb/issues/3497/timeline", + "performed_via_github_app": null, + "state_reason": "completed" + }, + { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/3466", + "repository_url": "https://api.github.com/repos/duckdb/duckdb", + "labels_url": "https://api.github.com/repos/duckdb/duckdb/issues/3466/labels{/name}", + "comments_url": "https://api.github.com/repos/duckdb/duckdb/issues/3466/comments", + "events_url": "https://api.github.com/repos/duckdb/duckdb/issues/3466/events", + "html_url": "https://github.com/duckdb/duckdb/issues/3466", + "id": 1210070142, + "node_id": "I_kwDOCEU65s5IIDR-", + "number": 3466, + "title": "Join condition when expression is true fails to return records", + "user": { + "login": "chrisknoll", + "id": 6818777, + "node_id": "MDQ6VXNlcjY4MTg3Nzc=", + "avatar_url": "https://avatars.githubusercontent.com/u/6818777?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/chrisknoll", + "html_url": "https://github.com/chrisknoll", + "followers_url": "https://api.github.com/users/chrisknoll/followers", + "following_url": "https://api.github.com/users/chrisknoll/following{/other_user}", + "gists_url": "https://api.github.com/users/chrisknoll/gists{/gist_id}", + "starred_url": "https://api.github.com/users/chrisknoll/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/chrisknoll/subscriptions", + "organizations_url": "https://api.github.com/users/chrisknoll/orgs", + "repos_url": "https://api.github.com/users/chrisknoll/repos", + "events_url": "https://api.github.com/users/chrisknoll/events{/privacy}", + "received_events_url": "https://api.github.com/users/chrisknoll/received_events", + "type": "User", + "site_admin": false + }, + "labels": [ + { + "id": 976002058, + "node_id": "MDU6TGFiZWw5NzYwMDIwNTg=", + "url": "https://api.github.com/repos/duckdb/duckdb/labels/bug", + "name": "bug", + "color": "d73a4a", + "default": true, + "description": "Something isn't working" + } + ], + "state": "closed", + "locked": false, + "assignee": { + "login": "hawkfish", + "id": 13156216, + "node_id": "MDQ6VXNlcjEzMTU2MjE2", + "avatar_url": "https://avatars.githubusercontent.com/u/13156216?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/hawkfish", + "html_url": "https://github.com/hawkfish", + "followers_url": "https://api.github.com/users/hawkfish/followers", + "following_url": "https://api.github.com/users/hawkfish/following{/other_user}", + "gists_url": "https://api.github.com/users/hawkfish/gists{/gist_id}", + "starred_url": "https://api.github.com/users/hawkfish/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/hawkfish/subscriptions", + "organizations_url": "https://api.github.com/users/hawkfish/orgs", + "repos_url": "https://api.github.com/users/hawkfish/repos", + "events_url": "https://api.github.com/users/hawkfish/events{/privacy}", + "received_events_url": "https://api.github.com/users/hawkfish/received_events", + "type": "User", + "site_admin": false + }, + "assignees": [ + { + "login": "hawkfish", + "id": 13156216, + "node_id": "MDQ6VXNlcjEzMTU2MjE2", + "avatar_url": "https://avatars.githubusercontent.com/u/13156216?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/hawkfish", + "html_url": "https://github.com/hawkfish", + "followers_url": "https://api.github.com/users/hawkfish/followers", + "following_url": "https://api.github.com/users/hawkfish/following{/other_user}", + "gists_url": "https://api.github.com/users/hawkfish/gists{/gist_id}", + "starred_url": "https://api.github.com/users/hawkfish/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/hawkfish/subscriptions", + "organizations_url": "https://api.github.com/users/hawkfish/orgs", + "repos_url": "https://api.github.com/users/hawkfish/repos", + "events_url": "https://api.github.com/users/hawkfish/events{/privacy}", + "received_events_url": "https://api.github.com/users/hawkfish/received_events", + "type": "User", + "site_admin": false + } + ], + "milestone": null, + "comments": 7, + "created_at": "2022-04-20T18:37:42Z", + "updated_at": "2022-04-22T11:25:51Z", + "closed_at": "2022-04-22T11:25:51Z", + "author_association": "NONE", + "active_lock_reason": null, + "body": "#### What happens?\r\nWhen performing a join between 2 tables using date columns, the join fails to return rows. Excluding the join condition using the date columns works.\r\n\r\n#### To Reproduce\r\n\r\nThis is the query without using the date columns in the join. I've appended the join condition as a column in the select to confirm that the date comparison results in `true`:\r\n\r\n```\r\nselect cohort_definition_id, subject_id, cohort_start_date, cohort_end_date, op1.observation_period_start_date, op1.observation_period_end_date,\r\n tc1.cohort_start_date >= op1.observation_period_start_date as gt_test,\r\n tc1.cohort_start_date <= op1.observation_period_end_date as lt_test\r\nfrom main.cohort tc1\r\ninner join main.observation_period op1 on tc1.subject_id = op1.person_id\r\n--\tand tc1.cohort_start_date >= op1.observation_period_start_date\r\n--\tand tc1.cohort_start_date <= op1.observation_period_end_date\r\nwhere cohort_definition_id in (100);\t\r\n```\r\n\r\ncohort_definition_id | subject_id | cohort_start_date | cohort_end_date | observation_period_start_date | observation_period_end_date | gt_test | lt_test\r\n---------------------+------------+-------------------+-----------------+-------------------------------+-----------------------------+---------+--------\r\n 100 | 1 | 2002-12-25 | 2002-12-25 | 1963-12-31 | 2010-01-01 | true | true \r\n 100 | 1 | 2007-03-01 | 2007-03-01 | 1963-12-31 | 2010-01-01 | true | true \r\n 100 | 2 | 2003-03-01 | 2003-03-01 | 1963-12-31 | 2010-01-01 | true | true \r\n 100 | 2 | 2005-03-01 | 2005-03-01 | 1963-12-31 | 2010-01-01 | true | true \r\n\r\n\r\nHowever, when uncommenting the two `and` clauses from the join, no records are returned:\r\n\r\n```\r\nselect cohort_definition_id, subject_id, cohort_start_date, cohort_end_date, op1.observation_period_start_date, op1.observation_period_end_date,\r\n tc1.cohort_start_date >= op1.observation_period_start_date as gt_test,\r\n tc1.cohort_start_date <= op1.observation_period_end_date as lt_test\r\nfrom main.cohort tc1\r\ninner join main.observation_period op1 on tc1.subject_id = op1.person_id\r\n\tand tc1.cohort_start_date >= op1.observation_period_start_date\r\n\tand tc1.cohort_start_date <= op1.observation_period_end_date\r\nwhere cohort_definition_id in (100);\t\r\n```\r\n\r\nBut the expression columns (gt test and lt test) both show TRUE as sthe result, so the problem is that the TRUE isn't leading to the successful join.\r\n\r\nFor clarity, this is the table definitions from information_schema (note the column types are properly defined as DATE):\r\ntable_name | column_name | ordinal_position | data_type\r\n-------------------+-------------------------------+------------------+----------\r\nCOHORT | cohort_definition_id | 1 | INTEGER \r\nCOHORT | subject_id | 2 | INTEGER \r\nCOHORT | cohort_start_date | 3 | DATE \r\nCOHORT | cohort_end_date | 4 | DATE \r\nOBSERVATION_PERIOD | observation_period_id | 1 | INTEGER \r\nOBSERVATION_PERIOD | person_id | 2 | INTEGER \r\nOBSERVATION_PERIOD | observation_period_start_date | 3 | DATE \r\nOBSERVATION_PERIOD | observation_period_end_date | 4 | DATE \r\nOBSERVATION_PERIOD | period_type_concept_id | 5 | INTEGER \r\n\r\n#### Environment (please complete the following information):\r\n - OS: [Windows]\r\n - DuckDB Version: [0.3.2-2]\r\n - DuckDB Client: [JDBC]\r\n\r\n", + "reactions": { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/3466/reactions", + "total_count": 0, + "+1": 0, + "-1": 0, + "laugh": 0, + "hooray": 0, + "confused": 0, + "heart": 0, + "rocket": 0, + "eyes": 0 + }, + "timeline_url": "https://api.github.com/repos/duckdb/duckdb/issues/3466/timeline", + "performed_via_github_app": null, + "state_reason": "completed" + }, + { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/3461", + "repository_url": "https://api.github.com/repos/duckdb/duckdb", + "labels_url": "https://api.github.com/repos/duckdb/duckdb/issues/3461/labels{/name}", + "comments_url": "https://api.github.com/repos/duckdb/duckdb/issues/3461/comments", + "events_url": "https://api.github.com/repos/duckdb/duckdb/issues/3461/events", + "html_url": "https://github.com/duckdb/duckdb/issues/3461", + "id": 1208843921, + "node_id": "I_kwDOCEU65s5IDX6R", + "number": 3461, + "title": "pip install duckdb requires Microsoft Visual C++ 14.0", + "user": { + "login": "alexboks", + "id": 57604785, + "node_id": "MDQ6VXNlcjU3NjA0Nzg1", + "avatar_url": "https://avatars.githubusercontent.com/u/57604785?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/alexboks", + "html_url": "https://github.com/alexboks", + "followers_url": "https://api.github.com/users/alexboks/followers", + "following_url": "https://api.github.com/users/alexboks/following{/other_user}", + "gists_url": "https://api.github.com/users/alexboks/gists{/gist_id}", + "starred_url": "https://api.github.com/users/alexboks/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/alexboks/subscriptions", + "organizations_url": "https://api.github.com/users/alexboks/orgs", + "repos_url": "https://api.github.com/users/alexboks/repos", + "events_url": "https://api.github.com/users/alexboks/events{/privacy}", + "received_events_url": "https://api.github.com/users/alexboks/received_events", + "type": "User", + "site_admin": false + }, + "labels": [ + + ], + "state": "closed", + "locked": false, + "assignee": null, + "assignees": [ + + ], + "milestone": null, + "comments": 7, + "created_at": "2022-04-19T20:46:01Z", + "updated_at": "2022-11-22T10:40:11Z", + "closed_at": "2022-04-26T08:57:23Z", + "author_association": "NONE", + "active_lock_reason": null, + "body": "#### What happens?\r\npip install of duckdb v0.3.3 fails on Windows with following error:\r\n\r\n error: Microsoft Visual C++ 14.0 or greater is required. Get it with \"Microsoft C++ Build Tools\": https://visualstudio.microsoft.com/visual-cpp-build-tools/\r\n [end of output]\r\n\r\npip installing latest master branch works fine ( `pip install duckdb --upgrade --pre`)\r\n\r\n#### To Reproduce\r\npip install duckdb==0.3.3\r\n\r\n#### Environment (please complete the following information):\r\n - OS: Windows\r\n - DuckDB Version: 0.3.3\r\n - DuckDB Client: Python\r\n\r\n#### Before Submitting\r\n\r\n- [x] **Have you tried this on the latest `master` branch?**\r\n* **Python**: `pip install duckdb --upgrade --pre`\r\n* **R**: `install.packages(\"https://github.com/duckdb/duckdb/releases/download/master-builds/duckdb_r_src.tar.gz\", repos = NULL)`\r\n* **Other Platforms**: You can find binaries [here](https://github.com/duckdb/duckdb/releases/tag/master-builds) or compile from source.\r\n\r\n- [x] **Have you tried the steps to reproduce? Do they include all relevant data and configuration? Does the issue you report still appear there?**\r\n", + "reactions": { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/3461/reactions", + "total_count": 1, + "+1": 1, + "-1": 0, + "laugh": 0, + "hooray": 0, + "confused": 0, + "heart": 0, + "rocket": 0, + "eyes": 0 + }, + "timeline_url": "https://api.github.com/repos/duckdb/duckdb/issues/3461/timeline", + "performed_via_github_app": null, + "state_reason": "completed" + }, + { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/3153", + "repository_url": "https://api.github.com/repos/duckdb/duckdb", + "labels_url": "https://api.github.com/repos/duckdb/duckdb/issues/3153/labels{/name}", + "comments_url": "https://api.github.com/repos/duckdb/duckdb/issues/3153/comments", + "events_url": "https://api.github.com/repos/duckdb/duckdb/issues/3153/events", + "html_url": "https://github.com/duckdb/duckdb/issues/3153", + "id": 1150659175, + "node_id": "I_kwDOCEU65s5Elapn", + "number": 3153, + "title": "C++ failed to read multiple parquet files \"TProtocolException: Invalid data\" with NaT values", + "user": { + "login": "hli500", + "id": 92927536, + "node_id": "U_kgDOBYn2MA", + "avatar_url": "https://avatars.githubusercontent.com/u/92927536?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/hli500", + "html_url": "https://github.com/hli500", + "followers_url": "https://api.github.com/users/hli500/followers", + "following_url": "https://api.github.com/users/hli500/following{/other_user}", + "gists_url": "https://api.github.com/users/hli500/gists{/gist_id}", + "starred_url": "https://api.github.com/users/hli500/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/hli500/subscriptions", + "organizations_url": "https://api.github.com/users/hli500/orgs", + "repos_url": "https://api.github.com/users/hli500/repos", + "events_url": "https://api.github.com/users/hli500/events{/privacy}", + "received_events_url": "https://api.github.com/users/hli500/received_events", + "type": "User", + "site_admin": false + }, + "labels": [ + + ], + "state": "open", + "locked": false, + "assignee": null, + "assignees": [ + + ], + "milestone": null, + "comments": 7, + "created_at": "2022-02-25T16:52:41Z", + "updated_at": "2022-03-16T16:28:54Z", + "closed_at": null, + "author_association": "NONE", + "active_lock_reason": null, + "body": "#### What happens?\r\n\r\nWe have the following parquet files:\r\n```\r\n-rw-rw-rw- 1 op general 1.4M Feb 22 07:56 test_20220103_20220110.parquet\r\n-rw-rw-rw- 1 op general 1.3M Feb 22 07:56 test_20220110_20220117.parquet\r\n-rw-rw-rw- 1 op general 1.2M Feb 22 07:57 test_20220117_20220124.parquet\r\n-rw-rw-rw- 1 op general 1.3M Feb 22 07:57 test_20220124_20220131.parquet\r\n-rw-rw-rw- 1 op general 1.1M Feb 22 07:57 test_20220131_20220207.parquet\r\n-rw-rw-rw- 1 op general 1.1M Feb 22 07:57 test_20220207_20220214.parquet\r\n-rw-rw-rw- 1 op general 1.2M Feb 22 07:57 test_20220214_20220221.parquet\r\n-rw-rw-rw- 1 op general 314K Feb 22 07:57 test_20220221_20220228.parquet\r\n```\r\nMetaData for the file:\r\n```\r\n############ file meta data ############\r\ncreated_by: parquet-cpp version 1.5.1-SNAPSHOT\r\nnum_columns: 1145\r\nnum_rows: 5823\r\nnum_row_groups: 1\r\nformat_version: 1.0\r\nserialized_size: 261856\r\n```\r\n\r\nTried changed one of the existing unittest:\r\n\r\n### individual file works:\r\n\r\n```\r\nquery II\r\nSELECT * FROM parquet_scan('test_20220103_20220110.parquet') where value='A';\r\n```\r\nworks fine\r\n\r\n### glob for multiple files failed with ERROR \"TProtocolException: Invalid data\":\r\nWe've tried select directly from parqute_scan or with create view, both gives back the same error\r\n```\r\nquery II\r\nSELECT * FROM parquet_scan('test_*.parquet') where value='A';\r\n\r\nor \r\nCREATE view parquet_file AS SELECT * from parquet_scan('test_*.parquet')\r\nSELECT * FROM parquet_file \r\n```\r\nGives back:\r\n\r\n```\r\nSQL Query\r\nSELECT * FROM parquet_scan('test_*.parquet');\r\n================================================================================\r\nActual result:\r\nTProtocolException: Invalid data\r\n```\r\n\r\n### works with create table\r\n```\r\nCREATE table parquet_file AS SELECT * from parquet_scan('test_*.parquet')\r\nSELECT * FROM parquet_file \r\n```\r\n\r\n### Tested with PythonAPI\r\nWe've also tested the same query with python api, it works fine there. So the parquet files should be ok.\r\n\r\nAnother thing we found is the performance for \"CREATING TABLE ...\"\r\n```\r\nCREATE table parquet_file AS SELECT * from parquet_scan('test_*.parquet')\r\n```\r\n^ The above query takes about 3 seconds running with python, but took about 26s with C++.\r\n\r\n#### To Reproduce\r\nManaged to reproduce with the following parquet files generated by:\r\n```\r\nimport numpy as np\r\nimport pyarrow as pa\r\nimport pyarrow.parquet as pq\r\nimport pandas as pd\r\nfrom datetime import datetime\r\n\r\ndf = pd.DataFrame({\"order\": [0, 1, 2], \"time\": [datetime.now(), pd.NaT, pd.NaT],\"name\": [\"A\", \"B\", \"C\"]})\r\ndf2 = pd.DataFrame({\"order\": [3, 4, 5], \"time\": [datetime.now(), pd.NaT, datetime.now()],\"name\": [\"D\", \"E\", \"F\"]})\r\n\r\ntable = pa.Table.from_pandas(df)\r\ntable2 = pa.Table.from_pandas(df2)\r\n\r\npq.write_table(table, 't1.parquet')\r\npq.write_table(table2, 't2.parquet')\r\n```\r\n\r\ntest/parquet/test_parquet_reader.test:\r\n```\r\nstatement ok\r\nCREATE view parquet_file AS SELECT * from parquet_scan('/test/t*.parquet')\r\n```\r\n\r\n./build/test/unittest test/parquet/test_parquet2.test\r\n```\r\nQuery unexpectedly failed (test/parquet/test_parquet.test:10)\r\n================================================================================\r\nSQL Query\r\nSELECT * from parquet_file\r\n================================================================================\r\nActual result:\r\nTProtocolException: Invalid data\r\n```\r\n\r\n#### Environment (please complete the following information):\r\n - OS: [linux]\r\n - DuckDB Version: [latest master]\r\n - DuckDB Client: [c++]\r\n\r\n#### Before Submitting\r\n\r\n- [x] **Have you tried this on the latest `master` branch?**\r\n* **Python**: `pip install duckdb --upgrade --pre`\r\n* **R**: `install.packages(\"https://github.com/duckdb/duckdb/releases/download/master-builds/duckdb_r_src.tar.gz\", repos = NULL)`\r\n* **Other Platforms**: You can find binaries [here](https://github.com/duckdb/duckdb/releases/tag/master-builds) or compile from source.\r\n\r\n- [x] **Have you tried the steps to reproduce? Do they include all relevant data and configuration? Does the issue you report still appear there?**\r\n", + "reactions": { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/3153/reactions", + "total_count": 0, + "+1": 0, + "-1": 0, + "laugh": 0, + "hooray": 0, + "confused": 0, + "heart": 0, + "rocket": 0, + "eyes": 0 + }, + "timeline_url": "https://api.github.com/repos/duckdb/duckdb/issues/3153/timeline", + "performed_via_github_app": null, + "state_reason": null + }, + { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/3144", + "repository_url": "https://api.github.com/repos/duckdb/duckdb", + "labels_url": "https://api.github.com/repos/duckdb/duckdb/issues/3144/labels{/name}", + "comments_url": "https://api.github.com/repos/duckdb/duckdb/issues/3144/comments", + "events_url": "https://api.github.com/repos/duckdb/duckdb/issues/3144/events", + "html_url": "https://github.com/duckdb/duckdb/issues/3144", + "id": 1148820070, + "node_id": "I_kwDOCEU65s5EeZpm", + "number": 3144, + "title": "Memory Leak in DuckDB Python API", + "user": { + "login": "alanhdu", + "id": 1914111, + "node_id": "MDQ6VXNlcjE5MTQxMTE=", + "avatar_url": "https://avatars.githubusercontent.com/u/1914111?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/alanhdu", + "html_url": "https://github.com/alanhdu", + "followers_url": "https://api.github.com/users/alanhdu/followers", + "following_url": "https://api.github.com/users/alanhdu/following{/other_user}", + "gists_url": "https://api.github.com/users/alanhdu/gists{/gist_id}", + "starred_url": "https://api.github.com/users/alanhdu/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/alanhdu/subscriptions", + "organizations_url": "https://api.github.com/users/alanhdu/orgs", + "repos_url": "https://api.github.com/users/alanhdu/repos", + "events_url": "https://api.github.com/users/alanhdu/events{/privacy}", + "received_events_url": "https://api.github.com/users/alanhdu/received_events", + "type": "User", + "site_admin": false + }, + "labels": [ + { + "id": 976002058, + "node_id": "MDU6TGFiZWw5NzYwMDIwNTg=", + "url": "https://api.github.com/repos/duckdb/duckdb/labels/bug", + "name": "bug", + "color": "d73a4a", + "default": true, + "description": "Something isn't working" + } + ], + "state": "closed", + "locked": false, + "assignee": { + "login": "pdet", + "id": 7377477, + "node_id": "MDQ6VXNlcjczNzc0Nzc=", + "avatar_url": "https://avatars.githubusercontent.com/u/7377477?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/pdet", + "html_url": "https://github.com/pdet", + "followers_url": "https://api.github.com/users/pdet/followers", + "following_url": "https://api.github.com/users/pdet/following{/other_user}", + "gists_url": "https://api.github.com/users/pdet/gists{/gist_id}", + "starred_url": "https://api.github.com/users/pdet/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/pdet/subscriptions", + "organizations_url": "https://api.github.com/users/pdet/orgs", + "repos_url": "https://api.github.com/users/pdet/repos", + "events_url": "https://api.github.com/users/pdet/events{/privacy}", + "received_events_url": "https://api.github.com/users/pdet/received_events", + "type": "User", + "site_admin": false + }, + "assignees": [ + { + "login": "pdet", + "id": 7377477, + "node_id": "MDQ6VXNlcjczNzc0Nzc=", + "avatar_url": "https://avatars.githubusercontent.com/u/7377477?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/pdet", + "html_url": "https://github.com/pdet", + "followers_url": "https://api.github.com/users/pdet/followers", + "following_url": "https://api.github.com/users/pdet/following{/other_user}", + "gists_url": "https://api.github.com/users/pdet/gists{/gist_id}", + "starred_url": "https://api.github.com/users/pdet/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/pdet/subscriptions", + "organizations_url": "https://api.github.com/users/pdet/orgs", + "repos_url": "https://api.github.com/users/pdet/repos", + "events_url": "https://api.github.com/users/pdet/events{/privacy}", + "received_events_url": "https://api.github.com/users/pdet/received_events", + "type": "User", + "site_admin": false + } + ], + "milestone": null, + "comments": 7, + "created_at": "2022-02-24T03:32:24Z", + "updated_at": "2022-06-30T16:54:11Z", + "closed_at": "2022-05-31T16:12:37Z", + "author_association": "CONTRIBUTOR", + "active_lock_reason": null, + "body": "#### What happens?\r\n\r\nI might be wrong about this, but I think the DuckDB Python API presented is fundamentally memory leaky. In particular, I think that almost every function that calls into\r\nhttps://github.com/duckdb/duckdb/blob/851e8cfd39727ab6ac13a5354cc55846e35bb337/tools/pythonpkg/src/pyconnection.cpp#L281-L290\r\nwill end up with a memory leak: the Python object is registered with the connection (using a random name), but that name Is attached to the `DuckDBPyRelation`, which has no way of unregistering the object when its destructed. Even worse, the *only* way a programmer can get access to the (randomly generated) name is `DuckDBPyRelation::GetAlias`, which can be overwritten via `DuckDBPyRelation::SetAlias`!\r\n\r\nThis is a little annoying, because one super convenient function that DuckDB presents is the `duckdb.query_df` function to run SQL queries on Pandas DataFrames, which under-the-hood runs into this problem.\r\n\r\nI've can put up a tentative PR to fix the `query_df` use-case, but think the underlying API here is prone to misuse. The \"simplest\" thing to do here would be to ask the user to pass in the `name` to this function and make it the caller's responsibility to unregister the Python object when they're done. Another approach would be to somehow tie the lifetime of the registered Python object to the resulting `DuckDBPyRelation`, although that seems harder and requires modifying the API (maybe add a `unique_ptr` to `PyRelation` so it automatically gets cleaned-up when the relation gets destructed?).\r\n\r\n#### To Reproduce\r\n\r\nIf you run this small script:\r\n\r\n```python\r\nimport pandas as pd\r\nimport duckdb\r\nimport numpy as np\r\n\r\nfrom tqdm import tqdm\r\n\r\nfor __ in tqdm(range(10_000_000)):\r\n df = pd.DataFrame({\"x\": np.random.rand(1_000_000)})\r\n duckdb.query_df(df, \"df\", \"SELECT * FROM df\")\r\n```\r\n\r\nYou can see that memory use rises without bound, indicating that the memory for `df` is being leaked.\r\n\r\n#### Environment (please complete the following information):\r\n - OS: Linux\r\n - DuckDB Version: 0.32\r\n - DuckDB Client: Python\r\n\r\n#### Before Submitting\r\n\r\n- [x] **Have you tried this on the latest `master` branch?**\r\n* **Python**: `pip install duckdb --upgrade --pre`\r\n* **R**: `install.packages(\"https://github.com/duckdb/duckdb/releases/download/master-builds/duckdb_r_src.tar.gz\", repos = NULL)`\r\n* **Other Platforms**: You can find binaries [here](https://github.com/duckdb/duckdb/releases/tag/master-builds) or compile from source.\r\n\r\n- [x] **Have you tried the steps to reproduce? Do they include all relevant data and configuration? Does the issue you report still appear there?**\r\n", + "reactions": { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/3144/reactions", + "total_count": 0, + "+1": 0, + "-1": 0, + "laugh": 0, + "hooray": 0, + "confused": 0, + "heart": 0, + "rocket": 0, + "eyes": 0 + }, + "timeline_url": "https://api.github.com/repos/duckdb/duckdb/issues/3144/timeline", + "performed_via_github_app": null, + "state_reason": "completed" + }, + { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/3135", + "repository_url": "https://api.github.com/repos/duckdb/duckdb", + "labels_url": "https://api.github.com/repos/duckdb/duckdb/issues/3135/labels{/name}", + "comments_url": "https://api.github.com/repos/duckdb/duckdb/issues/3135/comments", + "events_url": "https://api.github.com/repos/duckdb/duckdb/issues/3135/events", + "html_url": "https://github.com/duckdb/duckdb/issues/3135", + "id": 1147576838, + "node_id": "I_kwDOCEU65s5EZqIG", + "number": 3135, + "title": "Docs: missing docs for EXCLUDE, REPLACE", + "user": { + "login": "zenazn", + "id": 12960, + "node_id": "MDQ6VXNlcjEyOTYw", + "avatar_url": "https://avatars.githubusercontent.com/u/12960?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/zenazn", + "html_url": "https://github.com/zenazn", + "followers_url": "https://api.github.com/users/zenazn/followers", + "following_url": "https://api.github.com/users/zenazn/following{/other_user}", + "gists_url": "https://api.github.com/users/zenazn/gists{/gist_id}", + "starred_url": "https://api.github.com/users/zenazn/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/zenazn/subscriptions", + "organizations_url": "https://api.github.com/users/zenazn/orgs", + "repos_url": "https://api.github.com/users/zenazn/repos", + "events_url": "https://api.github.com/users/zenazn/events{/privacy}", + "received_events_url": "https://api.github.com/users/zenazn/received_events", + "type": "User", + "site_admin": false + }, + "labels": [ + { + "id": 3790959786, + "node_id": "LA_kwDOCEU65s7h9XSq", + "url": "https://api.github.com/repos/duckdb/duckdb/labels/Foundation%20Member", + "name": "Foundation Member", + "color": "FBCA04", + "default": false, + "description": "" + } + ], + "state": "open", + "locked": false, + "assignee": null, + "assignees": [ + + ], + "milestone": null, + "comments": 7, + "created_at": "2022-02-23T03:00:03Z", + "updated_at": "2022-07-11T15:23:44Z", + "closed_at": null, + "author_association": "CONTRIBUTOR", + "active_lock_reason": null, + "body": "I was delighted to find #2199, which solved a problem I had! Unfortunately I only found out about it when a coworker told me what to google—this syntax is completely missing from the [documentation for SELECT](https://duckdb.org/docs/sql/statements/select), which I'd been dutifully relying on until that point.\r\n\r\nAny chance of getting the docs updated to include these features? (I'm a bit curious what else might be missing!)", + "reactions": { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/3135/reactions", + "total_count": 0, + "+1": 0, + "-1": 0, + "laugh": 0, + "hooray": 0, + "confused": 0, + "heart": 0, + "rocket": 0, + "eyes": 0 + }, + "timeline_url": "https://api.github.com/repos/duckdb/duckdb/issues/3135/timeline", + "performed_via_github_app": null, + "state_reason": null + }, + { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/3104", + "repository_url": "https://api.github.com/repos/duckdb/duckdb", + "labels_url": "https://api.github.com/repos/duckdb/duckdb/issues/3104/labels{/name}", + "comments_url": "https://api.github.com/repos/duckdb/duckdb/issues/3104/comments", + "events_url": "https://api.github.com/repos/duckdb/duckdb/issues/3104/events", + "html_url": "https://github.com/duckdb/duckdb/issues/3104", + "id": 1142699354, + "node_id": "I_kwDOCEU65s5EHDVa", + "number": 3104, + "title": "JDBC:executeQuery method of preparedStatement more slower than statement", + "user": { + "login": "Cinzq4615", + "id": 19431346, + "node_id": "MDQ6VXNlcjE5NDMxMzQ2", + "avatar_url": "https://avatars.githubusercontent.com/u/19431346?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/Cinzq4615", + "html_url": "https://github.com/Cinzq4615", + "followers_url": "https://api.github.com/users/Cinzq4615/followers", + "following_url": "https://api.github.com/users/Cinzq4615/following{/other_user}", + "gists_url": "https://api.github.com/users/Cinzq4615/gists{/gist_id}", + "starred_url": "https://api.github.com/users/Cinzq4615/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/Cinzq4615/subscriptions", + "organizations_url": "https://api.github.com/users/Cinzq4615/orgs", + "repos_url": "https://api.github.com/users/Cinzq4615/repos", + "events_url": "https://api.github.com/users/Cinzq4615/events{/privacy}", + "received_events_url": "https://api.github.com/users/Cinzq4615/received_events", + "type": "User", + "site_admin": false + }, + "labels": [ + + ], + "state": "closed", + "locked": false, + "assignee": null, + "assignees": [ + + ], + "milestone": null, + "comments": 7, + "created_at": "2022-02-18T09:18:36Z", + "updated_at": "2022-08-28T08:42:58Z", + "closed_at": "2022-08-28T08:42:58Z", + "author_association": "NONE", + "active_lock_reason": null, + "body": "#### What happens?\r\nexecuteQuery method of preparedStatement more slower than statement when 5 million rows.\r\n\r\nstatement cost:7ms\r\npreparedStatement cost:580ms\r\n\r\n#### To Reproduce\r\n```\r\n@Test\r\n void createTableAndAppenderData() {\r\n try (DuckDBConnection connection = (DuckDBConnection) DriverManager.getConnection(\"jdbc:duckdb:test\")) {\r\n String sql = \"CREATE SCHEMA IF NOT EXISTS mydb\";\r\n Statement stat = connection.createStatement();\r\n stat.execute(sql);\r\n sql = \"CREATE TABLE IF NOT EXISTS mydb.test(id INTEGER PRIMARY KEY, name VARCHAR)\";\r\n stat.execute(sql);\r\n\r\n DuckDBAppender appender = connection.createAppender(\"mydb\", \"test\");\r\n for (int i = 1001; i < 5000000; i++) {\r\n appender.beginRow();\r\n appender.append(i);\r\n appender.append(\"name-\" + i);\r\n appender.endRow();\r\n }\r\n appender.flush();\r\n\r\n } catch (SQLException throwables) {\r\n throwables.printStackTrace();\r\n }\r\n }\r\n\r\n @Test\r\n void statementTest() {\r\n try (DuckDBConnection connection = (DuckDBConnection) DriverManager.getConnection(\"jdbc:duckdb:test\")) {\r\n final String SQL_TEMPLATE = \"select * from mydb.test where id = %d\";\r\n long startTime = System.currentTimeMillis();\r\n Statement stat = connection.createStatement();\r\n for (int i = 0; i < 10; i++) {\r\n ResultSet rs = stat.executeQuery(String.format(SQL_TEMPLATE, i));\r\n while (rs.next()) {\r\n System.out.println(rs.getString(2));\r\n }\r\n }\r\n long endTime = System.currentTimeMillis();\r\n System.out.println(\"statement cost:\" + (endTime - startTime) + \"ms\");\r\n stat.close();\r\n\r\n } catch (SQLException throwables) {\r\n throwables.printStackTrace();\r\n }\r\n }\r\n\r\n @Test\r\n void preparedStatementTest() {\r\n\r\n try (DuckDBConnection connection = (DuckDBConnection) DriverManager.getConnection(\"jdbc:duckdb:test\")) {\r\n String sql = \"select * from mydb.test where id = ?\";\r\n long startTime = System.currentTimeMillis();\r\n PreparedStatement stat = connection.prepareStatement(sql);\r\n for (int i = 0; i < 10; i++) {\r\n stat.setInt(1, i);\r\n ResultSet rs = stat.executeQuery();\r\n while (rs.next()) {\r\n System.out.println(rs.getString(2));\r\n }\r\n }\r\n long endTime = System.currentTimeMillis();\r\n System.out.println(\"preparedStatement cost:\" + (endTime - startTime) + \"ms\");\r\n stat.close();\r\n } catch (SQLException throwables) {\r\n throwables.printStackTrace();\r\n }\r\n }\r\n```\r\n\r\n\r\n#### Environment (please complete the following information):\r\n - OS: windos\r\n - DuckDB Version: 0.3.1\r\n - DuckDB Client:jdbc-03.1\r\n\r\n", + "reactions": { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/3104/reactions", + "total_count": 0, + "+1": 0, + "-1": 0, + "laugh": 0, + "hooray": 0, + "confused": 0, + "heart": 0, + "rocket": 0, + "eyes": 0 + }, + "timeline_url": "https://api.github.com/repos/duckdb/duckdb/issues/3104/timeline", + "performed_via_github_app": null, + "state_reason": "completed" + }, + { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/3084", + "repository_url": "https://api.github.com/repos/duckdb/duckdb", + "labels_url": "https://api.github.com/repos/duckdb/duckdb/issues/3084/labels{/name}", + "comments_url": "https://api.github.com/repos/duckdb/duckdb/issues/3084/comments", + "events_url": "https://api.github.com/repos/duckdb/duckdb/issues/3084/events", + "html_url": "https://github.com/duckdb/duckdb/pull/3084", + "id": 1136990120, + "node_id": "PR_kwDOCEU65s4yxQNt", + "number": 3084, + "title": "Hmb1 select macro feature mr", + "user": { + "login": "hmb1", + "id": 6428656, + "node_id": "MDQ6VXNlcjY0Mjg2NTY=", + "avatar_url": "https://avatars.githubusercontent.com/u/6428656?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/hmb1", + "html_url": "https://github.com/hmb1", + "followers_url": "https://api.github.com/users/hmb1/followers", + "following_url": "https://api.github.com/users/hmb1/following{/other_user}", + "gists_url": "https://api.github.com/users/hmb1/gists{/gist_id}", + "starred_url": "https://api.github.com/users/hmb1/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/hmb1/subscriptions", + "organizations_url": "https://api.github.com/users/hmb1/orgs", + "repos_url": "https://api.github.com/users/hmb1/repos", + "events_url": "https://api.github.com/users/hmb1/events{/privacy}", + "received_events_url": "https://api.github.com/users/hmb1/received_events", + "type": "User", + "site_admin": false + }, + "labels": [ + + ], + "state": "closed", + "locked": false, + "assignee": null, + "assignees": [ + + ], + "milestone": null, + "comments": 7, + "created_at": "2022-02-14T09:06:55Z", + "updated_at": "2022-03-24T09:06:12Z", + "closed_at": "2022-03-24T09:06:02Z", + "author_association": "CONTRIBUTOR", + "active_lock_reason": null, + "draft": false, + "pull_request": { + "url": "https://api.github.com/repos/duckdb/duckdb/pulls/3084", + "html_url": "https://github.com/duckdb/duckdb/pull/3084", + "diff_url": "https://github.com/duckdb/duckdb/pull/3084.diff", + "patch_url": "https://github.com/duckdb/duckdb/pull/3084.patch", + "merged_at": "2022-03-24T09:06:01Z" + }, + "body": "Table Macro code\r\ncode cleaned up", + "reactions": { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/3084/reactions", + "total_count": 0, + "+1": 0, + "-1": 0, + "laugh": 0, + "hooray": 0, + "confused": 0, + "heart": 0, + "rocket": 0, + "eyes": 0 + }, + "timeline_url": "https://api.github.com/repos/duckdb/duckdb/issues/3084/timeline", + "performed_via_github_app": null, + "state_reason": null + }, + { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/3015", + "repository_url": "https://api.github.com/repos/duckdb/duckdb", + "labels_url": "https://api.github.com/repos/duckdb/duckdb/issues/3015/labels{/name}", + "comments_url": "https://api.github.com/repos/duckdb/duckdb/issues/3015/comments", + "events_url": "https://api.github.com/repos/duckdb/duckdb/issues/3015/events", + "html_url": "https://github.com/duckdb/duckdb/issues/3015", + "id": 1120238731, + "node_id": "I_kwDOCEU65s5CxXyL", + "number": 3015, + "title": "Wrong result on using limit after order by on null data", + "user": { + "login": "Jedi18", + "id": 15855268, + "node_id": "MDQ6VXNlcjE1ODU1MjY4", + "avatar_url": "https://avatars.githubusercontent.com/u/15855268?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/Jedi18", + "html_url": "https://github.com/Jedi18", + "followers_url": "https://api.github.com/users/Jedi18/followers", + "following_url": "https://api.github.com/users/Jedi18/following{/other_user}", + "gists_url": "https://api.github.com/users/Jedi18/gists{/gist_id}", + "starred_url": "https://api.github.com/users/Jedi18/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/Jedi18/subscriptions", + "organizations_url": "https://api.github.com/users/Jedi18/orgs", + "repos_url": "https://api.github.com/users/Jedi18/repos", + "events_url": "https://api.github.com/users/Jedi18/events{/privacy}", + "received_events_url": "https://api.github.com/users/Jedi18/received_events", + "type": "User", + "site_admin": false + }, + "labels": [ + { + "id": 976002058, + "node_id": "MDU6TGFiZWw5NzYwMDIwNTg=", + "url": "https://api.github.com/repos/duckdb/duckdb/labels/bug", + "name": "bug", + "color": "d73a4a", + "default": true, + "description": "Something isn't working" + } + ], + "state": "closed", + "locked": false, + "assignee": { + "login": "Mytherin", + "id": 3978469, + "node_id": "MDQ6VXNlcjM5Nzg0Njk=", + "avatar_url": "https://avatars.githubusercontent.com/u/3978469?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/Mytherin", + "html_url": "https://github.com/Mytherin", + "followers_url": "https://api.github.com/users/Mytherin/followers", + "following_url": "https://api.github.com/users/Mytherin/following{/other_user}", + "gists_url": "https://api.github.com/users/Mytherin/gists{/gist_id}", + "starred_url": "https://api.github.com/users/Mytherin/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/Mytherin/subscriptions", + "organizations_url": "https://api.github.com/users/Mytherin/orgs", + "repos_url": "https://api.github.com/users/Mytherin/repos", + "events_url": "https://api.github.com/users/Mytherin/events{/privacy}", + "received_events_url": "https://api.github.com/users/Mytherin/received_events", + "type": "User", + "site_admin": false + }, + "assignees": [ + { + "login": "Mytherin", + "id": 3978469, + "node_id": "MDQ6VXNlcjM5Nzg0Njk=", + "avatar_url": "https://avatars.githubusercontent.com/u/3978469?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/Mytherin", + "html_url": "https://github.com/Mytherin", + "followers_url": "https://api.github.com/users/Mytherin/followers", + "following_url": "https://api.github.com/users/Mytherin/following{/other_user}", + "gists_url": "https://api.github.com/users/Mytherin/gists{/gist_id}", + "starred_url": "https://api.github.com/users/Mytherin/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/Mytherin/subscriptions", + "organizations_url": "https://api.github.com/users/Mytherin/orgs", + "repos_url": "https://api.github.com/users/Mytherin/repos", + "events_url": "https://api.github.com/users/Mytherin/events{/privacy}", + "received_events_url": "https://api.github.com/users/Mytherin/received_events", + "type": "User", + "site_admin": false + } + ], + "milestone": null, + "comments": 7, + "created_at": "2022-02-01T05:03:58Z", + "updated_at": "2022-02-07T09:35:40Z", + "closed_at": "2022-02-07T09:35:29Z", + "author_association": "NONE", + "active_lock_reason": null, + "body": "#### What happens?\r\nSeveral row values are missing (when compared with the same query on other SQL engines) when using limit in the below given query. I'm not a 100% sure whether this is a bug or this is the intended behaviour. Note that this happens on data with null values.\r\n\r\n#### To Reproduce\r\n```\r\n select o_orderkey, o_clerk, o_orderstatus, o_totalprice from orders\r\n order by o_orderkey NULLS FIRST,\r\n o_clerk NULLS FIRST, o_orderstatus NULLS FIRST,\r\n o_totalprice DESC NULLS LAST limit 540\r\n```\r\nThis gives the wrong result. To get the correct result, I had to change the query to the one given below\r\n```\r\n WITH result as (\r\n select o_orderkey, o_clerk, o_orderstatus, o_totalprice from orders\r\n order by o_orderkey NULLS FIRST,\r\n o_clerk NULLS FIRST, o_orderstatus NULLS FIRST,\r\n o_totalprice DESC NULLS LAST\r\n )\r\n SELECT * from result limit 540\r\n```\r\n\r\n#### Environment (please complete the following information):\r\n - OS: Ubuntu 20.04\r\n - DuckDB Version: 0.3.1\r\n - DuckDB Client: Python\r\n\r\n#### Before Submitting\r\n\r\n- [ ] **Have you tried this on the latest `master` branch?**\r\n* **Python**: `pip install duckdb --upgrade --pre`\r\n* **R**: `install.packages(\"https://github.com/duckdb/duckdb/releases/download/master-builds/duckdb_r_src.tar.gz\", repos = NULL)`\r\n* **Other Platforms**: You can find binaries [here](https://github.com/duckdb/duckdb/releases/tag/master-builds) or compile from source.\r\n\r\n- [ ] **Have you tried the steps to reproduce? Do they include all relevant data and configuration? Does the issue you report still appear there?**\r\n", + "reactions": { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/3015/reactions", + "total_count": 0, + "+1": 0, + "-1": 0, + "laugh": 0, + "hooray": 0, + "confused": 0, + "heart": 0, + "rocket": 0, + "eyes": 0 + }, + "timeline_url": "https://api.github.com/repos/duckdb/duckdb/issues/3015/timeline", + "performed_via_github_app": null, + "state_reason": "completed" + }, + { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/2972", + "repository_url": "https://api.github.com/repos/duckdb/duckdb", + "labels_url": "https://api.github.com/repos/duckdb/duckdb/issues/2972/labels{/name}", + "comments_url": "https://api.github.com/repos/duckdb/duckdb/issues/2972/comments", + "events_url": "https://api.github.com/repos/duckdb/duckdb/issues/2972/events", + "html_url": "https://github.com/duckdb/duckdb/issues/2972", + "id": 1110629365, + "node_id": "I_kwDOCEU65s5CMtv1", + "number": 2972, + "title": "Integer parameters sometimes become varchar when using python dbapi", + "user": { + "login": "machow", + "id": 2574498, + "node_id": "MDQ6VXNlcjI1NzQ0OTg=", + "avatar_url": "https://avatars.githubusercontent.com/u/2574498?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/machow", + "html_url": "https://github.com/machow", + "followers_url": "https://api.github.com/users/machow/followers", + "following_url": "https://api.github.com/users/machow/following{/other_user}", + "gists_url": "https://api.github.com/users/machow/gists{/gist_id}", + "starred_url": "https://api.github.com/users/machow/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/machow/subscriptions", + "organizations_url": "https://api.github.com/users/machow/orgs", + "repos_url": "https://api.github.com/users/machow/repos", + "events_url": "https://api.github.com/users/machow/events{/privacy}", + "received_events_url": "https://api.github.com/users/machow/received_events", + "type": "User", + "site_admin": false + }, + "labels": [ + { + "id": 1258094368, + "node_id": "MDU6TGFiZWwxMjU4MDk0MzY4", + "url": "https://api.github.com/repos/duckdb/duckdb/labels/Python", + "name": "Python", + "color": "f8fc8a", + "default": false, + "description": "" + }, + { + "id": 1269662541, + "node_id": "MDU6TGFiZWwxMjY5NjYyNTQx", + "url": "https://api.github.com/repos/duckdb/duckdb/labels/R", + "name": "R", + "color": "6ddb99", + "default": false, + "description": "R integration" + } + ], + "state": "closed", + "locked": false, + "assignee": { + "login": "Mytherin", + "id": 3978469, + "node_id": "MDQ6VXNlcjM5Nzg0Njk=", + "avatar_url": "https://avatars.githubusercontent.com/u/3978469?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/Mytherin", + "html_url": "https://github.com/Mytherin", + "followers_url": "https://api.github.com/users/Mytherin/followers", + "following_url": "https://api.github.com/users/Mytherin/following{/other_user}", + "gists_url": "https://api.github.com/users/Mytherin/gists{/gist_id}", + "starred_url": "https://api.github.com/users/Mytherin/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/Mytherin/subscriptions", + "organizations_url": "https://api.github.com/users/Mytherin/orgs", + "repos_url": "https://api.github.com/users/Mytherin/repos", + "events_url": "https://api.github.com/users/Mytherin/events{/privacy}", + "received_events_url": "https://api.github.com/users/Mytherin/received_events", + "type": "User", + "site_admin": false + }, + "assignees": [ + { + "login": "Mytherin", + "id": 3978469, + "node_id": "MDQ6VXNlcjM5Nzg0Njk=", + "avatar_url": "https://avatars.githubusercontent.com/u/3978469?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/Mytherin", + "html_url": "https://github.com/Mytherin", + "followers_url": "https://api.github.com/users/Mytherin/followers", + "following_url": "https://api.github.com/users/Mytherin/following{/other_user}", + "gists_url": "https://api.github.com/users/Mytherin/gists{/gist_id}", + "starred_url": "https://api.github.com/users/Mytherin/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/Mytherin/subscriptions", + "organizations_url": "https://api.github.com/users/Mytherin/orgs", + "repos_url": "https://api.github.com/users/Mytherin/repos", + "events_url": "https://api.github.com/users/Mytherin/events{/privacy}", + "received_events_url": "https://api.github.com/users/Mytherin/received_events", + "type": "User", + "site_admin": false + } + ], + "milestone": null, + "comments": 7, + "created_at": "2022-01-21T15:54:11Z", + "updated_at": "2022-04-28T16:37:16Z", + "closed_at": "2022-04-28T16:37:16Z", + "author_association": "NONE", + "active_lock_reason": null, + "body": "#### What happens?\r\n\r\nCurrently, python users interfacing with postgresql via dbapi drivers like `psycopg2` can parameterize queries like...\r\n\r\n```\r\n-- %s is the parameter\r\nSELECT %s\r\n```\r\n\r\nand if they pass an int like `1` as a parameter, it is equivalent to...\r\n\r\n```\r\n-- note 1's pg_typeof here would be integer, rather than varchar\r\nSELECT 1\r\n```\r\n\r\nThe implementation of this behavior is discussed in the [DBAPI pep 249 here](https://www.python.org/dev/peps/pep-0249/#type-objects-and-constructors).\r\n\r\nQuick Notes: \r\n\r\n* The style used to define parameters may vary across dbapi implementations, but tools like sqlalchemy abstract away the need for users to worry about this.\r\n* I have a backend implementation of duckdb working on a tool called siuba (https://github.com/machow/siuba/pull/380), which essentially translates pandas methods to duckdb, but there are some edge cases where an int could go in, but a string (or object dtype) could come out..!\r\n\r\n#### To Reproduce\r\n\r\n```python\r\nimport psycopg2\r\nimport duckdb\r\n\r\n# setup postgres\r\npg_con = psycopg2.connect(database=\"postgres\", user=\"postgres\", password=\"\", port=5432, host=\"localhost\")\r\npg_curs = pg_con.cursor()\r\n\r\n# setup duckdb\r\nduck_con = duckdb.connect(\":memory:\")\r\n\r\n\r\n# queries ----\r\n\r\n# integer\r\npg_curs.execute(\"SELECT pg_typeof(%s)\", (1,))\r\npg_curs.fetchone()\r\n\r\n# varchar\r\nduck_con.execute(\"SELECT pg_typeof(?)\", (1,)).fetchone()\r\n```\r\n\r\nHere is the docker compose I used to setup postgres:\r\n\r\n
\r\n\r\n```\r\nversion: '3.1'\r\n\r\nservices:\r\n\r\n db_mysql:\r\n image: mysql\r\n restart: always\r\n environment:\r\n MYSQL_ROOT_PASSWORD: \"\"\r\n MYSQL_ALLOW_EMPTY_PASSWORD: 1\r\n MYSQL_DATABASE: \"public\"\r\n ports:\r\n - 3306:3306\r\n # by default, mysql rounds to 4 decimals, but tests require more precision\r\n command: --div-precision-increment=30\r\n\r\n db:\r\n image: postgres\r\n restart: always\r\n environment:\r\n POSTGRES_PASSWORD: \"\"\r\n POSTGRES_HOST_AUTH_METHOD: \"trust\"\r\n ports:\r\n - 5432:5432\r\n```\r\n\r\n
\r\n\r\n#### Environment (please complete the following information):\r\n - OS: OSX 11.3.1\r\n - DuckDB Version: 0.3.2\r\n - DuckDB Client: Python\r\n\r\n#### Before Submitting\r\n\r\n- [x] **Have you tried this on the latest `master` branch?**\r\n* **Python**: `pip install duckdb --upgrade --pre`\r\n* **R**: `install.packages(\"https://github.com/duckdb/duckdb/releases/download/master-builds/duckdb_r_src.tar.gz\", repos = NULL)`\r\n* **Other Platforms**: You can find binaries [here](https://github.com/duckdb/duckdb/releases/tag/master-builds) or compile from source.\r\n\r\n- [x] **Have you tried the steps to reproduce? Do they include all relevant data and configuration? Does the issue you report still appear there?**\r\n", + "reactions": { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/2972/reactions", + "total_count": 0, + "+1": 0, + "-1": 0, + "laugh": 0, + "hooray": 0, + "confused": 0, + "heart": 0, + "rocket": 0, + "eyes": 0 + }, + "timeline_url": "https://api.github.com/repos/duckdb/duckdb/issues/2972/timeline", + "performed_via_github_app": null, + "state_reason": "completed" + }, + { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/2955", + "repository_url": "https://api.github.com/repos/duckdb/duckdb", + "labels_url": "https://api.github.com/repos/duckdb/duckdb/issues/2955/labels{/name}", + "comments_url": "https://api.github.com/repos/duckdb/duckdb/issues/2955/comments", + "events_url": "https://api.github.com/repos/duckdb/duckdb/issues/2955/events", + "html_url": "https://github.com/duckdb/duckdb/issues/2955", + "id": 1108670876, + "node_id": "I_kwDOCEU65s5CFPmc", + "number": 2955, + "title": "Parquet reader does not implement TableFilterType::CONJUNCTION_OR correctly", + "user": { + "login": "majetideepak", + "id": 5091159, + "node_id": "MDQ6VXNlcjUwOTExNTk=", + "avatar_url": "https://avatars.githubusercontent.com/u/5091159?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/majetideepak", + "html_url": "https://github.com/majetideepak", + "followers_url": "https://api.github.com/users/majetideepak/followers", + "following_url": "https://api.github.com/users/majetideepak/following{/other_user}", + "gists_url": "https://api.github.com/users/majetideepak/gists{/gist_id}", + "starred_url": "https://api.github.com/users/majetideepak/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/majetideepak/subscriptions", + "organizations_url": "https://api.github.com/users/majetideepak/orgs", + "repos_url": "https://api.github.com/users/majetideepak/repos", + "events_url": "https://api.github.com/users/majetideepak/events{/privacy}", + "received_events_url": "https://api.github.com/users/majetideepak/received_events", + "type": "User", + "site_admin": false + }, + "labels": [ + { + "id": 976002058, + "node_id": "MDU6TGFiZWw5NzYwMDIwNTg=", + "url": "https://api.github.com/repos/duckdb/duckdb/labels/bug", + "name": "bug", + "color": "d73a4a", + "default": true, + "description": "Something isn't working" + }, + { + "id": 2202756058, + "node_id": "MDU6TGFiZWwyMjAyNzU2MDU4", + "url": "https://api.github.com/repos/duckdb/duckdb/labels/Parquet", + "name": "Parquet", + "color": "e5ca72", + "default": false, + "description": "" + } + ], + "state": "closed", + "locked": false, + "assignee": null, + "assignees": [ + + ], + "milestone": null, + "comments": 7, + "created_at": "2022-01-19T23:43:37Z", + "updated_at": "2022-03-30T12:51:14Z", + "closed_at": "2022-03-30T12:51:14Z", + "author_association": "NONE", + "active_lock_reason": null, + "body": "As reported here https://github.com/facebookincubator/velox/issues/895\r\nParquet reader seems to have a bug in its `TableFilterType::CONJUNCTION_OR` implementation\r\nFile: https://github.com/facebookincubator/velox/blob/main/velox/dwio/parquet/tests/examples/nation.parquet\r\nReproducer:\r\n```\r\nint main(int argc, char** argv) {\r\n DuckDB db(nullptr);\r\n Connection con(db);\r\n auto result = con.Query(\"PRAGMA enable_profiling; SELECT * FROM 'nation.parquet' WHERE name='CANADA' OR NAME = 'UNITED KINGDOM'\");\r\n result->Print();\r\n}\r\n```\r\n```\r\n┌───────────────────────────┐\r\n│ PROJECTION │\r\n│ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ │\r\n│ nationkey │\r\n│ name │\r\n│ regionkey │\r\n│ comment │\r\n│ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ │\r\n│ 2 │\r\n│ (0.00s) │\r\n└─────────────┬─────────────┘ \r\n┌─────────────┴─────────────┐\r\n│ FILTER │\r\n│ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ │\r\n│(name=CANADA OR name=UNITED│\r\n│ KINGDOM) │\r\n│ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ │\r\n│ 2 │\r\n│ (0.00s) │\r\n└─────────────┬─────────────┘ \r\n┌─────────────┴─────────────┐\r\n│ PARQUET_SCAN │\r\n│ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ │\r\n│ name │\r\n│ nationkey │\r\n│ regionkey │\r\n│ comment │\r\n│ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ │\r\n│ Filters: name=CANADA OR │\r\n│ name=UNITED KINGDOM │\r\n│ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ │\r\n│ 25 │\r\n│ (0.00s) │\r\n└───────────────────────────┘\r\n ```", + "reactions": { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/2955/reactions", + "total_count": 0, + "+1": 0, + "-1": 0, + "laugh": 0, + "hooray": 0, + "confused": 0, + "heart": 0, + "rocket": 0, + "eyes": 0 + }, + "timeline_url": "https://api.github.com/repos/duckdb/duckdb/issues/2955/timeline", + "performed_via_github_app": null, + "state_reason": "completed" + }, + { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/2922", + "repository_url": "https://api.github.com/repos/duckdb/duckdb", + "labels_url": "https://api.github.com/repos/duckdb/duckdb/issues/2922/labels{/name}", + "comments_url": "https://api.github.com/repos/duckdb/duckdb/issues/2922/comments", + "events_url": "https://api.github.com/repos/duckdb/duckdb/issues/2922/events", + "html_url": "https://github.com/duckdb/duckdb/issues/2922", + "id": 1101976153, + "node_id": "I_kwDOCEU65s5BrtJZ", + "number": 2922, + "title": "Feature request: Flatten for nested arrays", + "user": { + "login": "pgabry", + "id": 85938293, + "node_id": "MDQ6VXNlcjg1OTM4Mjkz", + "avatar_url": "https://avatars.githubusercontent.com/u/85938293?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/pgabry", + "html_url": "https://github.com/pgabry", + "followers_url": "https://api.github.com/users/pgabry/followers", + "following_url": "https://api.github.com/users/pgabry/following{/other_user}", + "gists_url": "https://api.github.com/users/pgabry/gists{/gist_id}", + "starred_url": "https://api.github.com/users/pgabry/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/pgabry/subscriptions", + "organizations_url": "https://api.github.com/users/pgabry/orgs", + "repos_url": "https://api.github.com/users/pgabry/repos", + "events_url": "https://api.github.com/users/pgabry/events{/privacy}", + "received_events_url": "https://api.github.com/users/pgabry/received_events", + "type": "User", + "site_admin": false + }, + "labels": [ + { + "id": 976002067, + "node_id": "MDU6TGFiZWw5NzYwMDIwNjc=", + "url": "https://api.github.com/repos/duckdb/duckdb/labels/good%20first%20issue", + "name": "good first issue", + "color": "7057ff", + "default": true, + "description": "Good for newcomers" + }, + { + "id": 3551237865, + "node_id": "LA_kwDOCEU65s7Tq5bp", + "url": "https://api.github.com/repos/duckdb/duckdb/labels/feature", + "name": "feature", + "color": "3FD3C4", + "default": false, + "description": "" + } + ], + "state": "closed", + "locked": false, + "assignee": null, + "assignees": [ + + ], + "milestone": null, + "comments": 7, + "created_at": "2022-01-13T16:01:27Z", + "updated_at": "2022-10-11T08:42:17Z", + "closed_at": "2022-10-11T08:42:17Z", + "author_association": "NONE", + "active_lock_reason": null, + "body": "In Big Query there is a function array_concat_agg that aggregates array fields by concatenating the arrays. In Snowflake there is a flatten function that can unnest nested arrays into single array. I am looking for similar functionality in duckdb. \r\n\r\n`select flatten([[1, 2], [2, 3], [4, 5]]` would return `[1, 2, 2, 3, 4, 5]`\r\nI would also need a distinct option:\r\n`select flatten(DISTINCT [[1, 2], [2, 3], [4, 5]]` would return `[1, 2, 3, 4, 5]`\r\n\r\nAlternatively an equivalent to array_concat_agg from the Big Query would be sufficient.", + "reactions": { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/2922/reactions", + "total_count": 15, + "+1": 15, + "-1": 0, + "laugh": 0, + "hooray": 0, + "confused": 0, + "heart": 0, + "rocket": 0, + "eyes": 0 + }, + "timeline_url": "https://api.github.com/repos/duckdb/duckdb/issues/2922/timeline", + "performed_via_github_app": null, + "state_reason": "completed" + }, + { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/2879", + "repository_url": "https://api.github.com/repos/duckdb/duckdb", + "labels_url": "https://api.github.com/repos/duckdb/duckdb/issues/2879/labels{/name}", + "comments_url": "https://api.github.com/repos/duckdb/duckdb/issues/2879/comments", + "events_url": "https://api.github.com/repos/duckdb/duckdb/issues/2879/events", + "html_url": "https://github.com/duckdb/duckdb/issues/2879", + "id": 1096273246, + "node_id": "I_kwDOCEU65s5BV81e", + "number": 2879, + "title": "Install R package duckdb from GitHub fails", + "user": { + "login": "HanOostdijk", + "id": 13506389, + "node_id": "MDQ6VXNlcjEzNTA2Mzg5", + "avatar_url": "https://avatars.githubusercontent.com/u/13506389?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/HanOostdijk", + "html_url": "https://github.com/HanOostdijk", + "followers_url": "https://api.github.com/users/HanOostdijk/followers", + "following_url": "https://api.github.com/users/HanOostdijk/following{/other_user}", + "gists_url": "https://api.github.com/users/HanOostdijk/gists{/gist_id}", + "starred_url": "https://api.github.com/users/HanOostdijk/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/HanOostdijk/subscriptions", + "organizations_url": "https://api.github.com/users/HanOostdijk/orgs", + "repos_url": "https://api.github.com/users/HanOostdijk/repos", + "events_url": "https://api.github.com/users/HanOostdijk/events{/privacy}", + "received_events_url": "https://api.github.com/users/HanOostdijk/received_events", + "type": "User", + "site_admin": false + }, + "labels": [ + { + "id": 1269662541, + "node_id": "MDU6TGFiZWwxMjY5NjYyNTQx", + "url": "https://api.github.com/repos/duckdb/duckdb/labels/R", + "name": "R", + "color": "6ddb99", + "default": false, + "description": "R integration" + } + ], + "state": "closed", + "locked": false, + "assignee": null, + "assignees": [ + + ], + "milestone": null, + "comments": 7, + "created_at": "2022-01-07T12:25:41Z", + "updated_at": "2022-08-01T13:22:57Z", + "closed_at": "2022-02-16T19:23:26Z", + "author_association": "NONE", + "active_lock_reason": null, + "body": "#### What happens?\r\nInstall R package duckdb from GitHub fails: no DLL was created (details in attachment)\r\n\r\n#### To Reproduce\r\ninstall.packages(\"https://github.com/duckdb/duckdb/releases/download/master-builds/duckdb_r_src.tar.gz\", repos = NULL)\r\n\r\n#### Environment (please complete the following information):\r\n sessionInfo()\r\nR version 4.1.1 (2021-08-10)\r\nPlatform: x86_64-w64-mingw32/x64 (64-bit)\r\nRunning under: Windows 10 x64 (build 19043)\r\n\r\n#### Before Submitting\r\n\r\n- [ X] **Have you tried this on the latest `master` branch?**\r\n* **R**: `install.packages(\"https://github.com/duckdb/duckdb/releases/download/master-builds/duckdb_r_src.tar.gz\", repos = NULL)`\r\n\r\n\r\n- [ X] **Have you tried the steps to reproduce? Do they include all relevant data and configuration? Does the issue you report still appear there?**\r\n\r\nToday I tried to install the latest version. The previous versie 0.3.2-dev1 from 2021-11-16 installed from GitHub without problems.\r\nCompiler/linker messages in attachment:\r\n[install_R_package_duckdb_2022-01-07.txt](https://github.com/duckdb/duckdb/files/7828755/install_R_package_duckdb_2022-01-07.txt)\r\n\r\n", + "reactions": { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/2879/reactions", + "total_count": 0, + "+1": 0, + "-1": 0, + "laugh": 0, + "hooray": 0, + "confused": 0, + "heart": 0, + "rocket": 0, + "eyes": 0 + }, + "timeline_url": "https://api.github.com/repos/duckdb/duckdb/issues/2879/timeline", + "performed_via_github_app": null, + "state_reason": "completed" + }, + { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/2548", + "repository_url": "https://api.github.com/repos/duckdb/duckdb", + "labels_url": "https://api.github.com/repos/duckdb/duckdb/issues/2548/labels{/name}", + "comments_url": "https://api.github.com/repos/duckdb/duckdb/issues/2548/comments", + "events_url": "https://api.github.com/repos/duckdb/duckdb/issues/2548/events", + "html_url": "https://github.com/duckdb/duckdb/issues/2548", + "id": 1045880692, + "node_id": "I_kwDOCEU65s4-Vt90", + "number": 2548, + "title": "SORTED Constraint", + "user": { + "login": "Mytherin", + "id": 3978469, + "node_id": "MDQ6VXNlcjM5Nzg0Njk=", + "avatar_url": "https://avatars.githubusercontent.com/u/3978469?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/Mytherin", + "html_url": "https://github.com/Mytherin", + "followers_url": "https://api.github.com/users/Mytherin/followers", + "following_url": "https://api.github.com/users/Mytherin/following{/other_user}", + "gists_url": "https://api.github.com/users/Mytherin/gists{/gist_id}", + "starred_url": "https://api.github.com/users/Mytherin/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/Mytherin/subscriptions", + "organizations_url": "https://api.github.com/users/Mytherin/orgs", + "repos_url": "https://api.github.com/users/Mytherin/repos", + "events_url": "https://api.github.com/users/Mytherin/events{/privacy}", + "received_events_url": "https://api.github.com/users/Mytherin/received_events", + "type": "User", + "site_admin": false + }, + "labels": [ + + ], + "state": "open", + "locked": false, + "assignee": null, + "assignees": [ + + ], + "milestone": null, + "comments": 7, + "created_at": "2021-11-05T13:57:32Z", + "updated_at": "2022-03-15T20:03:37Z", + "closed_at": null, + "author_association": "COLLABORATOR", + "active_lock_reason": null, + "body": "We had the idea of allowing a `SORTED` constraint to be specified on a column, e.g.:\r\n\r\n```sql\r\nCREATE TABLE sensor_data(\r\n ts TIMESTAMP,\r\n measurement INTEGER,\r\n ...,\r\n SORTED(ts)\r\n);\r\n```\r\n\r\nThe sorted constraint enforces that the data is inserted in sorted order w.r.t. any columns that have a sorted constraint on them (i.e. inserting data in unsorted order throws an error), updating the columns is not allowed.\r\n\r\nThe sorted constraint can then be propagated through the query plan as part of the statistics propagation, and can be used to optimize e.g. window functions and merge joins (to avoid unnecessarily sorting already sorted data), and potentially even to optimize aggregates (in case of aggregates on a subset of the sorted column, such as e.g. grouping by `YEAR(ts), MONTH(ts)`).\r\n\r\nWe could also track \"accidental\" sortedness during insertion and propagate the same sortedness if columns happen to be sorted, but having a constraint to enforce this behavior and prevent surprises seems like a good idea. ", + "reactions": { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/2548/reactions", + "total_count": 11, + "+1": 11, + "-1": 0, + "laugh": 0, + "hooray": 0, + "confused": 0, + "heart": 0, + "rocket": 0, + "eyes": 0 + }, + "timeline_url": "https://api.github.com/repos/duckdb/duckdb/issues/2548/timeline", + "performed_via_github_app": null, + "state_reason": null + }, + { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/2539", + "repository_url": "https://api.github.com/repos/duckdb/duckdb", + "labels_url": "https://api.github.com/repos/duckdb/duckdb/issues/2539/labels{/name}", + "comments_url": "https://api.github.com/repos/duckdb/duckdb/issues/2539/comments", + "events_url": "https://api.github.com/repos/duckdb/duckdb/issues/2539/events", + "html_url": "https://github.com/duckdb/duckdb/pull/2539", + "id": 1044736532, + "node_id": "PR_kwDOCEU65s4uF4k2", + "number": 2539, + "title": "Restructuring CI Workflow", + "user": { + "login": "pdet", + "id": 7377477, + "node_id": "MDQ6VXNlcjczNzc0Nzc=", + "avatar_url": "https://avatars.githubusercontent.com/u/7377477?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/pdet", + "html_url": "https://github.com/pdet", + "followers_url": "https://api.github.com/users/pdet/followers", + "following_url": "https://api.github.com/users/pdet/following{/other_user}", + "gists_url": "https://api.github.com/users/pdet/gists{/gist_id}", + "starred_url": "https://api.github.com/users/pdet/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/pdet/subscriptions", + "organizations_url": "https://api.github.com/users/pdet/orgs", + "repos_url": "https://api.github.com/users/pdet/repos", + "events_url": "https://api.github.com/users/pdet/events{/privacy}", + "received_events_url": "https://api.github.com/users/pdet/received_events", + "type": "User", + "site_admin": false + }, + "labels": [ + + ], + "state": "closed", + "locked": false, + "assignee": null, + "assignees": [ + + ], + "milestone": null, + "comments": 7, + "created_at": "2021-11-04T12:59:29Z", + "updated_at": "2021-11-09T16:09:58Z", + "closed_at": "2021-11-06T08:38:58Z", + "author_association": "MEMBER", + "active_lock_reason": null, + "draft": false, + "pull_request": { + "url": "https://api.github.com/repos/duckdb/duckdb/pulls/2539", + "html_url": "https://github.com/duckdb/duckdb/pull/2539", + "diff_url": "https://github.com/duckdb/duckdb/pull/2539.diff", + "patch_url": "https://github.com/duckdb/duckdb/pull/2539.patch", + "merged_at": "2021-11-06T08:38:58Z" + }, + "body": "This PR has t2o main changes on how we run our CI workflow.\r\n\r\n1) **Instead of having one workflow, I broke it into multiple workflows**, the main advantage here is that if a workflow breaks for something unrelated to code changes, we can re-run the broken workflow instead of the whole workflow. \r\nSince we now have multiple workflows, each of these workflows has a job dependency ( one job that must be executed before running the remaining jobs). In general, I've tried to use the fastest/most reliable job of that workflow, as a sanity check before running the remainder.\r\n\r\n2) The Python jobs were frequently running out of time since they were executing many python versions inside the same job, I've added a new matrix parameter on those to alleviate this issue.\r\n\r\nI'm happy to make any changes to this suggested workflow structure. \r\n\r\nThis is the current suggested structure: (Main job in bold)\r\n1. Main: Basically Main Engine Linux Jobs + Pot-Pourri jobs (e.g., format checker) **(linux-debug)**\r\n2. Linux Release: Jobs that run on the Linux Release Version **(linux-release-64)**\r\n3. NodeJS: NodeJS jobs **(linux-nodejs)**\r\n4. OSX : Main Engine OSX jobs **(xcode-debug)**\r\n5. Python : Python Wheel of all OSs jobs **(linux-python3-9)**\r\n6. R : R jobs **(rstats-linux)**\r\n7. Windows : Main Engine Windows jobs **(win-release-64)**", + "reactions": { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/2539/reactions", + "total_count": 2, + "+1": 1, + "-1": 0, + "laugh": 0, + "hooray": 1, + "confused": 0, + "heart": 0, + "rocket": 0, + "eyes": 0 + }, + "timeline_url": "https://api.github.com/repos/duckdb/duckdb/issues/2539/timeline", + "performed_via_github_app": null, + "state_reason": null + }, + { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/2379", + "repository_url": "https://api.github.com/repos/duckdb/duckdb", + "labels_url": "https://api.github.com/repos/duckdb/duckdb/issues/2379/labels{/name}", + "comments_url": "https://api.github.com/repos/duckdb/duckdb/issues/2379/comments", + "events_url": "https://api.github.com/repos/duckdb/duckdb/issues/2379/events", + "html_url": "https://github.com/duckdb/duckdb/pull/2379", + "id": 1019592413, + "node_id": "PR_kwDOCEU65s4s3vdG", + "number": 2379, + "title": "Enable building for Python 3.10", + "user": { + "login": "Mause", + "id": 1405026, + "node_id": "MDQ6VXNlcjE0MDUwMjY=", + "avatar_url": "https://avatars.githubusercontent.com/u/1405026?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/Mause", + "html_url": "https://github.com/Mause", + "followers_url": "https://api.github.com/users/Mause/followers", + "following_url": "https://api.github.com/users/Mause/following{/other_user}", + "gists_url": "https://api.github.com/users/Mause/gists{/gist_id}", + "starred_url": "https://api.github.com/users/Mause/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/Mause/subscriptions", + "organizations_url": "https://api.github.com/users/Mause/orgs", + "repos_url": "https://api.github.com/users/Mause/repos", + "events_url": "https://api.github.com/users/Mause/events{/privacy}", + "received_events_url": "https://api.github.com/users/Mause/received_events", + "type": "User", + "site_admin": false + }, + "labels": [ + + ], + "state": "closed", + "locked": false, + "assignee": null, + "assignees": [ + + ], + "milestone": null, + "comments": 7, + "created_at": "2021-10-07T04:11:19Z", + "updated_at": "2021-11-29T06:32:37Z", + "closed_at": "2021-11-29T06:14:44Z", + "author_association": "MEMBER", + "active_lock_reason": null, + "draft": false, + "pull_request": { + "url": "https://api.github.com/repos/duckdb/duckdb/pulls/2379", + "html_url": "https://github.com/duckdb/duckdb/pull/2379", + "diff_url": "https://github.com/duckdb/duckdb/pull/2379.diff", + "patch_url": "https://github.com/duckdb/duckdb/pull/2379.patch", + "merged_at": null + }, + "body": null, + "reactions": { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/2379/reactions", + "total_count": 1, + "+1": 1, + "-1": 0, + "laugh": 0, + "hooray": 0, + "confused": 0, + "heart": 0, + "rocket": 0, + "eyes": 0 + }, + "timeline_url": "https://api.github.com/repos/duckdb/duckdb/issues/2379/timeline", + "performed_via_github_app": null, + "state_reason": null + }, + { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/2281", + "repository_url": "https://api.github.com/repos/duckdb/duckdb", + "labels_url": "https://api.github.com/repos/duckdb/duckdb/issues/2281/labels{/name}", + "comments_url": "https://api.github.com/repos/duckdb/duckdb/issues/2281/comments", + "events_url": "https://api.github.com/repos/duckdb/duckdb/issues/2281/events", + "html_url": "https://github.com/duckdb/duckdb/issues/2281", + "id": 996518696, + "node_id": "I_kwDOCEU65s47Zaso", + "number": 2281, + "title": "Group-By on col with null value causes seg fault", + "user": { + "login": "scottee", + "id": 1014942, + "node_id": "MDQ6VXNlcjEwMTQ5NDI=", + "avatar_url": "https://avatars.githubusercontent.com/u/1014942?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/scottee", + "html_url": "https://github.com/scottee", + "followers_url": "https://api.github.com/users/scottee/followers", + "following_url": "https://api.github.com/users/scottee/following{/other_user}", + "gists_url": "https://api.github.com/users/scottee/gists{/gist_id}", + "starred_url": "https://api.github.com/users/scottee/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/scottee/subscriptions", + "organizations_url": "https://api.github.com/users/scottee/orgs", + "repos_url": "https://api.github.com/users/scottee/repos", + "events_url": "https://api.github.com/users/scottee/events{/privacy}", + "received_events_url": "https://api.github.com/users/scottee/received_events", + "type": "User", + "site_admin": false + }, + "labels": [ + + ], + "state": "closed", + "locked": false, + "assignee": null, + "assignees": [ + + ], + "milestone": null, + "comments": 7, + "created_at": "2021-09-14T23:30:25Z", + "updated_at": "2021-09-22T20:56:49Z", + "closed_at": "2021-09-16T14:02:14Z", + "author_association": "NONE", + "active_lock_reason": null, + "body": "A group-by query on a pandas df which contains empty values causes a seg fault. This causes us to not be able to use duckdb. I'm using version 0.2.9.\r\n\r\nThe csv is:\r\n\r\n what,is_control,is_test\r\n ,0,0\r\n foo,1,0\r\n\r\nThe query is:\r\n\r\n select what, count(*) from c group by what\r\n\r\nThe python code is below. I don't know where in this the seg fault occurs:\r\n\r\n conn.register(\"c\", df)\r\n conn.execute(sql)\r\n df_result = conn.fetchdf()\r\n", + "reactions": { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/2281/reactions", + "total_count": 0, + "+1": 0, + "-1": 0, + "laugh": 0, + "hooray": 0, + "confused": 0, + "heart": 0, + "rocket": 0, + "eyes": 0 + }, + "timeline_url": "https://api.github.com/repos/duckdb/duckdb/issues/2281/timeline", + "performed_via_github_app": null, + "state_reason": "completed" + }, + { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/2252", + "repository_url": "https://api.github.com/repos/duckdb/duckdb", + "labels_url": "https://api.github.com/repos/duckdb/duckdb/issues/2252/labels{/name}", + "comments_url": "https://api.github.com/repos/duckdb/duckdb/issues/2252/comments", + "events_url": "https://api.github.com/repos/duckdb/duckdb/issues/2252/events", + "html_url": "https://github.com/duckdb/duckdb/issues/2252", + "id": 989095440, + "node_id": "MDU6SXNzdWU5ODkwOTU0NDA=", + "number": 2252, + "title": "Sorting ignoring Null Value Order for Floats", + "user": { + "login": "pdet", + "id": 7377477, + "node_id": "MDQ6VXNlcjczNzc0Nzc=", + "avatar_url": "https://avatars.githubusercontent.com/u/7377477?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/pdet", + "html_url": "https://github.com/pdet", + "followers_url": "https://api.github.com/users/pdet/followers", + "following_url": "https://api.github.com/users/pdet/following{/other_user}", + "gists_url": "https://api.github.com/users/pdet/gists{/gist_id}", + "starred_url": "https://api.github.com/users/pdet/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/pdet/subscriptions", + "organizations_url": "https://api.github.com/users/pdet/orgs", + "repos_url": "https://api.github.com/users/pdet/repos", + "events_url": "https://api.github.com/users/pdet/events{/privacy}", + "received_events_url": "https://api.github.com/users/pdet/received_events", + "type": "User", + "site_admin": false + }, + "labels": [ + + ], + "state": "closed", + "locked": false, + "assignee": null, + "assignees": [ + + ], + "milestone": null, + "comments": 7, + "created_at": "2021-09-06T11:57:37Z", + "updated_at": "2021-09-10T15:46:31Z", + "closed_at": "2021-09-10T07:29:00Z", + "author_association": "MEMBER", + "active_lock_reason": null, + "body": "**What does happen?**\r\nThe unoptimized and optimized results don't match in the sort order.\r\n\r\n**To Reproduce**\r\nThe following test case should result in the issue:\r\n\r\n```sql\r\nstatement ok\r\nPRAGMA enable_verification\r\n\r\nstatement ok\r\ncreate table tbl_1 (a float, b float)\r\n\r\nstatement ok\r\ninsert into tbl_1 values (1,NULL),(2,3),(NULL,NULL)\r\n\r\nstatement ok\r\ncreate table tbl_2 (b float)\r\n\r\nstatement ok\r\ninsert into tbl_2 values (1),(2),(NULL)\r\n\r\nquery II\r\nselect a,tbl_2.b from tbl_1 inner join tbl_2 on (a IS DISTINCT FROM tbl_2.b) order by a,tbl_2.b\r\n----\r\nNULL\t1\r\nNULL\t2\r\n1\tNULL\r\n1\t2\r\n2\tNULL\r\n2\t1 \r\n```", + "reactions": { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/2252/reactions", + "total_count": 0, + "+1": 0, + "-1": 0, + "laugh": 0, + "hooray": 0, + "confused": 0, + "heart": 0, + "rocket": 0, + "eyes": 0 + }, + "timeline_url": "https://api.github.com/repos/duckdb/duckdb/issues/2252/timeline", + "performed_via_github_app": null, + "state_reason": "completed" + }, + { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/2127", + "repository_url": "https://api.github.com/repos/duckdb/duckdb", + "labels_url": "https://api.github.com/repos/duckdb/duckdb/issues/2127/labels{/name}", + "comments_url": "https://api.github.com/repos/duckdb/duckdb/issues/2127/comments", + "events_url": "https://api.github.com/repos/duckdb/duckdb/issues/2127/events", + "html_url": "https://github.com/duckdb/duckdb/issues/2127", + "id": 966244970, + "node_id": "MDU6SXNzdWU5NjYyNDQ5NzA=", + "number": 2127, + "title": "duckdb CLI: parquet_scan reading from '/dev/stdin'", + "user": { + "login": "mskyttner", + "id": 1715840, + "node_id": "MDQ6VXNlcjE3MTU4NDA=", + "avatar_url": "https://avatars.githubusercontent.com/u/1715840?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/mskyttner", + "html_url": "https://github.com/mskyttner", + "followers_url": "https://api.github.com/users/mskyttner/followers", + "following_url": "https://api.github.com/users/mskyttner/following{/other_user}", + "gists_url": "https://api.github.com/users/mskyttner/gists{/gist_id}", + "starred_url": "https://api.github.com/users/mskyttner/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/mskyttner/subscriptions", + "organizations_url": "https://api.github.com/users/mskyttner/orgs", + "repos_url": "https://api.github.com/users/mskyttner/repos", + "events_url": "https://api.github.com/users/mskyttner/events{/privacy}", + "received_events_url": "https://api.github.com/users/mskyttner/received_events", + "type": "User", + "site_admin": false + }, + "labels": [ + + ], + "state": "closed", + "locked": false, + "assignee": null, + "assignees": [ + + ], + "milestone": null, + "comments": 7, + "created_at": "2021-08-11T08:39:47Z", + "updated_at": "2021-08-11T12:55:23Z", + "closed_at": "2021-08-11T12:55:23Z", + "author_association": "NONE", + "active_lock_reason": null, + "body": "**What does happen?**\r\n\r\nIs parquet_scan able to read from the '/dev/stdin' file? \r\n\r\nThis seems to work:\r\n\r\n`duckdb :memory: \"select * from parquet_scan('bottomT_annuals_0p100.parquet') limit 10;\"`\r\n\r\nBut this doesn't:\r\n\r\n`cat bottomT_annuals_0p100.parquet | duckdb :memory: \"select * from parquet_scan('/dev/stdin');\"`\r\n\r\n`Error: Invalid Input Error: File '/dev/stdin' too small to be a Parquet file`\r\n\r\n**What should happen?**\r\n\r\nThe parquet_scan function should preferably read the parquet file sent to /dev/stdin, similarly to how it reads parquet files from S3 locations.\r\n\r\n**To Reproduce**\r\nSteps to reproduce the behavior. Bonus points if those are only SQL queries.\r\n\r\nFirst make sure you have a locally stored .parquet file (for example generated by duckdb), named for example mydata.parquet.\r\n\r\n1. `cat mydata.parquet | duckdb :memory: \"select * from parquet_scan('/dev/stdin');\"`\r\n\r\n**Environment (please complete the following information):**\r\n - OS: Linux\r\n - DuckDB Version 0.2.8", + "reactions": { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/2127/reactions", + "total_count": 1, + "+1": 1, + "-1": 0, + "laugh": 0, + "hooray": 0, + "confused": 0, + "heart": 0, + "rocket": 0, + "eyes": 0 + }, + "timeline_url": "https://api.github.com/repos/duckdb/duckdb/issues/2127/timeline", + "performed_via_github_app": null, + "state_reason": "completed" + }, + { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/2117", + "repository_url": "https://api.github.com/repos/duckdb/duckdb", + "labels_url": "https://api.github.com/repos/duckdb/duckdb/issues/2117/labels{/name}", + "comments_url": "https://api.github.com/repos/duckdb/duckdb/issues/2117/comments", + "events_url": "https://api.github.com/repos/duckdb/duckdb/issues/2117/events", + "html_url": "https://github.com/duckdb/duckdb/issues/2117", + "id": 963554544, + "node_id": "MDU6SXNzdWU5NjM1NTQ1NDQ=", + "number": 2117, + "title": "Feature request - ORDER BY for list() and string_agg() aggregate functions", + "user": { + "login": "akdor1154", + "id": 6732831, + "node_id": "MDQ6VXNlcjY3MzI4MzE=", + "avatar_url": "https://avatars.githubusercontent.com/u/6732831?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/akdor1154", + "html_url": "https://github.com/akdor1154", + "followers_url": "https://api.github.com/users/akdor1154/followers", + "following_url": "https://api.github.com/users/akdor1154/following{/other_user}", + "gists_url": "https://api.github.com/users/akdor1154/gists{/gist_id}", + "starred_url": "https://api.github.com/users/akdor1154/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/akdor1154/subscriptions", + "organizations_url": "https://api.github.com/users/akdor1154/orgs", + "repos_url": "https://api.github.com/users/akdor1154/repos", + "events_url": "https://api.github.com/users/akdor1154/events{/privacy}", + "received_events_url": "https://api.github.com/users/akdor1154/received_events", + "type": "User", + "site_admin": false + }, + "labels": [ + + ], + "state": "closed", + "locked": false, + "assignee": null, + "assignees": [ + + ], + "milestone": null, + "comments": 7, + "created_at": "2021-08-09T00:42:19Z", + "updated_at": "2021-10-08T06:19:36Z", + "closed_at": "2021-09-22T12:09:11Z", + "author_association": "CONTRIBUTOR", + "active_lock_reason": null, + "body": "Hey all,\r\nAssuming I haven't misread doco, I don't think there is a way to specify internal ordering of of the list() and string_agg() aggregate functions.\r\n\r\nExample:\r\n```sql\r\nwith t as (\r\n values ('a',1), ('c',3), ('b',2)\r\n)\r\nselect list(col0 order by col1) from t\r\n```\r\n`RuntimeError: Parser Error: ORDER BY is not implemented for aggregates`\r\n\r\nIn this example I nicked the syntax from Postgres as I thought maybe you'd have implemented it anyway and not told anybody :)", + "reactions": { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/2117/reactions", + "total_count": 0, + "+1": 0, + "-1": 0, + "laugh": 0, + "hooray": 0, + "confused": 0, + "heart": 0, + "rocket": 0, + "eyes": 0 + }, + "timeline_url": "https://api.github.com/repos/duckdb/duckdb/issues/2117/timeline", + "performed_via_github_app": null, + "state_reason": "completed" + }, + { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/2054", + "repository_url": "https://api.github.com/repos/duckdb/duckdb", + "labels_url": "https://api.github.com/repos/duckdb/duckdb/issues/2054/labels{/name}", + "comments_url": "https://api.github.com/repos/duckdb/duckdb/issues/2054/comments", + "events_url": "https://api.github.com/repos/duckdb/duckdb/issues/2054/events", + "html_url": "https://github.com/duckdb/duckdb/pull/2054", + "id": 953857888, + "node_id": "MDExOlB1bGxSZXF1ZXN0Njk3ODM5MDQy", + "number": 2054, + "title": "Filter/Projection Pushdown to Arrow Scans", + "user": { + "login": "pdet", + "id": 7377477, + "node_id": "MDQ6VXNlcjczNzc0Nzc=", + "avatar_url": "https://avatars.githubusercontent.com/u/7377477?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/pdet", + "html_url": "https://github.com/pdet", + "followers_url": "https://api.github.com/users/pdet/followers", + "following_url": "https://api.github.com/users/pdet/following{/other_user}", + "gists_url": "https://api.github.com/users/pdet/gists{/gist_id}", + "starred_url": "https://api.github.com/users/pdet/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/pdet/subscriptions", + "organizations_url": "https://api.github.com/users/pdet/orgs", + "repos_url": "https://api.github.com/users/pdet/repos", + "events_url": "https://api.github.com/users/pdet/events{/privacy}", + "received_events_url": "https://api.github.com/users/pdet/received_events", + "type": "User", + "site_admin": false + }, + "labels": [ + + ], + "state": "closed", + "locked": false, + "assignee": null, + "assignees": [ + + ], + "milestone": null, + "comments": 7, + "created_at": "2021-07-27T13:00:16Z", + "updated_at": "2021-07-28T19:54:12Z", + "closed_at": "2021-07-28T19:54:12Z", + "author_association": "MEMBER", + "active_lock_reason": null, + "draft": false, + "pull_request": { + "url": "https://api.github.com/repos/duckdb/duckdb/pulls/2054", + "html_url": "https://github.com/duckdb/duckdb/pull/2054", + "diff_url": "https://github.com/duckdb/duckdb/pull/2054.diff", + "patch_url": "https://github.com/duckdb/duckdb/pull/2054.patch", + "merged_at": "2021-07-28T19:54:12Z" + }, + "body": "This PR implements Filter and Projection Pushdown directly to arrow scans (For both Python/R APIs).\r\n\r\nIn addition, this PR also implements a fetch arrow function in R (similar to the one that exists in Python) to transform DuckDB query results into Arrow Table \r\n```R\r\n arrow_table <-duckdb::duckdb_fetch_arrow(dbSendQuery(con, \"SELECT * FROM test\", arrow=TRUE))) \r\n```\r\n\r\ncc @jonkeane @nealrichardson", + "reactions": { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/2054/reactions", + "total_count": 1, + "+1": 1, + "-1": 0, + "laugh": 0, + "hooray": 0, + "confused": 0, + "heart": 0, + "rocket": 0, + "eyes": 0 + }, + "timeline_url": "https://api.github.com/repos/duckdb/duckdb/issues/2054/timeline", + "performed_via_github_app": null, + "state_reason": null + }, + { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/2033", + "repository_url": "https://api.github.com/repos/duckdb/duckdb", + "labels_url": "https://api.github.com/repos/duckdb/duckdb/issues/2033/labels{/name}", + "comments_url": "https://api.github.com/repos/duckdb/duckdb/issues/2033/comments", + "events_url": "https://api.github.com/repos/duckdb/duckdb/issues/2033/events", + "html_url": "https://github.com/duckdb/duckdb/issues/2033", + "id": 950813232, + "node_id": "MDU6SXNzdWU5NTA4MTMyMzI=", + "number": 2033, + "title": "parquet_scan is not available in CLI OSX binaries for versions 0.26 and 0.27", + "user": { + "login": "rberenguel", + "id": 2410938, + "node_id": "MDQ6VXNlcjI0MTA5Mzg=", + "avatar_url": "https://avatars.githubusercontent.com/u/2410938?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/rberenguel", + "html_url": "https://github.com/rberenguel", + "followers_url": "https://api.github.com/users/rberenguel/followers", + "following_url": "https://api.github.com/users/rberenguel/following{/other_user}", + "gists_url": "https://api.github.com/users/rberenguel/gists{/gist_id}", + "starred_url": "https://api.github.com/users/rberenguel/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/rberenguel/subscriptions", + "organizations_url": "https://api.github.com/users/rberenguel/orgs", + "repos_url": "https://api.github.com/users/rberenguel/repos", + "events_url": "https://api.github.com/users/rberenguel/events{/privacy}", + "received_events_url": "https://api.github.com/users/rberenguel/received_events", + "type": "User", + "site_admin": false + }, + "labels": [ + + ], + "state": "closed", + "locked": false, + "assignee": null, + "assignees": [ + + ], + "milestone": null, + "comments": 7, + "created_at": "2021-07-22T15:52:53Z", + "updated_at": "2022-04-30T11:16:04Z", + "closed_at": "2022-04-30T11:16:04Z", + "author_association": "NONE", + "active_lock_reason": null, + "body": "**What does happen?**\r\nCan't select, create view or copy from Parquet file, with error \r\n\r\n```\r\nError: Catalog Error: Table Function with name parquet_scan does not exist!\r\nDid you mean \"arrow_scan\"?\r\nLINE 1: CREATE VIEW events AS SELECT * FROM parquet_scan('ev.parquet');\r\n```\r\n\r\n**What should happen?**\r\nAccording to the documentation ([here](https://duckdb.org/docs/data/parquet)) I would expect a view to be created over this Parquet file (or copy, or insert, depending on command, none work obviously). This works correctly in at least 0.25 and 0.28 (master) locally.\r\n\r\n**To Reproduce**\r\nSteps to reproduce the behavior. Bonus points if those are only SQL queries.\r\n1. Install (or download and execute) duckdb versions 0.26 or 0.27 from GitHub, the MacOS CLI binary in particular.\r\n2. Execute the above query (will fail before even reaching for the file so no problem about not having it)\r\n\r\n**Environment (please complete the following information):**\r\n - OS: Mac OS Big Sur 11.4\r\n - DuckDB Version 0.26 and 0.27\r\n\r\n**Before submitting**\r\n- [*] Have you tried the steps to reproduce? Do they include all relevant data and configuration? Does the issue you report still appear there?\r\n- [*] Have you tried this on the latest `master` branch? In case you cannot compile, you may find some binaries here: https://github.com/duckdb/duckdb/releases/tag/master-builds\r\n\r\nThis is working in 0.28 (master binary above) but failing in 0.26 and 0.27", + "reactions": { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/2033/reactions", + "total_count": 0, + "+1": 0, + "-1": 0, + "laugh": 0, + "hooray": 0, + "confused": 0, + "heart": 0, + "rocket": 0, + "eyes": 0 + }, + "timeline_url": "https://api.github.com/repos/duckdb/duckdb/issues/2033/timeline", + "performed_via_github_app": null, + "state_reason": "completed" + }, + { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/1973", + "repository_url": "https://api.github.com/repos/duckdb/duckdb", + "labels_url": "https://api.github.com/repos/duckdb/duckdb/issues/1973/labels{/name}", + "comments_url": "https://api.github.com/repos/duckdb/duckdb/issues/1973/comments", + "events_url": "https://api.github.com/repos/duckdb/duckdb/issues/1973/events", + "html_url": "https://github.com/duckdb/duckdb/issues/1973", + "id": 940537549, + "node_id": "MDU6SXNzdWU5NDA1Mzc1NDk=", + "number": 1973, + "title": "Indexes for sorted columns?", + "user": { + "login": "dforsber", + "id": 7554666, + "node_id": "MDQ6VXNlcjc1NTQ2NjY=", + "avatar_url": "https://avatars.githubusercontent.com/u/7554666?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/dforsber", + "html_url": "https://github.com/dforsber", + "followers_url": "https://api.github.com/users/dforsber/followers", + "following_url": "https://api.github.com/users/dforsber/following{/other_user}", + "gists_url": "https://api.github.com/users/dforsber/gists{/gist_id}", + "starred_url": "https://api.github.com/users/dforsber/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/dforsber/subscriptions", + "organizations_url": "https://api.github.com/users/dforsber/orgs", + "repos_url": "https://api.github.com/users/dforsber/repos", + "events_url": "https://api.github.com/users/dforsber/events{/privacy}", + "received_events_url": "https://api.github.com/users/dforsber/received_events", + "type": "User", + "site_admin": false + }, + "labels": [ + + ], + "state": "closed", + "locked": false, + "assignee": { + "login": "pdet", + "id": 7377477, + "node_id": "MDQ6VXNlcjczNzc0Nzc=", + "avatar_url": "https://avatars.githubusercontent.com/u/7377477?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/pdet", + "html_url": "https://github.com/pdet", + "followers_url": "https://api.github.com/users/pdet/followers", + "following_url": "https://api.github.com/users/pdet/following{/other_user}", + "gists_url": "https://api.github.com/users/pdet/gists{/gist_id}", + "starred_url": "https://api.github.com/users/pdet/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/pdet/subscriptions", + "organizations_url": "https://api.github.com/users/pdet/orgs", + "repos_url": "https://api.github.com/users/pdet/repos", + "events_url": "https://api.github.com/users/pdet/events{/privacy}", + "received_events_url": "https://api.github.com/users/pdet/received_events", + "type": "User", + "site_admin": false + }, + "assignees": [ + { + "login": "pdet", + "id": 7377477, + "node_id": "MDQ6VXNlcjczNzc0Nzc=", + "avatar_url": "https://avatars.githubusercontent.com/u/7377477?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/pdet", + "html_url": "https://github.com/pdet", + "followers_url": "https://api.github.com/users/pdet/followers", + "following_url": "https://api.github.com/users/pdet/following{/other_user}", + "gists_url": "https://api.github.com/users/pdet/gists{/gist_id}", + "starred_url": "https://api.github.com/users/pdet/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/pdet/subscriptions", + "organizations_url": "https://api.github.com/users/pdet/orgs", + "repos_url": "https://api.github.com/users/pdet/repos", + "events_url": "https://api.github.com/users/pdet/events{/privacy}", + "received_events_url": "https://api.github.com/users/pdet/received_events", + "type": "User", + "site_admin": false + } + ], + "milestone": null, + "comments": 7, + "created_at": "2021-07-09T08:21:09Z", + "updated_at": "2021-08-31T08:14:57Z", + "closed_at": "2021-08-31T08:14:56Z", + "author_association": "CONTRIBUTOR", + "active_lock_reason": null, + "body": "Is it possible to have indexes for sorted column ordering (top N)?\r\n\r\nSo, that e.g. a query of `CREATE INDEX idx_col1 t(col1); SELECT * FROM t ORDERY BY col1 LIMIT 2` would be using the index to pick up the first entries from it, instead of doing live sorting of `col1`.\r\n\r\nIf not, is there a way to implement similarly efficient JOIN that would use e.g. rowid from a table that already has sorted `col1`. Like `CREATE TABLE col1_sorted AS SELECT rowid AS rownum, col1 FROM t ORDER BY col1`, and then have a JOIN query that would do the Top N but with additional filtering, like `SELECT * FROM col1_sorted JOIN (SELECT * FROM t AS main WHERE col2 > 10 AND main.id=col1_sorted.id) LIMIT 10`. Sorry, the SQL is probably wrong and the sort order may not be preserved by the JOIN from the `col1_sorted`, but hopefully it clarifies the point :).", + "reactions": { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/1973/reactions", + "total_count": 0, + "+1": 0, + "-1": 0, + "laugh": 0, + "hooray": 0, + "confused": 0, + "heart": 0, + "rocket": 0, + "eyes": 0 + }, + "timeline_url": "https://api.github.com/repos/duckdb/duckdb/issues/1973/timeline", + "performed_via_github_app": null, + "state_reason": "completed" + }, + { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/1932", + "repository_url": "https://api.github.com/repos/duckdb/duckdb", + "labels_url": "https://api.github.com/repos/duckdb/duckdb/issues/1932/labels{/name}", + "comments_url": "https://api.github.com/repos/duckdb/duckdb/issues/1932/comments", + "events_url": "https://api.github.com/repos/duckdb/duckdb/issues/1932/events", + "html_url": "https://github.com/duckdb/duckdb/issues/1932", + "id": 932957765, + "node_id": "MDU6SXNzdWU5MzI5NTc3NjU=", + "number": 1932, + "title": "Invalid string encoding aborts Parquet import", + "user": { + "login": "erikcw", + "id": 113129, + "node_id": "MDQ6VXNlcjExMzEyOQ==", + "avatar_url": "https://avatars.githubusercontent.com/u/113129?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/erikcw", + "html_url": "https://github.com/erikcw", + "followers_url": "https://api.github.com/users/erikcw/followers", + "following_url": "https://api.github.com/users/erikcw/following{/other_user}", + "gists_url": "https://api.github.com/users/erikcw/gists{/gist_id}", + "starred_url": "https://api.github.com/users/erikcw/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/erikcw/subscriptions", + "organizations_url": "https://api.github.com/users/erikcw/orgs", + "repos_url": "https://api.github.com/users/erikcw/repos", + "events_url": "https://api.github.com/users/erikcw/events{/privacy}", + "received_events_url": "https://api.github.com/users/erikcw/received_events", + "type": "User", + "site_admin": false + }, + "labels": [ + + ], + "state": "closed", + "locked": false, + "assignee": { + "login": "Mytherin", + "id": 3978469, + "node_id": "MDQ6VXNlcjM5Nzg0Njk=", + "avatar_url": "https://avatars.githubusercontent.com/u/3978469?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/Mytherin", + "html_url": "https://github.com/Mytherin", + "followers_url": "https://api.github.com/users/Mytherin/followers", + "following_url": "https://api.github.com/users/Mytherin/following{/other_user}", + "gists_url": "https://api.github.com/users/Mytherin/gists{/gist_id}", + "starred_url": "https://api.github.com/users/Mytherin/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/Mytherin/subscriptions", + "organizations_url": "https://api.github.com/users/Mytherin/orgs", + "repos_url": "https://api.github.com/users/Mytherin/repos", + "events_url": "https://api.github.com/users/Mytherin/events{/privacy}", + "received_events_url": "https://api.github.com/users/Mytherin/received_events", + "type": "User", + "site_admin": false + }, + "assignees": [ + { + "login": "Mytherin", + "id": 3978469, + "node_id": "MDQ6VXNlcjM5Nzg0Njk=", + "avatar_url": "https://avatars.githubusercontent.com/u/3978469?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/Mytherin", + "html_url": "https://github.com/Mytherin", + "followers_url": "https://api.github.com/users/Mytherin/followers", + "following_url": "https://api.github.com/users/Mytherin/following{/other_user}", + "gists_url": "https://api.github.com/users/Mytherin/gists{/gist_id}", + "starred_url": "https://api.github.com/users/Mytherin/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/Mytherin/subscriptions", + "organizations_url": "https://api.github.com/users/Mytherin/orgs", + "repos_url": "https://api.github.com/users/Mytherin/repos", + "events_url": "https://api.github.com/users/Mytherin/events{/privacy}", + "received_events_url": "https://api.github.com/users/Mytherin/received_events", + "type": "User", + "site_admin": false + } + ], + "milestone": null, + "comments": 7, + "created_at": "2021-06-29T17:44:16Z", + "updated_at": "2021-07-01T15:55:41Z", + "closed_at": "2021-07-01T15:55:41Z", + "author_association": "NONE", + "active_lock_reason": null, + "body": "I'm trying to import a series of parquet files into duckdb. Unfortunately, the import aborts with the following error:\r\n\r\n```python\r\nIn [51]: cursor.execute(\"INSERT INTO master_email SELECT * FROM 'master_email/*.parquet'\").df() \r\n --------------------------------------------------------------------------- RuntimeError Traceback (most recent call last) \r\n in \r\n----> 1 cursor.execute(\"INSERT INTO master_email SELECT * FROM 'master_email/*.parquet'\").df() \r\n \r\nRuntimeError: INTERNAL Error: Invalid string encoding found in Parquet file: value is not valid UTF8! \r\n\r\n```\r\n\r\nPandas (via pyarrow), Spark and Presto all read these same parquet files without issue.\r\n\r\nWould be nice to have the option to either ignore invalid rows, or at least include details about the bad record in the error so they can be manually cleaned up.", + "reactions": { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/1932/reactions", + "total_count": 0, + "+1": 0, + "-1": 0, + "laugh": 0, + "hooray": 0, + "confused": 0, + "heart": 0, + "rocket": 0, + "eyes": 0 + }, + "timeline_url": "https://api.github.com/repos/duckdb/duckdb/issues/1932/timeline", + "performed_via_github_app": null, + "state_reason": "completed" + }, + { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/1898", + "repository_url": "https://api.github.com/repos/duckdb/duckdb", + "labels_url": "https://api.github.com/repos/duckdb/duckdb/issues/1898/labels{/name}", + "comments_url": "https://api.github.com/repos/duckdb/duckdb/issues/1898/comments", + "events_url": "https://api.github.com/repos/duckdb/duckdb/issues/1898/events", + "html_url": "https://github.com/duckdb/duckdb/issues/1898", + "id": 928394903, + "node_id": "MDU6SXNzdWU5MjgzOTQ5MDM=", + "number": 1898, + "title": "Can't connect to existing database with RStudio", + "user": { + "login": "tom-christie", + "id": 5084541, + "node_id": "MDQ6VXNlcjUwODQ1NDE=", + "avatar_url": "https://avatars.githubusercontent.com/u/5084541?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/tom-christie", + "html_url": "https://github.com/tom-christie", + "followers_url": "https://api.github.com/users/tom-christie/followers", + "following_url": "https://api.github.com/users/tom-christie/following{/other_user}", + "gists_url": "https://api.github.com/users/tom-christie/gists{/gist_id}", + "starred_url": "https://api.github.com/users/tom-christie/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/tom-christie/subscriptions", + "organizations_url": "https://api.github.com/users/tom-christie/orgs", + "repos_url": "https://api.github.com/users/tom-christie/repos", + "events_url": "https://api.github.com/users/tom-christie/events{/privacy}", + "received_events_url": "https://api.github.com/users/tom-christie/received_events", + "type": "User", + "site_admin": false + }, + "labels": [ + + ], + "state": "open", + "locked": false, + "assignee": null, + "assignees": [ + + ], + "milestone": null, + "comments": 7, + "created_at": "2021-06-23T15:38:43Z", + "updated_at": "2021-07-25T21:51:19Z", + "closed_at": null, + "author_association": "NONE", + "active_lock_reason": null, + "body": "**What does happen?**\r\nI can't connect to an existing database using R. When I try, I get this error:\r\n\r\n > conn <- dbConnect(duckdb::duckdb(), \"~/test.duckdb\")\r\n Error in l$contains : $ operator is invalid for atomic vectors\r\n\r\n**What should happen?**\r\n`conn` should be an object I can use to execute queries...but the error causes no `conn` object to be created.\r\n\r\n**To Reproduce**\r\n```\r\n conn <- dbConnect(duckdb::duckdb(), \"~/test.duckdb\")\r\n df <- data.frame(a=c(1,2,3),\r\n b=c('1','2','3'))\r\n dbWriteTable(conn, \"df\", df)\r\n dbDisconnect(conn, shutdown=TRUE) \r\n rm(conn)\r\n conn <- dbConnect(duckdb::duckdb(), \"~/test.duckdb\")\r\n```\r\n\r\n**Environment (please complete the following information):**\r\n - OS: MacOS 10.15.7\r\n - DuckDB Version: 0.2.7\r\n - R Version: 4.0.4\r\n\r\n\r\n", + "reactions": { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/1898/reactions", + "total_count": 1, + "+1": 1, + "-1": 0, + "laugh": 0, + "hooray": 0, + "confused": 0, + "heart": 0, + "rocket": 0, + "eyes": 0 + }, + "timeline_url": "https://api.github.com/repos/duckdb/duckdb/issues/1898/timeline", + "performed_via_github_app": null, + "state_reason": null + }, + { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/1834", + "repository_url": "https://api.github.com/repos/duckdb/duckdb", + "labels_url": "https://api.github.com/repos/duckdb/duckdb/issues/1834/labels{/name}", + "comments_url": "https://api.github.com/repos/duckdb/duckdb/issues/1834/comments", + "events_url": "https://api.github.com/repos/duckdb/duckdb/issues/1834/events", + "html_url": "https://github.com/duckdb/duckdb/issues/1834", + "id": 909552736, + "node_id": "MDU6SXNzdWU5MDk1NTI3MzY=", + "number": 1834, + "title": "Deleting with DELETE USING causes a segmentation fault", + "user": { + "login": "szarnyasg", + "id": 1402801, + "node_id": "MDQ6VXNlcjE0MDI4MDE=", + "avatar_url": "https://avatars.githubusercontent.com/u/1402801?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/szarnyasg", + "html_url": "https://github.com/szarnyasg", + "followers_url": "https://api.github.com/users/szarnyasg/followers", + "following_url": "https://api.github.com/users/szarnyasg/following{/other_user}", + "gists_url": "https://api.github.com/users/szarnyasg/gists{/gist_id}", + "starred_url": "https://api.github.com/users/szarnyasg/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/szarnyasg/subscriptions", + "organizations_url": "https://api.github.com/users/szarnyasg/orgs", + "repos_url": "https://api.github.com/users/szarnyasg/repos", + "events_url": "https://api.github.com/users/szarnyasg/events{/privacy}", + "received_events_url": "https://api.github.com/users/szarnyasg/received_events", + "type": "User", + "site_admin": false + }, + "labels": [ + + ], + "state": "closed", + "locked": false, + "assignee": null, + "assignees": [ + + ], + "milestone": null, + "comments": 7, + "created_at": "2021-06-02T14:16:53Z", + "updated_at": "2021-06-03T13:10:22Z", + "closed_at": "2021-06-03T09:59:34Z", + "author_association": "CONTRIBUTOR", + "active_lock_reason": null, + "body": "**What does happen?**\r\nI am performing deletes with the `DELETE ... USING ...` (#1659) command. In some cases, it returns with a segmentation fault.\r\n\r\nI have also seen a few `Conflict on tuple deletion!` but the root cause of the two seems to be the same (sometimes, running the same script multiple times returns either a segfault or a conflict).\r\n\r\n**What should happen?**\r\nThe system should perform the delete operation on the tuples matching the condition.\r\n\r\n**To Reproduce**\r\n1. Grab the archive containing the data and the script from https://surfdrive.surf.nl/files/index.php/s/VA92agRErNV2EJl\r\n2. Run the SQL script:\r\n ```bash\r\n cat delete-segfault-mwe.sql | sed \"s#PATHVAR#`pwd`/data#\" | duckdb\r\n ```\r\n\r\n It returns a `Segmentation fault (core dumped)`\r\n\r\n**Environment (please complete the following information):**\r\n - OS: Fedora 34\r\n - DuckDB Version: 0.2.6, master (979f14ed48b865e0f775b2898640757b6302f4e7)\r\n\r\nThe content of the SQL script is the following:\r\n```sql\r\nCREATE TABLE Person_likes_Comment (creationDate timestamp without time zone not null, id bigint not null, likes_Comment bigint not null);\r\nCREATE TABLE Person_Delete_candidates (deletionDate timestamp without time zone not null, id bigint);\r\n\r\nCOPY Person_likes_Comment FROM 'PATHVAR/Person_likes_Comment.csv' (DELIMITER '|', TIMESTAMPFORMAT '%Y-%m-%dT%H:%M:%S.%g+00:00');\r\nCOPY Person_Delete_candidates FROM 'PATHVAR/Person_Delete_candidates.csv' (DELIMITER '|', HEADER, TIMESTAMPFORMAT '%Y-%m-%dT%H:%M:%S.%g+00:00');\r\n\r\nDELETE FROM Person_likes_Comment USING Person_Delete_candidates WHERE Person_Delete_candidates.id = Person_likes_Comment.id;\r\n```\r\n\r\nThe following query returns the tuples that should be deleted from the `Person_likes_Comment` (showing that there are no duplicates):\r\n```sql\r\nSELECT Person_likes_comment.* FROM Person_likes_Comment, Person_Delete_candidates WHERE Person_Delete_candidates.id = Person_likes_Comment.id;\r\n```\r\n", + "reactions": { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/1834/reactions", + "total_count": 0, + "+1": 0, + "-1": 0, + "laugh": 0, + "hooray": 0, + "confused": 0, + "heart": 0, + "rocket": 0, + "eyes": 0 + }, + "timeline_url": "https://api.github.com/repos/duckdb/duckdb/issues/1834/timeline", + "performed_via_github_app": null, + "state_reason": "completed" + }, + { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/1822", + "repository_url": "https://api.github.com/repos/duckdb/duckdb", + "labels_url": "https://api.github.com/repos/duckdb/duckdb/issues/1822/labels{/name}", + "comments_url": "https://api.github.com/repos/duckdb/duckdb/issues/1822/comments", + "events_url": "https://api.github.com/repos/duckdb/duckdb/issues/1822/events", + "html_url": "https://github.com/duckdb/duckdb/pull/1822", + "id": 907427048, + "node_id": "MDExOlB1bGxSZXF1ZXN0NjU4Mjc2Njcw", + "number": 1822, + "title": "Python/JSON-Style Struct & List Syntax ", + "user": { + "login": "Mytherin", + "id": 3978469, + "node_id": "MDQ6VXNlcjM5Nzg0Njk=", + "avatar_url": "https://avatars.githubusercontent.com/u/3978469?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/Mytherin", + "html_url": "https://github.com/Mytherin", + "followers_url": "https://api.github.com/users/Mytherin/followers", + "following_url": "https://api.github.com/users/Mytherin/following{/other_user}", + "gists_url": "https://api.github.com/users/Mytherin/gists{/gist_id}", + "starred_url": "https://api.github.com/users/Mytherin/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/Mytherin/subscriptions", + "organizations_url": "https://api.github.com/users/Mytherin/orgs", + "repos_url": "https://api.github.com/users/Mytherin/repos", + "events_url": "https://api.github.com/users/Mytherin/events{/privacy}", + "received_events_url": "https://api.github.com/users/Mytherin/received_events", + "type": "User", + "site_admin": false + }, + "labels": [ + + ], + "state": "closed", + "locked": false, + "assignee": null, + "assignees": [ + + ], + "milestone": null, + "comments": 7, + "created_at": "2021-05-31T12:29:31Z", + "updated_at": "2021-07-19T07:27:47Z", + "closed_at": "2021-05-31T16:36:57Z", + "author_association": "COLLABORATOR", + "active_lock_reason": null, + "draft": false, + "pull_request": { + "url": "https://api.github.com/repos/duckdb/duckdb/pulls/1822", + "html_url": "https://github.com/duckdb/duckdb/pull/1822", + "diff_url": "https://github.com/duckdb/duckdb/pull/1822.diff", + "patch_url": "https://github.com/duckdb/duckdb/pull/1822.patch", + "merged_at": "2021-05-31T16:36:57Z" + }, + "body": "This PR adds Python/JSON style syntax support for STRUCT and LIST entries, i.e. the following statements are now valid:\r\n\r\n```sql\r\nSELECT {'i': 3, 'j': 4};\r\n-- equivalent to STRUCT_PACK(i := 3, j := 4)\r\nSELECT [1,2,3];\r\n-- equivalent to LIST_VALUE(1, 2, 3)\r\nSELECT {'i': [1,2, 3], 'j': 4};\r\n-- equivalent to STRUCT_PACK(i := LIST_VALUE(1,2,3), j := 4)\r\n\r\n-- quotes are optional\r\nSELECT {i: 3, j: 4};\r\n-- double quotes are also allowed\r\nSELECT {\"i\": 3, \"j\": 4};\r\n```\r\n\r\nThis PR also changes the struct -> string conversion to follow the same style, so the string conversion can be round-tripped easily:\r\n\r\n```sql\r\nSELECT {'i': 3, 'j': 4}::VARCHAR;\r\n-- {'i': 3, 'j': 4}\r\n```", + "reactions": { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/1822/reactions", + "total_count": 2, + "+1": 2, + "-1": 0, + "laugh": 0, + "hooray": 0, + "confused": 0, + "heart": 0, + "rocket": 0, + "eyes": 0 + }, + "timeline_url": "https://api.github.com/repos/duckdb/duckdb/issues/1822/timeline", + "performed_via_github_app": null, + "state_reason": null + }, + { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/1732", + "repository_url": "https://api.github.com/repos/duckdb/duckdb", + "labels_url": "https://api.github.com/repos/duckdb/duckdb/issues/1732/labels{/name}", + "comments_url": "https://api.github.com/repos/duckdb/duckdb/issues/1732/comments", + "events_url": "https://api.github.com/repos/duckdb/duckdb/issues/1732/events", + "html_url": "https://github.com/duckdb/duckdb/issues/1732", + "id": 880441546, + "node_id": "MDU6SXNzdWU4ODA0NDE1NDY=", + "number": 1732, + "title": "Python client using a (hidden) global default connection instead of using the last created one.", + "user": { + "login": "pdet", + "id": 7377477, + "node_id": "MDQ6VXNlcjczNzc0Nzc=", + "avatar_url": "https://avatars.githubusercontent.com/u/7377477?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/pdet", + "html_url": "https://github.com/pdet", + "followers_url": "https://api.github.com/users/pdet/followers", + "following_url": "https://api.github.com/users/pdet/following{/other_user}", + "gists_url": "https://api.github.com/users/pdet/gists{/gist_id}", + "starred_url": "https://api.github.com/users/pdet/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/pdet/subscriptions", + "organizations_url": "https://api.github.com/users/pdet/orgs", + "repos_url": "https://api.github.com/users/pdet/repos", + "events_url": "https://api.github.com/users/pdet/events{/privacy}", + "received_events_url": "https://api.github.com/users/pdet/received_events", + "type": "User", + "site_admin": false + }, + "labels": [ + + ], + "state": "closed", + "locked": false, + "assignee": null, + "assignees": [ + + ], + "milestone": null, + "comments": 7, + "created_at": "2021-05-08T07:33:08Z", + "updated_at": "2022-05-12T10:42:06Z", + "closed_at": "2022-05-12T10:42:06Z", + "author_association": "MEMBER", + "active_lock_reason": null, + "body": "**What does happen?**\r\nThe problem is that the python client has this global default connection that exists in the background and is used before setting up a connection. From a user perspective, this is rather confusing and can cause undesired behaviors under the assumption you would be using the same connection.\r\nPerhaps we should change that so that it instead uses the last-created connection or something instead of this hidden global connection.\r\n\r\n**What should happen?**\r\nNot have python use this global default connection.\r\n\r\n**To Reproduce**\r\nIn this example we set the threads to 4 and force parallelism by using a connection from duckdb.connect(), however if we don't use duckdb_conn when creating the duckdb relation (last line), it will use the global default connection, that does not have the threads set to 4 or the force parallelism.\r\n```python\r\nduckdb_conn = duckdb.connect()\r\nduckdb_conn.execute(\"PRAGMA threads=4\")\r\nduckdb_conn.execute(\"PRAGMA force_parallelism\")\r\ndata = (pyarrow.array(np.random.randint(800, size=1000000), type=pyarrow.int32()))\r\ntbl = pyarrow.Table.from_batches(pyarrow.Table.from_arrays([data],['a']).to_batches(10000))\r\nrel = duckdb.from_arrow_table(tbl) \r\n```\r\n\r\n**Environment (please complete the following information):**\r\n - OS: Linux\r\n - DuckDB Version: Latest\r\n\r\n**Before submitting**\r\n- [x] Have you tried the steps to reproduce? Do they include all relevant data and configuration? Does the issue you report still appear there?\r\n- [x] Have you tried this on the latest `master` branch? In case you cannot compile, you may find some binaries here: https://github.com/duckdb/duckdb/releases/tag/master-builds\r\n", + "reactions": { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/1732/reactions", + "total_count": 0, + "+1": 0, + "-1": 0, + "laugh": 0, + "hooray": 0, + "confused": 0, + "heart": 0, + "rocket": 0, + "eyes": 0 + }, + "timeline_url": "https://api.github.com/repos/duckdb/duckdb/issues/1732/timeline", + "performed_via_github_app": null, + "state_reason": "completed" + }, + { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/1719", + "repository_url": "https://api.github.com/repos/duckdb/duckdb", + "labels_url": "https://api.github.com/repos/duckdb/duckdb/issues/1719/labels{/name}", + "comments_url": "https://api.github.com/repos/duckdb/duckdb/issues/1719/comments", + "events_url": "https://api.github.com/repos/duckdb/duckdb/issues/1719/events", + "html_url": "https://github.com/duckdb/duckdb/pull/1719", + "id": 877794135, + "node_id": "MDExOlB1bGxSZXF1ZXN0NjMxNjkxMDU2", + "number": 1719, + "title": "Add root expression profiler", + "user": { + "login": "azimafroozeh", + "id": 13484327, + "node_id": "MDQ6VXNlcjEzNDg0MzI3", + "avatar_url": "https://avatars.githubusercontent.com/u/13484327?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/azimafroozeh", + "html_url": "https://github.com/azimafroozeh", + "followers_url": "https://api.github.com/users/azimafroozeh/followers", + "following_url": "https://api.github.com/users/azimafroozeh/following{/other_user}", + "gists_url": "https://api.github.com/users/azimafroozeh/gists{/gist_id}", + "starred_url": "https://api.github.com/users/azimafroozeh/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/azimafroozeh/subscriptions", + "organizations_url": "https://api.github.com/users/azimafroozeh/orgs", + "repos_url": "https://api.github.com/users/azimafroozeh/repos", + "events_url": "https://api.github.com/users/azimafroozeh/events{/privacy}", + "received_events_url": "https://api.github.com/users/azimafroozeh/received_events", + "type": "User", + "site_admin": false + }, + "labels": [ + + ], + "state": "closed", + "locked": false, + "assignee": null, + "assignees": [ + + ], + "milestone": null, + "comments": 7, + "created_at": "2021-05-06T17:49:23Z", + "updated_at": "2021-05-17T10:35:18Z", + "closed_at": "2021-05-17T10:35:08Z", + "author_association": "CONTRIBUTOR", + "active_lock_reason": null, + "draft": false, + "pull_request": { + "url": "https://api.github.com/repos/duckdb/duckdb/pulls/1719", + "html_url": "https://github.com/duckdb/duckdb/pull/1719", + "diff_url": "https://github.com/duckdb/duckdb/pull/1719.diff", + "patch_url": "https://github.com/duckdb/duckdb/pull/1719.patch", + "merged_at": "2021-05-17T10:35:08Z" + }, + "body": "#1495.", + "reactions": { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/1719/reactions", + "total_count": 0, + "+1": 0, + "-1": 0, + "laugh": 0, + "hooray": 0, + "confused": 0, + "heart": 0, + "rocket": 0, + "eyes": 0 + }, + "timeline_url": "https://api.github.com/repos/duckdb/duckdb/issues/1719/timeline", + "performed_via_github_app": null, + "state_reason": null + }, + { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/1627", + "repository_url": "https://api.github.com/repos/duckdb/duckdb", + "labels_url": "https://api.github.com/repos/duckdb/duckdb/issues/1627/labels{/name}", + "comments_url": "https://api.github.com/repos/duckdb/duckdb/issues/1627/comments", + "events_url": "https://api.github.com/repos/duckdb/duckdb/issues/1627/events", + "html_url": "https://github.com/duckdb/duckdb/issues/1627", + "id": 859688439, + "node_id": "MDU6SXNzdWU4NTk2ODg0Mzk=", + "number": 1627, + "title": "Feature Request: Add support for offset parsing in timestamp string values", + "user": { + "login": "emres", + "id": 150102, + "node_id": "MDQ6VXNlcjE1MDEwMg==", + "avatar_url": "https://avatars.githubusercontent.com/u/150102?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/emres", + "html_url": "https://github.com/emres", + "followers_url": "https://api.github.com/users/emres/followers", + "following_url": "https://api.github.com/users/emres/following{/other_user}", + "gists_url": "https://api.github.com/users/emres/gists{/gist_id}", + "starred_url": "https://api.github.com/users/emres/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/emres/subscriptions", + "organizations_url": "https://api.github.com/users/emres/orgs", + "repos_url": "https://api.github.com/users/emres/repos", + "events_url": "https://api.github.com/users/emres/events{/privacy}", + "received_events_url": "https://api.github.com/users/emres/received_events", + "type": "User", + "site_admin": false + }, + "labels": [ + + ], + "state": "closed", + "locked": false, + "assignee": { + "login": "Mytherin", + "id": 3978469, + "node_id": "MDQ6VXNlcjM5Nzg0Njk=", + "avatar_url": "https://avatars.githubusercontent.com/u/3978469?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/Mytherin", + "html_url": "https://github.com/Mytherin", + "followers_url": "https://api.github.com/users/Mytherin/followers", + "following_url": "https://api.github.com/users/Mytherin/following{/other_user}", + "gists_url": "https://api.github.com/users/Mytherin/gists{/gist_id}", + "starred_url": "https://api.github.com/users/Mytherin/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/Mytherin/subscriptions", + "organizations_url": "https://api.github.com/users/Mytherin/orgs", + "repos_url": "https://api.github.com/users/Mytherin/repos", + "events_url": "https://api.github.com/users/Mytherin/events{/privacy}", + "received_events_url": "https://api.github.com/users/Mytherin/received_events", + "type": "User", + "site_admin": false + }, + "assignees": [ + { + "login": "Mytherin", + "id": 3978469, + "node_id": "MDQ6VXNlcjM5Nzg0Njk=", + "avatar_url": "https://avatars.githubusercontent.com/u/3978469?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/Mytherin", + "html_url": "https://github.com/Mytherin", + "followers_url": "https://api.github.com/users/Mytherin/followers", + "following_url": "https://api.github.com/users/Mytherin/following{/other_user}", + "gists_url": "https://api.github.com/users/Mytherin/gists{/gist_id}", + "starred_url": "https://api.github.com/users/Mytherin/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/Mytherin/subscriptions", + "organizations_url": "https://api.github.com/users/Mytherin/orgs", + "repos_url": "https://api.github.com/users/Mytherin/repos", + "events_url": "https://api.github.com/users/Mytherin/events{/privacy}", + "received_events_url": "https://api.github.com/users/Mytherin/received_events", + "type": "User", + "site_admin": false + } + ], + "milestone": null, + "comments": 7, + "created_at": "2021-04-16T10:30:41Z", + "updated_at": "2021-04-26T06:12:22Z", + "closed_at": "2021-04-26T06:12:22Z", + "author_association": "NONE", + "active_lock_reason": null, + "body": "Hello,\r\n\r\nI've exported a large PostgreSQL 12 table to a CSV file using the [PostgreSQL COPY](https://www.postgresql.org/docs/12/sql-copy.html) command, and I want to evaluate DuckDB's performance for some analytical queries for that data set.\r\n\r\nThis is an [IoT](https://en.wikipedia.org/wiki/Internet_of_things) type of sensor readings data and there's a field that stores the date and time information; the PostgreSQL data type of that field is `timestamp with time zone (timestamptz)` as described in \"[PostgreSQL: Documentation: 12: 8.5. Date/Time Types](https://www.postgresql.org/docs/12/datatype-datetime.html)\". \r\n\r\nPostgreSQL stores these values as [UTC](https://en.wikipedia.org/wiki/Coordinated_Universal_Time) values and when I export the data to a CSV in my 'Europe/Brussels' time zone, a sample timestamp value in the CSV file is like:\r\n\r\n`2020-12-31 21:25:58.745232+02`\r\n\r\nthat is, with the **offset** added.\r\n\r\nWhen I check the TIMESTAMP functionality of DuckDB, I can see that it can parse timestamp values **without offset**, e.g. `'2020-12-31 21:25:58.745232'` :\r\n```\r\n\r\nD SELECT TIMESTAMP '2020-12-31 21:25:58.745232';\r\n┌───────────────────────────────────────────────┐\r\n│ CAST(2020-12-31 21:25:58.745232 AS TIMESTAMP) │\r\n├───────────────────────────────────────────────┤\r\n│ 2020-12-31 21:25:58.745232 │\r\n└───────────────────────────────────────────────┘\r\n```\r\n\r\nbut it can't do the similar operation for string values **including a time offset** such as `'2020-12-31 21:25:58.745232+02`:\r\n\r\n\r\n```\r\nD SELECT TIMESTAMP '2020-12-31 21:25:58.745232+02';\r\nError: Conversion Error: timestamp field value out of range: \"2020-12-31 21:25:58.745232+02\", expected format is (YYYY-MM-DD HH:MM:SS[.MS]\r\n```\r\n\r\nWhat do you think about this feature request? \r\n\r\nIf DuckDB had this feature, and if this was also supported during the automatic type inferring when importing CSV (as described in https://duckdb.org/docs/data/csv), it would be very straightforward to import CSV files generated by PostgreSQL from tables that have `timestamp with time zone (timestamptz)` fields.\r\n", + "reactions": { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/1627/reactions", + "total_count": 0, + "+1": 0, + "-1": 0, + "laugh": 0, + "hooray": 0, + "confused": 0, + "heart": 0, + "rocket": 0, + "eyes": 0 + }, + "timeline_url": "https://api.github.com/repos/duckdb/duckdb/issues/1627/timeline", + "performed_via_github_app": null, + "state_reason": "completed" + }, + { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/1624", + "repository_url": "https://api.github.com/repos/duckdb/duckdb", + "labels_url": "https://api.github.com/repos/duckdb/duckdb/issues/1624/labels{/name}", + "comments_url": "https://api.github.com/repos/duckdb/duckdb/issues/1624/comments", + "events_url": "https://api.github.com/repos/duckdb/duckdb/issues/1624/events", + "html_url": "https://github.com/duckdb/duckdb/issues/1624", + "id": 859445804, + "node_id": "MDU6SXNzdWU4NTk0NDU4MDQ=", + "number": 1624, + "title": "Crash when deleting rows", + "user": { + "login": "akdor1154", + "id": 6732831, + "node_id": "MDQ6VXNlcjY3MzI4MzE=", + "avatar_url": "https://avatars.githubusercontent.com/u/6732831?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/akdor1154", + "html_url": "https://github.com/akdor1154", + "followers_url": "https://api.github.com/users/akdor1154/followers", + "following_url": "https://api.github.com/users/akdor1154/following{/other_user}", + "gists_url": "https://api.github.com/users/akdor1154/gists{/gist_id}", + "starred_url": "https://api.github.com/users/akdor1154/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/akdor1154/subscriptions", + "organizations_url": "https://api.github.com/users/akdor1154/orgs", + "repos_url": "https://api.github.com/users/akdor1154/repos", + "events_url": "https://api.github.com/users/akdor1154/events{/privacy}", + "received_events_url": "https://api.github.com/users/akdor1154/received_events", + "type": "User", + "site_admin": false + }, + "labels": [ + + ], + "state": "closed", + "locked": false, + "assignee": null, + "assignees": [ + + ], + "milestone": null, + "comments": 7, + "created_at": "2021-04-16T04:34:05Z", + "updated_at": "2022-04-30T11:02:47Z", + "closed_at": "2022-04-30T11:02:46Z", + "author_association": "CONTRIBUTOR", + "active_lock_reason": null, + "body": "Hiya,\r\nRan into this this morning.\r\nI have a table,\r\n```sql\r\ncreate table odap_tx (\r\n transaction_id text primary key,\r\n conformed_id text\r\n);\r\n```\r\n\r\nI also have a similar auxiliary table,\r\n```sql\r\ncreate table legacy_tx (\r\n transaction_id text primary key,\r\n new_consolidated_id text\r\n);\r\n```\r\n\r\nThe tables in question has about 6 million rows in them each. When I delete from `odap_tx`, I get a segfault:\r\n```sql\r\ndelete from odap_tx o\r\nwhere not exists (\r\n select l.transaction_id\r\n from legacy_tx l\r\n where l.transaction_id = o.transaction_id\r\n);\r\n```\r\n```\r\n#0 duckdb::VersionDeleteState::Delete (this=0x7fffffffa090, row_id=103424)\r\n at /home/jarrad/src/web/duckdb/src/storage/table/morsel_info.cpp:159\r\n#1 0x0000555556dafb98 in duckdb::MorselInfo::Delete (this=0x555557e05380, transaction=..., \r\n table=0x555557e4df40, row_ids=..., count=960)\r\n at /home/jarrad/src/web/duckdb/src/storage/table/morsel_info.cpp:140\r\n#2 0x00005555567b7362 in duckdb::DataTable::Delete (this=0x555557e4df40, table=..., \r\n context=..., row_identifiers=..., count=960)\r\n at /home/jarrad/src/web/duckdb/src/storage/data_table.cpp:842\r\n#3 0x0000555556d35935 in duckdb::PhysicalDelete::Sink (this=0x555557ea3570, context=..., \r\n state=..., lstate=..., input=...)\r\n at /home/jarrad/src/web/duckdb/src/execution/operator/persistent/physical_delete.cpp:27\r\n#4 0x00005555567985bb in duckdb::Pipeline::Execute (this=0x555557df98a0, task=...)\r\n at /home/jarrad/src/web/duckdb/src/parallel/pipeline.cpp:102\r\n#5 0x000055555679ae59 in duckdb::PipelineTask::Execute (this=0x555557ecb570)\r\n at /home/jarrad/src/web/duckdb/src/parallel/pipeline.cpp:28\r\n#6 0x0000555556796c7b in duckdb::Executor::Initialize (this=0x5555c2ae80b8, plan=0x555557ea3570)\r\n at /home/jarrad/src/web/duckdb/src/parallel/executor.cpp:49\r\n#7 0x000055555673f8bf in duckdb::ClientContext::ExecutePreparedStatement (this=0x5555c2ae7f40, \r\n lock=..., \r\n query=\"delete\\nfrom odap_tx o\\nwhere not exists (\\n select l.transaction_id\\n from legacy_tx l\\n where l.transaction_id = o.transaction_id\\n);\", \r\n statement_p=std::shared_ptr (use count 2, weak count 0) = {...}, bound_values=std::vector of length 0, capacity 0, allow_stream_result=false)\r\n at /home/jarrad/src/web/duckdb/src/main/client_context.cpp:218\r\n#8 0x000055555674124e in duckdb::ClientContext::RunStatementOrPreparedStatement (\r\n this=0x5555c2ae7f40, lock=..., \r\n query=\"delete\\nfrom odap_tx o\\nwhere not exists (\\n select l.transaction_id\\n from legacy_tx l\\n where l.transaction_id = o.transaction_id\\n);\", \r\n statement=std::unique_ptr = {...}, \r\n prepared=std::shared_ptr (use count 2, weak count 0) = {...}, \r\n values=0x555557eb01f8, allow_stream_result=false)\r\n at /home/jarrad/src/web/duckdb/src/main/client_context.cpp:395\r\n#9 0x0000555556740927 in duckdb::ClientContext::Execute (this=0x5555c2ae7f40, \r\n query=\"delete\\nfrom odap_tx o\\nwhere not exists (\\n select l.transaction_id\\n from legacy_tx l\\n where l.transaction_id = o.transaction_id\\n);\", \r\n prepared=std::shared_ptr (use count 2, weak count 0) = {...}, \r\n values=std::vector of length 0, capacity 0, allow_stream_result=false)\r\n at /home/jarrad/src/web/duckdb/src/main/client_context.cpp:334\r\n#10 0x0000555556748774 in duckdb::PreparedStatement::Execute (this=0x555557eaa930, \r\n values=std::vector of length 0, capacity 0, allow_stream_result=false)\r\n at /home/jarrad/src/web/duckdb/src/main/prepared_statement.cpp:45\r\n#11 0x0000555556419e54 in sqlite3_step (pStmt=0x555557eb01b0)\r\n at /home/jarrad/src/web/duckdb/tools/sqlite3_api_wrapper/sqlite3_api_wrapper.cpp:219\r\n#12 0x0000555556402a30 in exec_prepared_stmt_columnar (p=0x7fffffffcaa0, pStmt=0x555557eb01b0)\r\n at /home/jarrad/src/web/duckdb/tools/shell/shell.c:12708\r\n#13 0x0000555556403867 in exec_prepared_stmt (pArg=0x7fffffffcaa0, pStmt=0x555557eb01b0)\r\n at /home/jarrad/src/web/duckdb/tools/shell/shell.c:12884\r\n#14 0x0000555556404590 in shell_exec (pArg=0x7fffffffcaa0, \r\n zSql=0x555557eaa6f0 \"delete\\nfrom odap_tx o\\nwhere not exists (\\n select l.transaction_id\\n from legacy_tx l\\n where l.transaction_id = o.transaction_id\\n);\", \r\n pzErrMsg=0x7fffffffc928) at /home/jarrad/src/web/duckdb/tools/shell/shell.c:13200\r\n#15 0x00005555564131ff in runOneSqlLine (p=0x7fffffffcaa0, \r\n zSql=0x555557eaa6f0 \"delete\\nfrom odap_tx o\\nwhere not exists (\\n select l.transaction_id\\n from legacy_tx l\\n where l.transaction_id = o.transaction_id\\n);\", in=0x0, startline=1)\r\n at /home/jarrad/src/web/duckdb/tools/shell/shell.c:20022\r\n#16 0x0000555556413718 in process_input (p=0x7fffffffcaa0)\r\n at /home/jarrad/src/web/duckdb/tools/shell/shell.c:20122\r\n#17 0x000055555641528c in main (argc=2, argv=0x7fffffffdd98)\r\n at /home/jarrad/src/web/duckdb/tools/shell/shell.c:20906\r\n```\r\n\r\nThis is from a debug build of tag v0.2.5, and I noticed it in the 0.2.5 Python release (have not used previous released).\r\n\r\nThe table is created in the same session and in a fresh db as the deletion is attempted (so no cross version shenannigans).\r\n\r\nI can provide the database privately if needed, it's about 2GiB.", + "reactions": { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/1624/reactions", + "total_count": 0, + "+1": 0, + "-1": 0, + "laugh": 0, + "hooray": 0, + "confused": 0, + "heart": 0, + "rocket": 0, + "eyes": 0 + }, + "timeline_url": "https://api.github.com/repos/duckdb/duckdb/issues/1624/timeline", + "performed_via_github_app": null, + "state_reason": "completed" + }, + { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/1617", + "repository_url": "https://api.github.com/repos/duckdb/duckdb", + "labels_url": "https://api.github.com/repos/duckdb/duckdb/issues/1617/labels{/name}", + "comments_url": "https://api.github.com/repos/duckdb/duckdb/issues/1617/comments", + "events_url": "https://api.github.com/repos/duckdb/duckdb/issues/1617/events", + "html_url": "https://github.com/duckdb/duckdb/issues/1617", + "id": 859058599, + "node_id": "MDU6SXNzdWU4NTkwNTg1OTk=", + "number": 1617, + "title": "Support for parquet fixed_len_byte_array columns", + "user": { + "login": "nelhage", + "id": 16725, + "node_id": "MDQ6VXNlcjE2NzI1", + "avatar_url": "https://avatars.githubusercontent.com/u/16725?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/nelhage", + "html_url": "https://github.com/nelhage", + "followers_url": "https://api.github.com/users/nelhage/followers", + "following_url": "https://api.github.com/users/nelhage/following{/other_user}", + "gists_url": "https://api.github.com/users/nelhage/gists{/gist_id}", + "starred_url": "https://api.github.com/users/nelhage/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/nelhage/subscriptions", + "organizations_url": "https://api.github.com/users/nelhage/orgs", + "repos_url": "https://api.github.com/users/nelhage/repos", + "events_url": "https://api.github.com/users/nelhage/events{/privacy}", + "received_events_url": "https://api.github.com/users/nelhage/received_events", + "type": "User", + "site_admin": false + }, + "labels": [ + { + "id": 976002061, + "node_id": "MDU6TGFiZWw5NzYwMDIwNjE=", + "url": "https://api.github.com/repos/duckdb/duckdb/labels/enhancement", + "name": "enhancement", + "color": "a2eeef", + "default": true, + "description": "New feature or request" + }, + { + "id": 2202756058, + "node_id": "MDU6TGFiZWwyMjAyNzU2MDU4", + "url": "https://api.github.com/repos/duckdb/duckdb/labels/Parquet", + "name": "Parquet", + "color": "e5ca72", + "default": false, + "description": "" + } + ], + "state": "closed", + "locked": false, + "assignee": { + "login": "hannes", + "id": 227792, + "node_id": "MDQ6VXNlcjIyNzc5Mg==", + "avatar_url": "https://avatars.githubusercontent.com/u/227792?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/hannes", + "html_url": "https://github.com/hannes", + "followers_url": "https://api.github.com/users/hannes/followers", + "following_url": "https://api.github.com/users/hannes/following{/other_user}", + "gists_url": "https://api.github.com/users/hannes/gists{/gist_id}", + "starred_url": "https://api.github.com/users/hannes/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/hannes/subscriptions", + "organizations_url": "https://api.github.com/users/hannes/orgs", + "repos_url": "https://api.github.com/users/hannes/repos", + "events_url": "https://api.github.com/users/hannes/events{/privacy}", + "received_events_url": "https://api.github.com/users/hannes/received_events", + "type": "User", + "site_admin": false + }, + "assignees": [ + { + "login": "hannes", + "id": 227792, + "node_id": "MDQ6VXNlcjIyNzc5Mg==", + "avatar_url": "https://avatars.githubusercontent.com/u/227792?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/hannes", + "html_url": "https://github.com/hannes", + "followers_url": "https://api.github.com/users/hannes/followers", + "following_url": "https://api.github.com/users/hannes/following{/other_user}", + "gists_url": "https://api.github.com/users/hannes/gists{/gist_id}", + "starred_url": "https://api.github.com/users/hannes/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/hannes/subscriptions", + "organizations_url": "https://api.github.com/users/hannes/orgs", + "repos_url": "https://api.github.com/users/hannes/repos", + "events_url": "https://api.github.com/users/hannes/events{/privacy}", + "received_events_url": "https://api.github.com/users/hannes/received_events", + "type": "User", + "site_admin": false + } + ], + "milestone": null, + "comments": 7, + "created_at": "2021-04-15T16:36:36Z", + "updated_at": "2021-04-19T14:24:37Z", + "closed_at": "2021-04-19T12:30:23Z", + "author_association": "NONE", + "active_lock_reason": null, + "body": "Attempting to read a `fixed_len_byte_array` column from a parquet file yields the very unhelpful error:\r\n\r\n```\r\nError: Not implemented Error: INVALID\r\n```\r\n\r\nReproducer:\r\n```\r\n$ curl -O https://nelhage.com/files/fixed.parquet\r\n % Total % Received % Xferd Average Speed Time Time Time Current\r\n Dload Upload Total Spent Left Speed\r\n100 135 100 135 0 0 480 0 --:--:-- --:--:-- --:--:-- 480\r\n$ duckdb test.duckdb \"select * from parquet_scan('fixed.parquet');\"\r\nError: Not implemented Error: INVALID\r\n```\r\n", + "reactions": { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/1617/reactions", + "total_count": 0, + "+1": 0, + "-1": 0, + "laugh": 0, + "hooray": 0, + "confused": 0, + "heart": 0, + "rocket": 0, + "eyes": 0 + }, + "timeline_url": "https://api.github.com/repos/duckdb/duckdb/issues/1617/timeline", + "performed_via_github_app": null, + "state_reason": "completed" + }, + { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/1570", + "repository_url": "https://api.github.com/repos/duckdb/duckdb", + "labels_url": "https://api.github.com/repos/duckdb/duckdb/issues/1570/labels{/name}", + "comments_url": "https://api.github.com/repos/duckdb/duckdb/issues/1570/comments", + "events_url": "https://api.github.com/repos/duckdb/duckdb/issues/1570/events", + "html_url": "https://github.com/duckdb/duckdb/issues/1570", + "id": 849568822, + "node_id": "MDU6SXNzdWU4NDk1Njg4MjI=", + "number": 1570, + "title": "Ignore Byte Order Mark (BOM) characters while csv loading ", + "user": { + "login": "chilarai", + "id": 425193, + "node_id": "MDQ6VXNlcjQyNTE5Mw==", + "avatar_url": "https://avatars.githubusercontent.com/u/425193?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/chilarai", + "html_url": "https://github.com/chilarai", + "followers_url": "https://api.github.com/users/chilarai/followers", + "following_url": "https://api.github.com/users/chilarai/following{/other_user}", + "gists_url": "https://api.github.com/users/chilarai/gists{/gist_id}", + "starred_url": "https://api.github.com/users/chilarai/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/chilarai/subscriptions", + "organizations_url": "https://api.github.com/users/chilarai/orgs", + "repos_url": "https://api.github.com/users/chilarai/repos", + "events_url": "https://api.github.com/users/chilarai/events{/privacy}", + "received_events_url": "https://api.github.com/users/chilarai/received_events", + "type": "User", + "site_admin": false + }, + "labels": [ + + ], + "state": "closed", + "locked": false, + "assignee": null, + "assignees": [ + + ], + "milestone": null, + "comments": 7, + "created_at": "2021-04-03T03:40:34Z", + "updated_at": "2021-10-25T19:27:42Z", + "closed_at": "2021-04-03T14:46:50Z", + "author_association": "CONTRIBUTOR", + "active_lock_reason": null, + "body": "Many CSV files contain BOM at the start of the file ( hidden characters provided at the start of a CSV file to indicate the encoding type of the file). Now if such a CSV file is used to create a table using the following command, the first column in the table is always prepended by this BOM character. eg `ColumnName`\r\n\r\n```\r\nCREATE TABLE ontime AS SELECT * FROM read_csv_auto('test.csv');\r\n```\r\n\r\nIs there a way to ignore these characters while loading CSV files using the `read_csv_auto` method? Sample CSV file attached\r\n[People.zip](https://github.com/cwida/duckdb/files/6251859/People.zip)\r\n", + "reactions": { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/1570/reactions", + "total_count": 0, + "+1": 0, + "-1": 0, + "laugh": 0, + "hooray": 0, + "confused": 0, + "heart": 0, + "rocket": 0, + "eyes": 0 + }, + "timeline_url": "https://api.github.com/repos/duckdb/duckdb/issues/1570/timeline", + "performed_via_github_app": null, + "state_reason": "completed" + }, + { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/1558", + "repository_url": "https://api.github.com/repos/duckdb/duckdb", + "labels_url": "https://api.github.com/repos/duckdb/duckdb/issues/1558/labels{/name}", + "comments_url": "https://api.github.com/repos/duckdb/duckdb/issues/1558/comments", + "events_url": "https://api.github.com/repos/duckdb/duckdb/issues/1558/events", + "html_url": "https://github.com/duckdb/duckdb/issues/1558", + "id": 846952120, + "node_id": "MDU6SXNzdWU4NDY5NTIxMjA=", + "number": 1558, + "title": "SelectionVector::get_index should handle sel_vector == nullptr", + "user": { + "login": "hawkfish", + "id": 13156216, + "node_id": "MDQ6VXNlcjEzMTU2MjE2", + "avatar_url": "https://avatars.githubusercontent.com/u/13156216?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/hawkfish", + "html_url": "https://github.com/hawkfish", + "followers_url": "https://api.github.com/users/hawkfish/followers", + "following_url": "https://api.github.com/users/hawkfish/following{/other_user}", + "gists_url": "https://api.github.com/users/hawkfish/gists{/gist_id}", + "starred_url": "https://api.github.com/users/hawkfish/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/hawkfish/subscriptions", + "organizations_url": "https://api.github.com/users/hawkfish/orgs", + "repos_url": "https://api.github.com/users/hawkfish/repos", + "events_url": "https://api.github.com/users/hawkfish/events{/privacy}", + "received_events_url": "https://api.github.com/users/hawkfish/received_events", + "type": "User", + "site_admin": false + }, + "labels": [ + + ], + "state": "closed", + "locked": false, + "assignee": null, + "assignees": [ + + ], + "milestone": null, + "comments": 7, + "created_at": "2021-03-31T16:44:26Z", + "updated_at": "2021-07-15T18:00:43Z", + "closed_at": "2021-07-02T15:36:11Z", + "author_association": "CONTRIBUTOR", + "active_lock_reason": null, + "body": "I was wondering whether we should change `SelectionVector::get_index` to pass through when `sel_vector` is `nullptr`. This would make `FlatVector::INCREMENTAL_SELECTION_VECTOR` obsolete and solve corner cases like [this one](https://github.com/cwida/duckdb/blob/a4340e84db5c36123e8bff4bb7dc4c05dc333c93/src/common/vector_operations/vector_copy.cpp#L213). This could also lead to some efficiencies if the loop tested for this condition and was able to perform more efficient batch operations like appending list or string dictionaries instead of extracting from them.\r\n\r\nThe only issue I can see is that it would add a test to every selection vector access, but in this century, the processor branch predictor should make that a non-issue. And if the `SelectionVector` was `const`, some compilers might promote the test out of the loop and effectively create two loops.\r\n\r\nI was also a bit concerned by the `SelectionVector(idx_t start, idx_t count)` method, which doesn't check that `count <= STANDARD_VECTOR_SIZE` or pass that into `Initialize`.", + "reactions": { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/1558/reactions", + "total_count": 0, + "+1": 0, + "-1": 0, + "laugh": 0, + "hooray": 0, + "confused": 0, + "heart": 0, + "rocket": 0, + "eyes": 0 + }, + "timeline_url": "https://api.github.com/repos/duckdb/duckdb/issues/1558/timeline", + "performed_via_github_app": null, + "state_reason": "completed" + }, + { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/1551", + "repository_url": "https://api.github.com/repos/duckdb/duckdb", + "labels_url": "https://api.github.com/repos/duckdb/duckdb/issues/1551/labels{/name}", + "comments_url": "https://api.github.com/repos/duckdb/duckdb/issues/1551/comments", + "events_url": "https://api.github.com/repos/duckdb/duckdb/issues/1551/events", + "html_url": "https://github.com/duckdb/duckdb/issues/1551", + "id": 845075627, + "node_id": "MDU6SXNzdWU4NDUwNzU2Mjc=", + "number": 1551, + "title": "VectorOperations::Copy for LISTs grows buffers unnecessarily", + "user": { + "login": "hawkfish", + "id": 13156216, + "node_id": "MDQ6VXNlcjEzMTU2MjE2", + "avatar_url": "https://avatars.githubusercontent.com/u/13156216?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/hawkfish", + "html_url": "https://github.com/hawkfish", + "followers_url": "https://api.github.com/users/hawkfish/followers", + "following_url": "https://api.github.com/users/hawkfish/following{/other_user}", + "gists_url": "https://api.github.com/users/hawkfish/gists{/gist_id}", + "starred_url": "https://api.github.com/users/hawkfish/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/hawkfish/subscriptions", + "organizations_url": "https://api.github.com/users/hawkfish/orgs", + "repos_url": "https://api.github.com/users/hawkfish/repos", + "events_url": "https://api.github.com/users/hawkfish/events{/privacy}", + "received_events_url": "https://api.github.com/users/hawkfish/received_events", + "type": "User", + "site_admin": false + }, + "labels": [ + + ], + "state": "closed", + "locked": false, + "assignee": null, + "assignees": [ + + ], + "milestone": null, + "comments": 7, + "created_at": "2021-03-30T19:00:36Z", + "updated_at": "2021-04-01T13:07:03Z", + "closed_at": "2021-04-01T13:07:03Z", + "author_association": "CONTRIBUTOR", + "active_lock_reason": null, + "body": "I ran into this while testing #1367 with `LIST`s. \r\n\r\nThe code for `VectorOperations::Copy` simply concatenates the two buffers and does some offset arithmetic. This is very wasteful when the caller is concatenating a small number of values through a selection vector. It is also not the way strings work, which seems inconsistent.", + "reactions": { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/1551/reactions", + "total_count": 0, + "+1": 0, + "-1": 0, + "laugh": 0, + "hooray": 0, + "confused": 0, + "heart": 0, + "rocket": 0, + "eyes": 0 + }, + "timeline_url": "https://api.github.com/repos/duckdb/duckdb/issues/1551/timeline", + "performed_via_github_app": null, + "state_reason": "completed" + }, + { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/1519", + "repository_url": "https://api.github.com/repos/duckdb/duckdb", + "labels_url": "https://api.github.com/repos/duckdb/duckdb/issues/1519/labels{/name}", + "comments_url": "https://api.github.com/repos/duckdb/duckdb/issues/1519/comments", + "events_url": "https://api.github.com/repos/duckdb/duckdb/issues/1519/events", + "html_url": "https://github.com/duckdb/duckdb/issues/1519", + "id": 836922948, + "node_id": "MDU6SXNzdWU4MzY5MjI5NDg=", + "number": 1519, + "title": "parquet file with hundreds of thousands of row groups crashes parquet_scan", + "user": { + "login": "Prussian1870", + "id": 6884766, + "node_id": "MDQ6VXNlcjY4ODQ3NjY=", + "avatar_url": "https://avatars.githubusercontent.com/u/6884766?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/Prussian1870", + "html_url": "https://github.com/Prussian1870", + "followers_url": "https://api.github.com/users/Prussian1870/followers", + "following_url": "https://api.github.com/users/Prussian1870/following{/other_user}", + "gists_url": "https://api.github.com/users/Prussian1870/gists{/gist_id}", + "starred_url": "https://api.github.com/users/Prussian1870/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/Prussian1870/subscriptions", + "organizations_url": "https://api.github.com/users/Prussian1870/orgs", + "repos_url": "https://api.github.com/users/Prussian1870/repos", + "events_url": "https://api.github.com/users/Prussian1870/events{/privacy}", + "received_events_url": "https://api.github.com/users/Prussian1870/received_events", + "type": "User", + "site_admin": false + }, + "labels": [ + { + "id": 2202756058, + "node_id": "MDU6TGFiZWwyMjAyNzU2MDU4", + "url": "https://api.github.com/repos/duckdb/duckdb/labels/Parquet", + "name": "Parquet", + "color": "e5ca72", + "default": false, + "description": "" + } + ], + "state": "closed", + "locked": false, + "assignee": null, + "assignees": [ + + ], + "milestone": null, + "comments": 7, + "created_at": "2021-03-20T22:19:50Z", + "updated_at": "2021-03-26T08:07:43Z", + "closed_at": "2021-03-26T08:07:43Z", + "author_association": "NONE", + "active_lock_reason": null, + "body": "Hi all:\r\n\r\nI'm working with python duckdb version 2.4.0 on windows 10.\r\n\r\nI have a 5 gig parquet file with 317,000 row groups. When I create a view based on the file(see below) it never returns. Note that PyArrow is able to read the file.\r\n\r\nCREATE VIEW jsonlogfile AS select * from parquet_scan('c:/working/testing/json.parquet')\r\n\r\nIs there a limit to the number of row groups?\r\n", + "reactions": { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/1519/reactions", + "total_count": 0, + "+1": 0, + "-1": 0, + "laugh": 0, + "hooray": 0, + "confused": 0, + "heart": 0, + "rocket": 0, + "eyes": 0 + }, + "timeline_url": "https://api.github.com/repos/duckdb/duckdb/issues/1519/timeline", + "performed_via_github_app": null, + "state_reason": "completed" + }, + { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/1511", + "repository_url": "https://api.github.com/repos/duckdb/duckdb", + "labels_url": "https://api.github.com/repos/duckdb/duckdb/issues/1511/labels{/name}", + "comments_url": "https://api.github.com/repos/duckdb/duckdb/issues/1511/comments", + "events_url": "https://api.github.com/repos/duckdb/duckdb/issues/1511/events", + "html_url": "https://github.com/duckdb/duckdb/issues/1511", + "id": 835198323, + "node_id": "MDU6SXNzdWU4MzUxOTgzMjM=", + "number": 1511, + "title": "The code needs more const", + "user": { + "login": "hawkfish", + "id": 13156216, + "node_id": "MDQ6VXNlcjEzMTU2MjE2", + "avatar_url": "https://avatars.githubusercontent.com/u/13156216?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/hawkfish", + "html_url": "https://github.com/hawkfish", + "followers_url": "https://api.github.com/users/hawkfish/followers", + "following_url": "https://api.github.com/users/hawkfish/following{/other_user}", + "gists_url": "https://api.github.com/users/hawkfish/gists{/gist_id}", + "starred_url": "https://api.github.com/users/hawkfish/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/hawkfish/subscriptions", + "organizations_url": "https://api.github.com/users/hawkfish/orgs", + "repos_url": "https://api.github.com/users/hawkfish/repos", + "events_url": "https://api.github.com/users/hawkfish/events{/privacy}", + "received_events_url": "https://api.github.com/users/hawkfish/received_events", + "type": "User", + "site_admin": false + }, + "labels": [ + + ], + "state": "closed", + "locked": false, + "assignee": null, + "assignees": [ + + ], + "milestone": null, + "comments": 7, + "created_at": "2021-03-18T19:56:29Z", + "updated_at": "2021-09-21T18:35:15Z", + "closed_at": "2021-09-21T18:35:15Z", + "author_association": "CONTRIBUTOR", + "active_lock_reason": null, + "body": "As we move towards greater threading, it would be nice if we started using `const` more carefully. As of C++11, `const` really means \"thread safe\" and it would be great if we could start using it in objects that get passed around a lot.\r\n\r\nIn aid of this, it would be great if we could get rid of the formatting rule that require `const` instance variables to be in UPPERCASE. This does not improve readability IMHO and takes me back to the coding standards of the 1970s. In particular, this formatting rule makes moving code into a `Task` class somewhat onerous because you have to go and change all the cases of the variables. ", + "reactions": { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/1511/reactions", + "total_count": 0, + "+1": 0, + "-1": 0, + "laugh": 0, + "hooray": 0, + "confused": 0, + "heart": 0, + "rocket": 0, + "eyes": 0 + }, + "timeline_url": "https://api.github.com/repos/duckdb/duckdb/issues/1511/timeline", + "performed_via_github_app": null, + "state_reason": "completed" + }, + { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/1302", + "repository_url": "https://api.github.com/repos/duckdb/duckdb", + "labels_url": "https://api.github.com/repos/duckdb/duckdb/issues/1302/labels{/name}", + "comments_url": "https://api.github.com/repos/duckdb/duckdb/issues/1302/comments", + "events_url": "https://api.github.com/repos/duckdb/duckdb/issues/1302/events", + "html_url": "https://github.com/duckdb/duckdb/pull/1302", + "id": 787249407, + "node_id": "MDExOlB1bGxSZXF1ZXN0NTU1OTk2OTc3", + "number": 1302, + "title": "R package: Add ability to specify output timezone", + "user": { + "login": "ateucher", + "id": 2816635, + "node_id": "MDQ6VXNlcjI4MTY2MzU=", + "avatar_url": "https://avatars.githubusercontent.com/u/2816635?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/ateucher", + "html_url": "https://github.com/ateucher", + "followers_url": "https://api.github.com/users/ateucher/followers", + "following_url": "https://api.github.com/users/ateucher/following{/other_user}", + "gists_url": "https://api.github.com/users/ateucher/gists{/gist_id}", + "starred_url": "https://api.github.com/users/ateucher/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/ateucher/subscriptions", + "organizations_url": "https://api.github.com/users/ateucher/orgs", + "repos_url": "https://api.github.com/users/ateucher/repos", + "events_url": "https://api.github.com/users/ateucher/events{/privacy}", + "received_events_url": "https://api.github.com/users/ateucher/received_events", + "type": "User", + "site_admin": false + }, + "labels": [ + + ], + "state": "closed", + "locked": false, + "assignee": null, + "assignees": [ + + ], + "milestone": null, + "comments": 7, + "created_at": "2021-01-15T22:20:58Z", + "updated_at": "2021-02-18T17:15:08Z", + "closed_at": "2021-02-18T14:58:42Z", + "author_association": "CONTRIBUTOR", + "active_lock_reason": null, + "draft": false, + "pull_request": { + "url": "https://api.github.com/repos/duckdb/duckdb/pulls/1302", + "html_url": "https://github.com/duckdb/duckdb/pull/1302", + "diff_url": "https://github.com/duckdb/duckdb/pull/1302.diff", + "patch_url": "https://github.com/duckdb/duckdb/pull/1302.patch", + "merged_at": "2021-02-18T14:58:42Z" + }, + "body": "Recently, @krlmlr made a [great addition to the RPostgres R package](https://github.com/r-dbi/RPostgres/pull/276), which added a `timezone_out` argument to the `dbConnect()` method, allowing a user to specify a timezone in which `TIMESTAMP` columns would be returned to R.\r\n\r\nThis PR adds that functionality to the duckdb R package, and additionally adds a `tz_out_convert` argument, controlling the conversion from `UTC` to the timezone specified in `timezone_out`. Options for `tz_out_convert` are `\"with\"` (default), which shows the timestamp in the desired timezone, or `\"force\"` which overrides the timestamp timezone with the desired timezone. With default arguments, the behaviour is the same as current, so this is non-breaking.\r\n\r\nA comprehensive test suite is included.\r\n\r\nExample of the new behaviour:\r\n\r\n``` r\r\nlibrary(duckdb)\r\n\r\n# Default behaviour (unchanged):\r\ncon <- dbConnect(duckdb())\r\nquery <- \"SELECT '1970-01-01 12:00:00'::TIMESTAMP AS ts\"\r\ndbGetQuery(con, query)[[1]]\r\n#> [1] \"1970-01-01 12:00:00 UTC\"\r\ndbDisconnect(con, shutdown = TRUE)\r\n\r\n# Specify a timezone_out, leaving tz_out_convert = 'with' (default)\r\ncon <- dbConnect(duckdb(), timezone_out = \"America/Vancouver\")\r\nquery <- \"SELECT '1970-01-01 12:00:00'::TIMESTAMP AS ts\"\r\ndbGetQuery(con, query)[[1]]\r\n#> [1] \"1970-01-01 04:00:00 PST\"\r\ndbDisconnect(con, shutdown = TRUE)\r\n\r\n# Specify a timezone_out, setting tz_out_convert = 'force'\r\ncon <- dbConnect(duckdb(), timezone_out = \"America/Vancouver\", tz_out_convert = \"force\")\r\n\r\nquery <- \"SELECT '1970-01-01 12:00:00'::TIMESTAMP AS ts\"\r\ndbGetQuery(con, query)[[1]]\r\n#> [1] \"1970-01-01 12:00:00 PST\"\r\ndbDisconnect(con, shutdown = TRUE)\r\n```\r\n\r\nCreated on 2021-01-15 by the [reprex package](https://reprex.tidyverse.org) (v0.3.0)\r\n\r\nNote: This adds the `lubridate` package as a dependency in \"Imports\", and `withr` to \"Suggests\" for testing.\r\n\r\nI know this was unsolicited, so I won't be offended if you're not keen on taking it. The use case is: I have a large dataset (csv) that was collected and stored in `\"PST\"` - loading it into `duckdb` assumes it is UTC and stores it as such, so when querying the database the results returned to R are in UTC. This added functionality allows a user to specify the timezone in which the data are returned, and whether or not the clock time should be adjusted or if timezone should be just overridden.\r\n\r\nThanks very much for duckdb!\r\n\r\n", + "reactions": { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/1302/reactions", + "total_count": 0, + "+1": 0, + "-1": 0, + "laugh": 0, + "hooray": 0, + "confused": 0, + "heart": 0, + "rocket": 0, + "eyes": 0 + }, + "timeline_url": "https://api.github.com/repos/duckdb/duckdb/issues/1302/timeline", + "performed_via_github_app": null, + "state_reason": null + }, + { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/1244", + "repository_url": "https://api.github.com/repos/duckdb/duckdb", + "labels_url": "https://api.github.com/repos/duckdb/duckdb/issues/1244/labels{/name}", + "comments_url": "https://api.github.com/repos/duckdb/duckdb/issues/1244/comments", + "events_url": "https://api.github.com/repos/duckdb/duckdb/issues/1244/events", + "html_url": "https://github.com/duckdb/duckdb/pull/1244", + "id": 773038797, + "node_id": "MDExOlB1bGxSZXF1ZXN0NTQ0MTk5Mjk5", + "number": 1244, + "title": "SHOW select queries", + "user": { + "login": "nantiamak", + "id": 14079764, + "node_id": "MDQ6VXNlcjE0MDc5NzY0", + "avatar_url": "https://avatars.githubusercontent.com/u/14079764?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/nantiamak", + "html_url": "https://github.com/nantiamak", + "followers_url": "https://api.github.com/users/nantiamak/followers", + "following_url": "https://api.github.com/users/nantiamak/following{/other_user}", + "gists_url": "https://api.github.com/users/nantiamak/gists{/gist_id}", + "starred_url": "https://api.github.com/users/nantiamak/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/nantiamak/subscriptions", + "organizations_url": "https://api.github.com/users/nantiamak/orgs", + "repos_url": "https://api.github.com/users/nantiamak/repos", + "events_url": "https://api.github.com/users/nantiamak/events{/privacy}", + "received_events_url": "https://api.github.com/users/nantiamak/received_events", + "type": "User", + "site_admin": false + }, + "labels": [ + + ], + "state": "closed", + "locked": false, + "assignee": null, + "assignees": [ + + ], + "milestone": null, + "comments": 7, + "created_at": "2020-12-22T15:38:26Z", + "updated_at": "2021-01-06T15:04:17Z", + "closed_at": "2021-01-06T15:04:17Z", + "author_association": "CONTRIBUTOR", + "active_lock_reason": null, + "draft": false, + "pull_request": { + "url": "https://api.github.com/repos/duckdb/duckdb/pulls/1244", + "html_url": "https://github.com/duckdb/duckdb/pull/1244", + "diff_url": "https://github.com/duckdb/duckdb/pull/1244.diff", + "patch_url": "https://github.com/duckdb/duckdb/pull/1244.patch", + "merged_at": "2021-01-06T15:04:17Z" + }, + "body": "A first implementation of SHOW combined with a SELECT query (issue #674).\r\n\r\nReturned columns: column name, column type, not null, default value, primary key. ", + "reactions": { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/1244/reactions", + "total_count": 1, + "+1": 1, + "-1": 0, + "laugh": 0, + "hooray": 0, + "confused": 0, + "heart": 0, + "rocket": 0, + "eyes": 0 + }, + "timeline_url": "https://api.github.com/repos/duckdb/duckdb/issues/1244/timeline", + "performed_via_github_app": null, + "state_reason": null + }, + { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/1239", + "repository_url": "https://api.github.com/repos/duckdb/duckdb", + "labels_url": "https://api.github.com/repos/duckdb/duckdb/issues/1239/labels{/name}", + "comments_url": "https://api.github.com/repos/duckdb/duckdb/issues/1239/comments", + "events_url": "https://api.github.com/repos/duckdb/duckdb/issues/1239/events", + "html_url": "https://github.com/duckdb/duckdb/pull/1239", + "id": 771558778, + "node_id": "MDExOlB1bGxSZXF1ZXN0NTQzMDQ4MDgy", + "number": 1239, + "title": "Fix error message", + "user": { + "login": "krlmlr", + "id": 1741643, + "node_id": "MDQ6VXNlcjE3NDE2NDM=", + "avatar_url": "https://avatars.githubusercontent.com/u/1741643?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/krlmlr", + "html_url": "https://github.com/krlmlr", + "followers_url": "https://api.github.com/users/krlmlr/followers", + "following_url": "https://api.github.com/users/krlmlr/following{/other_user}", + "gists_url": "https://api.github.com/users/krlmlr/gists{/gist_id}", + "starred_url": "https://api.github.com/users/krlmlr/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/krlmlr/subscriptions", + "organizations_url": "https://api.github.com/users/krlmlr/orgs", + "repos_url": "https://api.github.com/users/krlmlr/repos", + "events_url": "https://api.github.com/users/krlmlr/events{/privacy}", + "received_events_url": "https://api.github.com/users/krlmlr/received_events", + "type": "User", + "site_admin": false + }, + "labels": [ + + ], + "state": "closed", + "locked": false, + "assignee": null, + "assignees": [ + + ], + "milestone": null, + "comments": 7, + "created_at": "2020-12-20T10:29:25Z", + "updated_at": "2022-10-29T04:34:08Z", + "closed_at": "2021-01-06T15:38:30Z", + "author_association": "COLLABORATOR", + "active_lock_reason": null, + "draft": false, + "pull_request": { + "url": "https://api.github.com/repos/duckdb/duckdb/pulls/1239", + "html_url": "https://github.com/duckdb/duckdb/pull/1239", + "diff_url": "https://github.com/duckdb/duckdb/pull/1239.diff", + "patch_url": "https://github.com/duckdb/duckdb/pull/1239.patch", + "merged_at": "2021-01-06T15:38:30Z" + }, + "body": "that occurs with certain parametrized queries.\n\n`uint8_t` is interpreted as `char` by the formatter routines. Are there other enums that are mapped to an `uint8_t` internally?", + "reactions": { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/1239/reactions", + "total_count": 0, + "+1": 0, + "-1": 0, + "laugh": 0, + "hooray": 0, + "confused": 0, + "heart": 0, + "rocket": 0, + "eyes": 0 + }, + "timeline_url": "https://api.github.com/repos/duckdb/duckdb/issues/1239/timeline", + "performed_via_github_app": null, + "state_reason": null + }, + { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/1222", + "repository_url": "https://api.github.com/repos/duckdb/duckdb", + "labels_url": "https://api.github.com/repos/duckdb/duckdb/issues/1222/labels{/name}", + "comments_url": "https://api.github.com/repos/duckdb/duckdb/issues/1222/comments", + "events_url": "https://api.github.com/repos/duckdb/duckdb/issues/1222/events", + "html_url": "https://github.com/duckdb/duckdb/issues/1222", + "id": 765188844, + "node_id": "MDU6SXNzdWU3NjUxODg4NDQ=", + "number": 1222, + "title": "Crash on single-argument COALESCE", + "user": { + "login": "alanpaulkwan", + "id": 8321252, + "node_id": "MDQ6VXNlcjgzMjEyNTI=", + "avatar_url": "https://avatars.githubusercontent.com/u/8321252?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/alanpaulkwan", + "html_url": "https://github.com/alanpaulkwan", + "followers_url": "https://api.github.com/users/alanpaulkwan/followers", + "following_url": "https://api.github.com/users/alanpaulkwan/following{/other_user}", + "gists_url": "https://api.github.com/users/alanpaulkwan/gists{/gist_id}", + "starred_url": "https://api.github.com/users/alanpaulkwan/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/alanpaulkwan/subscriptions", + "organizations_url": "https://api.github.com/users/alanpaulkwan/orgs", + "repos_url": "https://api.github.com/users/alanpaulkwan/repos", + "events_url": "https://api.github.com/users/alanpaulkwan/events{/privacy}", + "received_events_url": "https://api.github.com/users/alanpaulkwan/received_events", + "type": "User", + "site_admin": false + }, + "labels": [ + + ], + "state": "closed", + "locked": false, + "assignee": { + "login": "hannes", + "id": 227792, + "node_id": "MDQ6VXNlcjIyNzc5Mg==", + "avatar_url": "https://avatars.githubusercontent.com/u/227792?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/hannes", + "html_url": "https://github.com/hannes", + "followers_url": "https://api.github.com/users/hannes/followers", + "following_url": "https://api.github.com/users/hannes/following{/other_user}", + "gists_url": "https://api.github.com/users/hannes/gists{/gist_id}", + "starred_url": "https://api.github.com/users/hannes/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/hannes/subscriptions", + "organizations_url": "https://api.github.com/users/hannes/orgs", + "repos_url": "https://api.github.com/users/hannes/repos", + "events_url": "https://api.github.com/users/hannes/events{/privacy}", + "received_events_url": "https://api.github.com/users/hannes/received_events", + "type": "User", + "site_admin": false + }, + "assignees": [ + { + "login": "hannes", + "id": 227792, + "node_id": "MDQ6VXNlcjIyNzc5Mg==", + "avatar_url": "https://avatars.githubusercontent.com/u/227792?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/hannes", + "html_url": "https://github.com/hannes", + "followers_url": "https://api.github.com/users/hannes/followers", + "following_url": "https://api.github.com/users/hannes/following{/other_user}", + "gists_url": "https://api.github.com/users/hannes/gists{/gist_id}", + "starred_url": "https://api.github.com/users/hannes/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/hannes/subscriptions", + "organizations_url": "https://api.github.com/users/hannes/orgs", + "repos_url": "https://api.github.com/users/hannes/repos", + "events_url": "https://api.github.com/users/hannes/events{/privacy}", + "received_events_url": "https://api.github.com/users/hannes/received_events", + "type": "User", + "site_admin": false + } + ], + "milestone": null, + "comments": 7, + "created_at": "2020-12-13T08:37:17Z", + "updated_at": "2020-12-15T09:13:46Z", + "closed_at": "2020-12-13T11:54:11Z", + "author_association": "NONE", + "active_lock_reason": null, + "body": "I can PM someone the data. This is the valid query:\r\n```{r}\r\n\r\nres=dbGetQuery(con,\"select recipient_duns,count() as n_contracts,sum(coalesce(potential_total_value_of_award,0)) as award,\r\n max(action_date_fiscal_year) as year_max,\r\n min(action_date_fiscal_year) as year_min,\r\n sum(case when place_of_manufacture like '%OUTSIDE%U%S%' then 1 else 0 end) contracts_outside_usa,\r\n sum(case when place_of_manufacture like '%OUTSIDE%U%S%' then coalesce(potential_total_value_of_award,0) else 0 end) dollarval_contracts_outside_usa\r\n from parquet_scan('*201[5-9]*/*parquet') group by recipient_duns\")\r\n\r\n```\r\nThis is the errant query. It should just complain but rather R crashes giving me an option to core dump. The errant query is in the phrase coalesce(potential_total_value_of_award0) e.g. no comma\r\n\r\n```{r}\r\nres=dbGetQuery(con,\"select recipient_duns,count() as n_contracts,sum(coalesce(potential_total_value_of_award,0)) as award,\r\n max(action_date_fiscal_year) as year_max,\r\n min(action_date_fiscal_year) as year_min,\r\n sum(case when place_of_manufacture like '%OUTSIDE%U%S%' then 1 else 0 end) contracts_outside_usa,\r\n sum(case when place_of_manufacture like '%OUTSIDE%U%S%' then coalesce(potential_total_value_of_award0) else 0 end) dollarval_contracts_outside_usa\r\n from parquet_scan('*201[5-9]*/*parquet') group by recipient_duns\")", + "reactions": { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/1222/reactions", + "total_count": 0, + "+1": 0, + "-1": 0, + "laugh": 0, + "hooray": 0, + "confused": 0, + "heart": 0, + "rocket": 0, + "eyes": 0 + }, + "timeline_url": "https://api.github.com/repos/duckdb/duckdb/issues/1222/timeline", + "performed_via_github_app": null, + "state_reason": "completed" + }, + { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/1112", + "repository_url": "https://api.github.com/repos/duckdb/duckdb", + "labels_url": "https://api.github.com/repos/duckdb/duckdb/issues/1112/labels{/name}", + "comments_url": "https://api.github.com/repos/duckdb/duckdb/issues/1112/comments", + "events_url": "https://api.github.com/repos/duckdb/duckdb/issues/1112/events", + "html_url": "https://github.com/duckdb/duckdb/pull/1112", + "id": 741580245, + "node_id": "MDExOlB1bGxSZXF1ZXN0NTE5ODY4MTU4", + "number": 1112, + "title": "Add DuckDB node.js API", + "user": { + "login": "hannes", + "id": 227792, + "node_id": "MDQ6VXNlcjIyNzc5Mg==", + "avatar_url": "https://avatars.githubusercontent.com/u/227792?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/hannes", + "html_url": "https://github.com/hannes", + "followers_url": "https://api.github.com/users/hannes/followers", + "following_url": "https://api.github.com/users/hannes/following{/other_user}", + "gists_url": "https://api.github.com/users/hannes/gists{/gist_id}", + "starred_url": "https://api.github.com/users/hannes/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/hannes/subscriptions", + "organizations_url": "https://api.github.com/users/hannes/orgs", + "repos_url": "https://api.github.com/users/hannes/repos", + "events_url": "https://api.github.com/users/hannes/events{/privacy}", + "received_events_url": "https://api.github.com/users/hannes/received_events", + "type": "User", + "site_admin": false + }, + "labels": [ + + ], + "state": "closed", + "locked": false, + "assignee": null, + "assignees": [ + + ], + "milestone": null, + "comments": 7, + "created_at": "2020-11-12T12:59:04Z", + "updated_at": "2020-11-14T06:55:09Z", + "closed_at": "2020-11-12T15:06:31Z", + "author_association": "MEMBER", + "active_lock_reason": null, + "draft": false, + "pull_request": { + "url": "https://api.github.com/repos/duckdb/duckdb/pulls/1112", + "html_url": "https://github.com/duckdb/duckdb/pull/1112", + "diff_url": "https://github.com/duckdb/duckdb/pull/1112.diff", + "patch_url": "https://github.com/duckdb/duckdb/pull/1112.patch", + "merged_at": "2020-11-12T15:06:31Z" + }, + "body": "This adds a node.js API for DuckDB in `/tools/nodejs` due to popular demand. The API for this client is somewhat compliant to the SQLite node.js client for easier transition (and transition you must eventually). The npm package is called 'duckdb' thanks to @twilson63 who graciously allowed us to take over that package name. \r\n\r\nInstall the node.js API like so (for now, binaries & npm will follow)\r\n````bash\r\nnpm install duckdb\r\n````\r\n\r\n(this could take a while)\r\n\r\nThen load the package and create a database object:\r\n````JavaScript\r\nvar duckdb = require('duckdb');\r\n\r\nvar db = new duckdb.Database(':memory:'); // or a file name for a persistent DB\r\n````\r\nThen you can run a query:\r\n````JavaScript\r\ndb.all('SELECT 42 AS fortytwo', function(err, res) {\r\n if (err) {\r\n throw err;\r\n }\r\n console.log(res[0].fortytwo)\r\n});\r\n````\r\nOther available methods are `each`, where the callback is invoked for each row, `run` to execute a single statement without results and `exec`, which can execute several SQL commands at once but also does not return results. All those commands can work with prepared statements, taking the values for the parameters as additional arguments. For example like so:\r\n````JavaScript\r\ndb.all('SELECT ?::INTEGER AS fortytwo, ?::STRING as hello', 42, 'Hello, World', function(err, res) {\r\n if (err) {\r\n throw err;\r\n }\r\n console.log(res[0].fortytwo)\r\n console.log(res[0].hello)\r\n});\r\n````\r\n\r\nHowever, these are all shorthands for something much more elegant. A database can have multiple `Connection`s, those are created using `db.connect()`.\r\n````JavaScript\r\nvar con = db.connect();\r\n````\r\nYou can create multiple connections, each with their own transaction context.\r\n\r\n\r\n`Connection` objects also contain shorthands to directly call `run()`, `all()` and `each()` with parameters and callbacks, respectively, for example:\r\n````JavaScript\r\ncon.all('SELECT 42 AS fortytwo', function(err, res) {\r\n if (err) {\r\n throw err;\r\n }\r\n console.log(res[0].fortytwo)\r\n});\r\n````\r\n\r\nFrom connections, you can create prepared statements (and only that) using `con.prepare()`:\r\n\r\n````JavaScript\r\nvar stmt = con.prepare('select ?::INTEGER as fortytwo');\r\n```` \r\n\r\nTo execute this statement, you can call for example `all()` on the `stmt` object:\r\n````JavaScript\r\nstmt.all(42, function(err, res) {\r\n if (err) {\r\n throw err;\r\n }\r\n console.log(res[0].fortytwo)\r\n});\r\n````\r\n\r\nYou can also execute the prepared statement multiple times. This is for example useful to fill a table with data:\r\n````JavaScript\r\ncon.run('CREATE TABLE a (i INTEGER)');\r\nvar stmt = con.prepare('INSERT INTO a VALUES (?)');\r\nfor (var i = 0; i < 10; i++) {\r\n stmt.run(i);\r\n}\r\nstmt.finalize();\r\ncon.all('SELECT * FROM a', function(err, res) {\r\n if (err) {\r\n throw err;\r\n }\r\n console.log(res)\r\n});\r\n````\r\n\r\n`prepare()` can also take a callback which gets the prepared statement as an argument:\r\n````JavaScript\r\nvar stmt = con.prepare('select ?::INTEGER as fortytwo', function(err, stmt) {\r\n stmt.all(42, function(err, res) {\r\n if (err) {\r\n throw err;\r\n }\r\n console.log(res[0].fortytwo)\r\n });\r\n});\r\n````\r\n", + "reactions": { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/1112/reactions", + "total_count": 1, + "+1": 0, + "-1": 0, + "laugh": 0, + "hooray": 1, + "confused": 0, + "heart": 0, + "rocket": 0, + "eyes": 0 + }, + "timeline_url": "https://api.github.com/repos/duckdb/duckdb/issues/1112/timeline", + "performed_via_github_app": null, + "state_reason": null + }, + { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/1090", + "repository_url": "https://api.github.com/repos/duckdb/duckdb", + "labels_url": "https://api.github.com/repos/duckdb/duckdb/issues/1090/labels{/name}", + "comments_url": "https://api.github.com/repos/duckdb/duckdb/issues/1090/comments", + "events_url": "https://api.github.com/repos/duckdb/duckdb/issues/1090/events", + "html_url": "https://github.com/duckdb/duckdb/issues/1090", + "id": 738262260, + "node_id": "MDU6SXNzdWU3MzgyNjIyNjA=", + "number": 1090, + "title": "JDBC: cannot select BLOB columns", + "user": { + "login": "agarciadom", + "id": 46504, + "node_id": "MDQ6VXNlcjQ2NTA0", + "avatar_url": "https://avatars.githubusercontent.com/u/46504?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/agarciadom", + "html_url": "https://github.com/agarciadom", + "followers_url": "https://api.github.com/users/agarciadom/followers", + "following_url": "https://api.github.com/users/agarciadom/following{/other_user}", + "gists_url": "https://api.github.com/users/agarciadom/gists{/gist_id}", + "starred_url": "https://api.github.com/users/agarciadom/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/agarciadom/subscriptions", + "organizations_url": "https://api.github.com/users/agarciadom/orgs", + "repos_url": "https://api.github.com/users/agarciadom/repos", + "events_url": "https://api.github.com/users/agarciadom/events{/privacy}", + "received_events_url": "https://api.github.com/users/agarciadom/received_events", + "type": "User", + "site_admin": false + }, + "labels": [ + { + "id": 1977935149, + "node_id": "MDU6TGFiZWwxOTc3OTM1MTQ5", + "url": "https://api.github.com/repos/duckdb/duckdb/labels/JDBC", + "name": "JDBC", + "color": "48b210", + "default": false, + "description": "" + } + ], + "state": "closed", + "locked": false, + "assignee": { + "login": "hannes", + "id": 227792, + "node_id": "MDQ6VXNlcjIyNzc5Mg==", + "avatar_url": "https://avatars.githubusercontent.com/u/227792?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/hannes", + "html_url": "https://github.com/hannes", + "followers_url": "https://api.github.com/users/hannes/followers", + "following_url": "https://api.github.com/users/hannes/following{/other_user}", + "gists_url": "https://api.github.com/users/hannes/gists{/gist_id}", + "starred_url": "https://api.github.com/users/hannes/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/hannes/subscriptions", + "organizations_url": "https://api.github.com/users/hannes/orgs", + "repos_url": "https://api.github.com/users/hannes/repos", + "events_url": "https://api.github.com/users/hannes/events{/privacy}", + "received_events_url": "https://api.github.com/users/hannes/received_events", + "type": "User", + "site_admin": false + }, + "assignees": [ + { + "login": "hannes", + "id": 227792, + "node_id": "MDQ6VXNlcjIyNzc5Mg==", + "avatar_url": "https://avatars.githubusercontent.com/u/227792?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/hannes", + "html_url": "https://github.com/hannes", + "followers_url": "https://api.github.com/users/hannes/followers", + "following_url": "https://api.github.com/users/hannes/following{/other_user}", + "gists_url": "https://api.github.com/users/hannes/gists{/gist_id}", + "starred_url": "https://api.github.com/users/hannes/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/hannes/subscriptions", + "organizations_url": "https://api.github.com/users/hannes/orgs", + "repos_url": "https://api.github.com/users/hannes/repos", + "events_url": "https://api.github.com/users/hannes/events{/privacy}", + "received_events_url": "https://api.github.com/users/hannes/received_events", + "type": "User", + "site_admin": false + } + ], + "milestone": null, + "comments": 7, + "created_at": "2020-11-07T15:25:04Z", + "updated_at": "2021-10-09T18:34:13Z", + "closed_at": "2021-10-09T18:34:13Z", + "author_association": "NONE", + "active_lock_reason": null, + "body": "I have tried to write a SELECT query which mentions a BLOB column, among others. However, I receive the following error:\r\n\r\n```\r\n2020-11-07 15:18:25 [main] ERROR DuckNode - Could not fetch property d on node 1\r\njava.sql.SQLException: Unsupported result column type BLOB\r\n\tat org.duckdb.DuckDBNative.duckdb_jdbc_fetch(Native Method) ~[na:na]\r\n\tat org.duckdb.DuckDBResultSet.(DuckDBResultSet.java:45) ~[na:na]\r\n\tat org.duckdb.DuckDBPreparedStatement.execute(DuckDBPreparedStatement.java:90) ~[na:na]\r\n\tat org.duckdb.DuckDBPreparedStatement.executeQuery(DuckDBPreparedStatement.java:100) ~[na:na]\r\n\tat org.eclipse.hawk.duckdb.DuckNode.getProperty(DuckNode.java:252) ~[na:na]\r\n```\r\n\r\nShould I be casting that column to a VARCHAR instead?", + "reactions": { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/1090/reactions", + "total_count": 0, + "+1": 0, + "-1": 0, + "laugh": 0, + "hooray": 0, + "confused": 0, + "heart": 0, + "rocket": 0, + "eyes": 0 + }, + "timeline_url": "https://api.github.com/repos/duckdb/duckdb/issues/1090/timeline", + "performed_via_github_app": null, + "state_reason": "completed" + }, + { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/1076", + "repository_url": "https://api.github.com/repos/duckdb/duckdb", + "labels_url": "https://api.github.com/repos/duckdb/duckdb/issues/1076/labels{/name}", + "comments_url": "https://api.github.com/repos/duckdb/duckdb/issues/1076/comments", + "events_url": "https://api.github.com/repos/duckdb/duckdb/issues/1076/events", + "html_url": "https://github.com/duckdb/duckdb/pull/1076", + "id": 733721171, + "node_id": "MDExOlB1bGxSZXF1ZXN0NTEzNDMzOTEw", + "number": 1076, + "title": "Add Train Benchmark queries", + "user": { + "login": "szarnyasg", + "id": 1402801, + "node_id": "MDQ6VXNlcjE0MDI4MDE=", + "avatar_url": "https://avatars.githubusercontent.com/u/1402801?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/szarnyasg", + "html_url": "https://github.com/szarnyasg", + "followers_url": "https://api.github.com/users/szarnyasg/followers", + "following_url": "https://api.github.com/users/szarnyasg/following{/other_user}", + "gists_url": "https://api.github.com/users/szarnyasg/gists{/gist_id}", + "starred_url": "https://api.github.com/users/szarnyasg/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/szarnyasg/subscriptions", + "organizations_url": "https://api.github.com/users/szarnyasg/orgs", + "repos_url": "https://api.github.com/users/szarnyasg/repos", + "events_url": "https://api.github.com/users/szarnyasg/events{/privacy}", + "received_events_url": "https://api.github.com/users/szarnyasg/received_events", + "type": "User", + "site_admin": false + }, + "labels": [ + + ], + "state": "closed", + "locked": false, + "assignee": null, + "assignees": [ + + ], + "milestone": null, + "comments": 7, + "created_at": "2020-10-31T13:33:04Z", + "updated_at": "2020-11-08T12:17:08Z", + "closed_at": "2020-11-08T12:16:58Z", + "author_association": "CONTRIBUTOR", + "active_lock_reason": null, + "draft": false, + "pull_request": { + "url": "https://api.github.com/repos/duckdb/duckdb/pulls/1076", + "html_url": "https://github.com/duckdb/duckdb/pull/1076", + "diff_url": "https://github.com/duckdb/duckdb/pull/1076.diff", + "patch_url": "https://github.com/duckdb/duckdb/pull/1076.patch", + "merged_at": "2020-11-08T12:16:58Z" + }, + "body": "I took at porting the Train Benchmark MySQL/SQLite implementation to DuckDB.\r\n\r\nI co-designed & co-developed Train Benchmark during my PhD. There is an [OA journal paper](https://link.springer.com/article/10.1007%2Fs10270-016-0571-8) describing its specification.\r\n\r\nIn essence, it is a \"model validation\" benchmark, i.e. it performs OLAP-style queries on a model graph to check certain well-formedness properties in the graph. The set of features used by the queries is fairly small. The queries express subgraph matching with lots of joins and a few antijoins (when the query looks for the absence of an edge). They also use projection and filtering but there are no outer joins, aggregations, path queries, etc.\r\n\r\nThe difficulty of this benchmark is two-fold:\r\n\r\n1. The queries are global, so their complexity increases linearly with the data set size (or worse).\r\n\r\n Antijoin queries were particularly problematic for MySQL at the time, it [timed out](https://link.springer.com/article/10.1007/s10270-016-0571-8/figures/10) even for mid-sized graphs (1.1M nodes+edges) on the queries that have an antijoin (RoutSensor, SwitchMonitored, SemaphoreNeighbor).\r\n\r\n SQLite performed better, here's what we wrote:\r\n\r\n > [...] However, SQLite is surprisingly fast in several configurations. This may indicate that other technologies still have a lot of potential for performance enhancements.\r\n\r\n2. The full benchmark measures a \"continuous model validation\" scenario, i.e. the model graph is continuously updated (with both insert and deletes) and implementations need to return the current results for the query. Naturally, this favours incremental view maintenance approaches.\r\n\r\n The changes are issued by a Java/Groovy-based benchmark framework, so testing DuckDB with changes would require adding a DuckDB driver to the benchmark framework. This is doable but it needs a bit of work.\r\n\r\nSo far, my experience has been the following:\r\n\r\n* Porting the schema was fairly easy, I used the same tricks like the ones in LDBC SNB loader.\r\n* The queries worked out of the box.\r\n* The SF1 test data set is fairly small (64 kB), so I included it in the PR.\r\n", + "reactions": { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/1076/reactions", + "total_count": 0, + "+1": 0, + "-1": 0, + "laugh": 0, + "hooray": 0, + "confused": 0, + "heart": 0, + "rocket": 0, + "eyes": 0 + }, + "timeline_url": "https://api.github.com/repos/duckdb/duckdb/issues/1076/timeline", + "performed_via_github_app": null, + "state_reason": null + }, + { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/1056", + "repository_url": "https://api.github.com/repos/duckdb/duckdb", + "labels_url": "https://api.github.com/repos/duckdb/duckdb/issues/1056/labels{/name}", + "comments_url": "https://api.github.com/repos/duckdb/duckdb/issues/1056/comments", + "events_url": "https://api.github.com/repos/duckdb/duckdb/issues/1056/events", + "html_url": "https://github.com/duckdb/duckdb/issues/1056", + "id": 729496809, + "node_id": "MDU6SXNzdWU3Mjk0OTY4MDk=", + "number": 1056, + "title": "Remove NullValue and friends", + "user": { + "login": "Mytherin", + "id": 3978469, + "node_id": "MDQ6VXNlcjM5Nzg0Njk=", + "avatar_url": "https://avatars.githubusercontent.com/u/3978469?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/Mytherin", + "html_url": "https://github.com/Mytherin", + "followers_url": "https://api.github.com/users/Mytherin/followers", + "following_url": "https://api.github.com/users/Mytherin/following{/other_user}", + "gists_url": "https://api.github.com/users/Mytherin/gists{/gist_id}", + "starred_url": "https://api.github.com/users/Mytherin/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/Mytherin/subscriptions", + "organizations_url": "https://api.github.com/users/Mytherin/orgs", + "repos_url": "https://api.github.com/users/Mytherin/repos", + "events_url": "https://api.github.com/users/Mytherin/events{/privacy}", + "received_events_url": "https://api.github.com/users/Mytherin/received_events", + "type": "User", + "site_admin": false + }, + "labels": [ + + ], + "state": "closed", + "locked": false, + "assignee": null, + "assignees": [ + + ], + "milestone": null, + "comments": 7, + "created_at": "2020-10-26T11:51:01Z", + "updated_at": "2021-09-24T13:55:21Z", + "closed_at": "2021-09-24T13:55:21Z", + "author_association": "COLLABORATOR", + "active_lock_reason": null, + "body": "We still use a special value in the domain to represent null values for integers in some places (specifically hash tables); this is a remnant from an earlier implementation of null values. These need to go because they make interfacing with other file formats much more difficult (now we need to check all int32 before importing a parquet file, for example). Instead the hash tables should have a bitmask at the start of every row that indicates null-ness of every value.", + "reactions": { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/1056/reactions", + "total_count": 0, + "+1": 0, + "-1": 0, + "laugh": 0, + "hooray": 0, + "confused": 0, + "heart": 0, + "rocket": 0, + "eyes": 0 + }, + "timeline_url": "https://api.github.com/repos/duckdb/duckdb/issues/1056/timeline", + "performed_via_github_app": null, + "state_reason": "completed" + }, + { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/998", + "repository_url": "https://api.github.com/repos/duckdb/duckdb", + "labels_url": "https://api.github.com/repos/duckdb/duckdb/issues/998/labels{/name}", + "comments_url": "https://api.github.com/repos/duckdb/duckdb/issues/998/comments", + "events_url": "https://api.github.com/repos/duckdb/duckdb/issues/998/events", + "html_url": "https://github.com/duckdb/duckdb/issues/998", + "id": 719225407, + "node_id": "MDU6SXNzdWU3MTkyMjU0MDc=", + "number": 998, + "title": "CSV Reader Casting exceptions", + "user": { + "login": "pdet", + "id": 7377477, + "node_id": "MDQ6VXNlcjczNzc0Nzc=", + "avatar_url": "https://avatars.githubusercontent.com/u/7377477?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/pdet", + "html_url": "https://github.com/pdet", + "followers_url": "https://api.github.com/users/pdet/followers", + "following_url": "https://api.github.com/users/pdet/following{/other_user}", + "gists_url": "https://api.github.com/users/pdet/gists{/gist_id}", + "starred_url": "https://api.github.com/users/pdet/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/pdet/subscriptions", + "organizations_url": "https://api.github.com/users/pdet/orgs", + "repos_url": "https://api.github.com/users/pdet/repos", + "events_url": "https://api.github.com/users/pdet/events{/privacy}", + "received_events_url": "https://api.github.com/users/pdet/received_events", + "type": "User", + "site_admin": false + }, + "labels": [ + + ], + "state": "closed", + "locked": false, + "assignee": { + "login": "Mytherin", + "id": 3978469, + "node_id": "MDQ6VXNlcjM5Nzg0Njk=", + "avatar_url": "https://avatars.githubusercontent.com/u/3978469?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/Mytherin", + "html_url": "https://github.com/Mytherin", + "followers_url": "https://api.github.com/users/Mytherin/followers", + "following_url": "https://api.github.com/users/Mytherin/following{/other_user}", + "gists_url": "https://api.github.com/users/Mytherin/gists{/gist_id}", + "starred_url": "https://api.github.com/users/Mytherin/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/Mytherin/subscriptions", + "organizations_url": "https://api.github.com/users/Mytherin/orgs", + "repos_url": "https://api.github.com/users/Mytherin/repos", + "events_url": "https://api.github.com/users/Mytherin/events{/privacy}", + "received_events_url": "https://api.github.com/users/Mytherin/received_events", + "type": "User", + "site_admin": false + }, + "assignees": [ + { + "login": "Mytherin", + "id": 3978469, + "node_id": "MDQ6VXNlcjM5Nzg0Njk=", + "avatar_url": "https://avatars.githubusercontent.com/u/3978469?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/Mytherin", + "html_url": "https://github.com/Mytherin", + "followers_url": "https://api.github.com/users/Mytherin/followers", + "following_url": "https://api.github.com/users/Mytherin/following{/other_user}", + "gists_url": "https://api.github.com/users/Mytherin/gists{/gist_id}", + "starred_url": "https://api.github.com/users/Mytherin/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/Mytherin/subscriptions", + "organizations_url": "https://api.github.com/users/Mytherin/orgs", + "repos_url": "https://api.github.com/users/Mytherin/repos", + "events_url": "https://api.github.com/users/Mytherin/events{/privacy}", + "received_events_url": "https://api.github.com/users/Mytherin/received_events", + "type": "User", + "site_admin": false + } + ], + "milestone": null, + "comments": 7, + "created_at": "2020-10-12T09:30:57Z", + "updated_at": "2021-09-24T13:55:33Z", + "closed_at": "2021-09-24T13:55:33Z", + "author_association": "MEMBER", + "active_lock_reason": null, + "body": "Should the auto-detect from the csv reader throw casting exceptions?", + "reactions": { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/998/reactions", + "total_count": 0, + "+1": 0, + "-1": 0, + "laugh": 0, + "hooray": 0, + "confused": 0, + "heart": 0, + "rocket": 0, + "eyes": 0 + }, + "timeline_url": "https://api.github.com/repos/duckdb/duckdb/issues/998/timeline", + "performed_via_github_app": null, + "state_reason": "completed" + }, + { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/989", + "repository_url": "https://api.github.com/repos/duckdb/duckdb", + "labels_url": "https://api.github.com/repos/duckdb/duckdb/issues/989/labels{/name}", + "comments_url": "https://api.github.com/repos/duckdb/duckdb/issues/989/comments", + "events_url": "https://api.github.com/repos/duckdb/duckdb/issues/989/events", + "html_url": "https://github.com/duckdb/duckdb/issues/989", + "id": 715579041, + "node_id": "MDU6SXNzdWU3MTU1NzkwNDE=", + "number": 989, + "title": "Documentation and/or function for reserved keywords in duckdb", + "user": { + "login": "mskyttner", + "id": 1715840, + "node_id": "MDQ6VXNlcjE3MTU4NDA=", + "avatar_url": "https://avatars.githubusercontent.com/u/1715840?v=4", + "gravatar_id": "", + "url": "https://api.github.com/users/mskyttner", + "html_url": "https://github.com/mskyttner", + "followers_url": "https://api.github.com/users/mskyttner/followers", + "following_url": "https://api.github.com/users/mskyttner/following{/other_user}", + "gists_url": "https://api.github.com/users/mskyttner/gists{/gist_id}", + "starred_url": "https://api.github.com/users/mskyttner/starred{/owner}{/repo}", + "subscriptions_url": "https://api.github.com/users/mskyttner/subscriptions", + "organizations_url": "https://api.github.com/users/mskyttner/orgs", + "repos_url": "https://api.github.com/users/mskyttner/repos", + "events_url": "https://api.github.com/users/mskyttner/events{/privacy}", + "received_events_url": "https://api.github.com/users/mskyttner/received_events", + "type": "User", + "site_admin": false + }, + "labels": [ + + ], + "state": "closed", + "locked": false, + "assignee": null, + "assignees": [ + + ], + "milestone": null, + "comments": 7, + "created_at": "2020-10-06T11:20:53Z", + "updated_at": "2022-04-16T13:00:56Z", + "closed_at": "2022-04-16T13:00:56Z", + "author_association": "NONE", + "active_lock_reason": null, + "body": "What are reserved duckdb keywords which may not be used as the names of tables, columns/fields etc without first requiring quoting? I didn't find any docs for this at the website or any mention amongst the issues. \r\n\r\nComparing https://www.sqlite.org/lang_keywords.html to duckdb, using [a script here](https://gist.github.com/mskyttner/2f3d1a2a6964632f106e154c21cd8de3), it seems that several of those ~ 145 keywords that are \"reserved\" in SQLite can be used without \"quoting\" in duckdb as table names or field/column names. \r\n\r\nFor example these reserved keywords in SQLite appear to not require \"quoting\" in duckdb when used as table or column/field names:\r\n\r\n`abort, action, add, after, alter, always, attach, autoincrement, before, begin, between, by, cascade, commit, conflict, current, database, deferred, delete, detach, drop, each, escape, exclude, exclusive, exists, explain, fail, filter, first, following, generated, groups, if, ignore, immediate, index, indexed, insert, instead, key, last, match, no, nothing, nulls, of, others, over, partition, plan, pragma, preceding, query, raise, range, recursive, regexp, reindex, release, rename, replace, restrict, rollback, row, rows, savepoint, set, temp, temporary, ties, transaction, trigger, unbounded, update, vacuum, values, view, virtual, without\r\n`\r\n\r\nThese seem to be reserved though, and will require quoting if used as table or field names:\r\n\r\n`all, analyze, and, as, asc, case, cast, check, collate, column, constraint, create, cross, current_date, current_time, current_timestamp, default, deferrable, desc, distinct, do, else, end, except, for, foreign, from, full, glob, group, having, in, initially, inner, intersect, into, is, isnull, join, left, like, limit, natural, not, notnull, null, offset, on, or, order, outer, primary, references, right, select, table, then, to, union, unique, using, when, where, window, with`\r\n\r\nThere are probably more?\r\n\r\nIt would be nice to have a `duckdb_keywords()` function which would return reserved keywords, so one would know when quoting is required.", + "reactions": { + "url": "https://api.github.com/repos/duckdb/duckdb/issues/989/reactions", + "total_count": 0, + "+1": 0, + "-1": 0, + "laugh": 0, + "hooray": 0, + "confused": 0, + "heart": 0, + "rocket": 0, + "eyes": 0 + }, + "timeline_url": "https://api.github.com/repos/duckdb/duckdb/issues/989/timeline", + "performed_via_github_app": null, + "state_reason": "completed" + } + ] \ No newline at end of file diff --git a/tests/normalize/test_normalize.py b/tests/normalize/test_normalize.py index e52aa2e9cf..b893d4fedc 100644 --- a/tests/normalize/test_normalize.py +++ b/tests/normalize/test_normalize.py @@ -283,6 +283,34 @@ def test_schema_changes(raw_normalize: Normalize, caps: DestinationCapabilitiesC assert {"_dlt_id", "_dlt_list_idx", "_dlt_parent_id", "str", "int", "bool", "int__v_text"} == set(doc__comp_table["columns"].keys()) +@pytest.mark.parametrize("caps", ALL_CAPS) +def test_normalize_twice_with_flatten(raw_normalize: Normalize, caps: DestinationCapabilitiesContext) -> None: + mock_destination_caps(raw_normalize, caps) + load_id = extract_and_normalize_cases(raw_normalize, ["github.issues.load_page_5_duck"]) + table_files = expect_load_package(raw_normalize.load_storage, load_id, ["issues", "issues__labels", "issues__assignees"]) + assert len(table_files["issues"]) == 1 + _, lines = get_line_from_file(raw_normalize.load_storage, table_files["issues"], 0) + # insert writer adds 2 lines + assert lines in (100, 102) + + # check if schema contains a few crucial tables + def assert_schema(_schema: Schema): + assert "reactions___1" in schema._schema_tables["issues"]["columns"] + assert "reactions__1" not in schema._schema_tables["issues"]["columns"] + + schema = raw_normalize.load_or_create_schema(raw_normalize.schema_storage, "github") + assert_schema(schema) + + load_id = extract_and_normalize_cases(raw_normalize, ["github.issues.load_page_5_duck"]) + table_files = expect_load_package(raw_normalize.load_storage, load_id, ["issues", "issues__labels", "issues__assignees"]) + assert len(table_files["issues"]) == 1 + _, lines = get_line_from_file(raw_normalize.load_storage, table_files["issues"], 0) + # insert writer adds 2 lines + assert lines in (100, 102) + schema = raw_normalize.load_or_create_schema(raw_normalize.schema_storage, "github") + assert_schema(schema) + + def test_group_worker_files() -> None: files = ["f%03d" % idx for idx in range(0, 100)] @@ -335,10 +363,10 @@ def normalize_pending(normalize: Normalize, schema_name: str = "event") -> str: load_id = uniq_id() normalize.load_storage.create_temp_load_package(load_id) # pool not required for map_single - dest_cases = normalize.normalize_storage.storage.list_folder_files(NormalizeStorage.EXTRACTED_FOLDER) # [f"{NormalizeStorage.EXTRACTED_FOLDER}/{c}.extracted.json" for c in cases] + files = normalize.normalize_storage.list_files_to_normalize_sorted() # create schema if it does not exist - Normalize.load_or_create_schema(normalize.schema_storage, schema_name) - normalize.spool_files(schema_name, load_id, normalize.map_single, dest_cases) + for schema_name, files_in_schema in normalize.normalize_storage.group_by_schema(files): + normalize.spool_files(schema_name, load_id, normalize.map_single, list(files_in_schema)) return load_id