From 4aaf5e3b702b888a20b2997398ce4f491e12ba93 Mon Sep 17 00:00:00 2001 From: Matt Seddon Date: Wed, 2 Oct 2024 10:31:36 +1000 Subject: [PATCH 1/2] remove ordering by sys__id --- src/datachain/catalog/catalog.py | 2 -- src/datachain/data_storage/warehouse.py | 4 ---- src/datachain/query/batch.py | 1 - src/datachain/query/dataset.py | 16 ---------------- 4 files changed, 23 deletions(-) diff --git a/src/datachain/catalog/catalog.py b/src/datachain/catalog/catalog.py index 0902daf24..dbae1aa9f 100644 --- a/src/datachain/catalog/catalog.py +++ b/src/datachain/catalog/catalog.py @@ -1325,8 +1325,6 @@ def ls_dataset_rows( if offset: q = q.offset(offset) - q = q.order_by("sys__id") - return q.to_db_records() def signed_url(self, source: str, path: str, client_config=None) -> str: diff --git a/src/datachain/data_storage/warehouse.py b/src/datachain/data_storage/warehouse.py index 6b5b753d4..74b25045d 100644 --- a/src/datachain/data_storage/warehouse.py +++ b/src/datachain/data_storage/warehouse.py @@ -215,10 +215,6 @@ def dataset_select_paginated( limit = query._limit paginated_query = query.limit(page_size) - if not paginated_query._order_by_clauses: - # default order by is order by `sys__id` - paginated_query = paginated_query.order_by(query.selected_columns.sys__id) - results = None offset = 0 num_yielded = 0 diff --git a/src/datachain/query/batch.py b/src/datachain/query/batch.py index 15fb19e31..8f24ec895 100644 --- a/src/datachain/query/batch.py +++ b/src/datachain/query/batch.py @@ -97,7 +97,6 @@ def __call__( ordered_query = query.order_by(None).order_by( PARTITION_COLUMN_ID, - "sys__id", *query._order_by_clauses, ) diff --git a/src/datachain/query/dataset.py b/src/datachain/query/dataset.py index 67559317c..a0049c5ea 100644 --- a/src/datachain/query/dataset.py +++ b/src/datachain/query/dataset.py @@ -591,10 +591,6 @@ def process_input_query(self, query: Select) -> tuple[Select, list["Table"]]: return query, [] table = self.catalog.warehouse.create_pre_udf_table(query) q: Select = sqlalchemy.select(*table.c) - if query._order_by_clauses: - # we are adding ordering only if it's explicitly added by user in - # query part before adding signals - q = q.order_by(table.c.sys__id) return q, [table] def create_result_query( @@ -630,11 +626,6 @@ def q(*columns): else: res = sqlalchemy.select(*cols1).select_from(subq) - if query._order_by_clauses: - # if ordering is used in query part before adding signals, we - # will have it as order by id from select from pre-created udf table - res = res.order_by(subq.c.sys__id) - if self.partition_by is not None: subquery = res.subquery() res = sqlalchemy.select(*subquery.c).select_from(subquery) @@ -666,13 +657,6 @@ def create_udf_table(self, query: Select) -> "Table": def create_result_query( self, udf_table, query: Select ) -> tuple[QueryGeneratorFunc, list["sqlalchemy.Column"]]: - if not query._order_by_clauses: - # if we are not selecting all rows in UDF, we need to ensure that - # we get the same rows as we got as inputs of UDF since selecting - # without ordering can be non deterministic in some databases - c = query.selected_columns - query = query.order_by(c.sys__id) - udf_table_query = udf_table.select().subquery() udf_table_cols: list[sqlalchemy.Label[Any]] = [ label(c.name, c) for c in udf_table_query.columns From 6947d0a0c2b10e53056fc5ed0b238a3b59a35b05 Mon Sep 17 00:00:00 2001 From: Matt Seddon Date: Mon, 14 Oct 2024 09:41:07 +1100 Subject: [PATCH 2/2] add test for dataset preview order --- tests/func/test_datasets.py | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/tests/func/test_datasets.py b/tests/func/test_datasets.py index 90766f605..fa091d2c9 100644 --- a/tests/func/test_datasets.py +++ b/tests/func/test_datasets.py @@ -775,6 +775,39 @@ def test_types(): assert r["binary_col"] == [0, 25] +def test_dataset_preview_order(test_session): + ids = list(range(10000)) + order = ids[::-1] + catalog = test_session.catalog + dataset_name = "test" + + DataChain.from_values(id=ids, order=order, session=test_session).order_by( + "order" + ).save(dataset_name) + + preview_values = [] + + for r in catalog.get_dataset(dataset_name).get_version(1).preview: + id = ids.pop() + o = order.pop() + entry = (id, o) + preview_values.append((id, o)) + assert (r["id"], r["order"]) == entry + + DataChain.from_dataset(dataset_name, session=test_session).save(dataset_name) + + for r in catalog.get_dataset(dataset_name).get_version(2).preview: + assert (r["id"], r["order"]) == preview_values.pop(0) + + DataChain.from_dataset(dataset_name, 2, session=test_session).order_by("id").save( + dataset_name + ) + + for r in catalog.get_dataset(dataset_name).get_version(3).preview: + assert r["id"] == ids.pop(0) + assert r["order"] == order.pop(0) + + def test_dataset_preview_last_modified(cloud_test_catalog, dogs_dataset): catalog = cloud_test_catalog.catalog