From e4a5b71c2b24d9ff16f720f204604da657152775 Mon Sep 17 00:00:00 2001 From: khustup2 <sasun@activeloop.ai> Date: Fri, 5 Jan 2024 16:58:13 +0400 Subject: [PATCH 01/62] Rename DeeplakeQuery to Indra. --- ..._query_dataset.py => indra_dataset_view.py} | 18 +++++++++--------- ...ke_query_tensor.py => indra_tensor_view.py} | 4 ++-- .../core/tests/test_deeplake_indra_dataset.py | 14 +++++++------- deeplake/core/tests/test_query.py | 2 +- .../vector_search/indra/search_algorithm.py | 4 ++-- deeplake/enterprise/libdeeplake_query.py | 10 +++++----- 6 files changed, 26 insertions(+), 26 deletions(-) rename deeplake/core/dataset/{deeplake_query_dataset.py => indra_dataset_view.py} (96%) rename deeplake/core/dataset/{deeplake_query_tensor.py => indra_tensor_view.py} (98%) diff --git a/deeplake/core/dataset/deeplake_query_dataset.py b/deeplake/core/dataset/indra_dataset_view.py similarity index 96% rename from deeplake/core/dataset/deeplake_query_dataset.py rename to deeplake/core/dataset/indra_dataset_view.py index 3875e97c6b..2ae60d885a 100644 --- a/deeplake/core/dataset/deeplake_query_dataset.py +++ b/deeplake/core/dataset/indra_dataset_view.py @@ -30,10 +30,10 @@ import warnings -from deeplake.core.dataset.deeplake_query_tensor import DeepLakeQueryTensor +from deeplake.core.dataset.indra_tensor_view import IndraTensorView -class DeepLakeQueryDataset(Dataset): +class IndraDatasetView(Dataset): def __init__( self, deeplake_ds, @@ -97,7 +97,7 @@ def _get_tensor_from_root(self, fullpath): except: pass indra_tensor = tensor - return DeepLakeQueryTensor( + return IndraTensorView( deeplake_tensor, indra_tensor, index=self.index ) @@ -148,7 +148,7 @@ def __getitem__( if self.deeplake_ds is not None and self.deeplake_ds._has_group_in_root( fullpath ): - ret = DeepLakeQueryDataset( + ret = IndraDatasetView( deeplake_ds=self.deeplake_ds, indra_ds=self.indra_ds, index=self.index, @@ -179,7 +179,7 @@ def __getitem__( ) for x in item ] - ret = DeepLakeQueryDataset( + ret = IndraDatasetView( deeplake_ds=self.deeplake_ds, indra_ds=self.indra_ds, enabled_tensors=enabled_tensors, @@ -197,7 +197,7 @@ def __getitem__( warnings.warn( "Indexing by integer in a for loop, like `for i in range(len(ds)): ... ds[i]` can be quite slow. Use `for i, sample in enumerate(ds)` instead." ) - ret = DeepLakeQueryDataset( + ret = IndraDatasetView( deeplake_ds=self.deeplake_ds, indra_ds=self.indra_ds[item], index=self.index[item], @@ -349,11 +349,11 @@ def _tensors( original_keys = set(original_tensors.keys()) for t in indra_tensors: if t.name in original_keys: - original_tensors[t.name] = DeepLakeQueryTensor( + original_tensors[t.name] = IndraTensorView( original_tensors[t.name], t, index=self.index ) else: - original_tensors[t.name] = DeepLakeQueryTensor( + original_tensors[t.name] = IndraTensorView( None, t, index=self.index ) return original_tensors @@ -391,4 +391,4 @@ def random_split(self, lengths: Sequence[Union[int, float]]): lengths = calculate_absolute_lengths(lengths, len(self)) vs = self.indra_ds.random_split(lengths) - return [DeepLakeQueryDataset(self.deeplake_ds, v) for v in vs] + return [IndraDatasetView(self.deeplake_ds, v) for v in vs] diff --git a/deeplake/core/dataset/deeplake_query_tensor.py b/deeplake/core/dataset/indra_tensor_view.py similarity index 98% rename from deeplake/core/dataset/deeplake_query_tensor.py rename to deeplake/core/dataset/indra_tensor_view.py index 3a369a57e5..5b0738959a 100644 --- a/deeplake/core/dataset/deeplake_query_tensor.py +++ b/deeplake/core/dataset/indra_tensor_view.py @@ -11,7 +11,7 @@ import json -class DeepLakeQueryTensor(tensor.Tensor): +class IndraTensorView(tensor.Tensor): def __init__( self, deeplake_tensor, @@ -57,7 +57,7 @@ def __getitem__( indra_tensor = self.indra_tensor[item] - return DeepLakeQueryTensor( + return IndraTensorView( self.deeplake_tensor, indra_tensor, index=self.index[item], diff --git a/deeplake/core/tests/test_deeplake_indra_dataset.py b/deeplake/core/tests/test_deeplake_indra_dataset.py index 973ff353ac..6a5b2535b4 100644 --- a/deeplake/core/tests/test_deeplake_indra_dataset.py +++ b/deeplake/core/tests/test_deeplake_indra_dataset.py @@ -6,7 +6,7 @@ EmptyTokenException, ) -from deeplake.core.dataset.deeplake_query_dataset import DeepLakeQueryDataset +from deeplake.core.dataset.indra_dataset_view import IndraDatasetView import random import math import pytest @@ -23,7 +23,7 @@ def test_indexing(local_auth_ds_generator): deeplake_ds.label.append(int(100 * random.uniform(0.0, 1.0))) indra_ds = dataset_to_libdeeplake(deeplake_ds) - deeplake_indra_ds = DeepLakeQueryDataset(deeplake_ds=deeplake_ds, indra_ds=indra_ds) + deeplake_indra_ds = IndraDatasetView(deeplake_ds=deeplake_ds, indra_ds=indra_ds) assert len(deeplake_indra_ds) == len(indra_ds) @@ -70,7 +70,7 @@ def test_save_view(local_auth_ds_generator): deeplake_ds.commit("First") indra_ds = dataset_to_libdeeplake(deeplake_ds) - deeplake_indra_ds = DeepLakeQueryDataset(deeplake_ds=deeplake_ds, indra_ds=indra_ds) + deeplake_indra_ds = IndraDatasetView(deeplake_ds=deeplake_ds, indra_ds=indra_ds) deeplake_indra_ds.save_view() assert ( deeplake_indra_ds.base_storage["queries.json"] @@ -108,7 +108,7 @@ def test_load_view(local_auth_ds_generator): deeplake_ds.commit("First") indra_ds = dataset_to_libdeeplake(deeplake_ds) - deeplake_indra_ds = DeepLakeQueryDataset(deeplake_ds=deeplake_ds, indra_ds=indra_ds) + deeplake_indra_ds = IndraDatasetView(deeplake_ds=deeplake_ds, indra_ds=indra_ds) with pytest.raises(Exception): dataloader = deeplake_indra_ds.pytorch() @@ -158,7 +158,7 @@ def test_query(local_auth_ds_generator): deeplake_ds.image.append(np.random.randint(0, 255, (100, 200, 3), np.uint8)) indra_ds = dataset_to_libdeeplake(deeplake_ds) - deeplake_indra_ds = DeepLakeQueryDataset(deeplake_ds=deeplake_ds, indra_ds=indra_ds) + deeplake_indra_ds = IndraDatasetView(deeplake_ds=deeplake_ds, indra_ds=indra_ds) view = deeplake_indra_ds.query("SELECT * GROUP BY label") assert len(view) == 10 @@ -193,7 +193,7 @@ def test_metadata(local_auth_ds_generator): ) indra_ds = dataset_to_libdeeplake(deeplake_ds) - deeplake_indra_ds = DeepLakeQueryDataset(deeplake_ds=deeplake_ds, indra_ds=indra_ds) + deeplake_indra_ds = IndraDatasetView(deeplake_ds=deeplake_ds, indra_ds=indra_ds) assert deeplake_indra_ds.label.htype == "generic" assert deeplake_indra_ds.label.dtype == np.int32 assert deeplake_indra_ds.label.sample_compression == None @@ -219,7 +219,7 @@ def test_accessing_data(local_auth_ds_generator): deeplake_ds.label.append(int(100 * random.uniform(0.0, 1.0))) indra_ds = dataset_to_libdeeplake(deeplake_ds) - deeplake_indra_ds = DeepLakeQueryDataset(deeplake_ds=deeplake_ds, indra_ds=indra_ds) + deeplake_indra_ds = IndraDatasetView(deeplake_ds=deeplake_ds, indra_ds=indra_ds) assert np.all( np.isclose(deeplake_indra_ds.label.numpy(), deeplake_indra_ds["label"].numpy()) diff --git a/deeplake/core/tests/test_query.py b/deeplake/core/tests/test_query.py index 07223112ad..389e280397 100644 --- a/deeplake/core/tests/test_query.py +++ b/deeplake/core/tests/test_query.py @@ -1,6 +1,6 @@ import deeplake from deeplake.tests.common import requires_libdeeplake -from deeplake.core.dataset.deeplake_query_dataset import DeepLakeQueryDataset +from deeplake.core.dataset.indra_dataset_view import IndraDatasetView from deeplake.client.client import DeepLakeBackendClient import pytest import numpy as np diff --git a/deeplake/core/vectorstore/vector_search/indra/search_algorithm.py b/deeplake/core/vectorstore/vector_search/indra/search_algorithm.py index 0b3ad24bea..e39494f1f6 100644 --- a/deeplake/core/vectorstore/vector_search/indra/search_algorithm.py +++ b/deeplake/core/vectorstore/vector_search/indra/search_algorithm.py @@ -5,7 +5,7 @@ from deeplake.core.vectorstore.vector_search.indra import query from deeplake.core.vectorstore.vector_search import utils from deeplake.core.dataset import Dataset as DeepLakeDataset -from deeplake.core.dataset.deeplake_query_dataset import DeepLakeQueryDataset +from deeplake.core.dataset.indra_dataset_view import IndraDatasetView class SearchBasic(ABC): @@ -105,7 +105,7 @@ class SearchIndra(SearchBasic): def _get_view(self, tql_query, runtime: Optional[Dict] = None): indra_dataset = self._get_indra_dataset() indra_view = indra_dataset.query(tql_query) - view = DeepLakeQueryDataset( + view = IndraDatasetView( deeplake_ds=self.deeplake_dataset, indra_ds=indra_view ) view._tql_query = tql_query diff --git a/deeplake/enterprise/libdeeplake_query.py b/deeplake/enterprise/libdeeplake_query.py index 42c8c05b1f..57ec98bb94 100644 --- a/deeplake/enterprise/libdeeplake_query.py +++ b/deeplake/enterprise/libdeeplake_query.py @@ -1,5 +1,5 @@ from deeplake.enterprise.convert_to_libdeeplake import dataset_to_libdeeplake -from deeplake.core.dataset.deeplake_query_dataset import DeepLakeQueryDataset +from deeplake.core.dataset.indra_dataset_view import IndraDatasetView from typing import Optional, Union from deeplake.constants import INDRA_DATASET_SAMPLES_THRESHOLD @@ -35,7 +35,7 @@ def query(dataset, query_string: str): >>> ds_train = deeplake.load('hub://activeloop/coco-train') >>> query_ds_train = query(ds_train, "(select * where contains(categories, 'car') limit 1000) union (select * where contains(categories, 'motorcycle') limit 1000)") """ - if isinstance(dataset, DeepLakeQueryDataset): + if isinstance(dataset, IndraDatasetView): ds = dataset.indra_ds elif dataset.libdeeplake_dataset is not None: ds = dataset.libdeeplake_dataset @@ -49,11 +49,11 @@ def query(dataset, query_string: str): dsv = ds.query(query_string) from deeplake.enterprise.convert_to_libdeeplake import INDRA_API - if not isinstance(dataset, DeepLakeQueryDataset) and INDRA_API.tql.parse(query_string).is_filter and len(dsv.indexes) < INDRA_DATASET_SAMPLES_THRESHOLD: # type: ignore + if not isinstance(dataset, IndraDatasetView) and INDRA_API.tql.parse(query_string).is_filter and len(dsv.indexes) < INDRA_DATASET_SAMPLES_THRESHOLD: # type: ignore indexes = list(dsv.indexes) return dataset.no_view_dataset[indexes] else: - view = DeepLakeQueryDataset(deeplake_ds=dataset, indra_ds=dsv) + view = IndraDatasetView(deeplake_ds=dataset, indra_ds=dsv) view._tql_query = query_string if hasattr(dataset, "is_actually_cloud"): view.is_actually_cloud = dataset.is_actually_cloud @@ -158,6 +158,6 @@ def universal_query(query_string: str, token: Optional[str]): api = import_indra_api() dsv = api.tql.query(query_string, token) - view = DeepLakeQueryDataset(deeplake_ds=None, indra_ds=dsv) + view = IndraDatasetView(deeplake_ds=None, indra_ds=dsv) view._tql_query = query_string return view From ad30bb893a2ae85cb1be380306ccd3d78a496ccb Mon Sep 17 00:00:00 2001 From: khustup2 <sasun@activeloop.ai> Date: Fri, 5 Jan 2024 17:34:13 +0400 Subject: [PATCH 02/62] Fixed black. --- deeplake/core/dataset/indra_dataset_view.py | 8 ++------ ...st_deeplake_indra_dataset.py => test_indra_dataset.py} | 0 .../vectorstore/vector_search/indra/search_algorithm.py | 4 +--- 3 files changed, 3 insertions(+), 9 deletions(-) rename deeplake/core/tests/{test_deeplake_indra_dataset.py => test_indra_dataset.py} (100%) diff --git a/deeplake/core/dataset/indra_dataset_view.py b/deeplake/core/dataset/indra_dataset_view.py index 2ae60d885a..bcb9798bc0 100644 --- a/deeplake/core/dataset/indra_dataset_view.py +++ b/deeplake/core/dataset/indra_dataset_view.py @@ -97,9 +97,7 @@ def _get_tensor_from_root(self, fullpath): except: pass indra_tensor = tensor - return IndraTensorView( - deeplake_tensor, indra_tensor, index=self.index - ) + return IndraTensorView(deeplake_tensor, indra_tensor, index=self.index) def pytorch( self, @@ -353,9 +351,7 @@ def _tensors( original_tensors[t.name], t, index=self.index ) else: - original_tensors[t.name] = IndraTensorView( - None, t, index=self.index - ) + original_tensors[t.name] = IndraTensorView(None, t, index=self.index) return original_tensors def __str__(self): diff --git a/deeplake/core/tests/test_deeplake_indra_dataset.py b/deeplake/core/tests/test_indra_dataset.py similarity index 100% rename from deeplake/core/tests/test_deeplake_indra_dataset.py rename to deeplake/core/tests/test_indra_dataset.py diff --git a/deeplake/core/vectorstore/vector_search/indra/search_algorithm.py b/deeplake/core/vectorstore/vector_search/indra/search_algorithm.py index e39494f1f6..956aee99c9 100644 --- a/deeplake/core/vectorstore/vector_search/indra/search_algorithm.py +++ b/deeplake/core/vectorstore/vector_search/indra/search_algorithm.py @@ -105,9 +105,7 @@ class SearchIndra(SearchBasic): def _get_view(self, tql_query, runtime: Optional[Dict] = None): indra_dataset = self._get_indra_dataset() indra_view = indra_dataset.query(tql_query) - view = IndraDatasetView( - deeplake_ds=self.deeplake_dataset, indra_ds=indra_view - ) + view = IndraDatasetView(deeplake_ds=self.deeplake_dataset, indra_ds=indra_view) view._tql_query = tql_query return view From 6ca7f5c2062bdcaf65d68eab9517c8bcae879c06 Mon Sep 17 00:00:00 2001 From: khustup2 <sasun@activeloop.ai> Date: Mon, 22 Jan 2024 19:20:07 +0000 Subject: [PATCH 03/62] Added indra storage provider. --- deeplake/core/storage/indra.py | 53 ++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) create mode 100644 deeplake/core/storage/indra.py diff --git a/deeplake/core/storage/indra.py b/deeplake/core/storage/indra.py new file mode 100644 index 0000000000..5916df4ded --- /dev/null +++ b/deeplake/core/storage/indra.py @@ -0,0 +1,53 @@ +from deeplake.core.storage.provider import StorageProvider +from indra.api import storage +from typing import Optional, Union + +class IndraProvider(StorageProvider): + """Provider class for using Indra storage provider.""" + + def __init__( + self, + root: Union[str, storage.provider], + read_only: Optional[bool] = False, + **kwargs, + ): + if isinstance(root, str): + self.core = storage.create(root, read_only, **kwargs) + else: + self.core = root + + def subdir(self, path: str, read_only: bool = False): + return IndraProvider(self.core.subdir(path, read_only)) + + def __setitem__(self, path, content): + self.check_readonly() + self.core.set(path, content) + + def __getitem__(self, path): + return bytes(self.core.get(path)) + + def get_bytes( + self, path, start_byte: Optional[int] = None, end_byte: Optional[int] = None + ): + s = start_byte or 0 + e = end_byte or 0 + return bytes(self.core.get(path, s, e)) + + def get_object_size(self, path: str) -> int: + return self.core.length(path) + + def __delitem__(self, path): + return self.core.remove(path) + + def _all_keys(self): + return self.core.list("") + + def __len__(self): + return len(self.core.list("")) + + def __iter__(self): + return iter(self.core.list("")) + + def clear(self, prefix=""): + self.core.clear(prefix) + From fed36e6c647b6625db891895cc9084284eacf93a Mon Sep 17 00:00:00 2001 From: khustup2 <sasun@activeloop.ai> Date: Mon, 22 Jan 2024 20:05:23 +0000 Subject: [PATCH 04/62] Careful switch to indra provider. --- deeplake/core/storage/indra.py | 17 +++++++++++++---- deeplake/util/storage.py | 11 ++++++++--- 2 files changed, 21 insertions(+), 7 deletions(-) diff --git a/deeplake/core/storage/indra.py b/deeplake/core/storage/indra.py index 5916df4ded..0469670649 100644 --- a/deeplake/core/storage/indra.py +++ b/deeplake/core/storage/indra.py @@ -2,6 +2,7 @@ from indra.api import storage from typing import Optional, Union + class IndraProvider(StorageProvider): """Provider class for using Indra storage provider.""" @@ -24,17 +25,26 @@ def __setitem__(self, path, content): self.core.set(path, content) def __getitem__(self, path): - return bytes(self.core.get(path)) + try: + return bytes(self.core.get(path)) + except RuntimeError as e: + raise KeyError(path) def get_bytes( self, path, start_byte: Optional[int] = None, end_byte: Optional[int] = None ): s = start_byte or 0 e = end_byte or 0 - return bytes(self.core.get(path, s, e)) + try: + return bytes(self.core.get(path, s, e)) + except RuntimeError as e: + raise KeyError(path) def get_object_size(self, path: str) -> int: - return self.core.length(path) + try: + return self.core.length(path) + except RuntimeError as e: + raise KeyError(path) def __delitem__(self, path): return self.core.remove(path) @@ -50,4 +60,3 @@ def __iter__(self): def clear(self, prefix=""): self.core.clear(prefix) - diff --git a/deeplake/util/storage.py b/deeplake/util/storage.py index f9877f8796..d834b3fe6d 100644 --- a/deeplake/util/storage.py +++ b/deeplake/util/storage.py @@ -54,9 +54,14 @@ def storage_provider_from_path( if creds is None: creds = {} if path.startswith("hub://"): - storage: StorageProvider = storage_provider_from_hub_path( - path, read_only, db_engine=db_engine, token=token, creds=creds - ) + if read_only and not db_engine: + from deeplake.core.storage.indra import IndraProvider + + storage = IndraProvider(path, read_only=True, token=token, creds=creds) + else: + storage: StorageProvider = storage_provider_from_hub_path( + path, read_only, db_engine=db_engine, token=token, creds=creds + ) else: if path.startswith("s3://"): creds_used = "PLATFORM" From 3766f29e4bc4659aa646f8d92900bc28bb2c8719 Mon Sep 17 00:00:00 2001 From: khustup2 <sasun@activeloop.ai> Date: Mon, 22 Jan 2024 22:20:24 +0000 Subject: [PATCH 05/62] Fixed lint. --- deeplake/util/storage.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/deeplake/util/storage.py b/deeplake/util/storage.py index d834b3fe6d..b093c56cab 100644 --- a/deeplake/util/storage.py +++ b/deeplake/util/storage.py @@ -57,9 +57,9 @@ def storage_provider_from_path( if read_only and not db_engine: from deeplake.core.storage.indra import IndraProvider - storage = IndraProvider(path, read_only=True, token=token, creds=creds) + storage: StorageProvider = IndraProvider(path, read_only=True, token=token, creds=creds) else: - storage: StorageProvider = storage_provider_from_hub_path( + storage = storage_provider_from_hub_path( path, read_only, db_engine=db_engine, token=token, creds=creds ) else: From ffee1aeabe513d5c3405a175c4a21c81c9cad4eb Mon Sep 17 00:00:00 2001 From: khustup2 <sasun@activeloop.ai> Date: Mon, 22 Jan 2024 22:31:29 +0000 Subject: [PATCH 06/62] One more fix. --- deeplake/core/storage/indra.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deeplake/core/storage/indra.py b/deeplake/core/storage/indra.py index 0469670649..a8e4616154 100644 --- a/deeplake/core/storage/indra.py +++ b/deeplake/core/storage/indra.py @@ -1,5 +1,5 @@ from deeplake.core.storage.provider import StorageProvider -from indra.api import storage +from indra.api import storage # type: ignore from typing import Optional, Union From 291c6509bf462b4d7f1519ccce349f7b54d040c3 Mon Sep 17 00:00:00 2001 From: khustup2 <sasun@activeloop.ai> Date: Tue, 23 Jan 2024 05:50:15 +0000 Subject: [PATCH 07/62] Fixed black. --- deeplake/core/storage/indra.py | 2 +- deeplake/util/storage.py | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/deeplake/core/storage/indra.py b/deeplake/core/storage/indra.py index a8e4616154..97edadb5ae 100644 --- a/deeplake/core/storage/indra.py +++ b/deeplake/core/storage/indra.py @@ -1,5 +1,5 @@ from deeplake.core.storage.provider import StorageProvider -from indra.api import storage # type: ignore +from indra.api import storage # type: ignore from typing import Optional, Union diff --git a/deeplake/util/storage.py b/deeplake/util/storage.py index b093c56cab..f576e98b35 100644 --- a/deeplake/util/storage.py +++ b/deeplake/util/storage.py @@ -57,7 +57,9 @@ def storage_provider_from_path( if read_only and not db_engine: from deeplake.core.storage.indra import IndraProvider - storage: StorageProvider = IndraProvider(path, read_only=True, token=token, creds=creds) + storage: StorageProvider = IndraProvider( + path, read_only=True, token=token, creds=creds + ) else: storage = storage_provider_from_hub_path( path, read_only, db_engine=db_engine, token=token, creds=creds From 6516620d1a08db16d6e7413630502b075f02af88 Mon Sep 17 00:00:00 2001 From: khustup2 <sasun@activeloop.ai> Date: Tue, 23 Jan 2024 11:14:16 +0000 Subject: [PATCH 08/62] Switch local read only to Indra. --- deeplake/core/storage/indra.py | 3 +++ deeplake/util/storage.py | 8 +++++--- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/deeplake/core/storage/indra.py b/deeplake/core/storage/indra.py index 97edadb5ae..635c48292d 100644 --- a/deeplake/core/storage/indra.py +++ b/deeplake/core/storage/indra.py @@ -17,6 +17,9 @@ def __init__( else: self.core = root + def copy(self): + return IndraProvider(self.core) + def subdir(self, path: str, read_only: bool = False): return IndraProvider(self.core.subdir(path, read_only)) diff --git a/deeplake/util/storage.py b/deeplake/util/storage.py index f576e98b35..acc8e9aa92 100644 --- a/deeplake/util/storage.py +++ b/deeplake/util/storage.py @@ -53,10 +53,9 @@ def storage_provider_from_path( """ if creds is None: creds = {} + from deeplake.core.storage.indra import IndraProvider if path.startswith("hub://"): if read_only and not db_engine: - from deeplake.core.storage.indra import IndraProvider - storage: StorageProvider = IndraProvider( path, read_only=True, token=token, creds=creds ) @@ -110,7 +109,10 @@ def storage_provider_from_path( storage = MemoryProvider(path) else: if not os.path.exists(path) or os.path.isdir(path): - storage = LocalProvider(path) + if read_only: + storage = IndraProvider(path) + else: + storage = LocalProvider(path) else: raise ValueError( f"Local path {path} must be a path to a local directory" From d1bff43d268306d3d48d224a9e7b85969601e8dc Mon Sep 17 00:00:00 2001 From: khustup2 <sasun@activeloop.ai> Date: Tue, 23 Jan 2024 11:55:35 +0000 Subject: [PATCH 09/62] Fixed black. --- deeplake/util/storage.py | 1 + 1 file changed, 1 insertion(+) diff --git a/deeplake/util/storage.py b/deeplake/util/storage.py index acc8e9aa92..15887b5e0b 100644 --- a/deeplake/util/storage.py +++ b/deeplake/util/storage.py @@ -54,6 +54,7 @@ def storage_provider_from_path( if creds is None: creds = {} from deeplake.core.storage.indra import IndraProvider + if path.startswith("hub://"): if read_only and not db_engine: storage: StorageProvider = IndraProvider( From 284283615db96f3b1080e7203e51cfacfadd7bcb Mon Sep 17 00:00:00 2001 From: khustup2 <sasun@activeloop.ai> Date: Fri, 2 Feb 2024 20:27:05 +0000 Subject: [PATCH 10/62] Added v4 arg. --- deeplake/api/dataset.py | 7 ++++ deeplake/constants.py | 2 + deeplake/core/storage/indra.py | 2 + .../core/vectorstore/deeplake_vectorstore.py | 3 ++ deeplake/util/storage.py | 38 +++++++++++-------- 5 files changed, 36 insertions(+), 16 deletions(-) diff --git a/deeplake/api/dataset.py b/deeplake/api/dataset.py index 2869b7100c..cca1fb9158 100644 --- a/deeplake/api/dataset.py +++ b/deeplake/api/dataset.py @@ -45,6 +45,7 @@ DEFAULT_READONLY, DATASET_META_FILENAME, DATASET_LOCK_FILENAME, + USE_V4, ) from deeplake.util.access_method import ( check_access_method, @@ -375,6 +376,7 @@ def empty( lock_timeout: Optional[int] = 0, verbose: bool = True, index_params: Optional[Dict[str, Union[int, str]]] = None, + v4: bool = USE_V4, ) -> Dataset: """Creates an empty dataset @@ -399,6 +401,7 @@ def empty( lock_timeout (int): Number of seconds to wait before throwing a LockException. If None, wait indefinitely lock_enabled (bool): If true, the dataset manages a write lock. NOTE: Only set to False if you are managing concurrent access externally. index_params: Optional[Dict[str, Union[int, str]]]: Index parameters used while creating vector store, passed down to dataset. + v4 (bool): Flag indicating whether v4 api should be used to create the dataset. Defaults to false Returns: Dataset: Dataset created using the arguments provided. @@ -438,6 +441,7 @@ def empty( token=token, memory_cache_size=memory_cache_size, local_cache_size=local_cache_size, + v4=v4, ) feature_report_path( @@ -500,6 +504,7 @@ def load( access_method: str = "stream", unlink: bool = False, reset: bool = False, + v4: bool = USE_V4, check_integrity: Optional[bool] = None, lock_timeout: Optional[int] = 0, lock_enabled: Optional[bool] = True, @@ -570,6 +575,7 @@ def load( setting ``reset=True`` will reset HEAD changes and load the previous version. check_integrity (bool, Optional): Performs an integrity check by default (None) if the dataset has 20 or fewer tensors. Set to ``True`` to force integrity check, ``False`` to skip integrity check. + v4 (bool): Flag indicating whether v4 api should be used to create the dataset. Defaults to false .. # noqa: DAR101 @@ -616,6 +622,7 @@ def load( token=token, memory_cache_size=memory_cache_size, local_cache_size=local_cache_size, + v4=v4, ) feature_report_path( path, diff --git a/deeplake/constants.py b/deeplake/constants.py index c669d0df82..8ab8235888 100644 --- a/deeplake/constants.py +++ b/deeplake/constants.py @@ -349,3 +349,5 @@ # Size of dataset view to expose as indra dataset wrapper. INDRA_DATASET_SAMPLES_THRESHOLD = 10000000 + +USE_V4 = os.environ.get("DEEPLAKE_USE_V4", "false").strip().lower() == "true" diff --git a/deeplake/core/storage/indra.py b/deeplake/core/storage/indra.py index 635c48292d..42e70cd118 100644 --- a/deeplake/core/storage/indra.py +++ b/deeplake/core/storage/indra.py @@ -25,6 +25,8 @@ def subdir(self, path: str, read_only: bool = False): def __setitem__(self, path, content): self.check_readonly() + if not isinstance(content, bytes): + content = bytes(content) self.core.set(path, content) def __getitem__(self, path): diff --git a/deeplake/core/vectorstore/deeplake_vectorstore.py b/deeplake/core/vectorstore/deeplake_vectorstore.py index 8deda2b81f..f220b06318 100644 --- a/deeplake/core/vectorstore/deeplake_vectorstore.py +++ b/deeplake/core/vectorstore/deeplake_vectorstore.py @@ -12,6 +12,7 @@ DEFAULT_VECTORSTORE_TENSORS, MAX_BYTES_PER_MINUTE, TARGET_BYTE_SIZE, + USE_V4, ) from deeplake.util.bugout_reporter import feature_report_path from deeplake.util.exceptions import DeepMemoryAccessError @@ -41,6 +42,7 @@ def __init__( org_id: Optional[str] = None, logger: logging.Logger = logger, branch: str = "main", + v4: bool = USE_V4, **kwargs: Any, ) -> None: """Creates an empty VectorStore or loads an existing one if it exists at the specified ``path``. @@ -98,6 +100,7 @@ def __init__( - If 'ENV' is passed, credentials are fetched from the environment variables. This is also the case when creds is not passed for cloud datasets. For datasets connected to hub cloud, specifying 'ENV' will override the credentials fetched from Activeloop and use local ones. runtime (Dict, optional): Parameters for creating the Vector Store in Deep Lake's Managed Tensor Database. Not applicable when loading an existing Vector Store. To create a Vector Store in the Managed Tensor Database, set `runtime = {"tensor_db": True}`. branch (str): Branch name to use for the Vector Store. Defaults to "main". + v4 (bool): Flag indicating whether v4 api should be used to create the underlying dataset. Defaults to false **kwargs (Any): Additional keyword arguments. diff --git a/deeplake/util/storage.py b/deeplake/util/storage.py index 15887b5e0b..4f76db78f5 100644 --- a/deeplake/util/storage.py +++ b/deeplake/util/storage.py @@ -28,6 +28,7 @@ def storage_provider_from_path( token: Optional[str] = None, is_hub_path: bool = False, db_engine: bool = False, + v4: bool = False, ): """Construct a StorageProvider given a path. @@ -53,17 +54,17 @@ def storage_provider_from_path( """ if creds is None: creds = {} - from deeplake.core.storage.indra import IndraProvider - if path.startswith("hub://"): - if read_only and not db_engine: - storage: StorageProvider = IndraProvider( - path, read_only=True, token=token, creds=creds - ) - else: - storage = storage_provider_from_hub_path( - path, read_only, db_engine=db_engine, token=token, creds=creds - ) + if v4: + from deeplake.core.storage.indra import IndraProvider + + storage: StorageProvider = IndraProvider( + path, read_only=True, token=token, creds=creds + ) + elif path.startswith("hub://"): + storage = storage_provider_from_hub_path( + path, read_only, db_engine=db_engine, token=token, creds=creds + ) else: if path.startswith("s3://"): creds_used = "PLATFORM" @@ -110,10 +111,7 @@ def storage_provider_from_path( storage = MemoryProvider(path) else: if not os.path.exists(path) or os.path.isdir(path): - if read_only: - storage = IndraProvider(path) - else: - storage = LocalProvider(path) + storage = LocalProvider(path) else: raise ValueError( f"Local path {path} must be a path to a local directory" @@ -133,7 +131,7 @@ def get_dataset_credentials( mode: Optional[str], db_engine: bool, ): - # this will give the proper url (s3, gcs, etc) and corresponding creds, depending on where the dataset is stored. + # this will give the proper url(s3, gcs, etc) and corresponding creds, depending on where the dataset is stored. try: url, final_creds, mode, expiration, repo = client.get_dataset_credentials( org_id, ds_name, mode=mode, db_engine={"enabled": db_engine} @@ -152,6 +150,7 @@ def storage_provider_from_hub_path( db_engine: bool = False, token: Optional[str] = None, creds: Optional[Union[dict, str]] = None, + v4: bool = False, ): path, org_id, ds_name, subdir = process_hub_path(path) client = DeepLakeBackendClient(token=token) @@ -195,7 +194,12 @@ def storage_provider_from_hub_path( print(msg) storage = storage_provider_from_path( - path=url, creds=final_creds, read_only=read_only, is_hub_path=True, token=token + path=url, + creds=final_creds, + read_only=read_only, + is_hub_path=True, + token=token, + v4=v4, ) storage.creds_used = creds_used if creds_used == "PLATFORM": @@ -212,6 +216,7 @@ def get_storage_and_cache_chain( memory_cache_size, local_cache_size, db_engine=False, + v4=False, ): """ Returns storage provider and cache chain for a given path, according to arguments passed. @@ -236,6 +241,7 @@ def get_storage_and_cache_chain( creds=creds, read_only=read_only, token=token, + v4=v4, ) memory_cache_size_bytes = memory_cache_size * MB local_cache_size_bytes = local_cache_size * MB From abfed5b50dfb9a2e484a3065b04487999a067607 Mon Sep 17 00:00:00 2001 From: khustup2 <sasun@activeloop.ai> Date: Sat, 3 Feb 2024 05:33:51 +0000 Subject: [PATCH 11/62] Fixed darglint. --- deeplake/util/storage.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/deeplake/util/storage.py b/deeplake/util/storage.py index 4f76db78f5..a555e939a0 100644 --- a/deeplake/util/storage.py +++ b/deeplake/util/storage.py @@ -40,6 +40,7 @@ def storage_provider_from_path( token (str): token for authentication into activeloop. is_hub_path (bool): Whether the path points to a Deep Lake dataset. db_engine (bool): Whether to use Activeloop DB Engine. Only applicable for hub:// paths. + v4 (bool): If true creates v4 storage provider. Returns: If given a path starting with s3:// returns the S3Provider. @@ -230,6 +231,7 @@ def get_storage_and_cache_chain( memory_cache_size (int): The size of the in-memory cache to use. local_cache_size (int): The size of the local cache to use. db_engine (bool): Whether to use Activeloop DB Engine, only applicable for hub:// paths. + v4 (bool): If true creates v4 storage provider. Returns: A tuple of the storage provider and the storage chain. From de92f246228f6549cda1d29e57039dcc542ad206 Mon Sep 17 00:00:00 2001 From: khustup2 <sasun@activeloop.ai> Date: Sun, 4 Feb 2024 23:26:13 +0400 Subject: [PATCH 12/62] Fix. --- deeplake/core/dataset/indra_dataset_view.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deeplake/core/dataset/indra_dataset_view.py b/deeplake/core/dataset/indra_dataset_view.py index 57fd065795..9904d2737e 100644 --- a/deeplake/core/dataset/indra_dataset_view.py +++ b/deeplake/core/dataset/indra_dataset_view.py @@ -42,7 +42,7 @@ def __init__( enabled_tensors=None, index: Optional[Index] = None, ): - if isinstance(deeplake_ds, DeepLakeQueryDataset): + if isinstance(deeplake_ds, IndraDatasetView): deeplake_ds = deeplake_ds.deeplake_ds d: Dict[str, Any] = {} d["deeplake_ds"] = deeplake_ds From 6260cb7418e1d56fe6a911ae9a7958a880fd3697 Mon Sep 17 00:00:00 2001 From: khustup2 <sasun@activeloop.ai> Date: Tue, 13 Feb 2024 11:57:29 +0000 Subject: [PATCH 13/62] Minor. --- deeplake/core/storage/indra.py | 2 +- deeplake/util/storage.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/deeplake/core/storage/indra.py b/deeplake/core/storage/indra.py index 42e70cd118..af92e899a8 100644 --- a/deeplake/core/storage/indra.py +++ b/deeplake/core/storage/indra.py @@ -32,7 +32,7 @@ def __setitem__(self, path, content): def __getitem__(self, path): try: return bytes(self.core.get(path)) - except RuntimeError as e: + except Exception as e: raise KeyError(path) def get_bytes( diff --git a/deeplake/util/storage.py b/deeplake/util/storage.py index 0379c0f053..c5d5fc5143 100644 --- a/deeplake/util/storage.py +++ b/deeplake/util/storage.py @@ -60,7 +60,7 @@ def storage_provider_from_path( from deeplake.core.storage.indra import IndraProvider storage: StorageProvider = IndraProvider( - path, read_only=True, token=token, creds=creds + path, read_only=read_only, token=token, creds=creds ) elif path.startswith("hub://"): storage = storage_provider_from_hub_path( From 36fef4d9e1570e5dd55e948cb61c73a9bbe2908b Mon Sep 17 00:00:00 2001 From: khustup2 <sasun@activeloop.ai> Date: Wed, 14 Feb 2024 10:39:51 +0000 Subject: [PATCH 14/62] Cleanup indra provider. --- deeplake/core/storage/indra.py | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/deeplake/core/storage/indra.py b/deeplake/core/storage/indra.py index af92e899a8..49501be65d 100644 --- a/deeplake/core/storage/indra.py +++ b/deeplake/core/storage/indra.py @@ -25,15 +25,10 @@ def subdir(self, path: str, read_only: bool = False): def __setitem__(self, path, content): self.check_readonly() - if not isinstance(content, bytes): - content = bytes(content) - self.core.set(path, content) + self.core.set(path, bytes(content)) def __getitem__(self, path): - try: - return bytes(self.core.get(path)) - except Exception as e: - raise KeyError(path) + return bytes(self.core.get(path)) def get_bytes( self, path, start_byte: Optional[int] = None, end_byte: Optional[int] = None @@ -46,10 +41,7 @@ def get_bytes( raise KeyError(path) def get_object_size(self, path: str) -> int: - try: - return self.core.length(path) - except RuntimeError as e: - raise KeyError(path) + return self.core.length(path) def __delitem__(self, path): return self.core.remove(path) From da2af7e688486ce020726459aaf2935b8c724e04 Mon Sep 17 00:00:00 2001 From: khustup2 <sasun@activeloop.ai> Date: Thu, 15 Feb 2024 18:49:00 +0000 Subject: [PATCH 15/62] Fixes. --- deeplake/core/storage/indra.py | 4 ++++ deeplake/enterprise/convert_to_libdeeplake.py | 10 ++++++++++ 2 files changed, 14 insertions(+) diff --git a/deeplake/core/storage/indra.py b/deeplake/core/storage/indra.py index 49501be65d..0dc415b5d4 100644 --- a/deeplake/core/storage/indra.py +++ b/deeplake/core/storage/indra.py @@ -17,6 +17,10 @@ def __init__( else: self.core = root + @property + def path(self): + return self.core.path + def copy(self): return IndraProvider(self.core) diff --git a/deeplake/enterprise/convert_to_libdeeplake.py b/deeplake/enterprise/convert_to_libdeeplake.py index 93a35ab9f0..127a270603 100644 --- a/deeplake/enterprise/convert_to_libdeeplake.py +++ b/deeplake/enterprise/convert_to_libdeeplake.py @@ -4,6 +4,7 @@ from deeplake.core.storage.gcs import GCSProvider from deeplake.enterprise.util import raise_indra_installation_error # type: ignore from deeplake.core.storage import S3Provider +from deeplake.core.storage.indra import IndraProvider from deeplake.core.storage.azure import AzureProvider from deeplake.util.remove_cache import get_base_storage from deeplake.util.exceptions import EmptyTokenException @@ -45,6 +46,11 @@ def import_indra_api(): INDRA_INSTALLED = bool(importlib.util.find_spec("indra")) +def _get_indra_ds_from_native_provider(provider: IndraProvider): + api = import_indra_api() + return api.dataset(provider.core) + + def _get_indra_ds_from_azure_provider( path: str, token: str, @@ -182,6 +188,10 @@ def dataset_to_libdeeplake(hub2_dataset: Dataset): libdeeplake_dataset = _get_indra_ds_from_azure_provider( path=path, token=token, provider=provider ) + elif isinstance(provider, IndraProvider): + libdeeplake_dataset = _get_indra_ds_from_native_provider( + provider=provider + ) else: raise ValueError("Unknown storage provider for hub:// dataset") From 1d371b57061b8ec69148d0e7999466dd1f539f56 Mon Sep 17 00:00:00 2001 From: khustup2 <sasun@activeloop.ai> Date: Thu, 15 Feb 2024 19:00:32 +0000 Subject: [PATCH 16/62] Bump libdeeplake version. --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 8970052075..6aeb5a47f4 100644 --- a/setup.py +++ b/setup.py @@ -70,7 +70,7 @@ def libdeeplake_available(): extras_require["all"] = [req_map[r] for r in all_extras] if libdeeplake_available(): - libdeeplake = "libdeeplake==0.0.99" + libdeeplake = "libdeeplake==0.0.100" extras_require["enterprise"] = [libdeeplake, "pyjwt"] extras_require["all"].append(libdeeplake) install_requires.append(libdeeplake) From e40107955b93d676304f614726afca5243741310 Mon Sep 17 00:00:00 2001 From: zaaram <aram@activeloop.dev> Date: Thu, 15 Feb 2024 23:45:53 +0400 Subject: [PATCH 17/62] v4 CI Tests --- .github/workflows/test-push.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test-push.yml b/.github/workflows/test-push.yml index 1bf9b2438c..1025de7985 100644 --- a/.github/workflows/test-push.yml +++ b/.github/workflows/test-push.yml @@ -129,7 +129,7 @@ jobs: test: name: Test needs: setup - uses: activeloopai/shared-github-actions/.github/workflows/full_test.yml@main + uses: activeloopai/shared-github-actions/.github/workflows/full_test.yml@v4 if: github.repository == 'activeloopai/deeplake' with: repo: ${{ github.repository }} From 7e78642bbdd656c9c6be421b4c6ebdc9995946fc Mon Sep 17 00:00:00 2001 From: khustup2 <sasun@activeloop.ai> Date: Sun, 18 Feb 2024 14:32:46 +0000 Subject: [PATCH 18/62] Bump libdeeplake version for test. --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 30be2e8b52..ec51c17caa 100644 --- a/setup.py +++ b/setup.py @@ -70,7 +70,7 @@ def libdeeplake_available(): extras_require["all"] = [req_map[r] for r in all_extras] if libdeeplake_available(): - libdeeplake = "libdeeplake==0.0.101" + libdeeplake = "libdeeplake==0.0.102" extras_require["enterprise"] = [libdeeplake, "pyjwt"] extras_require["all"].append(libdeeplake) install_requires.append(libdeeplake) From 2599b9b1e2bc3a5e53318e1f82ca026bcf964539 Mon Sep 17 00:00:00 2001 From: khustup2 <sasun@activeloop.ai> Date: Mon, 19 Feb 2024 13:22:55 +0000 Subject: [PATCH 19/62] Reimplement rename with deecopy+delete. --- deeplake/api/dataset.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/deeplake/api/dataset.py b/deeplake/api/dataset.py index cca1fb9158..ba40d72191 100644 --- a/deeplake/api/dataset.py +++ b/deeplake/api/dataset.py @@ -811,10 +811,10 @@ def rename( feature_report_path(old_path, "rename", {}, token=token) - ds = deeplake.load(old_path, verbose=False, token=token, creds=creds) - ds.rename(new_path) + deeplake.deepcopy(old_path, new_path, verbose=False, token=token, creds=creds) + deeplake.delete(old_path, token=token, creds=creds) - return ds # type: ignore + return deeplake.load(new_path, verbose=False, token=token, creds=creds) @staticmethod @spinner From c268f324b4204ab3c5135a21b119af8e15909f2d Mon Sep 17 00:00:00 2001 From: khustup2 <sasun@activeloop.ai> Date: Tue, 20 Feb 2024 18:37:54 +0000 Subject: [PATCH 20/62] Bump libdeeplake version. --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index ec51c17caa..0b7e987d30 100644 --- a/setup.py +++ b/setup.py @@ -70,7 +70,7 @@ def libdeeplake_available(): extras_require["all"] = [req_map[r] for r in all_extras] if libdeeplake_available(): - libdeeplake = "libdeeplake==0.0.102" + libdeeplake = "libdeeplake==0.0.103" extras_require["enterprise"] = [libdeeplake, "pyjwt"] extras_require["all"].append(libdeeplake) install_requires.append(libdeeplake) From 03f0f2d10fa77ba3aab5a19eca429508b637475b Mon Sep 17 00:00:00 2001 From: khustup2 <sasun@activeloop.ai> Date: Sun, 25 Feb 2024 14:55:11 +0000 Subject: [PATCH 21/62] Switch to batch request for indra tensor bytes. --- deeplake/core/dataset/deeplake_query_tensor.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/deeplake/core/dataset/deeplake_query_tensor.py b/deeplake/core/dataset/deeplake_query_tensor.py index 3a369a57e5..76a31de272 100644 --- a/deeplake/core/dataset/deeplake_query_tensor.py +++ b/deeplake/core/dataset/deeplake_query_tensor.py @@ -80,18 +80,13 @@ def text(self, fetch_chunks: bool = False): """Return text data. Only applicable for tensors with 'text' base htype.""" if self.ndim == 1: return self.indra_tensor.bytes().decode() - return list( - self.indra_tensor[i].bytes().decode() for i in range(len(self.indra_tensor)) - ) + return list(b.decode() for b in self.indra_tensor.bytes()) def dict(self, fetch_chunks: bool = False): """Return json data. Only applicable for tensors with 'json' base htype.""" if self.ndim == 1: return json.loads(self.indra_tensor.bytes().decode()) - return list( - json.loads(self.indra_tensor[i].bytes().decode()) - for i in range(len(self.indra_tensor)) - ) + return list(json.loads(b.decode()) for b in self.indra_tensor.bytes()) @property def dtype(self): From 6c6b0dbe0f2d70f0e73af2b87f6a408842402680 Mon Sep 17 00:00:00 2001 From: khustup2 <sasun@activeloop.ai> Date: Sun, 25 Feb 2024 18:39:18 +0000 Subject: [PATCH 22/62] Bump libdeeplake version. --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 30be2e8b52..9e706957bd 100644 --- a/setup.py +++ b/setup.py @@ -70,7 +70,7 @@ def libdeeplake_available(): extras_require["all"] = [req_map[r] for r in all_extras] if libdeeplake_available(): - libdeeplake = "libdeeplake==0.0.101" + libdeeplake = "libdeeplake==0.0.104" extras_require["enterprise"] = [libdeeplake, "pyjwt"] extras_require["all"].append(libdeeplake) install_requires.append(libdeeplake) From c3c5964843e5a2ad61198ad9f8f8929d47f6ee99 Mon Sep 17 00:00:00 2001 From: khustup2 <sasun@activeloop.ai> Date: Mon, 26 Feb 2024 10:34:35 +0000 Subject: [PATCH 23/62] Fixed tests. --- deeplake/core/dataset/deeplake_query_tensor.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/deeplake/core/dataset/deeplake_query_tensor.py b/deeplake/core/dataset/deeplake_query_tensor.py index 76a31de272..2ec182793d 100644 --- a/deeplake/core/dataset/deeplake_query_tensor.py +++ b/deeplake/core/dataset/deeplake_query_tensor.py @@ -78,14 +78,20 @@ def numpy( def text(self, fetch_chunks: bool = False): """Return text data. Only applicable for tensors with 'text' base htype.""" + bs = self.indra_tensor.bytes() if self.ndim == 1: - return self.indra_tensor.bytes().decode() - return list(b.decode() for b in self.indra_tensor.bytes()) + return bs.decode() + if isinstance(bs, bytes): + return [bs.decode()] + return list(b.decode() for b in bs) def dict(self, fetch_chunks: bool = False): """Return json data. Only applicable for tensors with 'json' base htype.""" + bs = self.indra_tensor.bytes() if self.ndim == 1: - return json.loads(self.indra_tensor.bytes().decode()) + return json.loads(bs.decode()) + if isinstance(bs, bytes): + return [json.loads(bs.decode())] return list(json.loads(b.decode()) for b in self.indra_tensor.bytes()) @property From 5de55847e48d789ec050d620e103af601035ad99 Mon Sep 17 00:00:00 2001 From: khustup2 <sasun@activeloop.ai> Date: Thu, 29 Feb 2024 11:05:43 +0000 Subject: [PATCH 24/62] Replace v4 flag with indra flag. --- deeplake/api/dataset.py | 14 +++++++------- deeplake/constants.py | 2 +- .../core/vectorstore/deeplake_vectorstore.py | 6 +++--- deeplake/util/storage.py | 16 ++++++++-------- 4 files changed, 19 insertions(+), 19 deletions(-) diff --git a/deeplake/api/dataset.py b/deeplake/api/dataset.py index 32f159dcd8..4bf4d99d0c 100644 --- a/deeplake/api/dataset.py +++ b/deeplake/api/dataset.py @@ -44,7 +44,7 @@ DEFAULT_READONLY, DATASET_META_FILENAME, DATASET_LOCK_FILENAME, - USE_V4, + USE_INDRA, ) from deeplake.util.access_method import ( check_access_method, @@ -380,7 +380,7 @@ def empty( lock_timeout: Optional[int] = 0, verbose: bool = True, index_params: Optional[Dict[str, Union[int, str]]] = None, - v4: bool = USE_V4, + indra: bool = USE_INDRA, ) -> Dataset: """Creates an empty dataset @@ -405,7 +405,7 @@ def empty( lock_timeout (int): Number of seconds to wait before throwing a LockException. If None, wait indefinitely lock_enabled (bool): If true, the dataset manages a write lock. NOTE: Only set to False if you are managing concurrent access externally. index_params: Optional[Dict[str, Union[int, str]]]: Index parameters used while creating vector store, passed down to dataset. - v4 (bool): Flag indicating whether v4 api should be used to create the dataset. Defaults to false + indra (bool): Flag indicating whether indra api should be used to create the dataset. Defaults to false Returns: Dataset: Dataset created using the arguments provided. @@ -445,7 +445,7 @@ def empty( token=token, memory_cache_size=memory_cache_size, local_cache_size=local_cache_size, - v4=v4, + indra=indra, ) feature_report_path( @@ -513,7 +513,7 @@ def load( access_method: str = "stream", unlink: bool = False, reset: bool = False, - v4: bool = USE_V4, + indra: bool = USE_INDRA, check_integrity: Optional[bool] = None, lock_timeout: Optional[int] = 0, lock_enabled: Optional[bool] = True, @@ -584,7 +584,7 @@ def load( setting ``reset=True`` will reset HEAD changes and load the previous version. check_integrity (bool, Optional): Performs an integrity check by default (None) if the dataset has 20 or fewer tensors. Set to ``True`` to force integrity check, ``False`` to skip integrity check. - v4 (bool): Flag indicating whether v4 api should be used to create the dataset. Defaults to false + indra (bool): Flag indicating whether indra api should be used to create the dataset. Defaults to false .. # noqa: DAR101 @@ -631,7 +631,7 @@ def load( token=token, memory_cache_size=memory_cache_size, local_cache_size=local_cache_size, - v4=v4, + indra=indra, ) feature_report_path( path, diff --git a/deeplake/constants.py b/deeplake/constants.py index 4295a3eb23..e7c1f34612 100644 --- a/deeplake/constants.py +++ b/deeplake/constants.py @@ -353,4 +353,4 @@ # Size of dataset view to expose as indra dataset wrapper. INDRA_DATASET_SAMPLES_THRESHOLD = 10000000 -USE_V4 = os.environ.get("DEEPLAKE_USE_V4", "false").strip().lower() == "true" +USE_INDRA = os.environ.get("DEEPLAKE_USE_INDRA", "false").strip().lower() == "true" diff --git a/deeplake/core/vectorstore/deeplake_vectorstore.py b/deeplake/core/vectorstore/deeplake_vectorstore.py index b4f276d728..0378ca3d70 100644 --- a/deeplake/core/vectorstore/deeplake_vectorstore.py +++ b/deeplake/core/vectorstore/deeplake_vectorstore.py @@ -12,7 +12,7 @@ DEFAULT_VECTORSTORE_TENSORS, MAX_BYTES_PER_MINUTE, TARGET_BYTE_SIZE, - USE_V4, + USE_INDRA, ) from deeplake.util.bugout_reporter import feature_report_path from deeplake.util.exceptions import DeepMemoryAccessError @@ -42,7 +42,7 @@ def __init__( org_id: Optional[str] = None, logger: logging.Logger = logger, branch: str = "main", - v4: bool = USE_V4, + indra: bool = USE_INDRA, **kwargs: Any, ) -> None: """Creates an empty VectorStore or loads an existing one if it exists at the specified ``path``. @@ -105,7 +105,7 @@ def __init__( - If 'ENV' is passed, credentials are fetched from the environment variables. This is also the case when creds is not passed for cloud datasets. For datasets connected to hub cloud, specifying 'ENV' will override the credentials fetched from Activeloop and use local ones. runtime (Dict, optional): Parameters for creating the Vector Store in Deep Lake's Managed Tensor Database. Not applicable when loading an existing Vector Store. To create a Vector Store in the Managed Tensor Database, set `runtime = {"tensor_db": True}`. branch (str): Branch name to use for the Vector Store. Defaults to "main". - v4 (bool): Flag indicating whether v4 api should be used to create the underlying dataset. Defaults to false + indra (bool): Flag indicating whether indra api should be used to create the underlying dataset. Defaults to false **kwargs (dict): Additional keyword arguments. .. diff --git a/deeplake/util/storage.py b/deeplake/util/storage.py index c5d5fc5143..6c0acdd505 100644 --- a/deeplake/util/storage.py +++ b/deeplake/util/storage.py @@ -28,7 +28,7 @@ def storage_provider_from_path( token: Optional[str] = None, is_hub_path: bool = False, db_engine: bool = False, - v4: bool = False, + indra: bool = False, ): """Construct a StorageProvider given a path. @@ -40,7 +40,7 @@ def storage_provider_from_path( token (str): token for authentication into activeloop. is_hub_path (bool): Whether the path points to a Deep Lake dataset. db_engine (bool): Whether to use Activeloop DB Engine. Only applicable for hub:// paths. - v4 (bool): If true creates v4 storage provider. + indra (bool): If true creates indra storage provider. Returns: If given a path starting with s3:// returns the S3Provider. @@ -56,7 +56,7 @@ def storage_provider_from_path( if creds is None: creds = {} - if v4: + if indra: from deeplake.core.storage.indra import IndraProvider storage: StorageProvider = IndraProvider( @@ -153,7 +153,7 @@ def storage_provider_from_hub_path( db_engine: bool = False, token: Optional[str] = None, creds: Optional[Union[dict, str]] = None, - v4: bool = False, + indra: bool = False, ): path, org_id, ds_name, subdir = process_hub_path(path) client = DeepLakeBackendClient(token=token) @@ -202,7 +202,7 @@ def storage_provider_from_hub_path( read_only=read_only, is_hub_path=True, token=token, - v4=v4, + indra=indra, ) storage.creds_used = creds_used if creds_used == "PLATFORM": @@ -219,7 +219,7 @@ def get_storage_and_cache_chain( memory_cache_size, local_cache_size, db_engine=False, - v4=False, + indra=False, ): """ Returns storage provider and cache chain for a given path, according to arguments passed. @@ -233,7 +233,7 @@ def get_storage_and_cache_chain( memory_cache_size (int): The size of the in-memory cache to use. local_cache_size (int): The size of the local cache to use. db_engine (bool): Whether to use Activeloop DB Engine, only applicable for hub:// paths. - v4 (bool): If true creates v4 storage provider. + indra (bool): If true creates indra storage provider. Returns: A tuple of the storage provider and the storage chain. @@ -245,7 +245,7 @@ def get_storage_and_cache_chain( creds=creds, read_only=read_only, token=token, - v4=v4, + indra=indra, ) memory_cache_size_bytes = memory_cache_size * MB local_cache_size_bytes = local_cache_size * MB From 1bb14b456b43bdf04192086314a0b21889c49aee Mon Sep 17 00:00:00 2001 From: khustup2 <sasun@activeloop.ai> Date: Fri, 1 Mar 2024 09:49:53 +0000 Subject: [PATCH 25/62] Indra read only view. --- deeplake/api/dataset.py | 6 +++++ deeplake/core/dataset/indra_dataset_view.py | 29 ++++++++++++++++----- 2 files changed, 28 insertions(+), 7 deletions(-) diff --git a/deeplake/api/dataset.py b/deeplake/api/dataset.py index 4bf4d99d0c..dd020e4070 100644 --- a/deeplake/api/dataset.py +++ b/deeplake/api/dataset.py @@ -14,6 +14,7 @@ from deeplake.client.client import DeepLakeBackendClient from deeplake.client.log import logger from deeplake.core.dataset import Dataset, dataset_factory +from deeplake.core.dataset.indra_dataset_view import IndraDatasetView from deeplake.core.tensor import Tensor from deeplake.core.meta.dataset_meta import DatasetMeta from deeplake.util.connect_dataset import connect_dataset_entry @@ -652,6 +653,11 @@ def load( f"A Deep Lake dataset does not exist at the given path ({path}). Check the path provided or in case you want to create a new dataset, use deeplake.empty()." ) + if indra and read_only: + from indra import api + ids = api.load_from_storage(storage.core) + return IndraDatasetView(indra_ds=ids) + dataset_kwargs: Dict[str, Union[None, str, bool, int, Dict]] = { "path": path, "read_only": read_only, diff --git a/deeplake/core/dataset/indra_dataset_view.py b/deeplake/core/dataset/indra_dataset_view.py index 9904d2737e..7f6b3e79fe 100644 --- a/deeplake/core/dataset/indra_dataset_view.py +++ b/deeplake/core/dataset/indra_dataset_view.py @@ -36,8 +36,8 @@ class IndraDatasetView(Dataset): def __init__( self, - deeplake_ds, indra_ds, + deeplake_ds=None, group_index="", enabled_tensors=None, index: Optional[Index] = None, @@ -55,9 +55,6 @@ def __init__( if deeplake_ds is not None else None ) - d["version_state"] = ( - deeplake_ds.version_state if deeplake_ds is not None else {} - ) d["_index"] = ( index or deeplake_ds.index if deeplake_ds is not None @@ -79,6 +76,22 @@ def meta(self): def path(self): return self.deeplake_ds.path if self.deeplake_ds is not None else "" + @property + def version_state(self) -> Dict: + return self.indra_ds.version_state + + @property + def branches(self): + return self.indra_ds.branches + + @property + def commits(self) -> List[Dict]: + return self.indra_ds.commits + + @property + def commit_id(self) -> str + return self.indra_ds.commit_id + @property def libdeeplake_dataset(self): return self.indra_ds @@ -89,9 +102,11 @@ def merge(self, *args, **kwargs): ) def checkout(self, address: str, create: bool = False): - raise InvalidOperationError( - "checkout", "checkout method cannot be called on a Dataset view." - ) + if create: + raise InvalidOperationError( + "checkout", "Cannot create new branch on Dataset View." + ) + self.indra_ds.checkout(address) def _get_tensor_from_root(self, fullpath): tensors = self.indra_ds.tensors From cb61ed99608ef9dc1ae3a9325245a9401c069b5b Mon Sep 17 00:00:00 2001 From: khustup2 <sasun@activeloop.ai> Date: Fri, 1 Mar 2024 10:51:35 +0000 Subject: [PATCH 26/62] Fixed error. --- deeplake/core/dataset/indra_dataset_view.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deeplake/core/dataset/indra_dataset_view.py b/deeplake/core/dataset/indra_dataset_view.py index 7f6b3e79fe..e2eca20af0 100644 --- a/deeplake/core/dataset/indra_dataset_view.py +++ b/deeplake/core/dataset/indra_dataset_view.py @@ -89,7 +89,7 @@ def commits(self) -> List[Dict]: return self.indra_ds.commits @property - def commit_id(self) -> str + def commit_id(self) -> str: return self.indra_ds.commit_id @property From 6b0f2cf9d3039be18750ed3cd1b5226c656fa563 Mon Sep 17 00:00:00 2001 From: khustup2 <sasun@activeloop.ai> Date: Fri, 1 Mar 2024 13:03:30 +0000 Subject: [PATCH 27/62] Fixed black. --- deeplake/api/dataset.py | 1 + 1 file changed, 1 insertion(+) diff --git a/deeplake/api/dataset.py b/deeplake/api/dataset.py index dd020e4070..c7a2aaf8c1 100644 --- a/deeplake/api/dataset.py +++ b/deeplake/api/dataset.py @@ -655,6 +655,7 @@ def load( if indra and read_only: from indra import api + ids = api.load_from_storage(storage.core) return IndraDatasetView(indra_ds=ids) From f9800fd25efd634b3590772747554b1d810bf000 Mon Sep 17 00:00:00 2001 From: khustup2 <sasun@activeloop.ai> Date: Fri, 1 Mar 2024 13:33:50 +0000 Subject: [PATCH 28/62] Fix mypy. --- deeplake/api/dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deeplake/api/dataset.py b/deeplake/api/dataset.py index c7a2aaf8c1..cd721cf9fc 100644 --- a/deeplake/api/dataset.py +++ b/deeplake/api/dataset.py @@ -654,7 +654,7 @@ def load( ) if indra and read_only: - from indra import api + from indra import api # type: ignore ids = api.load_from_storage(storage.core) return IndraDatasetView(indra_ds=ids) From cc8d6f1d3759ffbcf130ee9dc7c17d089028ae2f Mon Sep 17 00:00:00 2001 From: khustup2 <sasun@activeloop.ai> Date: Sun, 3 Mar 2024 19:52:45 +0000 Subject: [PATCH 29/62] Fixes. --- deeplake/core/dataset/indra_dataset_view.py | 24 +++++++++++---------- deeplake/core/tests/test_indra_dataset.py | 6 +++--- deeplake/enterprise/libdeeplake_query.py | 4 ++-- 3 files changed, 18 insertions(+), 16 deletions(-) diff --git a/deeplake/core/dataset/indra_dataset_view.py b/deeplake/core/dataset/indra_dataset_view.py index e2eca20af0..5d73caf0b1 100644 --- a/deeplake/core/dataset/indra_dataset_view.py +++ b/deeplake/core/dataset/indra_dataset_view.py @@ -48,19 +48,24 @@ def __init__( d["deeplake_ds"] = deeplake_ds d["indra_ds"] = indra_ds d["group_index"] = ( - group_index or deeplake_ds.group_index if deeplake_ds is not None else "" + group_index or deeplake_ds.group_index + if hasattr(deeplake_ds, "group_index") + else "" ) d["enabled_tensors"] = ( enabled_tensors or deeplake_ds.enabled_tensors - if deeplake_ds is not None + if hasattr(deeplake_ds, "enabled_tensors") else None ) d["_index"] = ( index or deeplake_ds.index - if deeplake_ds is not None + if hasattr(deeplake_ds, "index") else Index(item=slice(None)) ) self.__dict__.update(d) + self._view_base = None + self._read_only = True + self._locked_out = False @property def read_only(self): @@ -78,7 +83,10 @@ def path(self): @property def version_state(self) -> Dict: - return self.indra_ds.version_state + try: + return self.indra_ds.version_state + except: + return dict() @property def branches(self): @@ -210,12 +218,6 @@ def __getitem__( ret = self[x] return ret else: - if not is_iteration and isinstance(item, int): - is_iteration = check_if_iteration(self._indexing_history, item) - if is_iteration and SHOW_ITERATION_WARNING: - warnings.warn( - "Indexing by integer in a for loop, like `for i in range(len(ds)): ... ds[i]` can be quite slow. Use `for i, sample in enumerate(ds)` instead." - ) ret = IndraDatasetView( deeplake_ds=self.deeplake_ds, indra_ds=self.indra_ds[item], @@ -408,4 +410,4 @@ def random_split(self, lengths: Sequence[Union[int, float]]): lengths = calculate_absolute_lengths(lengths, len(self)) vs = self.indra_ds.random_split(lengths) - return [IndraDatasetView(self.deeplake_ds, v) for v in vs] + return [IndraDatasetView(deeplake_ds=self.deeplake_ds, indra_ds=v) for v in vs] diff --git a/deeplake/core/tests/test_indra_dataset.py b/deeplake/core/tests/test_indra_dataset.py index 0a9196e1f0..c38ac1c5c4 100644 --- a/deeplake/core/tests/test_indra_dataset.py +++ b/deeplake/core/tests/test_indra_dataset.py @@ -244,11 +244,11 @@ def test_sequences_accessing_data(local_auth_ds_generator): assert len(deeplake_indra_ds) == 2 assert deeplake_indra_ds.image.shape == (2, None, None, 10, 3) assert deeplake_indra_ds[0].image.shape == (101, 10, 10, 3) - assert deeplake_indra_ds[0, 0].image.shape == (10, 10, 3) + assert deeplake_indra_ds[0, 0].image.shape == (1, 10, 10, 3) assert len(deeplake_indra_ds[0].image.numpy()) == 101 assert deeplake_indra_ds[1].image.shape == (99, None, 10, 3) - assert deeplake_indra_ds[1, 0].image.shape == (10, 10, 3) - assert deeplake_indra_ds[1, 98].image.shape == (20, 10, 3) + assert deeplake_indra_ds[1, 0].image.shape == (1, 10, 10, 3) + assert deeplake_indra_ds[1, 98].image.shape == (1, 20, 10, 3) assert len(deeplake_indra_ds[1].image.numpy()) == 99 assert deeplake_indra_ds[1].image.numpy()[0].shape == (10, 10, 3) assert deeplake_indra_ds[1].image.numpy()[98].shape == (20, 10, 3) diff --git a/deeplake/enterprise/libdeeplake_query.py b/deeplake/enterprise/libdeeplake_query.py index 57ec98bb94..3efcb50218 100644 --- a/deeplake/enterprise/libdeeplake_query.py +++ b/deeplake/enterprise/libdeeplake_query.py @@ -53,7 +53,7 @@ def query(dataset, query_string: str): indexes = list(dsv.indexes) return dataset.no_view_dataset[indexes] else: - view = IndraDatasetView(deeplake_ds=dataset, indra_ds=dsv) + view = IndraDatasetView(indra_ds=dsv) view._tql_query = query_string if hasattr(dataset, "is_actually_cloud"): view.is_actually_cloud = dataset.is_actually_cloud @@ -158,6 +158,6 @@ def universal_query(query_string: str, token: Optional[str]): api = import_indra_api() dsv = api.tql.query(query_string, token) - view = IndraDatasetView(deeplake_ds=None, indra_ds=dsv) + view = IndraDatasetView(indra_ds=dsv) view._tql_query = query_string return view From 34b7d4b34de0142d4a26db0e68da4e3b15436459 Mon Sep 17 00:00:00 2001 From: khustup2 <sasun@activeloop.ai> Date: Sun, 3 Mar 2024 20:27:28 +0000 Subject: [PATCH 30/62] Step to get rid of deeplake_ds. --- deeplake/core/dataset/indra_dataset_view.py | 1 + deeplake/core/dataset/indra_tensor_view.py | 2 +- deeplake/core/tests/test_indra_dataset.py | 12 ++++++------ 3 files changed, 8 insertions(+), 7 deletions(-) diff --git a/deeplake/core/dataset/indra_dataset_view.py b/deeplake/core/dataset/indra_dataset_view.py index 5d73caf0b1..ccd6b65486 100644 --- a/deeplake/core/dataset/indra_dataset_view.py +++ b/deeplake/core/dataset/indra_dataset_view.py @@ -66,6 +66,7 @@ def __init__( self._view_base = None self._read_only = True self._locked_out = False + self._query_string = None @property def read_only(self): diff --git a/deeplake/core/dataset/indra_tensor_view.py b/deeplake/core/dataset/indra_tensor_view.py index e3f0a939d7..43a7df735a 100644 --- a/deeplake/core/dataset/indra_tensor_view.py +++ b/deeplake/core/dataset/indra_tensor_view.py @@ -103,7 +103,7 @@ def htype(self): htype = self.indra_tensor.htype if self.indra_tensor.is_sequence: htype = f"sequence[{htype}]" - if self.deeplake_tensor.is_link: + if self.indra_tensor.is_link: htype = f"link[{htype}]" return htype diff --git a/deeplake/core/tests/test_indra_dataset.py b/deeplake/core/tests/test_indra_dataset.py index c38ac1c5c4..6b8d8825d9 100644 --- a/deeplake/core/tests/test_indra_dataset.py +++ b/deeplake/core/tests/test_indra_dataset.py @@ -23,7 +23,7 @@ def test_indexing(local_auth_ds_generator): deeplake_ds.label.append(int(100 * random.uniform(0.0, 1.0))) indra_ds = dataset_to_libdeeplake(deeplake_ds) - deeplake_indra_ds = IndraDatasetView(deeplake_ds=deeplake_ds, indra_ds=indra_ds) + deeplake_indra_ds = IndraDatasetView(indra_ds=indra_ds) assert len(deeplake_indra_ds) == len(indra_ds) @@ -70,7 +70,7 @@ def test_save_view(local_auth_ds_generator): deeplake_ds.commit("First") indra_ds = dataset_to_libdeeplake(deeplake_ds) - deeplake_indra_ds = IndraDatasetView(deeplake_ds=deeplake_ds, indra_ds=indra_ds) + deeplake_indra_ds = IndraDatasetView(indra_ds=indra_ds) deeplake_indra_ds.save_view() assert ( deeplake_indra_ds.base_storage["queries.json"] @@ -108,7 +108,7 @@ def test_load_view(local_auth_ds_generator): deeplake_ds.commit("First") indra_ds = dataset_to_libdeeplake(deeplake_ds) - deeplake_indra_ds = IndraDatasetView(deeplake_ds=deeplake_ds, indra_ds=indra_ds) + deeplake_indra_ds = IndraDatasetView(indra_ds=indra_ds) with pytest.raises(Exception): dataloader = deeplake_indra_ds.pytorch() @@ -158,7 +158,7 @@ def test_query(local_auth_ds_generator): deeplake_ds.image.append(np.random.randint(0, 255, (100, 200, 3), np.uint8)) indra_ds = dataset_to_libdeeplake(deeplake_ds) - deeplake_indra_ds = IndraDatasetView(deeplake_ds=deeplake_ds, indra_ds=indra_ds) + deeplake_indra_ds = IndraDatasetView(indra_ds=indra_ds) view = deeplake_indra_ds.query("SELECT * GROUP BY label") assert len(view) == 10 @@ -193,7 +193,7 @@ def test_metadata(local_auth_ds_generator): ) indra_ds = dataset_to_libdeeplake(deeplake_ds) - deeplake_indra_ds = IndraDatasetView(deeplake_ds=deeplake_ds, indra_ds=indra_ds) + deeplake_indra_ds = IndraDatasetView(indra_ds=indra_ds) assert deeplake_indra_ds.label.htype == "generic" assert deeplake_indra_ds.label.dtype == np.int32 assert deeplake_indra_ds.label.sample_compression == None @@ -219,7 +219,7 @@ def test_accessing_data(local_auth_ds_generator): deeplake_ds.label.append(int(100 * random.uniform(0.0, 1.0))) indra_ds = dataset_to_libdeeplake(deeplake_ds) - deeplake_indra_ds = IndraDatasetView(deeplake_ds=deeplake_ds, indra_ds=indra_ds) + deeplake_indra_ds = IndraDatasetView(indra_ds=indra_ds) assert np.all( np.isclose(deeplake_indra_ds.label.numpy(), deeplake_indra_ds["label"].numpy()) From 93da017dbc91c1790a3e4661ecedaade9f3cbff9 Mon Sep 17 00:00:00 2001 From: khustup2 <sasun@activeloop.ai> Date: Sun, 3 Mar 2024 21:11:37 +0000 Subject: [PATCH 31/62] Last. --- .../core/vectorstore/vector_search/indra/search_algorithm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deeplake/core/vectorstore/vector_search/indra/search_algorithm.py b/deeplake/core/vectorstore/vector_search/indra/search_algorithm.py index 956aee99c9..e36e411d14 100644 --- a/deeplake/core/vectorstore/vector_search/indra/search_algorithm.py +++ b/deeplake/core/vectorstore/vector_search/indra/search_algorithm.py @@ -105,7 +105,7 @@ class SearchIndra(SearchBasic): def _get_view(self, tql_query, runtime: Optional[Dict] = None): indra_dataset = self._get_indra_dataset() indra_view = indra_dataset.query(tql_query) - view = IndraDatasetView(deeplake_ds=self.deeplake_dataset, indra_ds=indra_view) + view = IndraDatasetView(indra_ds=indra_view) view._tql_query = tql_query return view From d6675b2ffd99ad885b30ea46d011ad3c61579c26 Mon Sep 17 00:00:00 2001 From: khustup2 <sasun@activeloop.ai> Date: Mon, 4 Mar 2024 09:28:50 +0000 Subject: [PATCH 32/62] Remove deeplake_ds. --- deeplake/core/dataset/indra_dataset_view.py | 76 +++------------------ deeplake/core/dataset/indra_tensor_view.py | 38 ++++------- 2 files changed, 21 insertions(+), 93 deletions(-) diff --git a/deeplake/core/dataset/indra_dataset_view.py b/deeplake/core/dataset/indra_dataset_view.py index ccd6b65486..ee71d18892 100644 --- a/deeplake/core/dataset/indra_dataset_view.py +++ b/deeplake/core/dataset/indra_dataset_view.py @@ -37,31 +37,15 @@ class IndraDatasetView(Dataset): def __init__( self, indra_ds, - deeplake_ds=None, group_index="", enabled_tensors=None, index: Optional[Index] = None, ): - if isinstance(deeplake_ds, IndraDatasetView): - deeplake_ds = deeplake_ds.deeplake_ds d: Dict[str, Any] = {} - d["deeplake_ds"] = deeplake_ds d["indra_ds"] = indra_ds - d["group_index"] = ( - group_index or deeplake_ds.group_index - if hasattr(deeplake_ds, "group_index") - else "" - ) - d["enabled_tensors"] = ( - enabled_tensors or deeplake_ds.enabled_tensors - if hasattr(deeplake_ds, "enabled_tensors") - else None - ) - d["_index"] = ( - index or deeplake_ds.index - if hasattr(deeplake_ds, "index") - else Index(item=slice(None)) - ) + d["group_index"] = "" + d["enabled_tensors"] = None + d["_index"] = Index(item=slice(None)) self.__dict__.update(d) self._view_base = None self._read_only = True @@ -70,17 +54,15 @@ def __init__( @property def read_only(self): - if self.deeplake_ds is not None: - return self.deeplake_ds.read_only return True @property def meta(self): - return self.deeplake_ds.meta if self.deeplake_ds is not None else DatasetMeta() + return DatasetMeta() @property def path(self): - return self.deeplake_ds.path if self.deeplake_ds is not None else "" + return "" @property def version_state(self) -> Dict: @@ -122,12 +104,8 @@ def _get_tensor_from_root(self, fullpath): for tensor in tensors: if tensor.name == fullpath: deeplake_tensor = None - try: - deeplake_tensor = self.deeplake_ds.__getattr__(fullpath) - except: - pass indra_tensor = tensor - return IndraTensorView(deeplake_tensor, indra_tensor, index=self.index) + return IndraTensorView(indra_tensor, index=self.index) def pytorch( self, @@ -173,15 +151,6 @@ def __getitem__( tensor = self._get_tensor_from_root(fullpath) if tensor is not None: return tensor - if self.deeplake_ds is not None and self.deeplake_ds._has_group_in_root( - fullpath - ): - ret = IndraDatasetView( - deeplake_ds=self.deeplake_ds, - indra_ds=self.indra_ds, - index=self.index, - group_index=posixpath.join(self.group_index, item), - ) elif "/" in item: splt = posixpath.split(item) ret = self[splt[0]][splt[1]] @@ -208,7 +177,6 @@ def __getitem__( for x in item ] ret = IndraDatasetView( - deeplake_ds=self.deeplake_ds, indra_ds=self.indra_ds, enabled_tensors=enabled_tensors, index=self.index, @@ -220,7 +188,6 @@ def __getitem__( return ret else: ret = IndraDatasetView( - deeplake_ds=self.deeplake_ds, indra_ds=self.indra_ds[item], index=self.index[item], ) @@ -234,13 +201,6 @@ def __getitem__( def __getattr__(self, key): try: return self.__getitem__(key) - except TensorDoesNotExistError as ke: - try: - return getattr(self.deeplake_ds, key) - except AttributeError: - raise AttributeError( - f"'{self.__class__}' object has no attribute '{key}'" - ) from ke except AttributeError: return getattr(self.indra_ds, key) @@ -346,11 +306,6 @@ def sample_indices(self): def _all_tensors_filtered( self, include_hidden: bool = True, include_disabled=True ) -> List[str]: - if self.deeplake_ds is not None: - return self.deeplake_ds._all_tensors_filtered( - include_hidden, include_disabled - ) - indra_tensors = self.indra_ds.tensors return list(t.name for t in indra_tensors) @@ -358,24 +313,9 @@ def _tensors( self, include_hidden: bool = True, include_disabled=True ) -> Dict[str, Tensor]: """All tensors belonging to this group, including those within sub groups. Always returns the sliced tensors.""" - original_tensors = ( - self.deeplake_ds._tensors(include_hidden, include_disabled) - if self.deeplake_ds is not None - else {} - ) indra_tensors = self.indra_ds.tensors - indra_keys = set(t.name for t in indra_tensors) - original_tensors = { - k: v for k, v in original_tensors.items() if k in indra_keys or v.hidden - } - original_keys = set(original_tensors.keys()) for t in indra_tensors: - if t.name in original_keys: - original_tensors[t.name] = IndraTensorView( - original_tensors[t.name], t, index=self.index - ) - else: - original_tensors[t.name] = IndraTensorView(None, t, index=self.index) + original_tensors[t.name] = IndraTensorView(t, index=self.index) return original_tensors def __str__(self): @@ -411,4 +351,4 @@ def random_split(self, lengths: Sequence[Union[int, float]]): lengths = calculate_absolute_lengths(lengths, len(self)) vs = self.indra_ds.random_split(lengths) - return [IndraDatasetView(deeplake_ds=self.deeplake_ds, indra_ds=v) for v in vs] + return [IndraDatasetView(indra_ds=v) for v in vs] diff --git a/deeplake/core/dataset/indra_tensor_view.py b/deeplake/core/dataset/indra_tensor_view.py index 43a7df735a..cf78f49a1f 100644 --- a/deeplake/core/dataset/indra_tensor_view.py +++ b/deeplake/core/dataset/indra_tensor_view.py @@ -14,20 +14,14 @@ class IndraTensorView(tensor.Tensor): def __init__( self, - deeplake_tensor, indra_tensor, index: Optional[Index] = None, is_iteration: bool = False, ): - self.deeplake_tensor = deeplake_tensor self.indra_tensor = indra_tensor self.is_iteration = is_iteration - self.key = ( - deeplake_tensor.key - if hasattr(deeplake_tensor, "key") - else indra_tensor.name - ) + self.key = indra_tensor.name self.first_dim = None @@ -35,14 +29,11 @@ def __init__( def __getattr__(self, key): try: - return getattr(self.deeplake_tensor, key) + return getattr(self.indra_tensor, key) except AttributeError: - try: - return getattr(self.indra_tensor, key) - except AttributeError: - raise AttributeError( - f"'{self.__class__}' object has no attribute '{key}'" - ) + raise AttributeError( + f"'{self.__class__}' object has no attribute '{key}'" + ) def __getitem__( self, @@ -58,7 +49,6 @@ def __getitem__( indra_tensor = self.indra_tensor[item] return IndraTensorView( - self.deeplake_tensor, indra_tensor, index=self.index[item], is_iteration=is_iteration, @@ -186,16 +176,14 @@ def ndim(self): @property def meta(self): """Metadata of the tensor.""" - if self.deeplake_tensor is None: - return TensorMeta( - htype=self.indra_tensor.htype, - dtype=self.indra_tensor.dtype, - sample_compression=self.indra_tensor.sample_compression, - chunk_compression=None, - is_sequence=self.indra_tensor.is_sequence, - is_link=False, - ) - return self.deeplake_tensor.chunk_engine.tensor_meta + return TensorMeta( + htype=self.indra_tensor.htype, + dtype=self.indra_tensor.dtype, + sample_compression=self.indra_tensor.sample_compression, + chunk_compression=None, + is_sequence=self.indra_tensor.is_sequence, + is_link=False, + ) @property def base_htype(self): From 80afefdaa8d5495270c21630dfe4cd071d4a9d18 Mon Sep 17 00:00:00 2001 From: khustup2 <sasun@activeloop.ai> Date: Mon, 4 Mar 2024 09:29:12 +0000 Subject: [PATCH 33/62] Fixed black. --- deeplake/core/dataset/indra_tensor_view.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/deeplake/core/dataset/indra_tensor_view.py b/deeplake/core/dataset/indra_tensor_view.py index cf78f49a1f..4a2c3bf0b2 100644 --- a/deeplake/core/dataset/indra_tensor_view.py +++ b/deeplake/core/dataset/indra_tensor_view.py @@ -31,9 +31,7 @@ def __getattr__(self, key): try: return getattr(self.indra_tensor, key) except AttributeError: - raise AttributeError( - f"'{self.__class__}' object has no attribute '{key}'" - ) + raise AttributeError(f"'{self.__class__}' object has no attribute '{key}'") def __getitem__( self, From f262dbd9a5d2f8b403cba45ec6ec3bd8e7494682 Mon Sep 17 00:00:00 2001 From: khustup2 <sasun@activeloop.ai> Date: Mon, 4 Mar 2024 14:41:13 +0000 Subject: [PATCH 34/62] Fixed failures. --- deeplake/core/dataset/indra_dataset_view.py | 33 ++++++++++++------- deeplake/enterprise/convert_to_libdeeplake.py | 9 +++-- 2 files changed, 27 insertions(+), 15 deletions(-) diff --git a/deeplake/core/dataset/indra_dataset_view.py b/deeplake/core/dataset/indra_dataset_view.py index ee71d18892..17d32caf79 100644 --- a/deeplake/core/dataset/indra_dataset_view.py +++ b/deeplake/core/dataset/indra_dataset_view.py @@ -48,13 +48,22 @@ def __init__( d["_index"] = Index(item=slice(None)) self.__dict__.update(d) self._view_base = None + self._view_entry = None self._read_only = True self._locked_out = False self._query_string = None + try: + from deeplake.core.storage.indra import IndraProvider + self.storage = IndraProvider(indra_ds.storage) + except: + pass @property def read_only(self): - return True + try: + return self.indra_ds.storage.read_only + except: + return True @property def meta(self): @@ -153,7 +162,7 @@ def __getitem__( return tensor elif "/" in item: splt = posixpath.split(item) - ret = self[splt[0]][splt[1]] + return self[splt[0]][splt[1]] else: raise TensorDoesNotExistError(item) elif isinstance(item, (int, slice, list, tuple, Index, type(Ellipsis))): @@ -176,7 +185,7 @@ def __getitem__( ) for x in item ] - ret = IndraDatasetView( + return IndraDatasetView( indra_ds=self.indra_ds, enabled_tensors=enabled_tensors, index=self.index, @@ -187,22 +196,21 @@ def __getitem__( ret = self[x] return ret else: - ret = IndraDatasetView( + return IndraDatasetView( indra_ds=self.indra_ds[item], index=self.index[item], ) else: raise InvalidKeyTypeError(item) - if hasattr(self, "_view_entry"): - ret._view_entry = self._view_entry - return ret - def __getattr__(self, key): try: - return self.__getitem__(key) + ret = self.__getitem__(key) except AttributeError: - return getattr(self.indra_ds, key) + ret = getattr(self.indra_ds, key) + if ret is not None: + ret._view_entry = self._view_entry + return ret def __len__(self): return len(self.indra_ds) @@ -314,9 +322,10 @@ def _tensors( ) -> Dict[str, Tensor]: """All tensors belonging to this group, including those within sub groups. Always returns the sliced tensors.""" indra_tensors = self.indra_ds.tensors + ret = {} for t in indra_tensors: - original_tensors[t.name] = IndraTensorView(t, index=self.index) - return original_tensors + ret[t.name] = IndraTensorView(t, index=self.index) + return ret def __str__(self): path_str = "" diff --git a/deeplake/enterprise/convert_to_libdeeplake.py b/deeplake/enterprise/convert_to_libdeeplake.py index 127a270603..9b9d8ef04d 100644 --- a/deeplake/enterprise/convert_to_libdeeplake.py +++ b/deeplake/enterprise/convert_to_libdeeplake.py @@ -156,6 +156,7 @@ def dataset_to_libdeeplake(hub2_dataset: Dataset): """Convert a hub 2.x dataset object to a libdeeplake dataset object.""" try_flushing(hub2_dataset) api = import_indra_api() + from deeplake.core.storage.indra import IndraProvider path: str = hub2_dataset.path token = ( @@ -166,7 +167,11 @@ def dataset_to_libdeeplake(hub2_dataset: Dataset): ) if token is None or token == "": raise EmptyTokenException - if hub2_dataset.libdeeplake_dataset is None: + if hub2_dataset.libdeeplake_dataset is not None: + libdeeplake_dataset = hub2_dataset.libdeeplake_dataset + elif isinstance(hub2_dataset.storage.next_storage, IndraProvider): + libdeeplake_dataset = api.load_from_storage(hub2_dataset.storage.next_storage.core) + else: libdeeplake_dataset = None if path.startswith("gdrive://"): raise ValueError("Gdrive datasets are not supported for libdeeplake") @@ -221,8 +226,6 @@ def dataset_to_libdeeplake(hub2_dataset: Dataset): libdeeplake_dataset = api.dataset(path, token=token, org_id=org_id) hub2_dataset.libdeeplake_dataset = libdeeplake_dataset - else: - libdeeplake_dataset = hub2_dataset.libdeeplake_dataset assert libdeeplake_dataset is not None libdeeplake_dataset._max_cache_size = max( From c4e42ff5c2ce0c4cdf77e9b153b3f9381412a448 Mon Sep 17 00:00:00 2001 From: khustup2 <sasun@activeloop.ai> Date: Mon, 4 Mar 2024 16:04:14 +0000 Subject: [PATCH 35/62] Further adaptations. --- deeplake/core/dataset/dataset.py | 2 +- deeplake/core/dataset/indra_dataset_view.py | 14 ++++++-- deeplake/core/storage/indra.py | 39 ++++++++++++++++++++- 3 files changed, 50 insertions(+), 5 deletions(-) diff --git a/deeplake/core/dataset/dataset.py b/deeplake/core/dataset/dataset.py index 5f975769d6..be98240020 100644 --- a/deeplake/core/dataset/dataset.py +++ b/deeplake/core/dataset/dataset.py @@ -3784,7 +3784,7 @@ def _save_view( ) from e else: raise ReadOnlyModeError( - "Cannot save view in read only dataset. Speicify a path to save the view in a different location." + "Cannot save view in read only dataset. Specify a path to save the view in a different location." ) else: vds = self._save_view_in_subdir( diff --git a/deeplake/core/dataset/indra_dataset_view.py b/deeplake/core/dataset/indra_dataset_view.py index 17d32caf79..77d9fc3277 100644 --- a/deeplake/core/dataset/indra_dataset_view.py +++ b/deeplake/core/dataset/indra_dataset_view.py @@ -45,6 +45,7 @@ def __init__( d["indra_ds"] = indra_ds d["group_index"] = "" d["enabled_tensors"] = None + d["verbose"] = False d["_index"] = Index(item=slice(None)) self.__dict__.update(d) self._view_base = None @@ -76,7 +77,10 @@ def path(self): @property def version_state(self) -> Dict: try: - return self.indra_ds.version_state + state = self.indra_ds.version_state + for k, v in state['full_tensors'].items(): + state['full_tensors'][k] = IndraTensorView(v, index=self.index) + return state except: return dict() @@ -202,14 +206,14 @@ def __getitem__( ) else: raise InvalidKeyTypeError(item) + raise AttributeError("Dataset has no attribute - {item}") def __getattr__(self, key): try: ret = self.__getitem__(key) except AttributeError: ret = getattr(self.indra_ds, key) - if ret is not None: - ret._view_entry = self._view_entry + ret._view_entry = self._view_entry return ret def __len__(self): @@ -298,6 +302,10 @@ def dataloader(self, ignore_errors: bool = False, verbose: bool = False): def no_view_dataset(self): return self + @property + def base_storage(self): + return self.storage + @property def index(self): return self._index diff --git a/deeplake/core/storage/indra.py b/deeplake/core/storage/indra.py index 0dc415b5d4..bcb2bafde8 100644 --- a/deeplake/core/storage/indra.py +++ b/deeplake/core/storage/indra.py @@ -1,6 +1,8 @@ from deeplake.core.storage.provider import StorageProvider +from deeplake.core.partial_reader import PartialReader +from deeplake.core.storage.deeplake_memory_object import DeepLakeMemoryObject from indra.api import storage # type: ignore -from typing import Optional, Union +from typing import Optional, Union, Dict class IndraProvider(StorageProvider): @@ -44,6 +46,41 @@ def get_bytes( except RuntimeError as e: raise KeyError(path) + def get_deeplake_object( + self, + path: str, + expected_class, + meta: Optional[Dict] = None, + url=False, + partial_bytes: int = 0, + ): + if partial_bytes != 0: + assert issubclass(expected_class, BaseChunk) + if path in self.lru_sizes: + return self[path] + buff = self.get_bytes(path, 0, partial_bytes) + obj = expected_class.frombuffer(buff, meta, partial=True) + obj.data_bytes = PartialReader(self, path, header_offset=obj.header_bytes) + return obj + + item = self[path] + if isinstance(item, DeepLakeMemoryObject): + if type(item) != expected_class: + raise ValueError( + f"'{path}' was expected to have the class '{expected_class.__name__}'. Instead, got: '{type(item)}'." + ) + return item + + if isinstance(item, (bytes, memoryview)): + obj = ( + expected_class.frombuffer(item) + if meta is None + else expected_class.frombuffer(item, meta) + ) + return obj + + raise ValueError(f"Item at '{path}' got an invalid type: '{type(item)}'.") + def get_object_size(self, path: str) -> int: return self.core.length(path) From 390ee9e10ec040e5eb30991c3134956b611c8a3f Mon Sep 17 00:00:00 2001 From: khustup2 <sasun@activeloop.ai> Date: Mon, 4 Mar 2024 16:06:47 +0000 Subject: [PATCH 36/62] Fixed linter. --- deeplake/core/dataset/indra_dataset_view.py | 7 ++++--- deeplake/enterprise/convert_to_libdeeplake.py | 5 ++++- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/deeplake/core/dataset/indra_dataset_view.py b/deeplake/core/dataset/indra_dataset_view.py index 77d9fc3277..69ee789679 100644 --- a/deeplake/core/dataset/indra_dataset_view.py +++ b/deeplake/core/dataset/indra_dataset_view.py @@ -55,6 +55,7 @@ def __init__( self._query_string = None try: from deeplake.core.storage.indra import IndraProvider + self.storage = IndraProvider(indra_ds.storage) except: pass @@ -78,8 +79,8 @@ def path(self): def version_state(self) -> Dict: try: state = self.indra_ds.version_state - for k, v in state['full_tensors'].items(): - state['full_tensors'][k] = IndraTensorView(v, index=self.index) + for k, v in state["full_tensors"].items(): + state["full_tensors"][k] = IndraTensorView(v, index=self.index) return state except: return dict() @@ -327,7 +328,7 @@ def _all_tensors_filtered( def _tensors( self, include_hidden: bool = True, include_disabled=True - ) -> Dict[str, Tensor]: + ) -> Dict[str, IndraTensorView]: """All tensors belonging to this group, including those within sub groups. Always returns the sliced tensors.""" indra_tensors = self.indra_ds.tensors ret = {} diff --git a/deeplake/enterprise/convert_to_libdeeplake.py b/deeplake/enterprise/convert_to_libdeeplake.py index 9b9d8ef04d..2902a85194 100644 --- a/deeplake/enterprise/convert_to_libdeeplake.py +++ b/deeplake/enterprise/convert_to_libdeeplake.py @@ -157,6 +157,7 @@ def dataset_to_libdeeplake(hub2_dataset: Dataset): try_flushing(hub2_dataset) api = import_indra_api() from deeplake.core.storage.indra import IndraProvider + path: str = hub2_dataset.path token = ( @@ -170,7 +171,9 @@ def dataset_to_libdeeplake(hub2_dataset: Dataset): if hub2_dataset.libdeeplake_dataset is not None: libdeeplake_dataset = hub2_dataset.libdeeplake_dataset elif isinstance(hub2_dataset.storage.next_storage, IndraProvider): - libdeeplake_dataset = api.load_from_storage(hub2_dataset.storage.next_storage.core) + libdeeplake_dataset = api.load_from_storage( + hub2_dataset.storage.next_storage.core + ) else: libdeeplake_dataset = None if path.startswith("gdrive://"): From c70a940f6457fd80cb6139f41058b3a523bec348 Mon Sep 17 00:00:00 2001 From: khustup2 <sasun@activeloop.ai> Date: Mon, 4 Mar 2024 16:50:58 +0000 Subject: [PATCH 37/62] Fix mypy. --- deeplake/core/storage/indra.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/deeplake/core/storage/indra.py b/deeplake/core/storage/indra.py index bcb2bafde8..86150d8fb5 100644 --- a/deeplake/core/storage/indra.py +++ b/deeplake/core/storage/indra.py @@ -55,9 +55,6 @@ def get_deeplake_object( partial_bytes: int = 0, ): if partial_bytes != 0: - assert issubclass(expected_class, BaseChunk) - if path in self.lru_sizes: - return self[path] buff = self.get_bytes(path, 0, partial_bytes) obj = expected_class.frombuffer(buff, meta, partial=True) obj.data_bytes = PartialReader(self, path, header_offset=obj.header_bytes) From 46aa379ea19b38c946773e2828091239d917c634 Mon Sep 17 00:00:00 2001 From: khustup2 <sasun@activeloop.ai> Date: Tue, 5 Mar 2024 11:42:33 +0000 Subject: [PATCH 38/62] More. --- deeplake/api/dataset.py | 3 +++ deeplake/core/dataset/indra_dataset_view.py | 17 ++++++++--------- deeplake/core/dataset/indra_tensor_view.py | 10 +--------- deeplake/core/index/index.py | 3 +++ deeplake/core/storage/indra.py | 5 +++++ deeplake/core/tests/test_indra_dataset.py | 4 ++-- 6 files changed, 22 insertions(+), 20 deletions(-) diff --git a/deeplake/api/dataset.py b/deeplake/api/dataset.py index cd721cf9fc..ae44dec928 100644 --- a/deeplake/api/dataset.py +++ b/deeplake/api/dataset.py @@ -104,6 +104,7 @@ def init( lock_enabled: Optional[bool] = True, lock_timeout: Optional[int] = 0, index_params: Optional[Dict[str, Union[int, str]]] = None, + indra: bool = USE_INDRA, ): """Returns a :class:`~deeplake.core.dataset.Dataset` object referencing either a new or existing dataset. @@ -176,6 +177,7 @@ def init( lock_timeout (int): Number of seconds to wait before throwing a LockException. If None, wait indefinitely lock_enabled (bool): If true, the dataset manages a write lock. NOTE: Only set to False if you are managing concurrent access externally index_params: Optional[Dict[str, Union[int, str]]] = None : The index parameters used while creating vector store is passed down to dataset. + indra (bool): Flag indicating whether indra api should be used to create the dataset. Defaults to false .. # noqa: DAR101 @@ -228,6 +230,7 @@ def init( token=token, memory_cache_size=memory_cache_size, local_cache_size=local_cache_size, + indra=indra, ) feature_report_path(path, "dataset", {"Overwrite": overwrite}, token=token) diff --git a/deeplake/core/dataset/indra_dataset_view.py b/deeplake/core/dataset/indra_dataset_view.py index 69ee789679..dc67f76368 100644 --- a/deeplake/core/dataset/indra_dataset_view.py +++ b/deeplake/core/dataset/indra_dataset_view.py @@ -39,14 +39,12 @@ def __init__( indra_ds, group_index="", enabled_tensors=None, - index: Optional[Index] = None, ): d: Dict[str, Any] = {} d["indra_ds"] = indra_ds d["group_index"] = "" d["enabled_tensors"] = None d["verbose"] = False - d["_index"] = Index(item=slice(None)) self.__dict__.update(d) self._view_base = None self._view_entry = None @@ -73,14 +71,17 @@ def meta(self): @property def path(self): - return "" + try: + return self.storage.original_path + except: + return "" @property def version_state(self) -> Dict: try: state = self.indra_ds.version_state for k, v in state["full_tensors"].items(): - state["full_tensors"][k] = IndraTensorView(v, index=self.index) + state["full_tensors"][k] = IndraTensorView(v) return state except: return dict() @@ -119,7 +120,7 @@ def _get_tensor_from_root(self, fullpath): if tensor.name == fullpath: deeplake_tensor = None indra_tensor = tensor - return IndraTensorView(indra_tensor, index=self.index) + return IndraTensorView(indra_tensor) def pytorch( self, @@ -193,7 +194,6 @@ def __getitem__( return IndraDatasetView( indra_ds=self.indra_ds, enabled_tensors=enabled_tensors, - index=self.index, ) elif isinstance(item, tuple) and len(item) and isinstance(item[0], str): ret = self @@ -203,7 +203,6 @@ def __getitem__( else: return IndraDatasetView( indra_ds=self.indra_ds[item], - index=self.index[item], ) else: raise InvalidKeyTypeError(item) @@ -309,7 +308,7 @@ def base_storage(self): @property def index(self): - return self._index + return Index(self.indra_ds.indexes) @property def sample_indices(self): @@ -333,7 +332,7 @@ def _tensors( indra_tensors = self.indra_ds.tensors ret = {} for t in indra_tensors: - ret[t.name] = IndraTensorView(t, index=self.index) + ret[t.name] = IndraTensorView(t) return ret def __str__(self): diff --git a/deeplake/core/dataset/indra_tensor_view.py b/deeplake/core/dataset/indra_tensor_view.py index 4a2c3bf0b2..fea0f805a0 100644 --- a/deeplake/core/dataset/indra_tensor_view.py +++ b/deeplake/core/dataset/indra_tensor_view.py @@ -15,7 +15,6 @@ class IndraTensorView(tensor.Tensor): def __init__( self, indra_tensor, - index: Optional[Index] = None, is_iteration: bool = False, ): self.indra_tensor = indra_tensor @@ -25,8 +24,6 @@ def __init__( self.first_dim = None - self._index = index or Index(self.indra_tensor.index) - def __getattr__(self, key): try: return getattr(self.indra_tensor, key) @@ -44,11 +41,8 @@ def __getitem__( if isinstance(item, tuple) or item is Ellipsis: item = replace_ellipsis_with_slices(item, self.ndim) - indra_tensor = self.indra_tensor[item] - return IndraTensorView( - indra_tensor, - index=self.index[item], + self.indra_tensor[item], is_iteration=is_iteration, ) @@ -150,8 +144,6 @@ def shape(self): @property def index(self): - if self._index is not None: - return self._index return Index(self.indra_tensor.indexes) @property diff --git a/deeplake/core/index/index.py b/deeplake/core/index/index.py index 688241a045..2909aec276 100644 --- a/deeplake/core/index/index.py +++ b/deeplake/core/index/index.py @@ -198,6 +198,9 @@ def __getitem__(self, item: IndexValue): def subscriptable(self): """Returns whether an IndexEntry can be further subscripted.""" + from indra import api + if isinstance(self.value, api.core.IndexMappingInt64): + return self.value.subscriptable() return not isinstance(self.value, int) def indices(self, length: int): diff --git a/deeplake/core/storage/indra.py b/deeplake/core/storage/indra.py index 86150d8fb5..b4a8d4c01b 100644 --- a/deeplake/core/storage/indra.py +++ b/deeplake/core/storage/indra.py @@ -18,11 +18,16 @@ def __init__( self.core = storage.create(root, read_only, **kwargs) else: self.core = root + self.root = self.path @property def path(self): return self.core.path + @property + def original_path(self): + return self.core.original_path + def copy(self): return IndraProvider(self.core) diff --git a/deeplake/core/tests/test_indra_dataset.py b/deeplake/core/tests/test_indra_dataset.py index 6b8d8825d9..912b9d3200 100644 --- a/deeplake/core/tests/test_indra_dataset.py +++ b/deeplake/core/tests/test_indra_dataset.py @@ -116,7 +116,7 @@ def test_load_view(local_auth_ds_generator): query_str = "select * group by label" view = deeplake_ds.query(query_str) view_path = view.save_view() - view_id = view_path.split("/")[-1] + view_id = view_path.split("/")[-2] view = deeplake_ds.load_view(view_id) dataloader = view[:3].dataloader().pytorch() @@ -130,7 +130,7 @@ def test_load_view(local_auth_ds_generator): view = deeplake_ds[0:50].query(query_str) view_path = view.save_view() - view_id = view_path.split("/")[-1] + view_id = view_path.split("/")[-2] view = deeplake_ds.load_view(view_id) dataloader = view[:3].dataloader().pytorch() From a997e1c5a6d09a216dbbcd0029a651fa7f85727d Mon Sep 17 00:00:00 2001 From: khustup2 <sasun@activeloop.ai> Date: Tue, 5 Mar 2024 12:10:08 +0000 Subject: [PATCH 39/62] Fixed linter. --- deeplake/core/index/index.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/deeplake/core/index/index.py b/deeplake/core/index/index.py index 2909aec276..d39e3b6897 100644 --- a/deeplake/core/index/index.py +++ b/deeplake/core/index/index.py @@ -198,7 +198,8 @@ def __getitem__(self, item: IndexValue): def subscriptable(self): """Returns whether an IndexEntry can be further subscripted.""" - from indra import api + from indra import api # type: ignore + if isinstance(self.value, api.core.IndexMappingInt64): return self.value.subscriptable() return not isinstance(self.value, int) From 598d7d088843c4fd22b98945f9e062bf81a80767 Mon Sep 17 00:00:00 2001 From: khustup2 <sasun@activeloop.ai> Date: Thu, 7 Mar 2024 08:06:53 +0000 Subject: [PATCH 40/62] Bump libdeeplake version. --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 9e706957bd..3386974341 100644 --- a/setup.py +++ b/setup.py @@ -70,7 +70,7 @@ def libdeeplake_available(): extras_require["all"] = [req_map[r] for r in all_extras] if libdeeplake_available(): - libdeeplake = "libdeeplake==0.0.104" + libdeeplake = "libdeeplake==0.0.105" extras_require["enterprise"] = [libdeeplake, "pyjwt"] extras_require["all"].append(libdeeplake) install_requires.append(libdeeplake) From a09bc4f380601907819eeeacd1806adc8a4b4a42 Mon Sep 17 00:00:00 2001 From: khustup2 <sasun@activeloop.ai> Date: Thu, 7 Mar 2024 11:49:42 +0000 Subject: [PATCH 41/62] Final fixes for indra adaptors. --- deeplake/core/dataset/indra_dataset_view.py | 11 ++---- deeplake/core/dataset/indra_tensor_view.py | 2 + deeplake/enterprise/convert_to_libdeeplake.py | 39 ++++++++++++------- 3 files changed, 30 insertions(+), 22 deletions(-) diff --git a/deeplake/core/dataset/indra_dataset_view.py b/deeplake/core/dataset/indra_dataset_view.py index dc67f76368..5eb2cfc159 100644 --- a/deeplake/core/dataset/indra_dataset_view.py +++ b/deeplake/core/dataset/indra_dataset_view.py @@ -55,16 +55,10 @@ def __init__( from deeplake.core.storage.indra import IndraProvider self.storage = IndraProvider(indra_ds.storage) + self._read_only = self.storage.read_only except: pass - @property - def read_only(self): - try: - return self.indra_ds.storage.read_only - except: - return True - @property def meta(self): return DatasetMeta() @@ -114,6 +108,9 @@ def checkout(self, address: str, create: bool = False): ) self.indra_ds.checkout(address) + def flush(self): + pass + def _get_tensor_from_root(self, fullpath): tensors = self.indra_ds.tensors for tensor in tensors: diff --git a/deeplake/core/dataset/indra_tensor_view.py b/deeplake/core/dataset/indra_tensor_view.py index fea0f805a0..530f3aa603 100644 --- a/deeplake/core/dataset/indra_tensor_view.py +++ b/deeplake/core/dataset/indra_tensor_view.py @@ -54,6 +54,8 @@ def numpy( return r else: try: + if self.index.values[0].subscriptable(): + r = r[0] return np.array(r) except ValueError: raise DynamicTensorNumpyError(self.name, self.index, "shape") diff --git a/deeplake/enterprise/convert_to_libdeeplake.py b/deeplake/enterprise/convert_to_libdeeplake.py index 2902a85194..5d1c4f94f1 100644 --- a/deeplake/enterprise/convert_to_libdeeplake.py +++ b/deeplake/enterprise/convert_to_libdeeplake.py @@ -65,15 +65,16 @@ def _get_indra_ds_from_azure_provider( sas_token = provider.get_sas_token() expiration = str(provider.expiration) if provider.expiration else None - return api.dataset( + storage = IndraProvider( path, - origin_path=provider.root, + read_only=provider.read_only, token=token, account_name=account_name, account_key=account_key, sas_token=sas_token, expiration=expiration, ) + return _get_indra_ds_from_native_provider(storage) def _get_indra_ds_from_gcp_provider( @@ -94,10 +95,11 @@ def _get_indra_ds_from_gcp_provider( scheme = creds.get("scheme", "") retry_limit_seconds = creds.get("retry_limit_seconds", "") - return api.dataset( + storage = IndraProvider( path, - origin_path=provider.root, + read_only=provider.read_only, token=token, + origin_path=provider.root, anon=anon, expiration=expiration, access_token=access_token, @@ -106,6 +108,7 @@ def _get_indra_ds_from_gcp_provider( scheme=scheme, retry_limit_seconds=retry_limit_seconds, ) + return _get_indra_ds_from_native_provider(storage) def _get_indra_ds_from_s3_provider( @@ -121,10 +124,11 @@ def _get_indra_ds_from_s3_provider( creds_used = provider.creds_used if creds_used == "PLATFORM": provider._check_update_creds() - return api.dataset( + storage = IndraProvider( path, - origin_path=provider.root, + read_only=provider.read_only, token=token, + origin_path=provider.root, aws_access_key_id=provider.aws_access_key_id, aws_secret_access_key=provider.aws_secret_access_key, aws_session_token=provider.aws_session_token, @@ -132,31 +136,35 @@ def _get_indra_ds_from_s3_provider( endpoint_url=provider.endpoint_url, expiration=str(provider.expiration), ) + return _get_indra_ds_from_native_provider(storage) elif creds_used == "ENV": - return api.dataset( + storage = IndraProvider( path, - origin_path=provider.root, + read_only=provider.read_only, token=token, + origin_path=provider.root, profile_name=provider.profile_name, ) + return _get_indra_ds_from_native_provider(storage) elif creds_used == "DICT": - return api.dataset( + storage = IndraProvider( path, - origin_path=provider.root, + read_only=provider.read_only, token=token, + origin_path=provider.root, aws_access_key_id=provider.aws_access_key_id, aws_secret_access_key=provider.aws_secret_access_key, aws_session_token=provider.aws_session_token, region_name=provider.aws_region, endpoint_url=provider.endpoint_url, ) + return _get_indra_ds_from_native_provider(storage) def dataset_to_libdeeplake(hub2_dataset: Dataset): """Convert a hub 2.x dataset object to a libdeeplake dataset object.""" try_flushing(hub2_dataset) api = import_indra_api() - from deeplake.core.storage.indra import IndraProvider path: str = hub2_dataset.path @@ -171,9 +179,7 @@ def dataset_to_libdeeplake(hub2_dataset: Dataset): if hub2_dataset.libdeeplake_dataset is not None: libdeeplake_dataset = hub2_dataset.libdeeplake_dataset elif isinstance(hub2_dataset.storage.next_storage, IndraProvider): - libdeeplake_dataset = api.load_from_storage( - hub2_dataset.storage.next_storage.core - ) + libdeeplake_dataset = api.dataset(hub2_dataset.storage.next_storage.core) else: libdeeplake_dataset = None if path.startswith("gdrive://"): @@ -226,7 +232,10 @@ def dataset_to_libdeeplake(hub2_dataset: Dataset): org_id = ( org_id or jwt.decode(token, options={"verify_signature": False})["id"] ) - libdeeplake_dataset = api.dataset(path, token=token, org_id=org_id) + storage = IndraProvider( + path, read_only=hub2_dataset.read_only, token=token, org_id=org_id + ) + libdeeplake_dataset = api.dataset(storage.core) hub2_dataset.libdeeplake_dataset = libdeeplake_dataset From f4512af253027d57cf8d8b9af5d4834353417ba0 Mon Sep 17 00:00:00 2001 From: khustup2 <sasun@activeloop.ai> Date: Thu, 7 Mar 2024 21:36:07 +0000 Subject: [PATCH 42/62] token and cache_size errors fixed. --- deeplake/core/dataset/indra_dataset_view.py | 1 + deeplake/core/storage/indra.py | 4 ++++ deeplake/enterprise/convert_to_libdeeplake.py | 7 ++++--- 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/deeplake/core/dataset/indra_dataset_view.py b/deeplake/core/dataset/indra_dataset_view.py index 5eb2cfc159..3d61a68838 100644 --- a/deeplake/core/dataset/indra_dataset_view.py +++ b/deeplake/core/dataset/indra_dataset_view.py @@ -56,6 +56,7 @@ def __init__( self.storage = IndraProvider(indra_ds.storage) self._read_only = self.storage.read_only + self._token = self.storage.token except: pass diff --git a/deeplake/core/storage/indra.py b/deeplake/core/storage/indra.py index b4a8d4c01b..c692fc880c 100644 --- a/deeplake/core/storage/indra.py +++ b/deeplake/core/storage/indra.py @@ -28,6 +28,10 @@ def path(self): def original_path(self): return self.core.original_path + @property + def token(self): + return self.core.token + def copy(self): return IndraProvider(self.core) diff --git a/deeplake/enterprise/convert_to_libdeeplake.py b/deeplake/enterprise/convert_to_libdeeplake.py index 5d1c4f94f1..be062dc458 100644 --- a/deeplake/enterprise/convert_to_libdeeplake.py +++ b/deeplake/enterprise/convert_to_libdeeplake.py @@ -240,9 +240,10 @@ def dataset_to_libdeeplake(hub2_dataset: Dataset): hub2_dataset.libdeeplake_dataset = libdeeplake_dataset assert libdeeplake_dataset is not None - libdeeplake_dataset._max_cache_size = max( - hub2_dataset.storage.cache_size, libdeeplake_dataset._max_cache_size - ) + if hasattr(hub2_dataset.storage, "cache_size"): + libdeeplake_dataset._max_cache_size = max( + hub2_dataset.storage.cache_size, libdeeplake_dataset._max_cache_size + ) commit_id = hub2_dataset.pending_commit_id libdeeplake_dataset.checkout(commit_id) slice_ = hub2_dataset.index.values[0].value From fd5287cb93cf3ff6d95a4a289af2735197202487 Mon Sep 17 00:00:00 2001 From: khustup2 <sasun@activeloop.ai> Date: Fri, 8 Mar 2024 20:14:30 +0000 Subject: [PATCH 43/62] Bump libdeeplake version. --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 3386974341..c563602480 100644 --- a/setup.py +++ b/setup.py @@ -70,7 +70,7 @@ def libdeeplake_available(): extras_require["all"] = [req_map[r] for r in all_extras] if libdeeplake_available(): - libdeeplake = "libdeeplake==0.0.105" + libdeeplake = "libdeeplake==0.0.106" extras_require["enterprise"] = [libdeeplake, "pyjwt"] extras_require["all"].append(libdeeplake) install_requires.append(libdeeplake) From ec92f04f4068ed8f03823c2ee73d9ed5baba9562 Mon Sep 17 00:00:00 2001 From: khustup2 <sasun@activeloop.ai> Date: Mon, 11 Mar 2024 22:28:29 +0000 Subject: [PATCH 44/62] Handle index in non-linear views. --- deeplake/core/dataset/indra_dataset_view.py | 5 ++++- deeplake/core/dataset/indra_tensor_view.py | 5 ++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/deeplake/core/dataset/indra_dataset_view.py b/deeplake/core/dataset/indra_dataset_view.py index 3d61a68838..734e627d88 100644 --- a/deeplake/core/dataset/indra_dataset_view.py +++ b/deeplake/core/dataset/indra_dataset_view.py @@ -306,7 +306,10 @@ def base_storage(self): @property def index(self): - return Index(self.indra_ds.indexes) + try: + return Index(self.indra_ds.indexes) + except: + return Index(slice(0, len(self))) @property def sample_indices(self): diff --git a/deeplake/core/dataset/indra_tensor_view.py b/deeplake/core/dataset/indra_tensor_view.py index 530f3aa603..63e748f1ff 100644 --- a/deeplake/core/dataset/indra_tensor_view.py +++ b/deeplake/core/dataset/indra_tensor_view.py @@ -146,7 +146,10 @@ def shape(self): @property def index(self): - return Index(self.indra_tensor.indexes) + try: + return Index(self.indra_tensor.indexes) + except: + return Index(slice(0, len(self))) @property def shape_interval(self): From 251d05742473c5251b31bb6ce5885871fff88aa8 Mon Sep 17 00:00:00 2001 From: khustup2 <sasun@activeloop.ai> Date: Sun, 17 Mar 2024 08:52:16 +0000 Subject: [PATCH 45/62] Prepare indra materialization usage. --- deeplake/core/dataset/dataset.py | 57 +++++++++++++++++++++-------- deeplake/core/dataset/view_entry.py | 2 - 2 files changed, 42 insertions(+), 17 deletions(-) diff --git a/deeplake/core/dataset/dataset.py b/deeplake/core/dataset/dataset.py index be98240020..18fcb9a19e 100644 --- a/deeplake/core/dataset/dataset.py +++ b/deeplake/core/dataset/dataset.py @@ -4588,6 +4588,39 @@ def visualize( def __contains__(self, tensor: str): return tensor in self.tensors + def _optimize_and_copy_view( + self, + info, + path: str, + new_path: str, + tensors: Optional[List[str]] = None, + external=False, + unlink=True, + num_workers=0, + scheduler="threaded", + progressbar=True, + ): + tql_query = info.get("tql_query") + if tql_query is not None: + raise Exception("Optimizing nonlinear query views is not supported") + + vds = self._sub_ds(".queries/" + path, verbose=False) + view = vds._get_view(not external) + new_path = path + "_OPTIMIZED" + optimized = self._sub_ds(".queries/" + new_path, empty=True, verbose=False) + view._copy( + optimized, + tensors=tensors, + overwrite=True, + unlink=unlink, + create_vds_index_tensor=True, + num_workers=num_workers, + scheduler=scheduler, + progressbar=progressbar, + ) + optimized.info.update(vds.info.__getstate__()) + return (vds, optimized) + def _optimize_saved_view( self, id: str, @@ -4614,32 +4647,26 @@ def _optimize_saved_view( # Already optimized return info path = info.get("path", info["id"]) - vds = self._sub_ds(".queries/" + path, verbose=False) - view = vds._get_view(not external) new_path = path + "_OPTIMIZED" - optimized = self._sub_ds( - ".queries/" + new_path, empty=True, verbose=False - ) - view._copy( - optimized, + old, new = self._optimize_and_copy_view( + info, + path, + new_path, tensors=tensors, - overwrite=True, unlink=unlink, - create_vds_index_tensor=True, num_workers=num_workers, scheduler=scheduler, progressbar=progressbar, ) - optimized.info.update(vds.info.__getstate__()) - optimized.info["virtual-datasource"] = False - optimized.info["path"] = new_path - optimized.flush() + new.info["virtual-datasource"] = False + new.info["path"] = new_path + new.flush() info["virtual-datasource"] = False info["path"] = new_path self._write_queries_json(qjson) - vds.base_storage.disable_readonly() + old.base_storage.disable_readonly() try: - vds.base_storage.clear() + old.base_storage.clear() except Exception as e: warnings.warn( f"Error while deleting old view after writing optimized version: {e}" diff --git a/deeplake/core/dataset/view_entry.py b/deeplake/core/dataset/view_entry.py index 4b65eb4c50..831d99b97b 100644 --- a/deeplake/core/dataset/view_entry.py +++ b/deeplake/core/dataset/view_entry.py @@ -120,8 +120,6 @@ def optimize( Exception: When query view cannot be optimized. """ - if not self.tql_query is None: - raise Exception("Optimizing nonlinear query views is not supported") self.info = self._ds._optimize_saved_view( self.info["id"], tensors=tensors, From bb5e8334cc83d8b78b916a2334a8153588c05974 Mon Sep 17 00:00:00 2001 From: khustup2 <sasun@activeloop.ai> Date: Sun, 17 Mar 2024 11:50:03 +0000 Subject: [PATCH 46/62] Materialize indra view. --- deeplake/core/dataset/dataset.py | 30 +++++++++++++++-------------- deeplake/core/dataset/view_entry.py | 2 +- 2 files changed, 17 insertions(+), 15 deletions(-) diff --git a/deeplake/core/dataset/dataset.py b/deeplake/core/dataset/dataset.py index 18fcb9a19e..fb907d50fb 100644 --- a/deeplake/core/dataset/dataset.py +++ b/deeplake/core/dataset/dataset.py @@ -4601,23 +4601,25 @@ def _optimize_and_copy_view( progressbar=True, ): tql_query = info.get("tql_query") - if tql_query is not None: - raise Exception("Optimizing nonlinear query views is not supported") - vds = self._sub_ds(".queries/" + path, verbose=False) view = vds._get_view(not external) new_path = path + "_OPTIMIZED" - optimized = self._sub_ds(".queries/" + new_path, empty=True, verbose=False) - view._copy( - optimized, - tensors=tensors, - overwrite=True, - unlink=unlink, - create_vds_index_tensor=True, - num_workers=num_workers, - scheduler=scheduler, - progressbar=progressbar, - ) + if tql_query is not None: + view = view.query(tql_query) + view.indra_ds.materialize(new_path, tensors, True) + optimized = self._sub_ds(".queries/" + new_path, empty=False, verbose=False) + else: + optimized = self._sub_ds(".queries/" + new_path, empty=True, verbose=False) + view._copy( + optimized, + tensors=tensors, + overwrite=True, + unlink=unlink, + create_vds_index_tensor=True, + num_workers=num_workers, + scheduler=scheduler, + progressbar=progressbar, + ) optimized.info.update(vds.info.__getstate__()) return (vds, optimized) diff --git a/deeplake/core/dataset/view_entry.py b/deeplake/core/dataset/view_entry.py index 831d99b97b..24b94e14ce 100644 --- a/deeplake/core/dataset/view_entry.py +++ b/deeplake/core/dataset/view_entry.py @@ -74,7 +74,7 @@ def load(self, verbose=True): if self.virtual: ds = ds._get_view(inherit_creds=not self._external) - if not self.tql_query is None: + if self.virtual and not self.tql_query is None: query_str = self.tql_query ds = ds.query(query_str) From cae1c23a0f1dc8ea178126555357285f931657f1 Mon Sep 17 00:00:00 2001 From: khustup2 <sasun@activeloop.ai> Date: Sun, 17 Mar 2024 13:09:51 +0000 Subject: [PATCH 47/62] Added indra view load test with optimize=True --- deeplake/core/tests/test_indra_dataset.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/deeplake/core/tests/test_indra_dataset.py b/deeplake/core/tests/test_indra_dataset.py index 912b9d3200..361a956c64 100644 --- a/deeplake/core/tests/test_indra_dataset.py +++ b/deeplake/core/tests/test_indra_dataset.py @@ -142,6 +142,19 @@ def test_load_view(local_auth_ds_generator): assert iss == [0, 1, 2] assert np.all(indra_ds.image.numpy() == deeplake_indra_ds.image.numpy()) + query_str = "select label where label > 0" + view = deeplake_ds.query(query_str) + view_path = view.save_view() + view_id = view_path.split("/")[-2] + view = deeplake_ds.load_view(view_id, optimize=True) + + + dataloader = view.dataloader().pytorch() + count = 0 + for i, batch in enumerate(dataloader): + assert batch["label"][0] > 0 + count += 1 + assert count == 90 @requires_libdeeplake def test_query(local_auth_ds_generator): From 74965ddc6eed83b3b86dd2bc8b51210f06bf5eb4 Mon Sep 17 00:00:00 2001 From: khustup2 <sasun@activeloop.ai> Date: Sun, 17 Mar 2024 13:14:43 +0000 Subject: [PATCH 48/62] Fixed black. --- deeplake/core/tests/test_indra_dataset.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/deeplake/core/tests/test_indra_dataset.py b/deeplake/core/tests/test_indra_dataset.py index 361a956c64..2da1204be1 100644 --- a/deeplake/core/tests/test_indra_dataset.py +++ b/deeplake/core/tests/test_indra_dataset.py @@ -148,14 +148,14 @@ def test_load_view(local_auth_ds_generator): view_id = view_path.split("/")[-2] view = deeplake_ds.load_view(view_id, optimize=True) - dataloader = view.dataloader().pytorch() - count = 0 + count = 0 for i, batch in enumerate(dataloader): assert batch["label"][0] > 0 count += 1 assert count == 90 + @requires_libdeeplake def test_query(local_auth_ds_generator): from deeplake.enterprise.convert_to_libdeeplake import dataset_to_libdeeplake From 32e85633735ff85f305075a9b68f8035ff1e64c9 Mon Sep 17 00:00:00 2001 From: khustup2 <sasun@activeloop.ai> Date: Thu, 21 Mar 2024 16:27:33 +0000 Subject: [PATCH 49/62] Added indra flag to ingest api. --- deeplake/api/dataset.py | 32 ++++++++++++++++++++++++++++---- 1 file changed, 28 insertions(+), 4 deletions(-) diff --git a/deeplake/api/dataset.py b/deeplake/api/dataset.py index 60f9787a1a..7bd1fb1ad4 100644 --- a/deeplake/api/dataset.py +++ b/deeplake/api/dataset.py @@ -1509,6 +1509,7 @@ def ingest_coco( num_workers: int = 0, token: Optional[str] = None, connect_kwargs: Optional[Dict] = None, + indra: bool = USE_INDRA, **dataset_kwargs, ) -> Dataset: """Ingest images and annotations in COCO format to a Deep Lake Dataset. The source data can be stored locally or in the cloud. @@ -1600,7 +1601,12 @@ def ingest_coco( structure = unstructured.prepare_structure(inspect_limit) ds = deeplake.empty( - dest, creds=dest_creds, verbose=False, token=token, **dataset_kwargs + dest, + creds=dest_creds, + verbose=False, + token=token, + indra=indra, + **dataset_kwargs, ) if connect_kwargs is not None: connect_kwargs["token"] = token or connect_kwargs.get("token") @@ -1631,6 +1637,7 @@ def ingest_yolo( num_workers: int = 0, token: Optional[str] = None, connect_kwargs: Optional[Dict] = None, + indra: bool = USE_INDRA, **dataset_kwargs, ) -> Dataset: """Ingest images and annotations (bounding boxes or polygons) in YOLO format to a Deep Lake Dataset. The source data can be stored locally or in the cloud. @@ -1726,7 +1733,12 @@ def ingest_yolo( structure = unstructured.prepare_structure() ds = deeplake.empty( - dest, creds=dest_creds, verbose=False, token=token, **dataset_kwargs + dest, + creds=dest_creds, + verbose=False, + token=token, + indra=indra, + **dataset_kwargs, ) if connect_kwargs is not None: connect_kwargs["token"] = token or connect_kwargs.get("token") @@ -1857,6 +1869,7 @@ def ingest_classification( dest_creds=dest_creds, progressbar=progressbar, token=token, + indra=indra, **dataset_kwargs, ) return ds @@ -1879,7 +1892,12 @@ def ingest_classification( unstructured = ImageClassification(source=src) ds = deeplake.empty( - dest, creds=dest_creds, token=token, verbose=False, **dataset_kwargs + dest, + creds=dest_creds, + token=token, + verbose=False, + indra=indra, + **dataset_kwargs, ) if connect_kwargs is not None: connect_kwargs["token"] = token or connect_kwargs.get("token") @@ -1974,6 +1992,7 @@ def ingest_kaggle( progressbar=progressbar, summary=summary, shuffle=shuffle, + indra=indra, **dataset_kwargs, ) @@ -2064,7 +2083,12 @@ def ingest_dataframe( dest = convert_pathlib_to_string_if_needed(dest) ds = deeplake.empty( - dest, creds=dest_creds, token=token, verbose=False, **dataset_kwargs + dest, + creds=dest_creds, + token=token, + verbose=False, + indra=indra, + **dataset_kwargs, ) if connect_kwargs is not None: connect_kwargs["token"] = token or connect_kwargs.get("token") From fde3acc349682e8f254b3021e6486fbc03b3c952 Mon Sep 17 00:00:00 2001 From: khustup2 <sasun@activeloop.ai> Date: Thu, 21 Mar 2024 17:10:05 +0000 Subject: [PATCH 50/62] Fix. --- deeplake/api/dataset.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/deeplake/api/dataset.py b/deeplake/api/dataset.py index c05b7e76c4..9fe4bf3504 100644 --- a/deeplake/api/dataset.py +++ b/deeplake/api/dataset.py @@ -1562,6 +1562,7 @@ def ingest_coco( num_workers (int): The number of workers to use for ingestion. Set to ``0`` by default. token (Optional[str]): The token to use for accessing the dataset and/or connecting it to Deep Lake. connect_kwargs (Optional[Dict]): If specified, the dataset will be connected to Deep Lake, and connect_kwargs will be passed to :meth:`Dataset.connect <deeplake.core.dataset.Dataset.connect>`. + indra (bool): Flag indicating whether indra api should be used to create the dataset. Defaults to false **dataset_kwargs: Any arguments passed here will be forwarded to the dataset creator function. See :func:`deeplake.empty`. Returns: @@ -1685,6 +1686,7 @@ def ingest_yolo( num_workers (int): The number of workers to use for ingestion. Set to ``0`` by default. token (Optional[str]): The token to use for accessing the dataset and/or connecting it to Deep Lake. connect_kwargs (Optional[Dict]): If specified, the dataset will be connected to Deep Lake, and connect_kwargs will be passed to :meth:`Dataset.connect <deeplake.core.dataset.Dataset.connect>`. + indra (bool): Flag indicating whether indra api should be used to create the dataset. Defaults to false **dataset_kwargs: Any arguments passed here will be forwarded to the dataset creator function. See :func:`deeplake.empty`. Returns: @@ -1767,6 +1769,7 @@ def ingest_classification( shuffle: bool = True, token: Optional[str] = None, connect_kwargs: Optional[Dict] = None, + indra: bool = USE_INDRA, **dataset_kwargs, ) -> Dataset: """Ingest a dataset of images from a local folder to a Deep Lake Dataset. Images should be stored in subfolders by class name. @@ -1787,6 +1790,7 @@ def ingest_classification( shuffle (bool): Shuffles the input data prior to ingestion. Since data arranged in folders by class is highly non-random, shuffling is important in order to produce optimal results when training. Defaults to ``True``. token (Optional[str]): The token to use for accessing the dataset. connect_kwargs (Optional[Dict]): If specified, the dataset will be connected to Deep Lake, and connect_kwargs will be passed to :meth:`Dataset.connect <deeplake.core.dataset.Dataset.connect>`. + indra (bool): Flag indicating whether indra api should be used to create the dataset. Defaults to false **dataset_kwargs: Any arguments passed here will be forwarded to the dataset creator function see :func:`deeplake.empty`. Returns: @@ -1927,6 +1931,7 @@ def ingest_kaggle( progressbar: bool = True, summary: bool = True, shuffle: bool = True, + indra: bool = USE_INDRA, **dataset_kwargs, ) -> Dataset: """Download and ingest a kaggle dataset and store it as a structured dataset to destination. @@ -1946,6 +1951,7 @@ def ingest_kaggle( progressbar (bool): Enables or disables ingestion progress bar. Set to ``True`` by default. summary (bool): Generates ingestion summary. Set to ``True`` by default. shuffle (bool): Shuffles the input data prior to ingestion. Since data arranged in folders by class is highly non-random, shuffling is important in order to produce optimal results when training. Defaults to ``True``. + indra (bool): Flag indicating whether indra api should be used to create the dataset. Defaults to false **dataset_kwargs: Any arguments passed here will be forwarded to the dataset creator function. See :func:`deeplake.dataset`. Returns: @@ -2008,6 +2014,7 @@ def ingest_dataframe( progressbar: bool = True, token: Optional[str] = None, connect_kwargs: Optional[Dict] = None, + indra: bool = USE_INDRA, **dataset_kwargs, ): """Convert pandas dataframe to a Deep Lake Dataset. The contents of the dataframe can be parsed literally, or can be treated as links to local or cloud files. @@ -2057,6 +2064,7 @@ def ingest_dataframe( progressbar (bool): Enables or disables ingestion progress bar. Set to ``True`` by default. token (Optional[str]): The token to use for accessing the dataset. connect_kwargs (Optional[Dict]): A dictionary containing arguments to be passed to the dataset connect method. See :meth:`Dataset.connect`. + indra (bool): Flag indicating whether indra api should be used to create the dataset. Defaults to false **dataset_kwargs: Any arguments passed here will be forwarded to the dataset creator function. See :func:`deeplake.empty`. Returns: From 6059c6ee27f0990c3b4e56748c32a1eb63b862ed Mon Sep 17 00:00:00 2001 From: khustup2 <sasun@activeloop.ai> Date: Sun, 24 Mar 2024 12:34:23 +0000 Subject: [PATCH 51/62] Added ingest dataframe with indra. --- deeplake/api/dataset.py | 26 ++++++++++++++++++-------- deeplake/auto/tests/test_ingestion.py | 4 ++-- 2 files changed, 20 insertions(+), 10 deletions(-) diff --git a/deeplake/api/dataset.py b/deeplake/api/dataset.py index 9fe4bf3504..5706ee2656 100644 --- a/deeplake/api/dataset.py +++ b/deeplake/api/dataset.py @@ -2089,20 +2089,30 @@ def ingest_dataframe( structured = DataFrame(src, column_params, src_creds, creds_key) dest = convert_pathlib_to_string_if_needed(dest) - ds = deeplake.empty( - dest, - creds=dest_creds, - token=token, - verbose=False, - indra=indra, - **dataset_kwargs, - ) + if indra: + from indra import api + + ds = api.dataset_writer( + dest, creds=dest_creds, token=token, **dataset_kwargs + ) + else: + ds = deeplake.empty( + dest, + creds=dest_creds, + token=token, + verbose=False, + **dataset_kwargs, + ) if connect_kwargs is not None: connect_kwargs["token"] = token or connect_kwargs.get("token") ds.connect(**connect_kwargs) structured.fill_dataset(ds, progressbar) # type: ignore + if indra: + ids = api.load_from_storage(ds.storage) + return IndraDatasetView(indra_ds=ids) + return ds # type: ignore @staticmethod diff --git a/deeplake/auto/tests/test_ingestion.py b/deeplake/auto/tests/test_ingestion.py index b8432325e8..f11c9a3c5d 100644 --- a/deeplake/auto/tests/test_ingestion.py +++ b/deeplake/auto/tests/test_ingestion.py @@ -231,7 +231,7 @@ def test_csv(memory_ds: Dataset, dataframe_ingestion_data: dict): assert ds[tensors_names[2]].htype == "text" assert ds[tensors_names[2]].dtype == str np.testing.assert_array_equal( - ds[tensors_names[2]].numpy().reshape(-1), df[df_keys[2]].values + np.array(ds[tensors_names[2]].numpy()).reshape(-1), df[df_keys[2]].values ) @@ -273,7 +273,7 @@ def test_dataframe_basic( assert ds[df_keys[2]].htype == "text" assert ds[df_keys[2]].dtype == str np.testing.assert_array_equal( - ds[df_keys[2]].numpy().reshape(-1), df[df_keys[2]].values + np.array(ds[df_keys[2]].numpy()).reshape(-1), df[df_keys[2]].values ) From 7d13829fec1dc8d991af749cf14c62e68ad6b72c Mon Sep 17 00:00:00 2001 From: khustup2 <sasun@activeloop.ai> Date: Sun, 24 Mar 2024 21:52:30 +0000 Subject: [PATCH 52/62] Adapt test to indra. --- deeplake/auto/tests/test_ingestion.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deeplake/auto/tests/test_ingestion.py b/deeplake/auto/tests/test_ingestion.py index f11c9a3c5d..ff0177ada2 100644 --- a/deeplake/auto/tests/test_ingestion.py +++ b/deeplake/auto/tests/test_ingestion.py @@ -342,7 +342,7 @@ def test_dataframe_array(memory_ds: Dataset): ) np.testing.assert_array_equal( - ds[df_keys[2]][0:3].numpy().reshape(-1), df[df_keys[2]].values[0:3] + np.array(ds[df_keys[2]][0:3].numpy()).reshape(-1), df[df_keys[2]].values[0:3] ) assert ds[df_keys[2]].dtype == df[df_keys[2]].dtype From 8f79d8d06f35ea4927cf17678b5c53b340427a33 Mon Sep 17 00:00:00 2001 From: khustup2 <sasun@activeloop.ai> Date: Mon, 25 Mar 2024 19:48:41 +0000 Subject: [PATCH 53/62] Bump libdeeplake version. --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index ea69cef28e..b7fcc269f7 100644 --- a/setup.py +++ b/setup.py @@ -70,7 +70,7 @@ def libdeeplake_available(): extras_require["all"] = [req_map[r] for r in all_extras] if libdeeplake_available(): - libdeeplake = "libdeeplake==0.0.109" + libdeeplake = "libdeeplake==0.0.110" extras_require["enterprise"] = [libdeeplake, "pyjwt"] extras_require["all"].append(libdeeplake) install_requires.append(libdeeplake) From 42c8e6ff40aedfea8713ab5a2efea8ccdd009f23 Mon Sep 17 00:00:00 2001 From: khustup2 <sasun@activeloop.ai> Date: Tue, 26 Mar 2024 19:14:07 +0400 Subject: [PATCH 54/62] Bump libdeeplake version. --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index b7fcc269f7..17b8dc3067 100644 --- a/setup.py +++ b/setup.py @@ -70,7 +70,7 @@ def libdeeplake_available(): extras_require["all"] = [req_map[r] for r in all_extras] if libdeeplake_available(): - libdeeplake = "libdeeplake==0.0.110" + libdeeplake = "libdeeplake==0.0.111" extras_require["enterprise"] = [libdeeplake, "pyjwt"] extras_require["all"].append(libdeeplake) install_requires.append(libdeeplake) From f2a4a1a1852a1c82208b7e35bb71e37095aa6d59 Mon Sep 17 00:00:00 2001 From: khustup2 <sasun@activeloop.ai> Date: Mon, 1 Apr 2024 15:21:14 +0000 Subject: [PATCH 55/62] Bump libdeeplake version. --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 17b8dc3067..08b9c9cb83 100644 --- a/setup.py +++ b/setup.py @@ -70,7 +70,7 @@ def libdeeplake_available(): extras_require["all"] = [req_map[r] for r in all_extras] if libdeeplake_available(): - libdeeplake = "libdeeplake==0.0.111" + libdeeplake = "libdeeplake==0.0.114" extras_require["enterprise"] = [libdeeplake, "pyjwt"] extras_require["all"].append(libdeeplake) install_requires.append(libdeeplake) From 2e0cc0eab19a659a8c8c3862416bfb7a6a807768 Mon Sep 17 00:00:00 2001 From: khustup2 <sasun@activeloop.ai> Date: Tue, 2 Apr 2024 19:25:21 +0000 Subject: [PATCH 56/62] set endpoint. --- conftest.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/conftest.py b/conftest.py index 3cb3db25f3..25b2029e1e 100644 --- a/conftest.py +++ b/conftest.py @@ -19,6 +19,13 @@ deeplake.client.config.USE_STAGING_ENVIRONMENT = True +try: + from indra import api + + api.backend.set_endpoint("https://app-staging.activeloop.dev") +except ImportError: + pass + from deeplake.constants import * from deeplake.tests.common import SESSION_ID From 8568cb761fa9c1c5711be76cf3969dc53c4ab33e Mon Sep 17 00:00:00 2001 From: khustup2 <sasun@activeloop.ai> Date: Tue, 2 Apr 2024 21:41:37 +0000 Subject: [PATCH 57/62] Bump libdeeplake version. --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 08b9c9cb83..1d0df6b226 100644 --- a/setup.py +++ b/setup.py @@ -70,7 +70,7 @@ def libdeeplake_available(): extras_require["all"] = [req_map[r] for r in all_extras] if libdeeplake_available(): - libdeeplake = "libdeeplake==0.0.114" + libdeeplake = "libdeeplake==0.0.117" extras_require["enterprise"] = [libdeeplake, "pyjwt"] extras_require["all"].append(libdeeplake) install_requires.append(libdeeplake) From 69a00e63f9e944f3d14236d6afe1ff30538cc665 Mon Sep 17 00:00:00 2001 From: khustup2 <sasun@activeloop.ai> Date: Tue, 2 Apr 2024 22:08:39 +0000 Subject: [PATCH 58/62] Fixed linter. --- conftest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conftest.py b/conftest.py index 25b2029e1e..78880bfa3d 100644 --- a/conftest.py +++ b/conftest.py @@ -20,7 +20,7 @@ deeplake.client.config.USE_STAGING_ENVIRONMENT = True try: - from indra import api + from indra import api # type: ignore api.backend.set_endpoint("https://app-staging.activeloop.dev") except ImportError: From 75220629a2761dfc302e779585698c5456341fe4 Mon Sep 17 00:00:00 2001 From: Sasun Hambardzumyan <xustup@gmail.com> Date: Wed, 3 Apr 2024 08:41:20 +0000 Subject: [PATCH 59/62] Reset workflow. --- .github/workflows/test-push.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test-push.yml b/.github/workflows/test-push.yml index 1025de7985..1bf9b2438c 100644 --- a/.github/workflows/test-push.yml +++ b/.github/workflows/test-push.yml @@ -129,7 +129,7 @@ jobs: test: name: Test needs: setup - uses: activeloopai/shared-github-actions/.github/workflows/full_test.yml@v4 + uses: activeloopai/shared-github-actions/.github/workflows/full_test.yml@main if: github.repository == 'activeloopai/deeplake' with: repo: ${{ github.repository }} From 880dbbc2bc062bd9510dc8fb62fbed7bd59c1c0c Mon Sep 17 00:00:00 2001 From: Sasun Hambardzumyan <xustup@gmail.com> Date: Wed, 3 Apr 2024 08:42:33 +0000 Subject: [PATCH 60/62] Restore shuffling. --- deeplake/enterprise/dataloader.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/deeplake/enterprise/dataloader.py b/deeplake/enterprise/dataloader.py index 0a1f1f0791..a62844b79f 100644 --- a/deeplake/enterprise/dataloader.py +++ b/deeplake/enterprise/dataloader.py @@ -105,7 +105,6 @@ def __init__( _return_index=None, _primary_tensor_name=None, _buffer_size=None, - _orig_dataset=None, _decode_method=None, _persistent_workers=None, _dataloader=None, @@ -117,7 +116,6 @@ def __init__( ): import_indra_loader() self.dataset = dataset - self._orig_dataset = _orig_dataset or dataset self._batch_size = _batch_size self._shuffle = _shuffle self._num_threads = _num_threads @@ -279,9 +277,9 @@ def collate_fn(self): def __len__(self): len_ds = ( - len(self._orig_dataset[self._tensors]) + len(self.dataset[self._tensors]) if self._tensors is not None - else len(self._orig_dataset) + else len(self.dataset) ) round_fn = math.floor if self._drop_last else math.ceil return round_fn(len_ds / ((self.batch_size) * self._world_size)) @@ -805,7 +803,7 @@ def __get_indra_dataloader( def __iter__(self): if self._dataloader is None: - dataset = self._orig_dataset + dataset = self.dataset tensors = self._tensors or map_tensor_keys(dataset, None) jpeg_png_compressed_tensors, json_tensors, list_tensors = check_tensors( @@ -857,7 +855,7 @@ def __iter__(self): tensor_info_dict=tensor_info_dict, ) - dataset_read(self._orig_dataset) + dataset_read(self.dataset) if self._iterator is not None: self._iterator = iter(self._dataloader) From 6578d812710a4cfc99fdb9fb420a21cc8927c39d Mon Sep 17 00:00:00 2001 From: khustup2 <sasun@activeloop.ai> Date: Thu, 4 Apr 2024 18:32:36 +0400 Subject: [PATCH 61/62] Bump libdeeplake version. --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 1d0df6b226..1b2e06755f 100644 --- a/setup.py +++ b/setup.py @@ -70,7 +70,7 @@ def libdeeplake_available(): extras_require["all"] = [req_map[r] for r in all_extras] if libdeeplake_available(): - libdeeplake = "libdeeplake==0.0.117" + libdeeplake = "libdeeplake==0.0.118" extras_require["enterprise"] = [libdeeplake, "pyjwt"] extras_require["all"].append(libdeeplake) install_requires.append(libdeeplake) From 132f78eb14bd182d68befd722c0bca8eacac90e4 Mon Sep 17 00:00:00 2001 From: khustup2 <sasun@activeloop.ai> Date: Thu, 4 Apr 2024 18:53:51 +0000 Subject: [PATCH 62/62] Fixed sonar. --- deeplake/core/dataset/dataset.py | 8 +++----- deeplake/core/dataset/indra_dataset_view.py | 1 - 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/deeplake/core/dataset/dataset.py b/deeplake/core/dataset/dataset.py index dba36190ac..29bf358e09 100644 --- a/deeplake/core/dataset/dataset.py +++ b/deeplake/core/dataset/dataset.py @@ -295,6 +295,7 @@ def __init__( self._initial_autoflush: List[bool] = ( [] ) # This is a stack to support nested with contexts + self._indexing_history: List[int] = [] if not self.read_only: @@ -4608,7 +4609,6 @@ def _optimize_and_copy_view( self, info, path: str, - new_path: str, tensors: Optional[List[str]] = None, external=False, unlink=True, @@ -4637,7 +4637,7 @@ def _optimize_and_copy_view( progressbar=progressbar, ) optimized.info.update(vds.info.__getstate__()) - return (vds, optimized) + return (vds, optimized, new_path) def _optimize_saved_view( self, @@ -4665,11 +4665,9 @@ def _optimize_saved_view( # Already optimized return info path = info.get("path", info["id"]) - new_path = path + "_OPTIMIZED" - old, new = self._optimize_and_copy_view( + old, new, new_path = self._optimize_and_copy_view( info, path, - new_path, tensors=tensors, unlink=unlink, num_workers=num_workers, diff --git a/deeplake/core/dataset/indra_dataset_view.py b/deeplake/core/dataset/indra_dataset_view.py index 9cc12168a7..364d74d94c 100644 --- a/deeplake/core/dataset/indra_dataset_view.py +++ b/deeplake/core/dataset/indra_dataset_view.py @@ -116,7 +116,6 @@ def _get_tensor_from_root(self, fullpath): tensors = self.indra_ds.tensors for tensor in tensors: if tensor.name == fullpath: - deeplake_tensor = None indra_tensor = tensor return IndraTensorView(indra_tensor)