Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin' into managed-thin-client
Browse files Browse the repository at this point in the history
  • Loading branch information
dgaloop committed Jan 22, 2024
2 parents 254fd6c + ac1c471 commit d06ca2b
Show file tree
Hide file tree
Showing 14 changed files with 45 additions and 33 deletions.
2 changes: 1 addition & 1 deletion deeplake/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@
]


__version__ = "3.8.14"
__version__ = "3.8.17"
warn_if_update_required(__version__)
__encoded_version__ = np.array(__version__)
config = {"s3": Config(max_pool_connections=50, connect_timeout=300, read_timeout=300)}
Expand Down
5 changes: 4 additions & 1 deletion deeplake/core/chunk/chunk_compressed_chunk.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,9 @@ def extend_if_has_space_byte_compression_numpy(
update_tensor_meta: bool = True,
):
sample = incoming_samples[0]
shape = sample.shape
if not shape:
shape = (1,)
chunk_dtype = self.dtype
sample_dtype = sample.dtype
if chunk_dtype == sample_dtype:
Expand Down Expand Up @@ -220,7 +223,7 @@ def extend_if_has_space_byte_compression_numpy(
if num_samples:
self.register_in_meta_and_headers(
sample_nbytes,
sample.shape,
shape,
update_tensor_meta=update_tensor_meta,
num_samples=num_samples,
)
Expand Down
2 changes: 1 addition & 1 deletion deeplake/core/storage/azure.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,7 @@ def _all_keys(self, prefix: str = ""):
self._check_update_creds()
prefix = posixpath.join(self.root_folder, prefix)
return {
relpath(blob.name, self.root_folder)
posixpath.relpath(blob.name, self.root_folder)
for blob in self.container_client.list_blobs(
name_starts_with=prefix, include=["metadata"]
)
Expand Down
2 changes: 1 addition & 1 deletion deeplake/core/storage/gcs.py
Original file line number Diff line number Diff line change
Expand Up @@ -325,7 +325,7 @@ def _get_path_from_key(self, key):

def _all_keys(self):
self._blob_objects = self.client_bucket.list_blobs(prefix=self.path)
return {relpath(obj.name, self.path) for obj in self._blob_objects}
return {posixpath.relpath(obj.name, self.path) for obj in self._blob_objects}

def _set_hub_creds_info(
self,
Expand Down
2 changes: 1 addition & 1 deletion deeplake/core/storage/local.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,7 @@ def _all_keys(self, refresh: bool = False) -> Set[str]:
for root, dirs, files in os.walk(full_path):
for file in files:
key_set.add(
relpath(
posixpath.relpath(
posixpath.join(pathlib.Path(root).as_posix(), file),
pathlib.Path(full_path).as_posix(),
)
Expand Down
18 changes: 12 additions & 6 deletions deeplake/core/storage/s3.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import deeplake
from math import ceil
from io import BytesIO
import time
import boto3
import botocore # type: ignore
Expand All @@ -8,6 +9,7 @@
from datetime import datetime, timezone
from botocore.session import ComponentLocator
from deeplake.client.client import DeepLakeBackendClient
from deeplake.constants import GB
from deeplake.core.storage.provider import StorageProvider
from deeplake.util.exceptions import (
S3GetAccessError,
Expand Down Expand Up @@ -153,12 +155,16 @@ def subdir(self, path: str, read_only: bool = False):
return sd

def _set(self, path, content):
self.client.put_object(
Bucket=self.bucket,
Body=content,
Key=path,
ContentType="application/octet-stream", # signifies binary data
)
if len(content) >= 1 * GB:
stream = BytesIO(content)
self.client.upload_fileobj(stream, self.bucket, path)
else:
self.client.put_object(
Bucket=self.bucket,
Body=content,
Key=path,
ContentType="application/octet-stream", # signifies binary data
)

def __setitem__(self, path, content):
"""Sets the object present at the path with the value
Expand Down
2 changes: 1 addition & 1 deletion deeplake/core/tests/test_deeplake_indra_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -199,7 +199,7 @@ def test_metadata(local_auth_ds_generator):
assert deeplake_indra_ds.label.sample_compression == None
assert deeplake_indra_ds.image.htype == "image"
assert deeplake_indra_ds.image.dtype == np.uint8
assert deeplake_indra_ds.image.sample_compression == "jpg"
assert deeplake_indra_ds.image.sample_compression == "jpeg"
assert deeplake_indra_ds.sequence.htype == "sequence[class_label]"
assert deeplake_indra_ds.sequence.dtype == np.uint8
assert deeplake_indra_ds.sequence.sample_compression == None
Expand Down
5 changes: 4 additions & 1 deletion deeplake/core/transform/transform_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -237,7 +237,10 @@ def flush(self, clear_on_fail=True):
name = self._get_engine_name(name)
updated_tensors[name] = 0
chunk_engine = all_chunk_engines[name]
callback = chunk_engine._transform_callback
if chunk_engine.tensor_meta.links:
callback = chunk_engine._transform_callback
else:
callback = None

meta = chunk_engine.tensor_meta
if meta.length == 0 and meta.dtype is None:
Expand Down
5 changes: 2 additions & 3 deletions deeplake/core/vectorstore/deep_memory/deep_memory.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,7 @@

import deeplake
from deeplake.util.exceptions import (
DeepMemoryWaitingListError,
DeepMemoryWaitingListError,
DeepMemoryAccessError,
IncorrectRelevanceTypeError,
IncorrectQueriesTypeError,
)
Expand Down Expand Up @@ -43,7 +42,7 @@
def access_control(func):
def wrapper(self, *args, **kwargs):
if self.client is None:
raise DeepMemoryWaitingListError()
raise DeepMemoryAccessError()
return func(self, *args, **kwargs)

return wrapper
Expand Down
12 changes: 6 additions & 6 deletions deeplake/core/vectorstore/deep_memory/test_deepmemory.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,11 @@
)
from deeplake.tests.common import requires_libdeeplake
from deeplake.util.exceptions import (
DeepMemoryWaitingListError,
DeepMemoryAccessError,
IncorrectQueriesTypeError,
IncorrectRelevanceTypeError,
)
from deeplake.util.exceptions import DeepMemoryWaitingListError
from deeplake.util.exceptions import DeepMemoryAccessError


logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -593,19 +593,19 @@ def test_unsupported_deepmemory_users(local_ds):
logger=logger,
embedding_function=DummyEmbedder,
)
with pytest.raises(DeepMemoryWaitingListError):
with pytest.raises(DeepMemoryAccessError):
dm.train(
queries=[],
relevance=[],
)

with pytest.raises(DeepMemoryWaitingListError):
with pytest.raises(DeepMemoryAccessError):
dm.status(job_id="123")

with pytest.raises(DeepMemoryWaitingListError):
with pytest.raises(DeepMemoryAccessError):
dm.list_jobs()

with pytest.raises(DeepMemoryWaitingListError):
with pytest.raises(DeepMemoryAccessError):
dm.evaluate(
queries=[],
relevance=[],
Expand Down
13 changes: 7 additions & 6 deletions deeplake/core/vectorstore/deeplake_vectorstore.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,14 +14,14 @@
TARGET_BYTE_SIZE,
)
from deeplake.util.bugout_reporter import feature_report_path
from deeplake.util.exceptions import DeepMemoryWaitingListError
from deeplake.util.path import convert_pathlib_to_string_if_needed
from deeplake.core.vectorstore.dataset_handlers.managed_dataset_handler import (
ManagedDH,
)
from deeplake.core.vectorstore.dataset_handlers.embedded_dataset_handler import (
EmbeddedDH,
)
from deeplake.util.exceptions import DeepMemoryAccessError


logger = logging.getLogger(__name__)
Expand All @@ -39,7 +39,6 @@ def __init__(
read_only: Optional[bool] = None,
ingestion_batch_size: int = 1000,
index_params: Optional[Dict[str, Union[int, str]]] = None,
num_workers: int = 0,
exec_option: str = "auto",
token: Optional[str] = None,
overwrite: bool = False,
Expand Down Expand Up @@ -83,7 +82,6 @@ def __init__(
tensor_params (List[Dict[str, dict]], optional): List of dictionaries that contains information about tensors that user wants to create. See ``create_tensor`` in Deep Lake API docs for more information. Defaults to ``DEFAULT_VECTORSTORE_TENSORS``.
embedding_function (Optional[Any], optional): Function or class that converts the embeddable data into embeddings. Input to `embedding_function` is a list of data and output is a list of embeddings. Defaults to None.
read_only (bool, optional): Opens dataset in read-only mode if True. Defaults to False.
num_workers (int): Number of workers to use for parallel ingestion.
ingestion_batch_size (int): Batch size to use for parallel ingestion.
index_params (Dict[str, Union[int, str]]): Dictionary containing information about vector index that will be created. Defaults to None, which will utilize ``DEFAULT_VECTORSTORE_INDEX_PARAMS`` from ``deeplake.constants``. The specified key-values override the default ones.
- threshold: The threshold for the dataset size above which an index will be created for the embedding tensor. When the threshold value is set to -1, index creation is turned off.
Expand Down Expand Up @@ -116,6 +114,9 @@ def __init__(
Danger:
Setting ``overwrite`` to ``True`` will delete all of your data if the Vector Store exists! Be very careful when setting this parameter.
"""

kwargs.pop("num_workers", None)

self.dataset_handler = get_dataset_handler(
path=path,
dataset=dataset,
Expand All @@ -124,7 +125,7 @@ def __init__(
read_only=read_only,
ingestion_batch_size=ingestion_batch_size,
index_params=index_params,
num_workers=num_workers,
num_workers=0,
exec_option=exec_option,
token=token,
overwrite=overwrite,
Expand Down Expand Up @@ -300,13 +301,13 @@ def search(
Raises:
ValueError: When invalid parameters are specified.
ValueError: when deep_memory is True. Deep Memory is only available for datasets stored in the Deep Lake Managed Database for paid accounts.
DeepMemoryWaitingListError: if user is not waitlisted to use deep_memory.
DeepMemoryAccessError: if user does not have access to deep_memory.
Returns:
Dict: Dictionary where keys are tensor names and values are the results of the search
"""
if deep_memory and not self.deep_memory:
raise DeepMemoryWaitingListError()
raise DeepMemoryAccessError()

return self.dataset_handler.search(
embedding_data=embedding_data,
Expand Down
6 changes: 3 additions & 3 deletions deeplake/util/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -1112,11 +1112,11 @@ def __init__(self, msg):
super().__init__(msg)


class DeepMemoryWaitingListError(Exception):
class DeepMemoryAccessError(Exception):
def __init__(self):
msg = (
"Deep Memory is available only for waiting list users. "
"Please, follow the link and join the waiting list: https://www.deeplake.ai/deepmemory"
"Deep Memory is not available for organizations on Community plan."
"Please, consider upgrading or start a free trial at https://app.activeloop.ai/pricing."
)
super().__init__(msg)

Expand Down
2 changes: 1 addition & 1 deletion deeplake/util/transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -348,7 +348,7 @@ def store_data_slice_with_pbar(pg_callback, transform_input: Tuple) -> Dict:
if isinstance(data_slice, deeplake.Dataset):
data_slice = add_cache_to_dataset_slice(data_slice, tensors)

rel_tensors = [relpath(tensor, group_index) for tensor in visible_tensors]
rel_tensors = [posixpath.relpath(tensor, group_index) for tensor in visible_tensors]

transform_dataset = TransformDataset(
rel_tensors,
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ def libdeeplake_available():
extras_require["all"] = [req_map[r] for r in all_extras]

if libdeeplake_available():
libdeeplake = "libdeeplake==0.0.95"
libdeeplake = "libdeeplake==0.0.98"
extras_require["enterprise"] = [libdeeplake, "pyjwt"]
extras_require["all"].append(libdeeplake)
install_requires.append(libdeeplake)
Expand Down

0 comments on commit d06ca2b

Please sign in to comment.