From 52ec9b357daccf14162a77be782bed8c481db3d8 Mon Sep 17 00:00:00 2001 From: Tibor Reiss Date: Thu, 18 Jul 2024 20:57:29 +0200 Subject: [PATCH 1/7] Allow 1D numpy array --- weaviate/collections/batch/grpc_batch_objects.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/weaviate/collections/batch/grpc_batch_objects.py b/weaviate/collections/batch/grpc_batch_objects.py index 84d7fddaa..76f61649b 100644 --- a/weaviate/collections/batch/grpc_batch_objects.py +++ b/weaviate/collections/batch/grpc_batch_objects.py @@ -57,7 +57,7 @@ def pack_vector(vector: Any) -> bytes: collection=obj.collection, vector_bytes=( pack_vector(obj.vector) - if obj.vector is not None and isinstance(obj.vector, list) + if obj.vector is not None else None ), uuid=str(obj.uuid) if obj.uuid is not None else str(uuid_package.uuid4()), From af1030bb4536313e3dfed7f4db6c949e7824efbc Mon Sep 17 00:00:00 2001 From: Tibor Reiss Date: Sun, 21 Jul 2024 08:26:20 +0200 Subject: [PATCH 2/7] Add tests --- integration/test_collection.py | 50 ++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/integration/test_collection.py b/integration/test_collection.py index 561ae6c70..d7abf411b 100644 --- a/integration/test_collection.py +++ b/integration/test_collection.py @@ -1,10 +1,12 @@ import datetime import io import pathlib +import struct import time import uuid from typing import Any, Callable, Dict, List, Optional, Sequence, TypedDict, Union +import numpy as np import pytest from integration.conftest import CollectionFactory, CollectionFactoryGet, _sanitize_collection_name @@ -293,6 +295,54 @@ class TestInsertManyWithTypedDict(TypedDict): assert obj2.properties["name"] == "some other name" +@pytest.mark.parametrize( + "objects, should_error", + [ + ( + [ + DataObject(properties={"name": "some numpy one"}, vector=np.array([1, 2, 3])), + ], + False, + ), + ( + [ + DataObject(properties={"name": "some numpy one"}, vector=np.array([1, 2, 3])), + DataObject(properties={"name": "some numpy two"}, vector=np.array([11, 12, 13])), + ], + False, + ), + ( + [ + DataObject( + properties={"name": "some numpy 2d"}, vector=np.array([[1, 2, 3], [11, 12, 13]]) + ), + ], + True, + ), + ], +) +def test_insert_many_with_numpy( + collection_factory: CollectionFactory, + objects: Sequence[DataObject[WeaviateProperties, Any]], + should_error: bool, +) -> None: + collection = collection_factory( + properties=[Property(name="Name", data_type=DataType.TEXT)], + vectorizer_config=Configure.Vectorizer.none(), + ) + if not should_error: + ret = collection.data.insert_many(objects) + for idx, uuid_ in ret.uuids.items(): + obj1 = collection.query.fetch_object_by_id(uuid_, include_vector=True) + inserted = objects[idx] + assert inserted.properties["name"] == obj1.properties["name"] + assert inserted.vector.tolist() == obj1.vector["default"] # type: ignore[union-attr] + else: + with pytest.raises(struct.error) as e: + collection.data.insert_many(objects) + assert str(e.value) == "required argument is not a float" + + def test_insert_many_with_refs(collection_factory: CollectionFactory) -> None: ref_collection = collection_factory( name="target", vectorizer_config=Configure.Vectorizer.none() From fa9f776c33c2d3c6f6df0c9584d17777a1d833c9 Mon Sep 17 00:00:00 2001 From: Tibor Reiss Date: Thu, 25 Jul 2024 21:39:25 +0200 Subject: [PATCH 3/7] Fix tests --- integration/test_collection.py | 19 ++++++++++++++----- .../collections/batch/grpc_batch_objects.py | 2 +- 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/integration/test_collection.py b/integration/test_collection.py index d7abf411b..6ebf81583 100644 --- a/integration/test_collection.py +++ b/integration/test_collection.py @@ -6,7 +6,6 @@ import uuid from typing import Any, Callable, Dict, List, Optional, Sequence, TypedDict, Union -import numpy as np import pytest from integration.conftest import CollectionFactory, CollectionFactoryGet, _sanitize_collection_name @@ -62,6 +61,14 @@ DATE3 = datetime.datetime.strptime("2019-06-10", "%Y-%m-%d").replace(tzinfo=datetime.timezone.utc) +def get_numpy_vector(input_list: list) -> Any: + try: + import numpy as np + return np.array(input_list) + except ModuleNotFoundError: + return input_list + + def test_insert_with_typed_dict_generic( collection_factory: CollectionFactory, collection_factory_get: CollectionFactoryGet, @@ -300,21 +307,21 @@ class TestInsertManyWithTypedDict(TypedDict): [ ( [ - DataObject(properties={"name": "some numpy one"}, vector=np.array([1, 2, 3])), + DataObject(properties={"name": "some numpy one"}, vector=get_numpy_vector([1, 2, 3])), ], False, ), ( [ - DataObject(properties={"name": "some numpy one"}, vector=np.array([1, 2, 3])), - DataObject(properties={"name": "some numpy two"}, vector=np.array([11, 12, 13])), + DataObject(properties={"name": "some numpy one"}, vector=get_numpy_vector([1, 2, 3])), + DataObject(properties={"name": "some numpy two"}, vector=get_numpy_vector([11, 12, 13])), ], False, ), ( [ DataObject( - properties={"name": "some numpy 2d"}, vector=np.array([[1, 2, 3], [11, 12, 13]]) + properties={"name": "some numpy 2d"}, vector=get_numpy_vector([[1, 2, 3], [11, 12, 13]]) ), ], True, @@ -326,6 +333,8 @@ def test_insert_many_with_numpy( objects: Sequence[DataObject[WeaviateProperties, Any]], should_error: bool, ) -> None: + if isinstance(objects[0].vector, list): + pytest.skip("numpy not available") collection = collection_factory( properties=[Property(name="Name", data_type=DataType.TEXT)], vectorizer_config=Configure.Vectorizer.none(), diff --git a/weaviate/collections/batch/grpc_batch_objects.py b/weaviate/collections/batch/grpc_batch_objects.py index 76f61649b..e0450c539 100644 --- a/weaviate/collections/batch/grpc_batch_objects.py +++ b/weaviate/collections/batch/grpc_batch_objects.py @@ -57,7 +57,7 @@ def pack_vector(vector: Any) -> bytes: collection=obj.collection, vector_bytes=( pack_vector(obj.vector) - if obj.vector is not None + if obj.vector is not None and not isinstance(obj.vector, dict) else None ), uuid=str(obj.uuid) if obj.uuid is not None else str(uuid_package.uuid4()), From 1ec45b9d4d0276d43757f777c5fadb14f7580402 Mon Sep 17 00:00:00 2001 From: Tibor Reiss Date: Fri, 26 Jul 2024 20:24:05 +0200 Subject: [PATCH 4/7] Linting --- integration/test_collection.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/integration/test_collection.py b/integration/test_collection.py index 6ebf81583..fe736a7ae 100644 --- a/integration/test_collection.py +++ b/integration/test_collection.py @@ -64,6 +64,7 @@ def get_numpy_vector(input_list: list) -> Any: try: import numpy as np + return np.array(input_list) except ModuleNotFoundError: return input_list @@ -307,21 +308,28 @@ class TestInsertManyWithTypedDict(TypedDict): [ ( [ - DataObject(properties={"name": "some numpy one"}, vector=get_numpy_vector([1, 2, 3])), + DataObject( + properties={"name": "some numpy one"}, vector=get_numpy_vector([1, 2, 3]) + ), ], False, ), ( [ - DataObject(properties={"name": "some numpy one"}, vector=get_numpy_vector([1, 2, 3])), - DataObject(properties={"name": "some numpy two"}, vector=get_numpy_vector([11, 12, 13])), + DataObject( + properties={"name": "some numpy one"}, vector=get_numpy_vector([1, 2, 3]) + ), + DataObject( + properties={"name": "some numpy two"}, vector=get_numpy_vector([11, 12, 13]) + ), ], False, ), ( [ DataObject( - properties={"name": "some numpy 2d"}, vector=get_numpy_vector([[1, 2, 3], [11, 12, 13]]) + properties={"name": "some numpy 2d"}, + vector=get_numpy_vector([[1, 2, 3], [11, 12, 13]]), ), ], True, From 6eb0ad9c35032dbd80872b1aa5eac94be91c7a70 Mon Sep 17 00:00:00 2001 From: Tibor Reiss Date: Fri, 13 Sep 2024 19:47:44 +0200 Subject: [PATCH 5/7] Simplify numpy usage --- integration/test_collection.py | 18 +++++------------- 1 file changed, 5 insertions(+), 13 deletions(-) diff --git a/integration/test_collection.py b/integration/test_collection.py index fe736a7ae..00d4554bb 100644 --- a/integration/test_collection.py +++ b/integration/test_collection.py @@ -6,6 +6,7 @@ import uuid from typing import Any, Callable, Dict, List, Optional, Sequence, TypedDict, Union +import numpy as np import pytest from integration.conftest import CollectionFactory, CollectionFactoryGet, _sanitize_collection_name @@ -61,15 +62,6 @@ DATE3 = datetime.datetime.strptime("2019-06-10", "%Y-%m-%d").replace(tzinfo=datetime.timezone.utc) -def get_numpy_vector(input_list: list) -> Any: - try: - import numpy as np - - return np.array(input_list) - except ModuleNotFoundError: - return input_list - - def test_insert_with_typed_dict_generic( collection_factory: CollectionFactory, collection_factory_get: CollectionFactoryGet, @@ -309,7 +301,7 @@ class TestInsertManyWithTypedDict(TypedDict): ( [ DataObject( - properties={"name": "some numpy one"}, vector=get_numpy_vector([1, 2, 3]) + properties={"name": "some numpy one"}, vector=np.array([1, 2, 3]) ), ], False, @@ -317,10 +309,10 @@ class TestInsertManyWithTypedDict(TypedDict): ( [ DataObject( - properties={"name": "some numpy one"}, vector=get_numpy_vector([1, 2, 3]) + properties={"name": "some numpy one"}, vector=np.array([1, 2, 3]) ), DataObject( - properties={"name": "some numpy two"}, vector=get_numpy_vector([11, 12, 13]) + properties={"name": "some numpy two"}, vector=np.array([11, 12, 13]) ), ], False, @@ -329,7 +321,7 @@ class TestInsertManyWithTypedDict(TypedDict): [ DataObject( properties={"name": "some numpy 2d"}, - vector=get_numpy_vector([[1, 2, 3], [11, 12, 13]]), + vector=np.array([[1, 2, 3], [11, 12, 13]]), ), ], True, From b1599a80ce1af32c5413e513336f39dd82165927 Mon Sep 17 00:00:00 2001 From: Tibor Reiss Date: Thu, 26 Sep 2024 20:22:35 +0200 Subject: [PATCH 6/7] Refactor --- weaviate/collections/batch/grpc_batch_objects.py | 5 ++--- weaviate/collections/data/data.py | 7 ++++++- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/weaviate/collections/batch/grpc_batch_objects.py b/weaviate/collections/batch/grpc_batch_objects.py index e0450c539..4d40750ae 100644 --- a/weaviate/collections/batch/grpc_batch_objects.py +++ b/weaviate/collections/batch/grpc_batch_objects.py @@ -49,15 +49,14 @@ def __init__(self, connection: ConnectionV4, consistency_level: Optional[Consist def __grpc_objects(self, objects: List[_BatchObject]) -> List[batch_pb2.BatchObject]: def pack_vector(vector: Any) -> bytes: - vector_list = _get_vector_v4(vector) - return struct.pack("{}f".format(len(vector_list)), *vector_list) + return struct.pack("{}f".format(len(vector)), *vector) return [ batch_pb2.BatchObject( collection=obj.collection, vector_bytes=( pack_vector(obj.vector) - if obj.vector is not None and not isinstance(obj.vector, dict) + if obj.vector is not None and isinstance(obj.vector, list) else None ), uuid=str(obj.uuid) if obj.uuid is not None else str(uuid_package.uuid4()), diff --git a/weaviate/collections/data/data.py b/weaviate/collections/data/data.py index b86d0ac94..2e64dd84a 100644 --- a/weaviate/collections/data/data.py +++ b/weaviate/collections/data/data.py @@ -363,7 +363,12 @@ async def insert_many( ( _BatchObject( collection=self.name, - vector=obj.vector, + vector=( + obj.vector + if obj.vector is None + or isinstance(obj.vector, dict) + else _get_vector_v4(obj.vector) + ), uuid=str(obj.uuid if obj.uuid is not None else uuid_package.uuid4()), properties=cast(dict, obj.properties), tenant=self._tenant, From 27422acb683743f908f32fb819bc217319ed8b6a Mon Sep 17 00:00:00 2001 From: Tibor Reiss Date: Thu, 26 Sep 2024 20:29:52 +0200 Subject: [PATCH 7/7] Reorg --- .../collections/batch/grpc_batch_objects.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/weaviate/collections/batch/grpc_batch_objects.py b/weaviate/collections/batch/grpc_batch_objects.py index 4d40750ae..a743c9086 100644 --- a/weaviate/collections/batch/grpc_batch_objects.py +++ b/weaviate/collections/batch/grpc_batch_objects.py @@ -27,11 +27,15 @@ from weaviate.util import _datetime_to_string, _get_vector_v4 +def _pack_vector(vector: Any) -> bytes: + return struct.pack("{}f".format(len(vector)), *vector) + + def _pack_named_vectors(vectors: Dict[str, List[float]]) -> List[base_pb2.Vectors]: return [ base_pb2.Vectors( name=name, - vector_bytes=struct.pack("{}f".format(len(vector)), *vector), + vector_bytes=_pack_vector(vector), ) for name, vector in vectors.items() ] @@ -48,17 +52,9 @@ def __init__(self, connection: ConnectionV4, consistency_level: Optional[Consist super().__init__(connection, consistency_level) def __grpc_objects(self, objects: List[_BatchObject]) -> List[batch_pb2.BatchObject]: - def pack_vector(vector: Any) -> bytes: - return struct.pack("{}f".format(len(vector)), *vector) - return [ batch_pb2.BatchObject( collection=obj.collection, - vector_bytes=( - pack_vector(obj.vector) - if obj.vector is not None and isinstance(obj.vector, list) - else None - ), uuid=str(obj.uuid) if obj.uuid is not None else str(uuid_package.uuid4()), properties=( self.__translate_properties_from_python_to_grpc( @@ -69,6 +65,11 @@ def pack_vector(vector: Any) -> bytes: else None ), tenant=obj.tenant, + vector_bytes=( + _pack_vector(obj.vector) + if obj.vector is not None and isinstance(obj.vector, list) + else None + ), vectors=( _pack_named_vectors(obj.vector) if obj.vector is not None and isinstance(obj.vector, dict)