Skip to content

Commit df2b3b5

Browse files
Merge pull request pinecone-io#134 from pinecone-io/add_dict_support
Index.upsert() now accepts List[dict]
2 parents 42ed389 + bce979c commit df2b3b5

File tree

3 files changed

+140
-20
lines changed

3 files changed

+140
-20
lines changed

pinecone/core/grpc/index_grpc.py

+68-10
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,11 @@
22
# Copyright (c) 2020-2021 Pinecone Systems Inc. All right reserved.
33
#
44
import logging
5+
import numbers
56
from abc import ABC, abstractmethod
67
from functools import wraps
78
from typing import NamedTuple, Optional, Dict, Iterable, Union, List, Tuple, Any
9+
from collections.abc import Mapping
810

911
import certifi
1012
import grpc
@@ -24,7 +26,8 @@
2426
from pinecone.core.grpc.protos.vector_service_pb2_grpc import VectorServiceStub
2527
from pinecone.core.grpc.retry import RetryOnRpcErrorClientInterceptor, RetryConfig
2628
from pinecone.core.utils import _generate_request_id, dict_to_proto_struct, fix_tuple_length
27-
from pinecone.core.utils.constants import MAX_MSG_SIZE, REQUEST_ID, CLIENT_VERSION
29+
from pinecone.core.utils.constants import MAX_MSG_SIZE, REQUEST_ID, CLIENT_VERSION, REQUIRED_VECTOR_FIELDS, \
30+
OPTIONAL_VECTOR_FIELDS
2831
from pinecone.exceptions import PineconeException
2932

3033
__all__ = ["GRPCIndex", "GRPCVector", "GRPCQueryVector", "GRPCSparseValues"]
@@ -263,7 +266,7 @@ def stub_class(self):
263266
return VectorServiceStub
264267

265268
def upsert(self,
266-
vectors: Union[List[GRPCVector], List[Tuple]],
269+
vectors: Union[List[GRPCVector], List[tuple], List[dict]],
267270
async_req: bool = False,
268271
namespace: Optional[str] = None,
269272
batch_size: Optional[int] = None,
@@ -274,18 +277,25 @@ def upsert(self,
274277
If a new value is upserted for an existing vector id, it will overwrite the previous value.
275278
276279
Examples:
277-
>>> index.upsert([('id1', [1.0, 2.0, 3.0], {'key': 'value'}), ('id2', [1.0, 2.0, 3.0])],
278-
>>> namespace='ns1', async_req=True)
280+
>>> index.upsert([('id1', [1.0, 2.0, 3.0], {'key': 'value'}),
281+
('id2', [1.0, 2.0, 3.0])
282+
],
283+
namespace='ns1', async_req=True)
284+
>>> index.upsert([{'id': 'id1', 'values': [1.0, 2.0, 3.0], 'metadata': {'key': 'value'}},
285+
{'id': 'id2',
286+
'values': [1.0, 2.0, 3.0],
287+
'sprase_values': {'indices': [1, 8], 'values': [0.2, 0.4]},
288+
])
279289
>>> index.upsert([GRPCVector(id='id1', values=[1.0, 2.0, 3.0], metadata={'key': 'value'}),
280-
>>> GRPCVector(id='id2', values=[1.0, 2.0, 3.0]),
281-
>>> GRPCVector(id='id3',
282-
>>> values=[1.0, 2.0, 3.0],
283-
>>> sparse_values=GRPCSparseValues(indices=[1, 2], values=[0.2, 0.4]))])
290+
GRPCVector(id='id2', values=[1.0, 2.0, 3.0]),
291+
GRPCVector(id='id3',
292+
values=[1.0, 2.0, 3.0],
293+
sparse_values=GRPCSparseValues(indices=[1, 2], values=[0.2, 0.4]))])
284294
285295
Args:
286296
vectors (Union[List[Vector], List[Tuple]]): A list of vectors to upsert.
287297
288-
A vector can be represented by a 1) GRPCVector object or a 2) tuple.
298+
A vector can be represented by a 1) GRPCVector object, a 2) tuple or 3) a dictionary
289299
1) if a tuple is used, it must be of the form (id, values, metadata) or (id, values).
290300
where id is a string, vector is a list of floats, and metadata is a dict.
291301
Examples: ('id1', [1.0, 2.0, 3.0], {'key': 'value'}), ('id2', [1.0, 2.0, 3.0])
@@ -299,6 +309,10 @@ def upsert(self,
299309
values=[1.0, 2.0, 3.0],
300310
sparse_values=GRPCSparseValues(indices=[1, 2], values=[0.2, 0.4]))
301311
312+
3) if a dictionary is used, it must be in the form
313+
{'id': str, 'values': List[float], 'sparse_values': {'indices': List[int], 'values': List[float]},
314+
'metadata': dict}
315+
302316
Note: the dimension of each vector must match the dimension of the index.
303317
async_req (bool): If True, the upsert operation will be performed asynchronously.
304318
Cannot be used with batch_size.
@@ -317,12 +331,56 @@ def upsert(self,
317331
'To upsert in parallel, please follow: '
318332
'https://docs.pinecone.io/docs/performance-tuning')
319333

334+
def _dict_to_grpc_vector(item):
335+
item_keys = set(item.keys())
336+
if not item_keys.issuperset(REQUIRED_VECTOR_FIELDS):
337+
raise ValueError(
338+
f"Vector dictionary is missing required fields: {list(REQUIRED_VECTOR_FIELDS - item_keys)}")
339+
340+
excessive_keys = item_keys - (REQUIRED_VECTOR_FIELDS | OPTIONAL_VECTOR_FIELDS)
341+
if len(excessive_keys) > 0:
342+
raise ValueError(f"Found excess keys in the vector dictionary: {list(excessive_keys)}. "
343+
f"The allowed keys are: {list(REQUIRED_VECTOR_FIELDS | OPTIONAL_VECTOR_FIELDS)}")
344+
345+
sparse_values = None
346+
if 'sparse_values' in item:
347+
if not isinstance(item['sparse_values'], Mapping):
348+
raise ValueError(f"Column `sparse_values` is expected to be a dictionary, found {type(item['sparse_values'])}")
349+
indices = item['sparse_values'].get('indices', None)
350+
values = item['sparse_values'].get('values', None)
351+
try:
352+
sparse_values = GRPCSparseValues(indices=indices, values=values)
353+
except TypeError as e:
354+
raise ValueError("Found unexpected data in column `sparse_values`. "
355+
"Expected format is `'sparse_values': {'indices': List[int], 'values': List[float]}`."
356+
) from e
357+
358+
metadata = item.get('metadata', None)
359+
if metadata is not None and not isinstance(metadata, Mapping):
360+
raise TypeError(f"Column `metadata` is expected to be a dictionary, found {type(metadata)}")
361+
362+
try:
363+
return GRPCVector(id=item['id'], values=item['values'], sparse_values=sparse_values,
364+
metadata=dict_to_proto_struct(metadata))
365+
366+
except TypeError as e:
367+
# No need to raise a dedicated error for `id` - protobuf's error message is clear enough
368+
if not isinstance(item['values'], Iterable) or not isinstance(item['values'][0], numbers.Real):
369+
raise TypeError(f"Column `values` is expected to be a list of floats")
370+
raise
371+
320372
def _vector_transform(item):
321373
if isinstance(item, GRPCVector):
322374
return item
323-
if isinstance(item, tuple):
375+
elif isinstance(item, tuple):
376+
if len(item) > 3:
377+
raise ValueError(f"Found a tuple of length {len(item)} which is not supported. "
378+
f"Vectors can be represented as tuples either the form (id, values, metadata) or (id, values). "
379+
f"To pass sparse values please use either dicts or GRPCVector objects as inputs.")
324380
id, values, metadata = fix_tuple_length(item, 3)
325381
return GRPCVector(id=id, values=values, metadata=dict_to_proto_struct(metadata) or {})
382+
elif isinstance(item, Mapping):
383+
return _dict_to_grpc_vector(item)
326384
raise ValueError(f"Invalid vector value passed: cannot interpret type {type(item)}")
327385

328386
timeout = kwargs.pop('timeout', None)

pinecone/core/utils/constants.py

+3
Original file line numberDiff line numberDiff line change
@@ -37,3 +37,6 @@ class NodeType(str, enum.Enum):
3737
TCP_KEEPINTVL = 60 # Sec
3838
TCP_KEEPIDLE = 300 # Sec
3939
TCP_KEEPCNT = 4
40+
41+
REQUIRED_VECTOR_FIELDS = {'id', 'values'}
42+
OPTIONAL_VECTOR_FIELDS = {'sparse_values', 'metadata'}

pinecone/index.py

+69-10
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,10 @@
11
#
22
# Copyright (c) 2020-2021 Pinecone Systems Inc. All right reserved.
33
#
4+
import numbers
5+
46
from tqdm import tqdm
5-
from collections.abc import Iterable
7+
from collections.abc import Iterable, Mapping
68
from typing import Union, List, Tuple, Optional, Dict, Any
79

810
from .core.client.model.sparse_values import SparseValues
@@ -21,6 +23,7 @@
2123
"UpdateRequest", "Vector", "DeleteRequest", "UpdateRequest", "DescribeIndexStatsRequest", "SparseValues"
2224
]
2325

26+
from .core.utils.constants import REQUIRED_VECTOR_FIELDS, OPTIONAL_VECTOR_FIELDS
2427
from .core.utils.error_handling import validate_and_convert_errors
2528

2629
_OPENAPI_ENDPOINT_PARAMS = (
@@ -63,7 +66,7 @@ def __init__(self, index_name: str, pool_threads=1):
6366

6467
@validate_and_convert_errors
6568
def upsert(self,
66-
vectors: Union[List[Vector], List[Tuple]],
69+
vectors: Union[List[Vector], List[tuple], List[dict]],
6770
namespace: Optional[str] = None,
6871
batch_size: Optional[int] = None,
6972
show_progress: bool = True,
@@ -77,18 +80,25 @@ def upsert(self,
7780
To upsert in parallel follow: https://docs.pinecone.io/docs/insert-data#sending-upserts-in-parallel
7881
7982
Examples:
80-
>>> index.upsert([('id1', [1.0, 2.0, 3.0], {'key': 'value'}), ('id2', [1.0, 2.0, 3.0])])
83+
>>> index.upsert([('id1', [1.0, 2.0, 3.0], {'key': 'value'}),
84+
('id2', [1.0, 2.0, 3.0]),
85+
])
86+
>>> index.upsert([{'id': 'id1', 'values': [1.0, 2.0, 3.0], 'metadata': {'key': 'value'}},
87+
{'id': 'id2',
88+
'values': [1.0, 2.0, 3.0],
89+
'sprase_values': {'indices': [1, 8], 'values': [0.2, 0.4]},
90+
])
8191
>>> index.upsert([Vector(id='id1',
82-
>>> values=[1.0, 2.0, 3.0],
83-
>>> metadata={'key': 'value'}),
84-
>>> Vector(id='id2',
85-
>>> values=[1.0, 2.0, 3.0],
86-
>>> sparse_values=SparseValues(indices=[1, 2], values=[0.2, 0.4]))])
92+
values=[1.0, 2.0, 3.0],
93+
metadata={'key': 'value'}),
94+
Vector(id='id2',
95+
values=[1.0, 2.0, 3.0],
96+
sparse_values=SparseValues(indices=[1, 2], values=[0.2, 0.4]))])
8797
8898
Args:
8999
vectors (Union[List[Vector], List[Tuple]]): A list of vectors to upsert.
90100
91-
A vector can be represented by a 1) Vector object or a 2) tuple.
101+
A vector can be represented by a 1) Vector object, a 2) tuple or 3) a dictionary
92102
1) if a tuple is used, it must be of the form (id, values, meatadaa) or (id, values).
93103
where id is a string, vector is a list of floats, metadata is a dict,
94104
and sparse_values is a dict of the form {'indices': List[int], 'values': List[float]}.
@@ -110,6 +120,10 @@ def upsert(self,
110120
111121
Note: the dimension of each vector must match the dimension of the index.
112122
123+
3) if a dictionary is used, it must be in the form
124+
{'id': str, 'values': List[float], 'sparse_values': {'indices': List[int], 'values': List[float]},
125+
'metadata': dict}
126+
113127
namespace (str): The namespace to write to. If not specified, the default namespace is used. [optional]
114128
batch_size (int): The number of vectors to upsert in each batch.
115129
If not specified, all vectors will be upserted in a single batch. [optional]
@@ -151,12 +165,57 @@ def _upsert_batch(self,
151165

152166
args_dict = self._parse_non_empty_args([('namespace', namespace)])
153167

168+
def _dict_to_vector(item):
169+
item_keys = set(item.keys())
170+
if not item_keys.issuperset(REQUIRED_VECTOR_FIELDS):
171+
raise ValueError(
172+
f"Vector dictionary is missing required fields: {list(REQUIRED_VECTOR_FIELDS - item_keys)}")
173+
174+
excessive_keys = item_keys - (REQUIRED_VECTOR_FIELDS | OPTIONAL_VECTOR_FIELDS)
175+
if len(excessive_keys) > 0:
176+
raise ValueError(f"Found excess keys in the vector dictionary: {list(excessive_keys)}. "
177+
f"The allowed keys are: {list(REQUIRED_VECTOR_FIELDS | OPTIONAL_VECTOR_FIELDS)}")
178+
179+
sparse_values = None
180+
if 'sparse_values' in item:
181+
if not isinstance(item['sparse_values'], Mapping):
182+
raise ValueError(
183+
f"Column `sparse_values` is expected to be a dictionary, found {type(item['sparse_values'])}")
184+
indices = item['sparse_values'].get('indices', None)
185+
values = item['sparse_values'].get('values', None)
186+
try:
187+
sparse_values = SparseValues(indices=indices, values=values)
188+
except TypeError as e:
189+
raise ValueError("Found unexpected data in column `sparse_values`. "
190+
"Expected format is `'sparse_values': {'indices': List[int], 'values': List[float]}`."
191+
) from e
192+
193+
metadata = item.get('metadata') or {}
194+
if not isinstance(metadata, Mapping):
195+
raise TypeError(f"Column `metadata` is expected to be a dictionary, found {type(metadata)}")
196+
197+
try:
198+
return Vector(id=item['id'], values=item['values'], sparse_values=sparse_values, metadata=metadata)
199+
200+
except TypeError as e:
201+
# if not isinstance(item['values'], Iterable) or not isinstance(item['values'][0], numbers.Real):
202+
# raise TypeError(f"Column `values` is expected to be a list of floats")
203+
if not isinstance(item['values'], Iterable) or not isinstance(item['values'][0], numbers.Real):
204+
raise TypeError(f"Column `values` is expected to be a list of floats")
205+
raise
206+
154207
def _vector_transform(item: Union[Vector, Tuple]):
155208
if isinstance(item, Vector):
156209
return item
157-
if isinstance(item, tuple):
210+
elif isinstance(item, tuple):
211+
if len(item) > 3:
212+
raise ValueError(f"Found a tuple of length {len(item)} which is not supported. "
213+
f"Vectors can be represented as tuples either the form (id, values, metadata) or (id, values). "
214+
f"To pass sparse values please use either dicts or a Vector objects as inputs.")
158215
id, values, metadata = fix_tuple_length(item, 3)
159216
return Vector(id=id, values=values, metadata=metadata or {}, _check_type=_check_type)
217+
elif isinstance(item, Mapping):
218+
return _dict_to_vector(item)
160219
raise ValueError(f"Invalid vector value passed: cannot interpret type {type(item)}")
161220

162221
return self._vector_api.upsert(

0 commit comments

Comments
 (0)