2
2
# Copyright (c) 2020-2021 Pinecone Systems Inc. All right reserved.
3
3
#
4
4
import logging
5
+ import numbers
5
6
from abc import ABC , abstractmethod
6
7
from functools import wraps
7
8
from typing import NamedTuple , Optional , Dict , Iterable , Union , List , Tuple , Any
9
+ from collections .abc import Mapping
8
10
9
11
import certifi
10
12
import grpc
24
26
from pinecone .core .grpc .protos .vector_service_pb2_grpc import VectorServiceStub
25
27
from pinecone .core .grpc .retry import RetryOnRpcErrorClientInterceptor , RetryConfig
26
28
from pinecone .core .utils import _generate_request_id , dict_to_proto_struct , fix_tuple_length
27
- from pinecone .core .utils .constants import MAX_MSG_SIZE , REQUEST_ID , CLIENT_VERSION
29
+ from pinecone .core .utils .constants import MAX_MSG_SIZE , REQUEST_ID , CLIENT_VERSION , REQUIRED_VECTOR_FIELDS , \
30
+ OPTIONAL_VECTOR_FIELDS
28
31
from pinecone .exceptions import PineconeException
29
32
30
33
__all__ = ["GRPCIndex" , "GRPCVector" , "GRPCQueryVector" , "GRPCSparseValues" ]
@@ -263,7 +266,7 @@ def stub_class(self):
263
266
return VectorServiceStub
264
267
265
268
def upsert (self ,
266
- vectors : Union [List [GRPCVector ], List [Tuple ]],
269
+ vectors : Union [List [GRPCVector ], List [tuple ], List [ dict ]],
267
270
async_req : bool = False ,
268
271
namespace : Optional [str ] = None ,
269
272
batch_size : Optional [int ] = None ,
@@ -274,18 +277,25 @@ def upsert(self,
274
277
If a new value is upserted for an existing vector id, it will overwrite the previous value.
275
278
276
279
Examples:
277
- >>> index.upsert([('id1', [1.0, 2.0, 3.0], {'key': 'value'}), ('id2', [1.0, 2.0, 3.0])],
278
- >>> namespace='ns1', async_req=True)
280
+ >>> index.upsert([('id1', [1.0, 2.0, 3.0], {'key': 'value'}),
281
+ ('id2', [1.0, 2.0, 3.0])
282
+ ],
283
+ namespace='ns1', async_req=True)
284
+ >>> index.upsert([{'id': 'id1', 'values': [1.0, 2.0, 3.0], 'metadata': {'key': 'value'}},
285
+ {'id': 'id2',
286
+ 'values': [1.0, 2.0, 3.0],
287
+ 'sprase_values': {'indices': [1, 8], 'values': [0.2, 0.4]},
288
+ ])
279
289
>>> index.upsert([GRPCVector(id='id1', values=[1.0, 2.0, 3.0], metadata={'key': 'value'}),
280
- >>> GRPCVector(id='id2', values=[1.0, 2.0, 3.0]),
281
- >>> GRPCVector(id='id3',
282
- >>> values=[1.0, 2.0, 3.0],
283
- >>> sparse_values=GRPCSparseValues(indices=[1, 2], values=[0.2, 0.4]))])
290
+ GRPCVector(id='id2', values=[1.0, 2.0, 3.0]),
291
+ GRPCVector(id='id3',
292
+ values=[1.0, 2.0, 3.0],
293
+ sparse_values=GRPCSparseValues(indices=[1, 2], values=[0.2, 0.4]))])
284
294
285
295
Args:
286
296
vectors (Union[List[Vector], List[Tuple]]): A list of vectors to upsert.
287
297
288
- A vector can be represented by a 1) GRPCVector object or a 2) tuple.
298
+ A vector can be represented by a 1) GRPCVector object, a 2) tuple or 3) a dictionary
289
299
1) if a tuple is used, it must be of the form (id, values, metadata) or (id, values).
290
300
where id is a string, vector is a list of floats, and metadata is a dict.
291
301
Examples: ('id1', [1.0, 2.0, 3.0], {'key': 'value'}), ('id2', [1.0, 2.0, 3.0])
@@ -299,6 +309,10 @@ def upsert(self,
299
309
values=[1.0, 2.0, 3.0],
300
310
sparse_values=GRPCSparseValues(indices=[1, 2], values=[0.2, 0.4]))
301
311
312
+ 3) if a dictionary is used, it must be in the form
313
+ {'id': str, 'values': List[float], 'sparse_values': {'indices': List[int], 'values': List[float]},
314
+ 'metadata': dict}
315
+
302
316
Note: the dimension of each vector must match the dimension of the index.
303
317
async_req (bool): If True, the upsert operation will be performed asynchronously.
304
318
Cannot be used with batch_size.
@@ -317,12 +331,56 @@ def upsert(self,
317
331
'To upsert in parallel, please follow: '
318
332
'https://docs.pinecone.io/docs/performance-tuning' )
319
333
334
+ def _dict_to_grpc_vector (item ):
335
+ item_keys = set (item .keys ())
336
+ if not item_keys .issuperset (REQUIRED_VECTOR_FIELDS ):
337
+ raise ValueError (
338
+ f"Vector dictionary is missing required fields: { list (REQUIRED_VECTOR_FIELDS - item_keys )} " )
339
+
340
+ excessive_keys = item_keys - (REQUIRED_VECTOR_FIELDS | OPTIONAL_VECTOR_FIELDS )
341
+ if len (excessive_keys ) > 0 :
342
+ raise ValueError (f"Found excess keys in the vector dictionary: { list (excessive_keys )} . "
343
+ f"The allowed keys are: { list (REQUIRED_VECTOR_FIELDS | OPTIONAL_VECTOR_FIELDS )} " )
344
+
345
+ sparse_values = None
346
+ if 'sparse_values' in item :
347
+ if not isinstance (item ['sparse_values' ], Mapping ):
348
+ raise ValueError (f"Column `sparse_values` is expected to be a dictionary, found { type (item ['sparse_values' ])} " )
349
+ indices = item ['sparse_values' ].get ('indices' , None )
350
+ values = item ['sparse_values' ].get ('values' , None )
351
+ try :
352
+ sparse_values = GRPCSparseValues (indices = indices , values = values )
353
+ except TypeError as e :
354
+ raise ValueError ("Found unexpected data in column `sparse_values`. "
355
+ "Expected format is `'sparse_values': {'indices': List[int], 'values': List[float]}`."
356
+ ) from e
357
+
358
+ metadata = item .get ('metadata' , None )
359
+ if metadata is not None and not isinstance (metadata , Mapping ):
360
+ raise TypeError (f"Column `metadata` is expected to be a dictionary, found { type (metadata )} " )
361
+
362
+ try :
363
+ return GRPCVector (id = item ['id' ], values = item ['values' ], sparse_values = sparse_values ,
364
+ metadata = dict_to_proto_struct (metadata ))
365
+
366
+ except TypeError as e :
367
+ # No need to raise a dedicated error for `id` - protobuf's error message is clear enough
368
+ if not isinstance (item ['values' ], Iterable ) or not isinstance (item ['values' ][0 ], numbers .Real ):
369
+ raise TypeError (f"Column `values` is expected to be a list of floats" )
370
+ raise
371
+
320
372
def _vector_transform (item ):
321
373
if isinstance (item , GRPCVector ):
322
374
return item
323
- if isinstance (item , tuple ):
375
+ elif isinstance (item , tuple ):
376
+ if len (item ) > 3 :
377
+ raise ValueError (f"Found a tuple of length { len (item )} which is not supported. "
378
+ f"Vectors can be represented as tuples either the form (id, values, metadata) or (id, values). "
379
+ f"To pass sparse values please use either dicts or GRPCVector objects as inputs." )
324
380
id , values , metadata = fix_tuple_length (item , 3 )
325
381
return GRPCVector (id = id , values = values , metadata = dict_to_proto_struct (metadata ) or {})
382
+ elif isinstance (item , Mapping ):
383
+ return _dict_to_grpc_vector (item )
326
384
raise ValueError (f"Invalid vector value passed: cannot interpret type { type (item )} " )
327
385
328
386
timeout = kwargs .pop ('timeout' , None )
0 commit comments