Skip to content

Commit 8377f43

Browse files
Index.upsert() now accepts List[dict]
Upsert will now accept vectors represented as dicts. This provides a nice way to pass vectors with sparse values, rather than isntantiating `Vector()` objects Also, this will allow us to represent entire datasets in the future as dataframes, where each vector is a dictionary (one row of the dataframe).
1 parent 42ed389 commit 8377f43

File tree

2 files changed

+65
-19
lines changed

2 files changed

+65
-19
lines changed

pinecone/core/grpc/index_grpc.py

+32-9
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
from abc import ABC, abstractmethod
66
from functools import wraps
77
from typing import NamedTuple, Optional, Dict, Iterable, Union, List, Tuple, Any
8+
from collections.abc import Mapping
89

910
import certifi
1011
import grpc
@@ -263,7 +264,7 @@ def stub_class(self):
263264
return VectorServiceStub
264265

265266
def upsert(self,
266-
vectors: Union[List[GRPCVector], List[Tuple]],
267+
vectors: Union[List[GRPCVector], List[tuple], List[dict]],
267268
async_req: bool = False,
268269
namespace: Optional[str] = None,
269270
batch_size: Optional[int] = None,
@@ -274,18 +275,25 @@ def upsert(self,
274275
If a new value is upserted for an existing vector id, it will overwrite the previous value.
275276
276277
Examples:
277-
>>> index.upsert([('id1', [1.0, 2.0, 3.0], {'key': 'value'}), ('id2', [1.0, 2.0, 3.0])],
278-
>>> namespace='ns1', async_req=True)
278+
>>> index.upsert([('id1', [1.0, 2.0, 3.0], {'key': 'value'}),
279+
('id2', [1.0, 2.0, 3.0])
280+
],
281+
namespace='ns1', async_req=True)
282+
>>> index.upsert([{'id': 'id1', 'values': [1.0, 2.0, 3.0], 'metadata': {'key': 'value'}},
283+
{'id': 'id2',
284+
'values': [1.0, 2.0, 3.0],
285+
'sprase_values': {'indices': [1, 8], 'values': [0.2, 0.4]},
286+
])
279287
>>> index.upsert([GRPCVector(id='id1', values=[1.0, 2.0, 3.0], metadata={'key': 'value'}),
280-
>>> GRPCVector(id='id2', values=[1.0, 2.0, 3.0]),
281-
>>> GRPCVector(id='id3',
282-
>>> values=[1.0, 2.0, 3.0],
283-
>>> sparse_values=GRPCSparseValues(indices=[1, 2], values=[0.2, 0.4]))])
288+
GRPCVector(id='id2', values=[1.0, 2.0, 3.0]),
289+
GRPCVector(id='id3',
290+
values=[1.0, 2.0, 3.0],
291+
sparse_values=GRPCSparseValues(indices=[1, 2], values=[0.2, 0.4]))])
284292
285293
Args:
286294
vectors (Union[List[Vector], List[Tuple]]): A list of vectors to upsert.
287295
288-
A vector can be represented by a 1) GRPCVector object or a 2) tuple.
296+
A vector can be represented by a 1) GRPCVector object, a 2) tuple or 3) a dictionary
289297
1) if a tuple is used, it must be of the form (id, values, metadata) or (id, values).
290298
where id is a string, vector is a list of floats, and metadata is a dict.
291299
Examples: ('id1', [1.0, 2.0, 3.0], {'key': 'value'}), ('id2', [1.0, 2.0, 3.0])
@@ -299,6 +307,10 @@ def upsert(self,
299307
values=[1.0, 2.0, 3.0],
300308
sparse_values=GRPCSparseValues(indices=[1, 2], values=[0.2, 0.4]))
301309
310+
3) if a dictionary is used, it must be in the form
311+
{'id': str, 'values': List[float], 'sparse_values': {'indices': List[int], 'values': List[float]},
312+
'metadata': dict}
313+
302314
Note: the dimension of each vector must match the dimension of the index.
303315
async_req (bool): If True, the upsert operation will be performed asynchronously.
304316
Cannot be used with batch_size.
@@ -320,9 +332,20 @@ def upsert(self,
320332
def _vector_transform(item):
321333
if isinstance(item, GRPCVector):
322334
return item
323-
if isinstance(item, tuple):
335+
elif isinstance(item, tuple):
336+
if len(item) > 3:
337+
raise ValueError(f"Found a tuple of length {len(item)} which is not supported. "
338+
f"Vectors can be represented as tuples either the form (id, values, metadata) or (id, values). "
339+
f"To pass sparse values please use either dicts or a GRPCVector objects as inputs.")
324340
id, values, metadata = fix_tuple_length(item, 3)
325341
return GRPCVector(id=id, values=values, metadata=dict_to_proto_struct(metadata) or {})
342+
elif isinstance(item, Mapping):
343+
sparse_values = None
344+
if 'sparse_values' in item:
345+
indices = item['sparse_values'].get('indices', None)
346+
values = item['sparse_values'].get('values', None)
347+
sparse_values = GRPCSparseValues(indices=indices, values=values)
348+
return GRPCVector(id=item['id'], values=item['values'], sparse_values=sparse_values, metadata=dict_to_proto_struct(item.get('metadata', None)))
326349
raise ValueError(f"Invalid vector value passed: cannot interpret type {type(item)}")
327350

328351
timeout = kwargs.pop('timeout', None)

pinecone/index.py

+33-10
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
# Copyright (c) 2020-2021 Pinecone Systems Inc. All right reserved.
33
#
44
from tqdm import tqdm
5-
from collections.abc import Iterable
5+
from collections.abc import Iterable, Mapping
66
from typing import Union, List, Tuple, Optional, Dict, Any
77

88
from .core.client.model.sparse_values import SparseValues
@@ -63,7 +63,7 @@ def __init__(self, index_name: str, pool_threads=1):
6363

6464
@validate_and_convert_errors
6565
def upsert(self,
66-
vectors: Union[List[Vector], List[Tuple]],
66+
vectors: Union[List[Vector], List[tuple], List[dict]],
6767
namespace: Optional[str] = None,
6868
batch_size: Optional[int] = None,
6969
show_progress: bool = True,
@@ -77,18 +77,25 @@ def upsert(self,
7777
To upsert in parallel follow: https://docs.pinecone.io/docs/insert-data#sending-upserts-in-parallel
7878
7979
Examples:
80-
>>> index.upsert([('id1', [1.0, 2.0, 3.0], {'key': 'value'}), ('id2', [1.0, 2.0, 3.0])])
80+
>>> index.upsert([('id1', [1.0, 2.0, 3.0], {'key': 'value'}),
81+
('id2', [1.0, 2.0, 3.0]),
82+
])
83+
>>> index.upsert([{'id': 'id1', 'values': [1.0, 2.0, 3.0], 'metadata': {'key': 'value'}},
84+
{'id': 'id2',
85+
'values': [1.0, 2.0, 3.0],
86+
'sprase_values': {'indices': [1, 8], 'values': [0.2, 0.4]},
87+
])
8188
>>> index.upsert([Vector(id='id1',
82-
>>> values=[1.0, 2.0, 3.0],
83-
>>> metadata={'key': 'value'}),
84-
>>> Vector(id='id2',
85-
>>> values=[1.0, 2.0, 3.0],
86-
>>> sparse_values=SparseValues(indices=[1, 2], values=[0.2, 0.4]))])
89+
values=[1.0, 2.0, 3.0],
90+
metadata={'key': 'value'}),
91+
Vector(id='id2',
92+
values=[1.0, 2.0, 3.0],
93+
sparse_values=SparseValues(indices=[1, 2], values=[0.2, 0.4]))])
8794
8895
Args:
8996
vectors (Union[List[Vector], List[Tuple]]): A list of vectors to upsert.
9097
91-
A vector can be represented by a 1) Vector object or a 2) tuple.
98+
A vector can be represented by a 1) Vector object, a 2) tuple or 3) a dictionary
9299
1) if a tuple is used, it must be of the form (id, values, meatadaa) or (id, values).
93100
where id is a string, vector is a list of floats, metadata is a dict,
94101
and sparse_values is a dict of the form {'indices': List[int], 'values': List[float]}.
@@ -110,6 +117,10 @@ def upsert(self,
110117
111118
Note: the dimension of each vector must match the dimension of the index.
112119
120+
3) if a dictionary is used, it must be in the form
121+
{'id': str, 'values': List[float], 'sparse_values': {'indices': List[int], 'values': List[float]},
122+
'metadata': dict}
123+
113124
namespace (str): The namespace to write to. If not specified, the default namespace is used. [optional]
114125
batch_size (int): The number of vectors to upsert in each batch.
115126
If not specified, all vectors will be upserted in a single batch. [optional]
@@ -154,9 +165,21 @@ def _upsert_batch(self,
154165
def _vector_transform(item: Union[Vector, Tuple]):
155166
if isinstance(item, Vector):
156167
return item
157-
if isinstance(item, tuple):
168+
elif isinstance(item, tuple):
169+
if len(item) > 3:
170+
raise ValueError(f"Found a tuple of length {len(item)} which is not supported. "
171+
f"Vectors can be represented as tuples either the form (id, values, metadata) or (id, values). "
172+
f"To pass sparse values please use either dicts or a Vector objects as inputs.")
158173
id, values, metadata = fix_tuple_length(item, 3)
159174
return Vector(id=id, values=values, metadata=metadata or {}, _check_type=_check_type)
175+
elif isinstance(item, Mapping):
176+
sparse_values = None
177+
if 'sparse_values' in item:
178+
indices = item['sparse_values'].get('indices', [])
179+
values = item['sparse_values'].get('values', [])
180+
sparse_values = SparseValues(indices=indices, values=values)
181+
return Vector(id=item['id'], values=item['values'], sparse_values=sparse_values, metadata=item.get('metadata', {}), _check_type=_check_type)
182+
160183
raise ValueError(f"Invalid vector value passed: cannot interpret type {type(item)}")
161184

162185
return self._vector_api.upsert(

0 commit comments

Comments
 (0)