Skip to content

Commit 5520462

Browse files
Upserst - added many validations to upsert with dicts
Since upsert([dicts]) is going to be the basis for upsert_from_dataframe(df), we need to give much more explicit errors in case the user has passed some weird DF.
1 parent 8377f43 commit 5520462

File tree

3 files changed

+90
-14
lines changed

3 files changed

+90
-14
lines changed

pinecone/core/grpc/index_grpc.py

+43-7
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22
# Copyright (c) 2020-2021 Pinecone Systems Inc. All right reserved.
33
#
44
import logging
5+
import numbers
6+
import warnings
57
from abc import ABC, abstractmethod
68
from functools import wraps
79
from typing import NamedTuple, Optional, Dict, Iterable, Union, List, Tuple, Any
@@ -25,7 +27,8 @@
2527
from pinecone.core.grpc.protos.vector_service_pb2_grpc import VectorServiceStub
2628
from pinecone.core.grpc.retry import RetryOnRpcErrorClientInterceptor, RetryConfig
2729
from pinecone.core.utils import _generate_request_id, dict_to_proto_struct, fix_tuple_length
28-
from pinecone.core.utils.constants import MAX_MSG_SIZE, REQUEST_ID, CLIENT_VERSION
30+
from pinecone.core.utils.constants import MAX_MSG_SIZE, REQUEST_ID, CLIENT_VERSION, REQUIRED_VECTOR_FIELDS, \
31+
OPTIONAL_VECTOR_FIELDS
2932
from pinecone.exceptions import PineconeException
3033

3134
__all__ = ["GRPCIndex", "GRPCVector", "GRPCQueryVector", "GRPCSparseValues"]
@@ -329,6 +332,44 @@ def upsert(self,
329332
'To upsert in parallel, please follow: '
330333
'https://docs.pinecone.io/docs/performance-tuning')
331334

335+
def _dict_to_grpc_vector(item):
336+
item_keys = set(item.keys())
337+
if not item_keys.issuperset(REQUIRED_VECTOR_FIELDS):
338+
raise ValueError(
339+
f"Vector dictionary is missing required fields: {list(REQUIRED_VECTOR_FIELDS - item_keys)}")
340+
341+
excessive_keys = item_keys - (REQUIRED_VECTOR_FIELDS | OPTIONAL_VECTOR_FIELDS)
342+
if len(excessive_keys) > 0:
343+
warnings.warn(f"Found excessive keys in the vector dictionary: {list(excessive_keys)}. "
344+
f"These keys will be ignored. The allowed keys are: {list(REQUIRED_VECTOR_FIELDS | OPTIONAL_VECTOR_FIELDS)}")
345+
346+
sparse_values = None
347+
if 'sparse_values' in item:
348+
if not isinstance(item['sparse_values'], Mapping):
349+
raise ValueError(f"Column `sparse_values` is expected to be a dictionary, found {type(item['sparse_values'])}")
350+
indices = item['sparse_values'].get('indices', None)
351+
values = item['sparse_values'].get('values', None)
352+
try:
353+
sparse_values = GRPCSparseValues(indices=indices, values=values)
354+
except TypeError as e:
355+
raise ValueError("Found unexpected data in column `sparse_values`. "
356+
"Expected format is `'sparse_values': {'indices': List[int], 'values': List[float]}`."
357+
) from e
358+
359+
metadata = item.get('metadata', None)
360+
if metadata is not None and not isinstance(metadata, Mapping):
361+
raise TypeError(f"Column `metadata` is expected to be a dictionary, found {type(metadata)}")
362+
363+
try:
364+
return GRPCVector(id=item['id'], values=item['values'], sparse_values=sparse_values,
365+
metadata=dict_to_proto_struct(metadata))
366+
367+
except TypeError as e:
368+
# No need to raise a dedicated error for `id` - protobuf's error message is clear enough
369+
if not isinstance(item['values'], Iterable) or not isinstance(item['values'][0], numbers.Real):
370+
raise TypeError(f"Column `values` is expected to be a list of floats")
371+
raise
372+
332373
def _vector_transform(item):
333374
if isinstance(item, GRPCVector):
334375
return item
@@ -340,12 +381,7 @@ def _vector_transform(item):
340381
id, values, metadata = fix_tuple_length(item, 3)
341382
return GRPCVector(id=id, values=values, metadata=dict_to_proto_struct(metadata) or {})
342383
elif isinstance(item, Mapping):
343-
sparse_values = None
344-
if 'sparse_values' in item:
345-
indices = item['sparse_values'].get('indices', None)
346-
values = item['sparse_values'].get('values', None)
347-
sparse_values = GRPCSparseValues(indices=indices, values=values)
348-
return GRPCVector(id=item['id'], values=item['values'], sparse_values=sparse_values, metadata=dict_to_proto_struct(item.get('metadata', None)))
384+
return _dict_to_grpc_vector(item)
349385
raise ValueError(f"Invalid vector value passed: cannot interpret type {type(item)}")
350386

351387
timeout = kwargs.pop('timeout', None)

pinecone/core/utils/constants.py

+3
Original file line numberDiff line numberDiff line change
@@ -37,3 +37,6 @@ class NodeType(str, enum.Enum):
3737
TCP_KEEPINTVL = 60 # Sec
3838
TCP_KEEPIDLE = 300 # Sec
3939
TCP_KEEPCNT = 4
40+
41+
REQUIRED_VECTOR_FIELDS = {'id', 'values'}
42+
OPTIONAL_VECTOR_FIELDS = {'sparse_values', 'metadata'}

pinecone/index.py

+44-7
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
11
#
22
# Copyright (c) 2020-2021 Pinecone Systems Inc. All right reserved.
33
#
4+
import numbers
5+
import warnings
6+
47
from tqdm import tqdm
58
from collections.abc import Iterable, Mapping
69
from typing import Union, List, Tuple, Optional, Dict, Any
@@ -21,6 +24,7 @@
2124
"UpdateRequest", "Vector", "DeleteRequest", "UpdateRequest", "DescribeIndexStatsRequest", "SparseValues"
2225
]
2326

27+
from .core.utils.constants import REQUIRED_VECTOR_FIELDS, OPTIONAL_VECTOR_FIELDS
2428
from .core.utils.error_handling import validate_and_convert_errors
2529

2630
_OPENAPI_ENDPOINT_PARAMS = (
@@ -162,6 +166,45 @@ def _upsert_batch(self,
162166

163167
args_dict = self._parse_non_empty_args([('namespace', namespace)])
164168

169+
def _dict_to_vector(item):
170+
item_keys = set(item.keys())
171+
if not item_keys.issuperset(REQUIRED_VECTOR_FIELDS):
172+
raise ValueError(
173+
f"Vector dictionary is missing required fields: {list(REQUIRED_VECTOR_FIELDS - item_keys)}")
174+
175+
excessive_keys = item_keys - (REQUIRED_VECTOR_FIELDS | OPTIONAL_VECTOR_FIELDS)
176+
if len(excessive_keys) > 0:
177+
warnings.warn(f"Found excessive keys in the vector dictionary: {list(excessive_keys)}. "
178+
f"These keys will be ignored. The allowed keys are: {list(REQUIRED_VECTOR_FIELDS | OPTIONAL_VECTOR_FIELDS)}")
179+
180+
sparse_values = None
181+
if 'sparse_values' in item:
182+
if not isinstance(item['sparse_values'], Mapping):
183+
raise ValueError(
184+
f"Column `sparse_values` is expected to be a dictionary, found {type(item['sparse_values'])}")
185+
indices = item['sparse_values'].get('indices', None)
186+
values = item['sparse_values'].get('values', None)
187+
try:
188+
sparse_values = SparseValues(indices=indices, values=values)
189+
except TypeError as e:
190+
raise ValueError("Found unexpected data in column `sparse_values`. "
191+
"Expected format is `'sparse_values': {'indices': List[int], 'values': List[float]}`."
192+
) from e
193+
194+
metadata = item.get('metadata') or {}
195+
if not isinstance(metadata, Mapping):
196+
raise TypeError(f"Column `metadata` is expected to be a dictionary, found {type(metadata)}")
197+
198+
try:
199+
return Vector(id=item['id'], values=item['values'], sparse_values=sparse_values, metadata=metadata)
200+
201+
except TypeError as e:
202+
# if not isinstance(item['values'], Iterable) or not isinstance(item['values'][0], numbers.Real):
203+
# raise TypeError(f"Column `values` is expected to be a list of floats")
204+
if not isinstance(item['values'], Iterable) or not isinstance(item['values'][0], numbers.Real):
205+
raise TypeError(f"Column `values` is expected to be a list of floats")
206+
raise
207+
165208
def _vector_transform(item: Union[Vector, Tuple]):
166209
if isinstance(item, Vector):
167210
return item
@@ -173,13 +216,7 @@ def _vector_transform(item: Union[Vector, Tuple]):
173216
id, values, metadata = fix_tuple_length(item, 3)
174217
return Vector(id=id, values=values, metadata=metadata or {}, _check_type=_check_type)
175218
elif isinstance(item, Mapping):
176-
sparse_values = None
177-
if 'sparse_values' in item:
178-
indices = item['sparse_values'].get('indices', [])
179-
values = item['sparse_values'].get('values', [])
180-
sparse_values = SparseValues(indices=indices, values=values)
181-
return Vector(id=item['id'], values=item['values'], sparse_values=sparse_values, metadata=item.get('metadata', {}), _check_type=_check_type)
182-
219+
return _dict_to_vector(item)
183220
raise ValueError(f"Invalid vector value passed: cannot interpret type {type(item)}")
184221

185222
return self._vector_api.upsert(

0 commit comments

Comments
 (0)