5
5
import numbers
6
6
from abc import ABC , abstractmethod
7
7
from functools import wraps
8
+ from importlib .util import find_spec
8
9
from typing import NamedTuple , Optional , Dict , Iterable , Union , List , Tuple , Any
9
10
from collections .abc import Mapping
10
11
11
12
import certifi
12
13
import grpc
13
14
from google .protobuf import json_format
14
15
from grpc ._channel import _InactiveRpcError , _MultiThreadedRendezvous
15
- from tqdm import tqdm
16
+ from tqdm .autonotebook import tqdm
17
+ import json
16
18
17
19
from pinecone import FetchResponse , QueryResponse , ScoredVector , SingleQueryResults , DescribeIndexStatsResponse
18
20
from pinecone .config import Config
@@ -84,9 +86,38 @@ def __init__(self, index_name: str, channel=None, grpc_config: GRPCClientConfig
84
86
"client-version" : CLIENT_VERSION
85
87
}
86
88
self ._endpoint_override = _endpoint_override
89
+
90
+ self .method_config = json .dumps (
91
+ {
92
+ "methodConfig" : [
93
+ {
94
+ "name" : [{"service" : "VectorService.Upsert" }],
95
+ "retryPolicy" : {
96
+ "maxAttempts" : 5 ,
97
+ "initialBackoff" : "0.1s" ,
98
+ "maxBackoff" : "1s" ,
99
+ "backoffMultiplier" : 2 ,
100
+ "retryableStatusCodes" : ["UNAVAILABLE" ],
101
+ },
102
+ },
103
+ {
104
+ "name" : [{"service" : "VectorService" }],
105
+ "retryPolicy" : {
106
+ "maxAttempts" : 5 ,
107
+ "initialBackoff" : "0.1s" ,
108
+ "maxBackoff" : "1s" ,
109
+ "backoffMultiplier" : 2 ,
110
+ "retryableStatusCodes" : ["UNAVAILABLE" ],
111
+ },
112
+ }
113
+ ]
114
+ }
115
+ )
116
+
87
117
self ._channel = channel or self ._gen_channel ()
88
118
self .stub = self .stub_class (self ._channel )
89
119
120
+
90
121
@property
91
122
@abstractmethod
92
123
def stub_class (self ):
@@ -100,7 +131,9 @@ def _gen_channel(self, options=None):
100
131
target = self ._endpoint ()
101
132
default_options = {
102
133
"grpc.max_send_message_length" : MAX_MSG_SIZE ,
103
- "grpc.max_receive_message_length" : MAX_MSG_SIZE
134
+ "grpc.max_receive_message_length" : MAX_MSG_SIZE ,
135
+ "grpc.service_config" : self .method_config ,
136
+ "grpc.enable_retries" : True
104
137
}
105
138
if self .grpc_client_config .secure :
106
139
default_options ['grpc.ssl_target_name_override' ] = target .split (':' )[0 ]
@@ -114,8 +147,8 @@ def _gen_channel(self, options=None):
114
147
root_cas = open (certifi .where (), "rb" ).read ()
115
148
tls = grpc .ssl_channel_credentials (root_certificates = root_cas )
116
149
channel = grpc .secure_channel (target , tls , options = _options )
117
- interceptor = RetryOnRpcErrorClientInterceptor ( self . retry_config )
118
- return grpc . intercept_channel ( channel , interceptor )
150
+
151
+ return channel
119
152
120
153
@property
121
154
def channel (self ):
@@ -246,7 +279,7 @@ def add_done_callback(self, fun):
246
279
247
280
def result (self , timeout = None ):
248
281
try :
249
- self ._delegate .result (timeout = timeout )
282
+ return self ._delegate .result (timeout = timeout )
250
283
except _MultiThreadedRendezvous as e :
251
284
raise PineconeException (e ._state .debug_error_string ) from e
252
285
@@ -417,6 +450,52 @@ def _upsert_batch(self,
417
450
request = UpsertRequest (vectors = vectors , ** args_dict )
418
451
return self ._wrap_grpc_call (self .stub .Upsert , request , timeout = timeout , ** kwargs )
419
452
453
+ def upsert_from_dataframe (self ,
454
+ df ,
455
+ namespace : str = None ,
456
+ batch_size : int = 500 ,
457
+ use_async_requests : bool = True ,
458
+ show_progress : bool = True ) -> None :
459
+ """Upserts a dataframe into the index.
460
+
461
+ Args:
462
+ df: A pandas dataframe with the following columns: id, vector, and metadata.
463
+ namespace: The namespace to upsert into.
464
+ batch_size: The number of rows to upsert in a single batch.
465
+ use_async_requests: Whether to upsert multiple requests at the same time using asynchronous request mechanism.
466
+ Set to `False`
467
+ show_progress: Whether to show a progress bar.
468
+ """
469
+ try :
470
+ import pandas as pd
471
+ except ImportError :
472
+ raise RuntimeError ("The `pandas` package is not installed. Please install pandas to use `upsert_from_dataframe()`" )
473
+
474
+ if not isinstance (df , pd .DataFrame ):
475
+ raise ValueError (f"Only pandas dataframes are supported. Found: { type (df )} " )
476
+
477
+ pbar = tqdm (total = len (df ), disable = not show_progress , desc = "sending upsert requests" )
478
+ results = []
479
+ for chunk in self ._iter_dataframe (df , batch_size = batch_size ):
480
+ res = self .upsert (vectors = chunk , namespace = namespace , async_req = use_async_requests )
481
+ pbar .update (len (chunk ))
482
+ results .append (res )
483
+
484
+ if use_async_requests :
485
+ results = [async_result .result () for async_result in tqdm (results , desc = "collecting async responses" )]
486
+
487
+ upserted_count = 0
488
+ for res in results :
489
+ upserted_count += res .upserted_count
490
+
491
+ return UpsertResponse (upserted_count = upserted_count )
492
+
493
+ @staticmethod
494
+ def _iter_dataframe (df , batch_size ):
495
+ for i in range (0 , len (df ), batch_size ):
496
+ batch = df .iloc [i :i + batch_size ].to_dict (orient = "records" )
497
+ yield batch
498
+
420
499
def delete (self ,
421
500
ids : Optional [List [str ]] = None ,
422
501
delete_all : Optional [bool ] = None ,
0 commit comments