|
5 | 5 | import numbers
|
6 | 6 | from abc import ABC, abstractmethod
|
7 | 7 | from functools import wraps
|
| 8 | +from importlib.util import find_spec |
8 | 9 | from typing import NamedTuple, Optional, Dict, Iterable, Union, List, Tuple, Any
|
9 | 10 | from collections.abc import Mapping
|
10 | 11 |
|
11 | 12 | import certifi
|
12 | 13 | import grpc
|
13 | 14 | from google.protobuf import json_format
|
14 | 15 | from grpc._channel import _InactiveRpcError, _MultiThreadedRendezvous
|
15 |
| -from tqdm import tqdm |
| 16 | +from tqdm.autonotebook import tqdm |
16 | 17 | import json
|
17 | 18 |
|
18 | 19 | from pinecone import FetchResponse, QueryResponse, ScoredVector, SingleQueryResults, DescribeIndexStatsResponse
|
@@ -449,6 +450,52 @@ def _upsert_batch(self,
|
449 | 450 | request = UpsertRequest(vectors=vectors, **args_dict)
|
450 | 451 | return self._wrap_grpc_call(self.stub.Upsert, request, timeout=timeout, **kwargs)
|
451 | 452 |
|
| 453 | + def upsert_from_dataframe(self, |
| 454 | + df, |
| 455 | + namespace: str = None, |
| 456 | + batch_size: int = 500, |
| 457 | + use_async_requests: bool = True, |
| 458 | + show_progress: bool = True) -> None: |
| 459 | + """Upserts a dataframe into the index. |
| 460 | +
|
| 461 | + Args: |
| 462 | + df: A pandas dataframe with the following columns: id, vector, and metadata. |
| 463 | + namespace: The namespace to upsert into. |
| 464 | + batch_size: The number of rows to upsert in a single batch. |
| 465 | + use_async_requests: Whether to upsert multiple requests at the same time using asynchronous request mechanism. |
| 466 | + Set to `False` |
| 467 | + show_progress: Whether to show a progress bar. |
| 468 | + """ |
| 469 | + try: |
| 470 | + import pandas as pd |
| 471 | + except ImportError: |
| 472 | + raise RuntimeError("The `pandas` package is not installed. Please install pandas to use `upsert_from_dataframe()`") |
| 473 | + |
| 474 | + if not isinstance(df, pd.DataFrame): |
| 475 | + raise ValueError(f"Only pandas dataframes are supported. Found: {type(df)}") |
| 476 | + |
| 477 | + pbar = tqdm(total=len(df), disable=not show_progress, desc="sending upsert requests") |
| 478 | + results = [] |
| 479 | + for chunk in self._iter_dataframe(df, batch_size=batch_size): |
| 480 | + res = self.upsert(vectors=chunk, namespace=namespace, async_req=use_async_requests) |
| 481 | + pbar.update(len(chunk)) |
| 482 | + results.append(res) |
| 483 | + |
| 484 | + if use_async_requests: |
| 485 | + results = [async_result.result() for async_result in tqdm(results, desc="collecting async responses")] |
| 486 | + |
| 487 | + upserted_count = 0 |
| 488 | + for res in results: |
| 489 | + upserted_count += res.upserted_count |
| 490 | + |
| 491 | + return UpsertResponse(upserted_count=upserted_count) |
| 492 | + |
| 493 | + @staticmethod |
| 494 | + def _iter_dataframe(df, batch_size): |
| 495 | + for i in range(0, len(df), batch_size): |
| 496 | + batch = df.iloc[i:i + batch_size].to_dict(orient="records") |
| 497 | + yield batch |
| 498 | + |
452 | 499 | def delete(self,
|
453 | 500 | ids: Optional[List[str]] = None,
|
454 | 501 | delete_all: Optional[bool] = None,
|
|
0 commit comments