Skip to content

Commit

Permalink
community[minor]: Add support for non-file-based Document Loaders in …
Browse files Browse the repository at this point in the history
…PebbloSafeLoader (langchain-ai#19574)

**Description:**
PebbloSafeLoader: Add support for non-file-based Document Loaders

This pull request enhances PebbloSafeLoader by introducing support for
several non-file-based Document Loaders. With this update,
PebbloSafeLoader now seamlessly integrates with the following loaders:
- GoogleDriveLoader
- SlackDirectoryLoader
- Unstructured EmailLoader

**Issue:** NA
**Dependencies:** - None
**Twitter handle:** @Raj__725

---------

Co-authored-by: Rahul Tripathi <[email protected]>
  • Loading branch information
Raj725 and Rahul Tripathi authored Mar 27, 2024
1 parent 9954c6a commit 0019d8a
Show file tree
Hide file tree
Showing 2 changed files with 35 additions and 5 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,9 @@ def _send_loader_doc(self, loading_end: bool = False) -> None:
doc_content = [doc.dict() for doc in self.docs]
docs = []
for doc in doc_content:
doc_source_path = get_full_path(doc.get("metadata", {}).get("source"))
doc_source_path = get_full_path(
doc.get("metadata", {}).get("source", self.source_path)
)
doc_source_owner = PebbloSafeLoader.get_file_owner_from_path(
doc_source_path
)
Expand Down
36 changes: 32 additions & 4 deletions libs/community/langchain_community/utilities/pebblo.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,11 +29,28 @@
"AmazonTextractPDFLoader",
"CSVLoader",
"UnstructuredExcelLoader",
"UnstructuredEmailLoader",
]
dir_loader = ["DirectoryLoader", "S3DirLoader", "PyPDFDirectoryLoader"]
dir_loader = [
"DirectoryLoader",
"S3DirLoader",
"SlackDirectoryLoader",
"PyPDFDirectoryLoader",
"NotionDirectoryLoader",
]

in_memory = ["DataFrameLoader"]
remote_db = [
"NotionDBLoader",
"GoogleDriveLoader",
]

LOADER_TYPE_MAPPING = {"file": file_loader, "dir": dir_loader, "in-memory": in_memory}
LOADER_TYPE_MAPPING = {
"file": file_loader,
"dir": dir_loader,
"in-memory": in_memory,
"remote_db": remote_db,
}

SUPPORTED_LOADERS = (*file_loader, *dir_loader, *in_memory)

Expand Down Expand Up @@ -159,7 +176,7 @@ def get_loader_type(loader: str) -> str:
for loader_type, loaders in LOADER_TYPE_MAPPING.items():
if loader in loaders:
return loader_type
return "unknown"
return "unsupported"


def get_loader_full_path(loader: BaseLoader) -> str:
Expand All @@ -172,6 +189,7 @@ def get_loader_full_path(loader: BaseLoader) -> str:
from langchain_community.document_loaders import (
DataFrameLoader,
GCSFileLoader,
NotionDBLoader,
S3FileLoader,
)

Expand All @@ -188,15 +206,25 @@ def get_loader_full_path(loader: BaseLoader) -> str:
location = f"gc://{loader.bucket}/{loader.blob}"
elif isinstance(loader, S3FileLoader):
location = f"s3://{loader.bucket}/{loader.key}"
elif "source" in loader_dict:
location = loader_dict["source"]
if location and "channel" in loader_dict:
channel = loader_dict["channel"]
if channel:
location = f"{location}/{channel}"
elif "path" in loader_dict:
location = loader_dict["path"]
elif "file_path" in loader_dict:
location = loader_dict["file_path"]
elif "web_paths" in loader_dict:
location = loader_dict["web_paths"][0]
web_paths = loader_dict["web_paths"]
if web_paths and isinstance(web_paths, list) and len(web_paths) > 0:
location = web_paths[0]
# For in-memory types:
elif isinstance(loader, DataFrameLoader):
location = "in-memory"
elif isinstance(loader, NotionDBLoader):
location = f"notiondb://{loader.database_id}"
except Exception:
pass
return get_full_path(str(location))
Expand Down

0 comments on commit 0019d8a

Please sign in to comment.