Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add document for Docset readers and writers. #196

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions docs/source/APIs/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,6 @@ API References

data/data.rst
transforms/transforms.rst
readers/readers.rst
writers/writers.rst
functions/functions.rst
7 changes: 7 additions & 0 deletions docs/source/APIs/readers/readers.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
.. _Ref-Readers:

Readers
===========

.. automodule:: sycamore.scans.reader
:members:
7 changes: 7 additions & 0 deletions docs/source/APIs/writers/writers.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
.. _Ref-Writers:

Writers
===========

.. automodule:: sycamore.writers.writer
:members:
2 changes: 1 addition & 1 deletion sycamore/context.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ def __init__(self, ray_args: Optional[dict[str, Any]] = None):

@property
def read(self):
from sycamore.reader import DocSetReader
from sycamore.scans.reader import DocSetReader

return DocSetReader(self)

Expand Down
2 changes: 1 addition & 1 deletion sycamore/docset.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from sycamore.transforms.summarize import Summarizer
from sycamore.transforms.extract_table import TableExtractor
from sycamore.transforms.merge_elements import ElementMerger
from sycamore.writer import DocSetWriter
from sycamore.writers.writer import DocSetWriter

logger = logging.getLogger(__name__)

Expand Down
5 changes: 2 additions & 3 deletions sycamore/scans/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
from sycamore.scans.file_scan import BinaryScan, FileScan, JsonScan
from sycamore.scans.materialized_scan import ArrowScan, DocScan, MaterializedScan, PandasScan
from sycamore.scans.reader import DocSetReader

__all__ = ["ArrowScan", "BinaryScan", "DocScan", "FileScan", "JsonScan", "MaterializedScan", "PandasScan"]
__all__ = ["DocSetReader"]
15 changes: 7 additions & 8 deletions sycamore/scans/file_scan.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,12 @@ def _parse_s3_path(s3_path: str) -> Tuple[str, str]:


class FileScan(Scan):
"""A base scan class for file based data"""
"""
The base scan class for file based data

This class should not be used directly, and should instead be subclassed
and tailored to particular file formats.
"""

def __init__(
self,
Expand All @@ -87,13 +92,7 @@ def _is_s3_scheme(self) -> bool:


class BinaryScan(FileScan):
"""Scan data file into raw bytes

For each file, BinaryScan creates one Document in the form of
{"doc_id": uuid,
"content": {"binary": xxx, "text": None},
"properties": {"path": xxx}}.

"""
Note: if you specify filter_paths_by_extension = False, you need to make sure
all the files that are scanned can be processed by the pipeline. Many pipelines
include file-type specific steps.
Expand Down
5 changes: 2 additions & 3 deletions sycamore/scans/materialized_scan.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,8 @@


class MaterializedScan(Scan):
"""A base scan class for materialized data
e.g. arrow table, pandas dataframe, python dict list or even spark
dataset
"""
A base scan class for materialized data
"""

def __init__(self, **resource_args):
Expand Down
62 changes: 60 additions & 2 deletions sycamore/reader.py → sycamore/scans/reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,17 @@

from sycamore import Context, DocSet
from sycamore.data import Document
from sycamore.scans import ArrowScan, BinaryScan, DocScan, PandasScan, JsonScan
from sycamore.scans.file_scan import FileMetadataProvider
from sycamore.scans.materialized_scan import ArrowScan, DocScan, PandasScan
from sycamore.scans.file_scan import BinaryScan, FileMetadataProvider, JsonScan


class DocSetReader:
"""
Read data from different kinds of sources into DocSet.

DocSetReader is exposed through sycamore context read API.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sycamore should be capitalized

"""

def __init__(self, context: Context):
self._context = context

Expand All @@ -23,6 +29,18 @@ def binary(
metadata_provider: Optional[FileMetadataProvider] = None,
**resource_args
) -> DocSet:
"""
Scan data file into raw bytes

For each file, BinaryScan creates one Document, we use BinaryScan to process
unstructured data format like PDF or HTML.

Examples:
>>> import sycamore
>>> import pyarrow as pa
>>> context = sycamore.init()
>>> docset = context.read.binary("s3://bucket/prefix", "pdf")
"""
scan = BinaryScan(
paths,
binary_format=binary_format,
Expand Down Expand Up @@ -61,6 +79,16 @@ def json(
document_body_field: Optional[str] = None,
**resource_args
) -> DocSet:
"""
Scan JSON or JSONL data file into DocSet

We currently handle each JSON file by reading binary and then parsing it into Document.
Examples:
>>> import sycamore
>>> import pyarrow as pa
>>> context = sycamore.init()
>>> docset = context.read.json("s3://bucket/prefix")
"""
json_scan = JsonScan(
paths,
properties=properties,
Expand All @@ -71,13 +99,43 @@ def json(
return DocSet(self._context, json_scan)

def arrow(self, tables: Union[Table, bytes, list[Union[Table, bytes]]]) -> DocSet:
"""
Scan arrow data into a DocSet
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Arrow should be capitalized


Examples:
>>> import sycamore
>>> import pyarrow as pa
>>> context = sycamore.init()
>>> table = pa.table({"x": [1]})
>>> docset = context.read.arrow(table)
"""
scan = ArrowScan(tables)
return DocSet(self._context, scan)

def document(self, docs: list[Document]) -> DocSet:
"""
Scan a list of Documents into a DocSet

Examples:
>>> import sycamore
>>> from sycamore.data import Document
>>> context = sycamore.init()
>>> documents = [Document()]
>>> docset = context.read.document(documents)
"""
scan = DocScan(docs)
return DocSet(self._context, scan)

def pandas(self, dfs: Union[DataFrame, list[DataFrame]]) -> DocSet:
"""
Scan a list of Documents into a DocSet

Examples:
>>> import sycamore
>>> from pandas import DataFrame
>>> context = sycamore.init()
>>> df = DataFrame({"doc_id": 1, "type": "hello, world!"})
>>> docset = context.read.pandas(df)
"""
scan = PandasScan(dfs)
return DocSet(self._context, scan)
2 changes: 1 addition & 1 deletion sycamore/tests/unit/scans/test_file_scan.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

from sycamore.data import Document
from sycamore.scans.file_scan import JsonManifestMetadataProvider
from sycamore.scans import BinaryScan, JsonScan
from sycamore.scans.file_scan import BinaryScan, JsonScan
from sycamore.tests.config import TEST_DIR


Expand Down
2 changes: 1 addition & 1 deletion sycamore/tests/unit/scans/test_materialized_scan.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from pandas import DataFrame
from pyarrow import Table

from sycamore.scans import ArrowScan, DocScan, PandasScan
from sycamore.scans.materialized_scan import ArrowScan, DocScan, PandasScan
from sycamore.data import Document


Expand Down
2 changes: 1 addition & 1 deletion sycamore/tests/unit/test_docset.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from sycamore import DocSet, Context
from sycamore.data import Document
from sycamore.plan_nodes import Node
from sycamore.scans import BinaryScan
from sycamore.scans.file_scan import BinaryScan
from sycamore.transforms import (
Embedder,
Embed,
Expand Down
4 changes: 2 additions & 2 deletions sycamore/tests/unit/test_rewriter.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
from sycamore.rules import EnforceResourceUsage
from sycamore.scans import BinaryScan
from sycamore.scans.file_scan import BinaryScan
from sycamore.transforms import Partition, Explode
from sycamore.transforms.partition import UnstructuredPdfPartitioner
from sycamore.writers import OpenSearchWriter
from sycamore.writers.opensearch import OpenSearchWriter


class TestRewriter:
Expand Down
2 changes: 1 addition & 1 deletion sycamore/tests/unit/test_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import sycamore
from sycamore.data import Document, Element
from sycamore.plan_nodes import Node
from sycamore.writers import OpenSearchWriter
from sycamore.writers.opensearch import OpenSearchWriter

import json
from pathlib import Path
Expand Down
2 changes: 1 addition & 1 deletion sycamore/tests/unit/transforms/test_partition.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
UnstructuredPdfPartitioner,
UnstructuredPPTXPartitioner,
)
from sycamore.scans import BinaryScan
from sycamore.scans.file_scan import BinaryScan
from sycamore.tests.config import TEST_DIR


Expand Down
4 changes: 2 additions & 2 deletions sycamore/writers/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
from sycamore.writers.opensearch import OpenSearchWriter
from sycamore.writers.writer import DocSetWriter

__all__ = ["OpenSearchWriter"]
__all__ = ["DocSetWriter"]
19 changes: 18 additions & 1 deletion sycamore/writer.py → sycamore/writers/writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,25 @@


class DocSetWriter:
"""
Write DocSet into different targets.
"""

def __init__(self, context: Context, plan: Node):
self.context = context
self.plan = plan

def opensearch(
self, *, os_client_args: dict, index_name: str, index_settings: Optional[dict] = None, **resource_args
) -> None:
from sycamore.writers import OpenSearchWriter
"""Write a docset into opensearch
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

DocSet caps? OpenSearch caps

Args:
os_client_args: opensearch client arguments like host address, port number etc.
index_name: opensearch index name
index_settings: index setting such as number of shards, index mapping
resource_args: Additional resource-related arguments that can be passed
"""
from sycamore.writers.opensearch import OpenSearchWriter

os = OpenSearchWriter(
self.plan, index_name, os_client_args=os_client_args, index_settings=index_settings, **resource_args
Expand All @@ -42,6 +53,12 @@ def files(
Defaults to using text_representation if available, or binary_representation
if not.
resource_args: Arguments to pass to the underlying execution environment.

Example
>>> import sycamore
>>> context = sycamore.init()
>>> docset = context.read.json("s3://bucket/prefix1")
>>> docset.write.files("s3://bucket/prefix2")
"""
file_writer = FileWriter(
self.plan,
Expand Down
Loading