From 5797b140ef4cffdd1b7323b8d558f2e46eb7d6d9 Mon Sep 17 00:00:00 2001 From: Bohou Li Date: Mon, 18 Dec 2023 13:36:22 -0800 Subject: [PATCH] Add document for Docset readers and writers. --- docs/source/APIs/index.rst | 2 + docs/source/APIs/readers/readers.rst | 7 +++ docs/source/APIs/writers/writers.rst | 7 +++ sycamore/context.py | 2 +- sycamore/docset.py | 2 +- sycamore/scans/__init__.py | 5 +- sycamore/scans/file_scan.py | 15 +++-- sycamore/scans/materialized_scan.py | 5 +- sycamore/{ => scans}/reader.py | 62 ++++++++++++++++++- sycamore/tests/unit/scans/test_file_scan.py | 2 +- .../unit/scans/test_materialized_scan.py | 2 +- sycamore/tests/unit/test_docset.py | 2 +- sycamore/tests/unit/test_rewriter.py | 4 +- sycamore/tests/unit/test_writer.py | 2 +- .../tests/unit/transforms/test_partition.py | 2 +- sycamore/writers/__init__.py | 4 +- sycamore/{ => writers}/writer.py | 19 +++++- 17 files changed, 116 insertions(+), 28 deletions(-) create mode 100644 docs/source/APIs/readers/readers.rst create mode 100644 docs/source/APIs/writers/writers.rst rename sycamore/{ => scans}/reader.py (56%) rename sycamore/{ => writers}/writer.py (73%) diff --git a/docs/source/APIs/index.rst b/docs/source/APIs/index.rst index ffd417c99..75db156fd 100644 --- a/docs/source/APIs/index.rst +++ b/docs/source/APIs/index.rst @@ -6,4 +6,6 @@ API References data/data.rst transforms/transforms.rst + readers/readers.rst + writers/writers.rst functions/functions.rst diff --git a/docs/source/APIs/readers/readers.rst b/docs/source/APIs/readers/readers.rst new file mode 100644 index 000000000..d2e9c09d3 --- /dev/null +++ b/docs/source/APIs/readers/readers.rst @@ -0,0 +1,7 @@ +.. _Ref-Readers: + +Readers +=========== + +.. automodule:: sycamore.scans.reader + :members: diff --git a/docs/source/APIs/writers/writers.rst b/docs/source/APIs/writers/writers.rst new file mode 100644 index 000000000..9c4460825 --- /dev/null +++ b/docs/source/APIs/writers/writers.rst @@ -0,0 +1,7 @@ +.. _Ref-Writers: + +Writers +=========== + +.. automodule:: sycamore.writers.writer + :members: diff --git a/sycamore/context.py b/sycamore/context.py index 0093bd0f8..c962ef8e4 100644 --- a/sycamore/context.py +++ b/sycamore/context.py @@ -22,7 +22,7 @@ def __init__(self, ray_args: Optional[dict[str, Any]] = None): @property def read(self): - from sycamore.reader import DocSetReader + from sycamore.scans.reader import DocSetReader return DocSetReader(self) diff --git a/sycamore/docset.py b/sycamore/docset.py index a1db28efe..1bbd9499d 100644 --- a/sycamore/docset.py +++ b/sycamore/docset.py @@ -14,7 +14,7 @@ from sycamore.transforms.summarize import Summarizer from sycamore.transforms.extract_table import TableExtractor from sycamore.transforms.merge_elements import ElementMerger -from sycamore.writer import DocSetWriter +from sycamore.writers.writer import DocSetWriter logger = logging.getLogger(__name__) diff --git a/sycamore/scans/__init__.py b/sycamore/scans/__init__.py index 810787d08..f84a7ed1b 100644 --- a/sycamore/scans/__init__.py +++ b/sycamore/scans/__init__.py @@ -1,4 +1,3 @@ -from sycamore.scans.file_scan import BinaryScan, FileScan, JsonScan -from sycamore.scans.materialized_scan import ArrowScan, DocScan, MaterializedScan, PandasScan +from sycamore.scans.reader import DocSetReader -__all__ = ["ArrowScan", "BinaryScan", "DocScan", "FileScan", "JsonScan", "MaterializedScan", "PandasScan"] +__all__ = ["DocSetReader"] diff --git a/sycamore/scans/file_scan.py b/sycamore/scans/file_scan.py index ae17166b9..b4d629ed5 100644 --- a/sycamore/scans/file_scan.py +++ b/sycamore/scans/file_scan.py @@ -64,7 +64,12 @@ def _parse_s3_path(s3_path: str) -> Tuple[str, str]: class FileScan(Scan): - """A base scan class for file based data""" + """ + The base scan class for file based data + + This class should not be used directly, and should instead be subclassed + and tailored to particular file formats. + """ def __init__( self, @@ -87,13 +92,7 @@ def _is_s3_scheme(self) -> bool: class BinaryScan(FileScan): - """Scan data file into raw bytes - - For each file, BinaryScan creates one Document in the form of - {"doc_id": uuid, - "content": {"binary": xxx, "text": None}, - "properties": {"path": xxx}}. - + """ Note: if you specify filter_paths_by_extension = False, you need to make sure all the files that are scanned can be processed by the pipeline. Many pipelines include file-type specific steps. diff --git a/sycamore/scans/materialized_scan.py b/sycamore/scans/materialized_scan.py index b318500b8..a2db81f82 100644 --- a/sycamore/scans/materialized_scan.py +++ b/sycamore/scans/materialized_scan.py @@ -9,9 +9,8 @@ class MaterializedScan(Scan): - """A base scan class for materialized data - e.g. arrow table, pandas dataframe, python dict list or even spark - dataset + """ + A base scan class for materialized data """ def __init__(self, **resource_args): diff --git a/sycamore/reader.py b/sycamore/scans/reader.py similarity index 56% rename from sycamore/reader.py rename to sycamore/scans/reader.py index 9e803762e..3e41c02be 100644 --- a/sycamore/reader.py +++ b/sycamore/scans/reader.py @@ -6,11 +6,17 @@ from sycamore import Context, DocSet from sycamore.data import Document -from sycamore.scans import ArrowScan, BinaryScan, DocScan, PandasScan, JsonScan -from sycamore.scans.file_scan import FileMetadataProvider +from sycamore.scans.materialized_scan import ArrowScan, DocScan, PandasScan +from sycamore.scans.file_scan import BinaryScan, FileMetadataProvider, JsonScan class DocSetReader: + """ + Read data from different kinds of sources into DocSet. + + DocSetReader is exposed through sycamore context read API. + """ + def __init__(self, context: Context): self._context = context @@ -23,6 +29,18 @@ def binary( metadata_provider: Optional[FileMetadataProvider] = None, **resource_args ) -> DocSet: + """ + Scan data file into raw bytes + + For each file, BinaryScan creates one Document, we use BinaryScan to process + unstructured data format like PDF or HTML. + + Examples: + >>> import sycamore + >>> import pyarrow as pa + >>> context = sycamore.init() + >>> docset = context.read.binary("s3://bucket/prefix", "pdf") + """ scan = BinaryScan( paths, binary_format=binary_format, @@ -61,6 +79,16 @@ def json( document_body_field: Optional[str] = None, **resource_args ) -> DocSet: + """ + Scan JSON or JSONL data file into DocSet + + We currently handle each JSON file by reading binary and then parsing it into Document. + Examples: + >>> import sycamore + >>> import pyarrow as pa + >>> context = sycamore.init() + >>> docset = context.read.json("s3://bucket/prefix") + """ json_scan = JsonScan( paths, properties=properties, @@ -71,13 +99,43 @@ def json( return DocSet(self._context, json_scan) def arrow(self, tables: Union[Table, bytes, list[Union[Table, bytes]]]) -> DocSet: + """ + Scan arrow data into a DocSet + + Examples: + >>> import sycamore + >>> import pyarrow as pa + >>> context = sycamore.init() + >>> table = pa.table({"x": [1]}) + >>> docset = context.read.arrow(table) + """ scan = ArrowScan(tables) return DocSet(self._context, scan) def document(self, docs: list[Document]) -> DocSet: + """ + Scan a list of Documents into a DocSet + + Examples: + >>> import sycamore + >>> from sycamore.data import Document + >>> context = sycamore.init() + >>> documents = [Document()] + >>> docset = context.read.document(documents) + """ scan = DocScan(docs) return DocSet(self._context, scan) def pandas(self, dfs: Union[DataFrame, list[DataFrame]]) -> DocSet: + """ + Scan a list of Documents into a DocSet + + Examples: + >>> import sycamore + >>> from pandas import DataFrame + >>> context = sycamore.init() + >>> df = DataFrame({"doc_id": 1, "type": "hello, world!"}) + >>> docset = context.read.pandas(df) + """ scan = PandasScan(dfs) return DocSet(self._context, scan) diff --git a/sycamore/tests/unit/scans/test_file_scan.py b/sycamore/tests/unit/scans/test_file_scan.py index 1da95408b..bf6a47256 100644 --- a/sycamore/tests/unit/scans/test_file_scan.py +++ b/sycamore/tests/unit/scans/test_file_scan.py @@ -3,7 +3,7 @@ from sycamore.data import Document from sycamore.scans.file_scan import JsonManifestMetadataProvider -from sycamore.scans import BinaryScan, JsonScan +from sycamore.scans.file_scan import BinaryScan, JsonScan from sycamore.tests.config import TEST_DIR diff --git a/sycamore/tests/unit/scans/test_materialized_scan.py b/sycamore/tests/unit/scans/test_materialized_scan.py index 7a9b9c4df..f6e30064f 100644 --- a/sycamore/tests/unit/scans/test_materialized_scan.py +++ b/sycamore/tests/unit/scans/test_materialized_scan.py @@ -2,7 +2,7 @@ from pandas import DataFrame from pyarrow import Table -from sycamore.scans import ArrowScan, DocScan, PandasScan +from sycamore.scans.materialized_scan import ArrowScan, DocScan, PandasScan from sycamore.data import Document diff --git a/sycamore/tests/unit/test_docset.py b/sycamore/tests/unit/test_docset.py index 9a3194cd3..05eac522b 100644 --- a/sycamore/tests/unit/test_docset.py +++ b/sycamore/tests/unit/test_docset.py @@ -6,7 +6,7 @@ from sycamore import DocSet, Context from sycamore.data import Document from sycamore.plan_nodes import Node -from sycamore.scans import BinaryScan +from sycamore.scans.file_scan import BinaryScan from sycamore.transforms import ( Embedder, Embed, diff --git a/sycamore/tests/unit/test_rewriter.py b/sycamore/tests/unit/test_rewriter.py index bef6deb81..f447210a8 100644 --- a/sycamore/tests/unit/test_rewriter.py +++ b/sycamore/tests/unit/test_rewriter.py @@ -1,8 +1,8 @@ from sycamore.rules import EnforceResourceUsage -from sycamore.scans import BinaryScan +from sycamore.scans.file_scan import BinaryScan from sycamore.transforms import Partition, Explode from sycamore.transforms.partition import UnstructuredPdfPartitioner -from sycamore.writers import OpenSearchWriter +from sycamore.writers.opensearch import OpenSearchWriter class TestRewriter: diff --git a/sycamore/tests/unit/test_writer.py b/sycamore/tests/unit/test_writer.py index e45358885..01a3ebc1b 100644 --- a/sycamore/tests/unit/test_writer.py +++ b/sycamore/tests/unit/test_writer.py @@ -2,7 +2,7 @@ import sycamore from sycamore.data import Document, Element from sycamore.plan_nodes import Node -from sycamore.writers import OpenSearchWriter +from sycamore.writers.opensearch import OpenSearchWriter import json from pathlib import Path diff --git a/sycamore/tests/unit/transforms/test_partition.py b/sycamore/tests/unit/transforms/test_partition.py index c2040b350..e32f845fa 100644 --- a/sycamore/tests/unit/transforms/test_partition.py +++ b/sycamore/tests/unit/transforms/test_partition.py @@ -11,7 +11,7 @@ UnstructuredPdfPartitioner, UnstructuredPPTXPartitioner, ) -from sycamore.scans import BinaryScan +from sycamore.scans.file_scan import BinaryScan from sycamore.tests.config import TEST_DIR diff --git a/sycamore/writers/__init__.py b/sycamore/writers/__init__.py index 746d32577..964efd811 100644 --- a/sycamore/writers/__init__.py +++ b/sycamore/writers/__init__.py @@ -1,3 +1,3 @@ -from sycamore.writers.opensearch import OpenSearchWriter +from sycamore.writers.writer import DocSetWriter -__all__ = ["OpenSearchWriter"] +__all__ = ["DocSetWriter"] diff --git a/sycamore/writer.py b/sycamore/writers/writer.py similarity index 73% rename from sycamore/writer.py rename to sycamore/writers/writer.py index 041bb3886..21fbc7223 100644 --- a/sycamore/writer.py +++ b/sycamore/writers/writer.py @@ -9,6 +9,10 @@ class DocSetWriter: + """ + Write DocSet into different targets. + """ + def __init__(self, context: Context, plan: Node): self.context = context self.plan = plan @@ -16,7 +20,14 @@ def __init__(self, context: Context, plan: Node): def opensearch( self, *, os_client_args: dict, index_name: str, index_settings: Optional[dict] = None, **resource_args ) -> None: - from sycamore.writers import OpenSearchWriter + """Write a docset into opensearch + Args: + os_client_args: opensearch client arguments like host address, port number etc. + index_name: opensearch index name + index_settings: index setting such as number of shards, index mapping + resource_args: Additional resource-related arguments that can be passed + """ + from sycamore.writers.opensearch import OpenSearchWriter os = OpenSearchWriter( self.plan, index_name, os_client_args=os_client_args, index_settings=index_settings, **resource_args @@ -42,6 +53,12 @@ def files( Defaults to using text_representation if available, or binary_representation if not. resource_args: Arguments to pass to the underlying execution environment. + + Example + >>> import sycamore + >>> context = sycamore.init() + >>> docset = context.read.json("s3://bucket/prefix1") + >>> docset.write.files("s3://bucket/prefix2") """ file_writer = FileWriter( self.plan,