Adding CSV Parser (#1996)

* Create csvparser.py I am adding the CSV parser python code. it works with basic CSV files. * Update prepdocs.py updating the csv parser code and importing the CsvParser class * Create test_csvparser.py Added CSV Test file * Update test_csvparser.py Formatted the file * Update csvparser.py Formatted the file * Update prepdocs.py * Update prepdocs.py * Update csvparser.py * Update prepdocs.py * Fix prepdocs and tests to match main --------- Co-authored-by: Pamela Fox <[email protected]> Co-authored-by: Pamela Fox <[email protected]>
Azure-Samples · Oct 2, 2024 · 2dd7ba9 · 2dd7ba9
1 parent a127523
commit 2dd7ba9
Show file tree

Hide file tree

Showing 4 changed files with 94 additions and 4 deletions.
diff --git a/app/backend/prepdocs.py b/app/backend/prepdocs.py
@@ -10,6 +10,7 @@
 
 from load_azd_env import load_azd_env
 from prepdocslib.blobmanager import BlobManager
+from prepdocslib.csvparser import CsvParser
 from prepdocslib.embeddings import (
     AzureOpenAIEmbeddingService,
     ImageEmbeddings,
@@ -190,6 +191,7 @@ def setup_file_processors(
         ".json": FileProcessor(JsonParser(), SimpleTextSplitter()),
         ".md": FileProcessor(TextParser(), sentence_text_splitter),
         ".txt": FileProcessor(TextParser(), sentence_text_splitter),
+        ".csv": FileProcessor(CsvParser(), sentence_text_splitter),
     }
     # These require either a Python package or Document Intelligence
     if pdf_parser is not None:

diff --git a/app/backend/prepdocslib/csvparser.py b/app/backend/prepdocslib/csvparser.py
@@ -0,0 +1,31 @@
+import csv
+from typing import IO, AsyncGenerator
+
+from .page import Page
+from .parser import Parser
+
+
+class CsvParser(Parser):
+    """
+    Concrete parser that can parse CSV into Page objects. Each row becomes a Page object.
+    """
+
+    async def parse(self, content: IO) -> AsyncGenerator[Page, None]:
+        # Check if content is in bytes (binary file) and decode to string
+        content_str: str
+        if isinstance(content, (bytes, bytearray)):
+            content_str = content.decode("utf-8")
+        elif hasattr(content, "read"):  # Handle BufferedReader
+            content_str = content.read().decode("utf-8")
+
+        # Create a CSV reader from the text content
+        reader = csv.reader(content_str.splitlines())
+        offset = 0
+
+        # Skip the header row
+        next(reader, None)
+
+        for i, row in enumerate(reader):
+            page_text = ",".join(row)
+            yield Page(i, offset, page_text)
+            offset += len(page_text) + 1  # Account for newline character
diff --git a/tests/test_app_config.py b/tests/test_app_config.py
@@ -63,7 +63,7 @@ async def test_app_user_upload_processors(monkeypatch, minimal_env):
     async with quart_app.test_app():
         ingester = quart_app.config[app.CONFIG_INGESTER]
         assert ingester is not None
-        assert len(ingester.file_processors.keys()) == 5
+        assert len(ingester.file_processors.keys()) == 6
 
 
 @pytest.mark.asyncio
@@ -77,7 +77,7 @@ async def test_app_user_upload_processors_docint(monkeypatch, minimal_env):
     async with quart_app.test_app():
         ingester = quart_app.config[app.CONFIG_INGESTER]
         assert ingester is not None
-        assert len(ingester.file_processors.keys()) == 14
+        assert len(ingester.file_processors.keys()) == 15
 
 
 @pytest.mark.asyncio
@@ -92,7 +92,7 @@ async def test_app_user_upload_processors_docint_localpdf(monkeypatch, minimal_e
     async with quart_app.test_app():
         ingester = quart_app.config[app.CONFIG_INGESTER]
         assert ingester is not None
-        assert len(ingester.file_processors.keys()) == 14
+        assert len(ingester.file_processors.keys()) == 15
         assert ingester.file_processors[".pdf"] is not ingester.file_processors[".pptx"]
 
 
@@ -108,7 +108,7 @@ async def test_app_user_upload_processors_docint_localhtml(monkeypatch, minimal_
     async with quart_app.test_app():
         ingester = quart_app.config[app.CONFIG_INGESTER]
         assert ingester is not None
-        assert len(ingester.file_processors.keys()) == 14
+        assert len(ingester.file_processors.keys()) == 15
         assert ingester.file_processors[".html"] is not ingester.file_processors[".pptx"]
 
 

diff --git a/tests/test_csvparser.py b/tests/test_csvparser.py
@@ -0,0 +1,57 @@
+import io
+
+import pytest
+
+from prepdocslib.csvparser import CsvParser  # Adjust import to the correct module
+
+
+@pytest.mark.asyncio
+async def test_csvparser_single_row():
+    # Mock CSV content with a single row in binary format
+    file = io.BytesIO(b"col1,col2,col3\nvalue1,value2,value3")
+    file.name = "test.csv"
+    csvparser = CsvParser()
+
+    # Parse the file
+    pages = [page async for page in csvparser.parse(file)]
+
+    # Assertions
+    assert len(pages) == 1
+    assert pages[0].page_num == 0
+    assert pages[0].offset == 0
+    assert pages[0].text == "value1,value2,value3"
+
+
+@pytest.mark.asyncio
+async def test_csvparser_multiple_rows():
+    # Mock CSV content with multiple rows in binary format
+    file = io.BytesIO(b"col1,col2,col3\nvalue1,value2,value3\nvalue4,value5,value6")
+    file.name = "test.csv"
+    csvparser = CsvParser()
+
+    # Parse the file
+    pages = [page async for page in csvparser.parse(file)]
+
+    # Assertions
+    assert len(pages) == 2  # Expect only data rows, skipping the header
+    assert pages[0].page_num == 0
+    assert pages[0].offset == 0
+    assert pages[0].text == "value1,value2,value3"
+
+    assert pages[1].page_num == 1
+    assert pages[1].offset == len(pages[0].text) + 1  # Length of the first row plus a newline
+    assert pages[1].text == "value4,value5,value6"
+
+
+@pytest.mark.asyncio
+async def test_csvparser_empty_file():
+    # Mock empty CSV content in binary format
+    file = io.BytesIO(b"")
+    file.name = "test.csv"
+    csvparser = CsvParser()
+
+    # Parse the file
+    pages = [page async for page in csvparser.parse(file)]
+
+    # Assertions
+    assert len(pages) == 0  # No rows should be parsed from an empty file