Skip to content

Commit 2dd7ba9

Browse files
Adding CSV Parser (#1996)
* Create csvparser.py I am adding the CSV parser python code. it works with basic CSV files. * Update prepdocs.py updating the csv parser code and importing the CsvParser class * Create test_csvparser.py Added CSV Test file * Update test_csvparser.py Formatted the file * Update csvparser.py Formatted the file * Update prepdocs.py * Update prepdocs.py * Update csvparser.py * Update prepdocs.py * Fix prepdocs and tests to match main --------- Co-authored-by: Pamela Fox <[email protected]> Co-authored-by: Pamela Fox <[email protected]>
1 parent a127523 commit 2dd7ba9

File tree

4 files changed

+94
-4
lines changed

4 files changed

+94
-4
lines changed

app/backend/prepdocs.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010

1111
from load_azd_env import load_azd_env
1212
from prepdocslib.blobmanager import BlobManager
13+
from prepdocslib.csvparser import CsvParser
1314
from prepdocslib.embeddings import (
1415
AzureOpenAIEmbeddingService,
1516
ImageEmbeddings,
@@ -190,6 +191,7 @@ def setup_file_processors(
190191
".json": FileProcessor(JsonParser(), SimpleTextSplitter()),
191192
".md": FileProcessor(TextParser(), sentence_text_splitter),
192193
".txt": FileProcessor(TextParser(), sentence_text_splitter),
194+
".csv": FileProcessor(CsvParser(), sentence_text_splitter),
193195
}
194196
# These require either a Python package or Document Intelligence
195197
if pdf_parser is not None:

app/backend/prepdocslib/csvparser.py

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
import csv
2+
from typing import IO, AsyncGenerator
3+
4+
from .page import Page
5+
from .parser import Parser
6+
7+
8+
class CsvParser(Parser):
9+
"""
10+
Concrete parser that can parse CSV into Page objects. Each row becomes a Page object.
11+
"""
12+
13+
async def parse(self, content: IO) -> AsyncGenerator[Page, None]:
14+
# Check if content is in bytes (binary file) and decode to string
15+
content_str: str
16+
if isinstance(content, (bytes, bytearray)):
17+
content_str = content.decode("utf-8")
18+
elif hasattr(content, "read"): # Handle BufferedReader
19+
content_str = content.read().decode("utf-8")
20+
21+
# Create a CSV reader from the text content
22+
reader = csv.reader(content_str.splitlines())
23+
offset = 0
24+
25+
# Skip the header row
26+
next(reader, None)
27+
28+
for i, row in enumerate(reader):
29+
page_text = ",".join(row)
30+
yield Page(i, offset, page_text)
31+
offset += len(page_text) + 1 # Account for newline character

tests/test_app_config.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@ async def test_app_user_upload_processors(monkeypatch, minimal_env):
6363
async with quart_app.test_app():
6464
ingester = quart_app.config[app.CONFIG_INGESTER]
6565
assert ingester is not None
66-
assert len(ingester.file_processors.keys()) == 5
66+
assert len(ingester.file_processors.keys()) == 6
6767

6868

6969
@pytest.mark.asyncio
@@ -77,7 +77,7 @@ async def test_app_user_upload_processors_docint(monkeypatch, minimal_env):
7777
async with quart_app.test_app():
7878
ingester = quart_app.config[app.CONFIG_INGESTER]
7979
assert ingester is not None
80-
assert len(ingester.file_processors.keys()) == 14
80+
assert len(ingester.file_processors.keys()) == 15
8181

8282

8383
@pytest.mark.asyncio
@@ -92,7 +92,7 @@ async def test_app_user_upload_processors_docint_localpdf(monkeypatch, minimal_e
9292
async with quart_app.test_app():
9393
ingester = quart_app.config[app.CONFIG_INGESTER]
9494
assert ingester is not None
95-
assert len(ingester.file_processors.keys()) == 14
95+
assert len(ingester.file_processors.keys()) == 15
9696
assert ingester.file_processors[".pdf"] is not ingester.file_processors[".pptx"]
9797

9898

@@ -108,7 +108,7 @@ async def test_app_user_upload_processors_docint_localhtml(monkeypatch, minimal_
108108
async with quart_app.test_app():
109109
ingester = quart_app.config[app.CONFIG_INGESTER]
110110
assert ingester is not None
111-
assert len(ingester.file_processors.keys()) == 14
111+
assert len(ingester.file_processors.keys()) == 15
112112
assert ingester.file_processors[".html"] is not ingester.file_processors[".pptx"]
113113

114114

tests/test_csvparser.py

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
import io
2+
3+
import pytest
4+
5+
from prepdocslib.csvparser import CsvParser # Adjust import to the correct module
6+
7+
8+
@pytest.mark.asyncio
9+
async def test_csvparser_single_row():
10+
# Mock CSV content with a single row in binary format
11+
file = io.BytesIO(b"col1,col2,col3\nvalue1,value2,value3")
12+
file.name = "test.csv"
13+
csvparser = CsvParser()
14+
15+
# Parse the file
16+
pages = [page async for page in csvparser.parse(file)]
17+
18+
# Assertions
19+
assert len(pages) == 1
20+
assert pages[0].page_num == 0
21+
assert pages[0].offset == 0
22+
assert pages[0].text == "value1,value2,value3"
23+
24+
25+
@pytest.mark.asyncio
26+
async def test_csvparser_multiple_rows():
27+
# Mock CSV content with multiple rows in binary format
28+
file = io.BytesIO(b"col1,col2,col3\nvalue1,value2,value3\nvalue4,value5,value6")
29+
file.name = "test.csv"
30+
csvparser = CsvParser()
31+
32+
# Parse the file
33+
pages = [page async for page in csvparser.parse(file)]
34+
35+
# Assertions
36+
assert len(pages) == 2 # Expect only data rows, skipping the header
37+
assert pages[0].page_num == 0
38+
assert pages[0].offset == 0
39+
assert pages[0].text == "value1,value2,value3"
40+
41+
assert pages[1].page_num == 1
42+
assert pages[1].offset == len(pages[0].text) + 1 # Length of the first row plus a newline
43+
assert pages[1].text == "value4,value5,value6"
44+
45+
46+
@pytest.mark.asyncio
47+
async def test_csvparser_empty_file():
48+
# Mock empty CSV content in binary format
49+
file = io.BytesIO(b"")
50+
file.name = "test.csv"
51+
csvparser = CsvParser()
52+
53+
# Parse the file
54+
pages = [page async for page in csvparser.parse(file)]
55+
56+
# Assertions
57+
assert len(pages) == 0 # No rows should be parsed from an empty file

0 commit comments

Comments
 (0)