-
Notifications
You must be signed in to change notification settings - Fork 4.2k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* Create csvparser.py I am adding the CSV parser python code. it works with basic CSV files. * Update prepdocs.py updating the csv parser code and importing the CsvParser class * Create test_csvparser.py Added CSV Test file * Update test_csvparser.py Formatted the file * Update csvparser.py Formatted the file * Update prepdocs.py * Update prepdocs.py * Update csvparser.py * Update prepdocs.py * Fix prepdocs and tests to match main --------- Co-authored-by: Pamela Fox <[email protected]> Co-authored-by: Pamela Fox <[email protected]>
- Loading branch information
1 parent
a127523
commit 2dd7ba9
Showing
4 changed files
with
94 additions
and
4 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
import csv | ||
from typing import IO, AsyncGenerator | ||
|
||
from .page import Page | ||
from .parser import Parser | ||
|
||
|
||
class CsvParser(Parser): | ||
""" | ||
Concrete parser that can parse CSV into Page objects. Each row becomes a Page object. | ||
""" | ||
|
||
async def parse(self, content: IO) -> AsyncGenerator[Page, None]: | ||
# Check if content is in bytes (binary file) and decode to string | ||
content_str: str | ||
if isinstance(content, (bytes, bytearray)): | ||
content_str = content.decode("utf-8") | ||
elif hasattr(content, "read"): # Handle BufferedReader | ||
content_str = content.read().decode("utf-8") | ||
|
||
# Create a CSV reader from the text content | ||
reader = csv.reader(content_str.splitlines()) | ||
offset = 0 | ||
|
||
# Skip the header row | ||
next(reader, None) | ||
|
||
for i, row in enumerate(reader): | ||
page_text = ",".join(row) | ||
yield Page(i, offset, page_text) | ||
offset += len(page_text) + 1 # Account for newline character |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,57 @@ | ||
import io | ||
|
||
import pytest | ||
|
||
from prepdocslib.csvparser import CsvParser # Adjust import to the correct module | ||
|
||
|
||
@pytest.mark.asyncio | ||
async def test_csvparser_single_row(): | ||
# Mock CSV content with a single row in binary format | ||
file = io.BytesIO(b"col1,col2,col3\nvalue1,value2,value3") | ||
file.name = "test.csv" | ||
csvparser = CsvParser() | ||
|
||
# Parse the file | ||
pages = [page async for page in csvparser.parse(file)] | ||
|
||
# Assertions | ||
assert len(pages) == 1 | ||
assert pages[0].page_num == 0 | ||
assert pages[0].offset == 0 | ||
assert pages[0].text == "value1,value2,value3" | ||
|
||
|
||
@pytest.mark.asyncio | ||
async def test_csvparser_multiple_rows(): | ||
# Mock CSV content with multiple rows in binary format | ||
file = io.BytesIO(b"col1,col2,col3\nvalue1,value2,value3\nvalue4,value5,value6") | ||
file.name = "test.csv" | ||
csvparser = CsvParser() | ||
|
||
# Parse the file | ||
pages = [page async for page in csvparser.parse(file)] | ||
|
||
# Assertions | ||
assert len(pages) == 2 # Expect only data rows, skipping the header | ||
assert pages[0].page_num == 0 | ||
assert pages[0].offset == 0 | ||
assert pages[0].text == "value1,value2,value3" | ||
|
||
assert pages[1].page_num == 1 | ||
assert pages[1].offset == len(pages[0].text) + 1 # Length of the first row plus a newline | ||
assert pages[1].text == "value4,value5,value6" | ||
|
||
|
||
@pytest.mark.asyncio | ||
async def test_csvparser_empty_file(): | ||
# Mock empty CSV content in binary format | ||
file = io.BytesIO(b"") | ||
file.name = "test.csv" | ||
csvparser = CsvParser() | ||
|
||
# Parse the file | ||
pages = [page async for page in csvparser.parse(file)] | ||
|
||
# Assertions | ||
assert len(pages) == 0 # No rows should be parsed from an empty file |