Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Changed test cases and yielding INgestedTokens at end #163

Merged
merged 1 commit into from
Nov 27, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion querent/collectors/fs/fs_collector.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,6 @@ async def walk_files(self, root: Path) -> AsyncGenerator[Path, None]:
item_split = set(str(item).split("/"))
item_split.remove("")
if item_split.intersection(self.items_to_ignore):
print(item_split, "\n\n", self.items_to_ignore)
continue
if item.is_file():
yield item
Expand Down
7 changes: 7 additions & 0 deletions querent/ingestors/audio/audio_ingestors.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,11 @@ async def ingest(
yield IngestedTokens(file=current_file, data=[text], error=None)
collected_bytes = b""
current_file = chunk_bytes.file
yield IngestedTokens(
file=current_file,
data=None,
error=None,
)
collected_bytes += chunk_bytes.data
except Exception as e:
yield IngestedTokens(file=current_file, data=None, error=f"Exception: {e}")
Expand All @@ -92,6 +97,8 @@ async def ingest(
):
yield IngestedTokens(file=current_file, data=[text], error=None)

yield IngestedTokens(file=current_file, data=None, error=None)

async def extract_and_process_audio(
self, collected_bytes: CollectedBytes
) -> AsyncGenerator[IngestedTokens, None]:
Expand Down
6 changes: 6 additions & 0 deletions querent/ingestors/code/code_ingestor.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,11 @@ async def ingest(
)
collected_bytes = b""
current_file = chunk_bytes.file
yield IngestedTokens(
file=current_file,
data=None,
error=None,
)

collected_bytes += chunk_bytes.data
if current_file:
Expand All @@ -91,6 +96,7 @@ async def ingest(
data=[line],
error=None,
)
yield IngestedTokens(file=current_file, data=None, error=None)
except Exception as exc:
print(exc)
raise Exception from exc
Expand Down
7 changes: 7 additions & 0 deletions querent/ingestors/csv/csv_ingestor.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,11 @@ async def ingest(
yield IngestedTokens(file=current_file, data=[row], error=None)
collected_bytes = b""
current_file = chunk_bytes.file
yield IngestedTokens(
file=current_file,
data=None,
error=None,
)
collected_bytes += chunk_bytes.data
except Exception as e:
yield IngestedTokens(file=current_file, data=None, error=f"Exception: {e}")
Expand All @@ -60,6 +65,8 @@ async def ingest(
):
yield IngestedTokens(file=current_file, data=[row], error=None)

yield IngestedTokens(file=current_file, data=None, error=None)

async def extract_and_process_csv(
self, collected_bytes: CollectedBytes
) -> AsyncGenerator[str, None]:
Expand Down
6 changes: 6 additions & 0 deletions querent/ingestors/doc/doc_ingestor.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,11 @@ async def ingest(
)
collected_bytes = b""
current_file = chunk_bytes.file
yield IngestedTokens(
file=current_file,
data=None,
error=None,
)
collected_bytes += chunk_bytes.data
except Exception as e:
yield IngestedTokens(file=current_file, data=None, error=f"Exception: {e}")
Expand All @@ -64,6 +69,7 @@ async def ingest(
CollectedBytes(file=current_file, data=collected_bytes)
):
yield IngestedTokens(file=current_file, data=[paragraph], error=None)
yield IngestedTokens(file=current_file, data=None, error=None)

async def extract_and_process_doc(
self, collected_bytes: CollectedBytes
Expand Down
6 changes: 6 additions & 0 deletions querent/ingestors/github/github_ingestor.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,11 @@ async def ingest(
)
collected_bytes = b""
current_file = chunk_bytes.file
yield IngestedCode(
file=current_file,
data=None,
error=None,
)

collected_bytes += chunk_bytes.data

Expand All @@ -50,6 +55,7 @@ async def ingest(
data=[line], # Wrap line in a list
error=None,
)
yield IngestedCode(file=current_file, data=None, error=None)
except Exception as e:
yield IngestedCode(file=current_file, data=None, error=f"Exception: {e}")

Expand Down
6 changes: 6 additions & 0 deletions querent/ingestors/html/html_ingestor.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,11 @@ async def ingest(
)
collected_bytes = b""
current_file = chunk_bytes.file
yield IngestedTokens(
file=current_file,
data=None,
error=None,
)
collected_bytes += chunk_bytes.data
except Exception as e:
yield IngestedTokens(file=current_file, data=None, error=f"Exception: {e}")
Expand All @@ -59,6 +64,7 @@ async def ingest(
CollectedBytes(file=current_file, data=collected_bytes)
):
yield IngestedTokens(file=current_file, data=[element], error=None)
yield IngestedTokens(file=current_file, data=None, error=None)

async def extract_and_process_html(
self, collected_bytes: CollectedBytes
Expand Down
3 changes: 2 additions & 1 deletion querent/ingestors/images/image_ingestor.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ async def ingest(
CollectedBytes(file=current_file, data=collected_bytes)
)
yield IngestedTokens(file=current_file, data=[text], error=None)
yield IngestedTokens(file=current_file, data=None, error=None)

except Exception as e:
yield IngestedTokens(file=current_file, data=None, error=f"Exception: {e}")
Expand All @@ -88,7 +89,7 @@ async def extract_text_from_image(self, collected_bytes: CollectedBytes) -> str:
) from exc

text = pytesseract.image_to_string(image)
return text
return str(text).encode("utf-8").decode("unicode_escape")

async def process_data(self, text: str) -> str:
processed_data = text
Expand Down
26 changes: 16 additions & 10 deletions querent/ingestors/json/json_ingestor.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,24 +49,30 @@ async def ingest(
file=current_file, data=[json_object], error=None
)
collected_bytes = b""
if current_file:
yield IngestedTokens(
file=current_file,
data=None,
error=None,
)
current_file = chunk_bytes.file

collected_bytes += chunk_bytes.data

if current_file:
json_objects = await self.extract_and_process_json(
CollectedBytes(file=current_file, data=collected_bytes)
)
for json_object in json_objects:
processed_json_object = await self.process_data(json_object)
yield IngestedTokens(
file=current_file, data=[processed_json_object], error=None
)

except json.JSONDecodeError:
yield IngestedTokens(
file=current_file, data=None, error="JSON Decode Error"
)
finally:
json_objects = await self.extract_and_process_json(
CollectedBytes(file=current_file, data=collected_bytes)
)
for json_object in json_objects:
processed_json_object = await self.process_data(json_object)
yield IngestedTokens(
file=current_file, data=[processed_json_object], error=None
)
yield IngestedTokens(file=current_file, data=None, error=None)

except Exception as e:
yield IngestedTokens(file=current_file, data=None, error=f"Exception: {e}")
Expand Down
7 changes: 7 additions & 0 deletions querent/ingestors/pdfs/pdf_ingestor_v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,11 @@ async def ingest(
yield page_text
collected_bytes = b""
current_file = chunk_bytes.file
yield IngestedTokens(
file=current_file,
data=None,
error=None,
)
collected_bytes += chunk_bytes.data
except Exception as e:
# at the queue level, we can sample out the error
Expand All @@ -63,6 +68,8 @@ async def ingest(
CollectedBytes(file=current_file, data=collected_bytes)
):
yield page_text

yield IngestedTokens(file=current_file, data=None, error=None)
except Exception as exc:
yield None

Expand Down
6 changes: 6 additions & 0 deletions querent/ingestors/ppt/ppt_ingestor.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,11 @@ async def ingest(
)
collected_bytes = b""
current_file = chunk_bytes.file
yield IngestedTokens(
file=current_file,
data=None,
error=None,
)
collected_bytes += chunk_bytes.data
except Exception as e:
yield IngestedTokens(file=current_file, data=None, error=f"Exception: {e}")
Expand All @@ -62,6 +67,7 @@ async def ingest(
CollectedBytes(file=current_file, data=collected_bytes)
):
yield IngestedTokens(file=current_file, data=[slide_text], error=None)
yield IngestedTokens(file=current_file, data=None, error=None)

async def extract_and_process_ppt(
self, collected_bytes: CollectedBytes
Expand Down
6 changes: 6 additions & 0 deletions querent/ingestors/texts/text_ingestor.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,11 @@ async def ingest(
)
collected_bytes = b""
current_file = chunk_bytes.file
yield IngestedTokens(
file=current_file,
data=None,
error=None,
)

collected_bytes += chunk_bytes.data

Expand All @@ -71,6 +76,7 @@ async def ingest(
data=[line], # Wrap line in a list
error=None,
)
yield IngestedTokens(file=current_file, data=None, error=None)
except Exception as e:
yield IngestedTokens(file=current_file, data=None, error=f"Exception: {e}")

Expand Down
6 changes: 6 additions & 0 deletions querent/ingestors/video/video_ingestor.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,11 @@ async def ingest(
yield IngestedTokens(file=current_file, data=[text], error=None)
collected_bytes = b""
current_file = chunk_bytes.file
yield IngestedTokens(
file=current_file,
data=None,
error=None,
)
collected_bytes += chunk_bytes.data
except Exception as e:
yield IngestedTokens(file=current_file, data=None, error=f"Exception: {e}")
Expand All @@ -59,6 +64,7 @@ async def ingest(
CollectedBytes(file=current_file, data=collected_bytes)
):
yield IngestedTokens(file=current_file, data=[text], error=None)
yield IngestedTokens(file=current_file, data=None, error=None)

async def extract_and_process_video(
self, collected_bytes: CollectedBytes
Expand Down
6 changes: 6 additions & 0 deletions querent/ingestors/xlsx/xlsx_ingestor.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,11 @@ async def ingest(
)
collected_bytes = b""
current_file = chunk_bytes.file
yield IngestedTokens(
file=current_file,
data=None,
error=None,
)
collected_bytes += chunk_bytes.data
except Exception as e:
yield IngestedTokens(file=current_file, data=None, error=f"Exception: {e}")
Expand All @@ -69,6 +74,7 @@ async def ingest(
), # Convert DataFrame to a list of dictionaries
error=None,
)
yield IngestedTokens(file=current_file, data=None, error=None)

async def extract_and_process_xlsx(
self, collected_bytes: CollectedBytes
Expand Down
7 changes: 7 additions & 0 deletions querent/ingestors/xml/xml_ingestor.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,11 @@ async def ingest(
yield IngestedTokens(file=current_file, data=[text], error=None)
collected_bytes = b""
current_file = chunk_bytes.file
yield IngestedTokens(
file=current_file,
data=None,
error=None,
)
collected_bytes += chunk_bytes.data
except Exception as e:
yield IngestedTokens(file=current_file, data=None, error=f"Exception: {e}")
Expand All @@ -63,6 +68,8 @@ async def ingest(
):
yield IngestedTokens(file=current_file, data=[text], error=None)

yield IngestedTokens(file=current_file, data=None, error=None)

async def extract_and_process_xml(
self, collected_bytes: CollectedBytes
) -> AsyncGenerator[str, None]:
Expand Down
6 changes: 2 additions & 4 deletions tests/test_audio_ingestor.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,14 +29,12 @@ async def test_collect_and_ingest_audio():
async def poll_and_print():
counter = 0
async for ingested in ingested_call:
assert ingested is not None
assert ingested.error is None
assert ingested.file is not None
assert ingested.data is not None
assert len(ingested.data) > 0
counter += 1

assert counter == 1
# counter is 2 because at the end of each file there is an empty IngestedTokens being yielded
assert counter == 2

await poll_and_print()

Expand Down
5 changes: 3 additions & 2 deletions tests/test_code_ingestor.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,10 @@ async def poll_and_print():
counter = 0
async for ingested in ingested_call:
assert ingested is not None
if ingested is not "" or ingested is not None:
if ingested.data != "" or ingested is not None:
counter += 1
assert counter == 2
# counter is 2 though files are 4, that is because we are yielding an empty IngestedCode at the end of each file
assert counter == 4

await poll_and_print()

Expand Down
5 changes: 2 additions & 3 deletions tests/test_csv_ingestor.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,9 @@ async def poll_and_print():
assert ingested is not None
assert ingested.error is None
assert ingested.file is not None
assert ingested.data is not None
assert len(ingested.data) > 0
counter += 1
assert counter == 7
# 2 extra IngestedTokens are representing end of file
assert counter == 9

await poll_and_print()

Expand Down
5 changes: 2 additions & 3 deletions tests/test_doc_ingestor.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,10 +32,9 @@ async def poll_and_print():
assert ingested is not None
assert ingested.error is None
assert ingested.file is not None
assert ingested.data is not None
assert len(ingested.data) > 0
counter += 1
assert counter == 2
# 2 extra IngestedTokens are repreenting end of files
assert counter == 4

await poll_and_print()

Expand Down
2 changes: 0 additions & 2 deletions tests/test_generic_ingestor.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,6 @@ async def test_collect_and_ingest_generic_bytes():
# Set up the collector
collector_factory = SlackCollectorFactory()
uri = Uri("slack://")
print(os.getenv("SLACK_ACCESS_KEY"))
config = get_collector_config()
collector = collector_factory.resolve(uri, config)

Expand All @@ -49,7 +48,6 @@ async def poll_and_print():
async for ingested in ingested_call:
assert ingested is not None
if ingested is not "" or ingested is not None:
# print(ingested)
counter += 1
assert counter == 23

Expand Down
3 changes: 2 additions & 1 deletion tests/test_github_ingestor.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,8 @@ async def poll_and_print():
if ingested is not "" or ingested is not None:
counter += 1

assert counter == 6
# 6 extra IngestedTokens signifying end of file
assert counter == 12

await poll_and_print() # Notice the use of await here

Expand Down
Loading