Skip to content

Commit

Permalink
Changed test cases and yielding INgestedTokens at end (#163)
Browse files Browse the repository at this point in the history
  • Loading branch information
Ansh5461 authored Nov 27, 2023
1 parent ee22460 commit b7c3323
Show file tree
Hide file tree
Showing 29 changed files with 123 additions and 41 deletions.
1 change: 0 additions & 1 deletion querent/collectors/fs/fs_collector.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,6 @@ async def walk_files(self, root: Path) -> AsyncGenerator[Path, None]:
item_split = set(str(item).split("/"))
item_split.remove("")
if item_split.intersection(self.items_to_ignore):
print(item_split, "\n\n", self.items_to_ignore)
continue
if item.is_file():
yield item
Expand Down
7 changes: 7 additions & 0 deletions querent/ingestors/audio/audio_ingestors.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,11 @@ async def ingest(
yield IngestedTokens(file=current_file, data=[text], error=None)
collected_bytes = b""
current_file = chunk_bytes.file
yield IngestedTokens(
file=current_file,
data=None,
error=None,
)
collected_bytes += chunk_bytes.data
except Exception as e:
yield IngestedTokens(file=current_file, data=None, error=f"Exception: {e}")
Expand All @@ -92,6 +97,8 @@ async def ingest(
):
yield IngestedTokens(file=current_file, data=[text], error=None)

yield IngestedTokens(file=current_file, data=None, error=None)

async def extract_and_process_audio(
self, collected_bytes: CollectedBytes
) -> AsyncGenerator[IngestedTokens, None]:
Expand Down
6 changes: 6 additions & 0 deletions querent/ingestors/code/code_ingestor.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,11 @@ async def ingest(
)
collected_bytes = b""
current_file = chunk_bytes.file
yield IngestedTokens(
file=current_file,
data=None,
error=None,
)

collected_bytes += chunk_bytes.data
if current_file:
Expand All @@ -91,6 +96,7 @@ async def ingest(
data=[line],
error=None,
)
yield IngestedTokens(file=current_file, data=None, error=None)
except Exception as exc:
print(exc)
raise Exception from exc
Expand Down
7 changes: 7 additions & 0 deletions querent/ingestors/csv/csv_ingestor.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,11 @@ async def ingest(
yield IngestedTokens(file=current_file, data=[row], error=None)
collected_bytes = b""
current_file = chunk_bytes.file
yield IngestedTokens(
file=current_file,
data=None,
error=None,
)
collected_bytes += chunk_bytes.data
except Exception as e:
yield IngestedTokens(file=current_file, data=None, error=f"Exception: {e}")
Expand All @@ -60,6 +65,8 @@ async def ingest(
):
yield IngestedTokens(file=current_file, data=[row], error=None)

yield IngestedTokens(file=current_file, data=None, error=None)

async def extract_and_process_csv(
self, collected_bytes: CollectedBytes
) -> AsyncGenerator[str, None]:
Expand Down
6 changes: 6 additions & 0 deletions querent/ingestors/doc/doc_ingestor.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,11 @@ async def ingest(
)
collected_bytes = b""
current_file = chunk_bytes.file
yield IngestedTokens(
file=current_file,
data=None,
error=None,
)
collected_bytes += chunk_bytes.data
except Exception as e:
yield IngestedTokens(file=current_file, data=None, error=f"Exception: {e}")
Expand All @@ -64,6 +69,7 @@ async def ingest(
CollectedBytes(file=current_file, data=collected_bytes)
):
yield IngestedTokens(file=current_file, data=[paragraph], error=None)
yield IngestedTokens(file=current_file, data=None, error=None)

async def extract_and_process_doc(
self, collected_bytes: CollectedBytes
Expand Down
6 changes: 6 additions & 0 deletions querent/ingestors/github/github_ingestor.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,11 @@ async def ingest(
)
collected_bytes = b""
current_file = chunk_bytes.file
yield IngestedCode(
file=current_file,
data=None,
error=None,
)

collected_bytes += chunk_bytes.data

Expand All @@ -50,6 +55,7 @@ async def ingest(
data=[line], # Wrap line in a list
error=None,
)
yield IngestedCode(file=current_file, data=None, error=None)
except Exception as e:
yield IngestedCode(file=current_file, data=None, error=f"Exception: {e}")

Expand Down
6 changes: 6 additions & 0 deletions querent/ingestors/html/html_ingestor.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,11 @@ async def ingest(
)
collected_bytes = b""
current_file = chunk_bytes.file
yield IngestedTokens(
file=current_file,
data=None,
error=None,
)
collected_bytes += chunk_bytes.data
except Exception as e:
yield IngestedTokens(file=current_file, data=None, error=f"Exception: {e}")
Expand All @@ -59,6 +64,7 @@ async def ingest(
CollectedBytes(file=current_file, data=collected_bytes)
):
yield IngestedTokens(file=current_file, data=[element], error=None)
yield IngestedTokens(file=current_file, data=None, error=None)

async def extract_and_process_html(
self, collected_bytes: CollectedBytes
Expand Down
3 changes: 2 additions & 1 deletion querent/ingestors/images/image_ingestor.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ async def ingest(
CollectedBytes(file=current_file, data=collected_bytes)
)
yield IngestedTokens(file=current_file, data=[text], error=None)
yield IngestedTokens(file=current_file, data=None, error=None)

except Exception as e:
yield IngestedTokens(file=current_file, data=None, error=f"Exception: {e}")
Expand All @@ -88,7 +89,7 @@ async def extract_text_from_image(self, collected_bytes: CollectedBytes) -> str:
) from exc

text = pytesseract.image_to_string(image)
return text
return str(text).encode("utf-8").decode("unicode_escape")

async def process_data(self, text: str) -> str:
processed_data = text
Expand Down
26 changes: 16 additions & 10 deletions querent/ingestors/json/json_ingestor.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,24 +49,30 @@ async def ingest(
file=current_file, data=[json_object], error=None
)
collected_bytes = b""
if current_file:
yield IngestedTokens(
file=current_file,
data=None,
error=None,
)
current_file = chunk_bytes.file

collected_bytes += chunk_bytes.data

if current_file:
json_objects = await self.extract_and_process_json(
CollectedBytes(file=current_file, data=collected_bytes)
)
for json_object in json_objects:
processed_json_object = await self.process_data(json_object)
yield IngestedTokens(
file=current_file, data=[processed_json_object], error=None
)

except json.JSONDecodeError:
yield IngestedTokens(
file=current_file, data=None, error="JSON Decode Error"
)
finally:
json_objects = await self.extract_and_process_json(
CollectedBytes(file=current_file, data=collected_bytes)
)
for json_object in json_objects:
processed_json_object = await self.process_data(json_object)
yield IngestedTokens(
file=current_file, data=[processed_json_object], error=None
)
yield IngestedTokens(file=current_file, data=None, error=None)

except Exception as e:
yield IngestedTokens(file=current_file, data=None, error=f"Exception: {e}")
Expand Down
7 changes: 7 additions & 0 deletions querent/ingestors/pdfs/pdf_ingestor_v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,11 @@ async def ingest(
yield page_text
collected_bytes = b""
current_file = chunk_bytes.file
yield IngestedTokens(
file=current_file,
data=None,
error=None,
)
collected_bytes += chunk_bytes.data
except Exception as e:
# at the queue level, we can sample out the error
Expand All @@ -63,6 +68,8 @@ async def ingest(
CollectedBytes(file=current_file, data=collected_bytes)
):
yield page_text

yield IngestedTokens(file=current_file, data=None, error=None)
except Exception as exc:
yield None

Expand Down
6 changes: 6 additions & 0 deletions querent/ingestors/ppt/ppt_ingestor.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,11 @@ async def ingest(
)
collected_bytes = b""
current_file = chunk_bytes.file
yield IngestedTokens(
file=current_file,
data=None,
error=None,
)
collected_bytes += chunk_bytes.data
except Exception as e:
yield IngestedTokens(file=current_file, data=None, error=f"Exception: {e}")
Expand All @@ -62,6 +67,7 @@ async def ingest(
CollectedBytes(file=current_file, data=collected_bytes)
):
yield IngestedTokens(file=current_file, data=[slide_text], error=None)
yield IngestedTokens(file=current_file, data=None, error=None)

async def extract_and_process_ppt(
self, collected_bytes: CollectedBytes
Expand Down
6 changes: 6 additions & 0 deletions querent/ingestors/texts/text_ingestor.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,11 @@ async def ingest(
)
collected_bytes = b""
current_file = chunk_bytes.file
yield IngestedTokens(
file=current_file,
data=None,
error=None,
)

collected_bytes += chunk_bytes.data

Expand All @@ -71,6 +76,7 @@ async def ingest(
data=[line], # Wrap line in a list
error=None,
)
yield IngestedTokens(file=current_file, data=None, error=None)
except Exception as e:
yield IngestedTokens(file=current_file, data=None, error=f"Exception: {e}")

Expand Down
6 changes: 6 additions & 0 deletions querent/ingestors/video/video_ingestor.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,11 @@ async def ingest(
yield IngestedTokens(file=current_file, data=[text], error=None)
collected_bytes = b""
current_file = chunk_bytes.file
yield IngestedTokens(
file=current_file,
data=None,
error=None,
)
collected_bytes += chunk_bytes.data
except Exception as e:
yield IngestedTokens(file=current_file, data=None, error=f"Exception: {e}")
Expand All @@ -59,6 +64,7 @@ async def ingest(
CollectedBytes(file=current_file, data=collected_bytes)
):
yield IngestedTokens(file=current_file, data=[text], error=None)
yield IngestedTokens(file=current_file, data=None, error=None)

async def extract_and_process_video(
self, collected_bytes: CollectedBytes
Expand Down
6 changes: 6 additions & 0 deletions querent/ingestors/xlsx/xlsx_ingestor.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,11 @@ async def ingest(
)
collected_bytes = b""
current_file = chunk_bytes.file
yield IngestedTokens(
file=current_file,
data=None,
error=None,
)
collected_bytes += chunk_bytes.data
except Exception as e:
yield IngestedTokens(file=current_file, data=None, error=f"Exception: {e}")
Expand All @@ -69,6 +74,7 @@ async def ingest(
), # Convert DataFrame to a list of dictionaries
error=None,
)
yield IngestedTokens(file=current_file, data=None, error=None)

async def extract_and_process_xlsx(
self, collected_bytes: CollectedBytes
Expand Down
7 changes: 7 additions & 0 deletions querent/ingestors/xml/xml_ingestor.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,11 @@ async def ingest(
yield IngestedTokens(file=current_file, data=[text], error=None)
collected_bytes = b""
current_file = chunk_bytes.file
yield IngestedTokens(
file=current_file,
data=None,
error=None,
)
collected_bytes += chunk_bytes.data
except Exception as e:
yield IngestedTokens(file=current_file, data=None, error=f"Exception: {e}")
Expand All @@ -63,6 +68,8 @@ async def ingest(
):
yield IngestedTokens(file=current_file, data=[text], error=None)

yield IngestedTokens(file=current_file, data=None, error=None)

async def extract_and_process_xml(
self, collected_bytes: CollectedBytes
) -> AsyncGenerator[str, None]:
Expand Down
6 changes: 2 additions & 4 deletions tests/test_audio_ingestor.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,14 +29,12 @@ async def test_collect_and_ingest_audio():
async def poll_and_print():
counter = 0
async for ingested in ingested_call:
assert ingested is not None
assert ingested.error is None
assert ingested.file is not None
assert ingested.data is not None
assert len(ingested.data) > 0
counter += 1

assert counter == 1
# counter is 2 because at the end of each file there is an empty IngestedTokens being yielded
assert counter == 2

await poll_and_print()

Expand Down
5 changes: 3 additions & 2 deletions tests/test_code_ingestor.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,10 @@ async def poll_and_print():
counter = 0
async for ingested in ingested_call:
assert ingested is not None
if ingested is not "" or ingested is not None:
if ingested.data != "" or ingested is not None:
counter += 1
assert counter == 2
# counter is 2 though files are 4, that is because we are yielding an empty IngestedCode at the end of each file
assert counter == 4

await poll_and_print()

Expand Down
5 changes: 2 additions & 3 deletions tests/test_csv_ingestor.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,9 @@ async def poll_and_print():
assert ingested is not None
assert ingested.error is None
assert ingested.file is not None
assert ingested.data is not None
assert len(ingested.data) > 0
counter += 1
assert counter == 7
# 2 extra IngestedTokens are representing end of file
assert counter == 9

await poll_and_print()

Expand Down
5 changes: 2 additions & 3 deletions tests/test_doc_ingestor.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,10 +32,9 @@ async def poll_and_print():
assert ingested is not None
assert ingested.error is None
assert ingested.file is not None
assert ingested.data is not None
assert len(ingested.data) > 0
counter += 1
assert counter == 2
# 2 extra IngestedTokens are repreenting end of files
assert counter == 4

await poll_and_print()

Expand Down
2 changes: 0 additions & 2 deletions tests/test_generic_ingestor.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,6 @@ async def test_collect_and_ingest_generic_bytes():
# Set up the collector
collector_factory = SlackCollectorFactory()
uri = Uri("slack://")
print(os.getenv("SLACK_ACCESS_KEY"))
config = get_collector_config()
collector = collector_factory.resolve(uri, config)

Expand All @@ -49,7 +48,6 @@ async def poll_and_print():
async for ingested in ingested_call:
assert ingested is not None
if ingested is not "" or ingested is not None:
# print(ingested)
counter += 1
assert counter == 23

Expand Down
3 changes: 2 additions & 1 deletion tests/test_github_ingestor.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,8 @@ async def poll_and_print():
if ingested is not "" or ingested is not None:
counter += 1

assert counter == 6
# 6 extra IngestedTokens signifying end of file
assert counter == 12

await poll_and_print() # Notice the use of await here

Expand Down
Loading

0 comments on commit b7c3323

Please sign in to comment.