Skip to content

Commit b7c3323

Browse files
authored
Changed test cases and yielding INgestedTokens at end (#163)
1 parent ee22460 commit b7c3323

29 files changed

+123
-41
lines changed

querent/collectors/fs/fs_collector.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,6 @@ async def walk_files(self, root: Path) -> AsyncGenerator[Path, None]:
6161
item_split = set(str(item).split("/"))
6262
item_split.remove("")
6363
if item_split.intersection(self.items_to_ignore):
64-
print(item_split, "\n\n", self.items_to_ignore)
6564
continue
6665
if item.is_file():
6766
yield item

querent/ingestors/audio/audio_ingestors.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,11 @@ async def ingest(
8282
yield IngestedTokens(file=current_file, data=[text], error=None)
8383
collected_bytes = b""
8484
current_file = chunk_bytes.file
85+
yield IngestedTokens(
86+
file=current_file,
87+
data=None,
88+
error=None,
89+
)
8590
collected_bytes += chunk_bytes.data
8691
except Exception as e:
8792
yield IngestedTokens(file=current_file, data=None, error=f"Exception: {e}")
@@ -92,6 +97,8 @@ async def ingest(
9297
):
9398
yield IngestedTokens(file=current_file, data=[text], error=None)
9499

100+
yield IngestedTokens(file=current_file, data=None, error=None)
101+
95102
async def extract_and_process_audio(
96103
self, collected_bytes: CollectedBytes
97104
) -> AsyncGenerator[IngestedTokens, None]:

querent/ingestors/code/code_ingestor.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,11 @@ async def ingest(
8080
)
8181
collected_bytes = b""
8282
current_file = chunk_bytes.file
83+
yield IngestedTokens(
84+
file=current_file,
85+
data=None,
86+
error=None,
87+
)
8388

8489
collected_bytes += chunk_bytes.data
8590
if current_file:
@@ -91,6 +96,7 @@ async def ingest(
9196
data=[line],
9297
error=None,
9398
)
99+
yield IngestedTokens(file=current_file, data=None, error=None)
94100
except Exception as exc:
95101
print(exc)
96102
raise Exception from exc

querent/ingestors/csv/csv_ingestor.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,11 @@ async def ingest(
5050
yield IngestedTokens(file=current_file, data=[row], error=None)
5151
collected_bytes = b""
5252
current_file = chunk_bytes.file
53+
yield IngestedTokens(
54+
file=current_file,
55+
data=None,
56+
error=None,
57+
)
5358
collected_bytes += chunk_bytes.data
5459
except Exception as e:
5560
yield IngestedTokens(file=current_file, data=None, error=f"Exception: {e}")
@@ -60,6 +65,8 @@ async def ingest(
6065
):
6166
yield IngestedTokens(file=current_file, data=[row], error=None)
6267

68+
yield IngestedTokens(file=current_file, data=None, error=None)
69+
6370
async def extract_and_process_csv(
6471
self, collected_bytes: CollectedBytes
6572
) -> AsyncGenerator[str, None]:

querent/ingestors/doc/doc_ingestor.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,11 @@ async def ingest(
5555
)
5656
collected_bytes = b""
5757
current_file = chunk_bytes.file
58+
yield IngestedTokens(
59+
file=current_file,
60+
data=None,
61+
error=None,
62+
)
5863
collected_bytes += chunk_bytes.data
5964
except Exception as e:
6065
yield IngestedTokens(file=current_file, data=None, error=f"Exception: {e}")
@@ -64,6 +69,7 @@ async def ingest(
6469
CollectedBytes(file=current_file, data=collected_bytes)
6570
):
6671
yield IngestedTokens(file=current_file, data=[paragraph], error=None)
72+
yield IngestedTokens(file=current_file, data=None, error=None)
6773

6874
async def extract_and_process_doc(
6975
self, collected_bytes: CollectedBytes

querent/ingestors/github/github_ingestor.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,11 @@ async def ingest(
3838
)
3939
collected_bytes = b""
4040
current_file = chunk_bytes.file
41+
yield IngestedCode(
42+
file=current_file,
43+
data=None,
44+
error=None,
45+
)
4146

4247
collected_bytes += chunk_bytes.data
4348

@@ -50,6 +55,7 @@ async def ingest(
5055
data=[line], # Wrap line in a list
5156
error=None,
5257
)
58+
yield IngestedCode(file=current_file, data=None, error=None)
5359
except Exception as e:
5460
yield IngestedCode(file=current_file, data=None, error=f"Exception: {e}")
5561

querent/ingestors/html/html_ingestor.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,11 @@ async def ingest(
5050
)
5151
collected_bytes = b""
5252
current_file = chunk_bytes.file
53+
yield IngestedTokens(
54+
file=current_file,
55+
data=None,
56+
error=None,
57+
)
5358
collected_bytes += chunk_bytes.data
5459
except Exception as e:
5560
yield IngestedTokens(file=current_file, data=None, error=f"Exception: {e}")
@@ -59,6 +64,7 @@ async def ingest(
5964
CollectedBytes(file=current_file, data=collected_bytes)
6065
):
6166
yield IngestedTokens(file=current_file, data=[element], error=None)
67+
yield IngestedTokens(file=current_file, data=None, error=None)
6268

6369
async def extract_and_process_html(
6470
self, collected_bytes: CollectedBytes

querent/ingestors/images/image_ingestor.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@ async def ingest(
6262
CollectedBytes(file=current_file, data=collected_bytes)
6363
)
6464
yield IngestedTokens(file=current_file, data=[text], error=None)
65+
yield IngestedTokens(file=current_file, data=None, error=None)
6566

6667
except Exception as e:
6768
yield IngestedTokens(file=current_file, data=None, error=f"Exception: {e}")
@@ -88,7 +89,7 @@ async def extract_text_from_image(self, collected_bytes: CollectedBytes) -> str:
8889
) from exc
8990

9091
text = pytesseract.image_to_string(image)
91-
return text
92+
return str(text).encode("utf-8").decode("unicode_escape")
9293

9394
async def process_data(self, text: str) -> str:
9495
processed_data = text

querent/ingestors/json/json_ingestor.py

Lines changed: 16 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -49,24 +49,30 @@ async def ingest(
4949
file=current_file, data=[json_object], error=None
5050
)
5151
collected_bytes = b""
52+
if current_file:
53+
yield IngestedTokens(
54+
file=current_file,
55+
data=None,
56+
error=None,
57+
)
5258
current_file = chunk_bytes.file
5359

5460
collected_bytes += chunk_bytes.data
5561

56-
if current_file:
57-
json_objects = await self.extract_and_process_json(
58-
CollectedBytes(file=current_file, data=collected_bytes)
59-
)
60-
for json_object in json_objects:
61-
processed_json_object = await self.process_data(json_object)
62-
yield IngestedTokens(
63-
file=current_file, data=[processed_json_object], error=None
64-
)
65-
6662
except json.JSONDecodeError:
6763
yield IngestedTokens(
6864
file=current_file, data=None, error="JSON Decode Error"
6965
)
66+
finally:
67+
json_objects = await self.extract_and_process_json(
68+
CollectedBytes(file=current_file, data=collected_bytes)
69+
)
70+
for json_object in json_objects:
71+
processed_json_object = await self.process_data(json_object)
72+
yield IngestedTokens(
73+
file=current_file, data=[processed_json_object], error=None
74+
)
75+
yield IngestedTokens(file=current_file, data=None, error=None)
7076

7177
except Exception as e:
7278
yield IngestedTokens(file=current_file, data=None, error=f"Exception: {e}")

querent/ingestors/pdfs/pdf_ingestor_v1.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,11 @@ async def ingest(
5252
yield page_text
5353
collected_bytes = b""
5454
current_file = chunk_bytes.file
55+
yield IngestedTokens(
56+
file=current_file,
57+
data=None,
58+
error=None,
59+
)
5560
collected_bytes += chunk_bytes.data
5661
except Exception as e:
5762
# at the queue level, we can sample out the error
@@ -63,6 +68,8 @@ async def ingest(
6368
CollectedBytes(file=current_file, data=collected_bytes)
6469
):
6570
yield page_text
71+
72+
yield IngestedTokens(file=current_file, data=None, error=None)
6673
except Exception as exc:
6774
yield None
6875

querent/ingestors/ppt/ppt_ingestor.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,11 @@ async def ingest(
5454
)
5555
collected_bytes = b""
5656
current_file = chunk_bytes.file
57+
yield IngestedTokens(
58+
file=current_file,
59+
data=None,
60+
error=None,
61+
)
5762
collected_bytes += chunk_bytes.data
5863
except Exception as e:
5964
yield IngestedTokens(file=current_file, data=None, error=f"Exception: {e}")
@@ -62,6 +67,7 @@ async def ingest(
6267
CollectedBytes(file=current_file, data=collected_bytes)
6368
):
6469
yield IngestedTokens(file=current_file, data=[slide_text], error=None)
70+
yield IngestedTokens(file=current_file, data=None, error=None)
6571

6672
async def extract_and_process_ppt(
6773
self, collected_bytes: CollectedBytes

querent/ingestors/texts/text_ingestor.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,11 @@ async def ingest(
5959
)
6060
collected_bytes = b""
6161
current_file = chunk_bytes.file
62+
yield IngestedTokens(
63+
file=current_file,
64+
data=None,
65+
error=None,
66+
)
6267

6368
collected_bytes += chunk_bytes.data
6469

@@ -71,6 +76,7 @@ async def ingest(
7176
data=[line], # Wrap line in a list
7277
error=None,
7378
)
79+
yield IngestedTokens(file=current_file, data=None, error=None)
7480
except Exception as e:
7581
yield IngestedTokens(file=current_file, data=None, error=f"Exception: {e}")
7682

querent/ingestors/video/video_ingestor.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,11 @@ async def ingest(
5050
yield IngestedTokens(file=current_file, data=[text], error=None)
5151
collected_bytes = b""
5252
current_file = chunk_bytes.file
53+
yield IngestedTokens(
54+
file=current_file,
55+
data=None,
56+
error=None,
57+
)
5358
collected_bytes += chunk_bytes.data
5459
except Exception as e:
5560
yield IngestedTokens(file=current_file, data=None, error=f"Exception: {e}")
@@ -59,6 +64,7 @@ async def ingest(
5964
CollectedBytes(file=current_file, data=collected_bytes)
6065
):
6166
yield IngestedTokens(file=current_file, data=[text], error=None)
67+
yield IngestedTokens(file=current_file, data=None, error=None)
6268

6369
async def extract_and_process_video(
6470
self, collected_bytes: CollectedBytes

querent/ingestors/xlsx/xlsx_ingestor.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,11 @@ async def ingest(
5555
)
5656
collected_bytes = b""
5757
current_file = chunk_bytes.file
58+
yield IngestedTokens(
59+
file=current_file,
60+
data=None,
61+
error=None,
62+
)
5863
collected_bytes += chunk_bytes.data
5964
except Exception as e:
6065
yield IngestedTokens(file=current_file, data=None, error=f"Exception: {e}")
@@ -69,6 +74,7 @@ async def ingest(
6974
), # Convert DataFrame to a list of dictionaries
7075
error=None,
7176
)
77+
yield IngestedTokens(file=current_file, data=None, error=None)
7278

7379
async def extract_and_process_xlsx(
7480
self, collected_bytes: CollectedBytes

querent/ingestors/xml/xml_ingestor.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,11 @@ async def ingest(
5454
yield IngestedTokens(file=current_file, data=[text], error=None)
5555
collected_bytes = b""
5656
current_file = chunk_bytes.file
57+
yield IngestedTokens(
58+
file=current_file,
59+
data=None,
60+
error=None,
61+
)
5762
collected_bytes += chunk_bytes.data
5863
except Exception as e:
5964
yield IngestedTokens(file=current_file, data=None, error=f"Exception: {e}")
@@ -63,6 +68,8 @@ async def ingest(
6368
):
6469
yield IngestedTokens(file=current_file, data=[text], error=None)
6570

71+
yield IngestedTokens(file=current_file, data=None, error=None)
72+
6673
async def extract_and_process_xml(
6774
self, collected_bytes: CollectedBytes
6875
) -> AsyncGenerator[str, None]:

tests/test_audio_ingestor.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -29,14 +29,12 @@ async def test_collect_and_ingest_audio():
2929
async def poll_and_print():
3030
counter = 0
3131
async for ingested in ingested_call:
32-
assert ingested is not None
3332
assert ingested.error is None
3433
assert ingested.file is not None
35-
assert ingested.data is not None
36-
assert len(ingested.data) > 0
3734
counter += 1
3835

39-
assert counter == 1
36+
# counter is 2 because at the end of each file there is an empty IngestedTokens being yielded
37+
assert counter == 2
4038

4139
await poll_and_print()
4240

tests/test_code_ingestor.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,9 +29,10 @@ async def poll_and_print():
2929
counter = 0
3030
async for ingested in ingested_call:
3131
assert ingested is not None
32-
if ingested is not "" or ingested is not None:
32+
if ingested.data != "" or ingested is not None:
3333
counter += 1
34-
assert counter == 2
34+
# counter is 2 though files are 4, that is because we are yielding an empty IngestedCode at the end of each file
35+
assert counter == 4
3536

3637
await poll_and_print()
3738

tests/test_csv_ingestor.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -29,10 +29,9 @@ async def poll_and_print():
2929
assert ingested is not None
3030
assert ingested.error is None
3131
assert ingested.file is not None
32-
assert ingested.data is not None
33-
assert len(ingested.data) > 0
3432
counter += 1
35-
assert counter == 7
33+
# 2 extra IngestedTokens are representing end of file
34+
assert counter == 9
3635

3736
await poll_and_print()
3837

tests/test_doc_ingestor.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -32,10 +32,9 @@ async def poll_and_print():
3232
assert ingested is not None
3333
assert ingested.error is None
3434
assert ingested.file is not None
35-
assert ingested.data is not None
36-
assert len(ingested.data) > 0
3735
counter += 1
38-
assert counter == 2
36+
# 2 extra IngestedTokens are repreenting end of files
37+
assert counter == 4
3938

4039
await poll_and_print()
4140

tests/test_generic_ingestor.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,6 @@ async def test_collect_and_ingest_generic_bytes():
3131
# Set up the collector
3232
collector_factory = SlackCollectorFactory()
3333
uri = Uri("slack://")
34-
print(os.getenv("SLACK_ACCESS_KEY"))
3534
config = get_collector_config()
3635
collector = collector_factory.resolve(uri, config)
3736

@@ -49,7 +48,6 @@ async def poll_and_print():
4948
async for ingested in ingested_call:
5049
assert ingested is not None
5150
if ingested is not "" or ingested is not None:
52-
# print(ingested)
5351
counter += 1
5452
assert counter == 23
5553

tests/test_github_ingestor.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,8 @@ async def poll_and_print():
4040
if ingested is not "" or ingested is not None:
4141
counter += 1
4242

43-
assert counter == 6
43+
# 6 extra IngestedTokens signifying end of file
44+
assert counter == 12
4445

4546
await poll_and_print() # Notice the use of await here
4647

0 commit comments

Comments
 (0)