diff --git a/querent/core/transformers/bert_ner_opensourcellm.py b/querent/core/transformers/bert_ner_opensourcellm.py index 235fb470..25b9d997 100644 --- a/querent/core/transformers/bert_ner_opensourcellm.py +++ b/querent/core/transformers/bert_ner_opensourcellm.py @@ -209,14 +209,17 @@ async def process_tokens(self, data: IngestedTokens): if self.sample_relationships: embedding_triples = self.predicate_context_extractor.process_predicate_types(embedding_triples) for triple in embedding_triples: - graph_json = json.dumps(TripleToJsonConverter.convert_graphjson(triple)) - if graph_json: - current_state = EventState(EventType.Graph,1.0, graph_json, file) - await self.set_state(new_state=current_state) - vector_json = json.dumps(TripleToJsonConverter.convert_vectorjson(triple)) - if vector_json: - current_state = EventState(EventType.Vector,1.0, vector_json, file) - await self.set_state(new_state=current_state) + if not self.termination_event.is_set(): + graph_json = json.dumps(TripleToJsonConverter.convert_graphjson(triple)) + if graph_json: + current_state = EventState(EventType.Graph,1.0, graph_json, file) + await self.set_state(new_state=current_state) + vector_json = json.dumps(TripleToJsonConverter.convert_vectorjson(triple)) + if vector_json: + current_state = EventState(EventType.Vector,1.0, vector_json, file) + await self.set_state(new_state=current_state) + else: + return else: return else: diff --git a/querent/core/transformers/fixed_entities_set_opensourcellm.py b/querent/core/transformers/fixed_entities_set_opensourcellm.py index 204a355c..43c5b7b4 100644 --- a/querent/core/transformers/fixed_entities_set_opensourcellm.py +++ b/querent/core/transformers/fixed_entities_set_opensourcellm.py @@ -173,14 +173,17 @@ async def process_tokens(self, data: IngestedTokens): if self.sample_relationships: embedding_triples = self.predicate_context_extractor.process_predicate_types(embedding_triples) for triple in embedding_triples: - graph_json = json.dumps(TripleToJsonConverter.convert_graphjson(triple)) - if graph_json: - current_state = EventState(EventType.Graph,1.0, graph_json, file) - await self.set_state(new_state=current_state) - vector_json = json.dumps(TripleToJsonConverter.convert_vectorjson(triple)) - if vector_json: - current_state = EventState(EventType.Vector,1.0, vector_json, file) - await self.set_state(new_state=current_state) + if not self.termination_event.is_set(): + graph_json = json.dumps(TripleToJsonConverter.convert_graphjson(triple)) + if graph_json: + current_state = EventState(EventType.Graph,1.0, graph_json, file) + await self.set_state(new_state=current_state) + vector_json = json.dumps(TripleToJsonConverter.convert_vectorjson(triple)) + if vector_json: + current_state = EventState(EventType.Vector,1.0, vector_json, file) + await self.set_state(new_state=current_state) + else: + return else: return else: diff --git a/querent/core/transformers/gpt_llm_bert_ner_or_fixed_entities_set_ner.py b/querent/core/transformers/gpt_llm_bert_ner_or_fixed_entities_set_ner.py index 47bfd898..ca9885f5 100644 --- a/querent/core/transformers/gpt_llm_bert_ner_or_fixed_entities_set_ner.py +++ b/querent/core/transformers/gpt_llm_bert_ner_or_fixed_entities_set_ner.py @@ -277,14 +277,17 @@ async def process_tokens(self, data: IngestedTokens): if self.sample_relationships: embedding_triples = self.predicate_context_extractor.process_predicate_types(embedding_triples) for triple in embedding_triples: - graph_json = json.dumps(TripleToJsonConverter.convert_graphjson(triple)) - if graph_json: - current_state = EventState(EventType.Graph,1.0, graph_json, file) - await self.set_state(new_state=current_state) - vector_json = json.dumps(TripleToJsonConverter.convert_vectorjson(triple)) - if vector_json: - current_state = EventState(EventType.Vector,1.0, vector_json, file) - await self.set_state(new_state=current_state) + if not self.termination_event.is_set(): + graph_json = json.dumps(TripleToJsonConverter.convert_graphjson(triple)) + if graph_json: + current_state = EventState(EventType.Graph,1.0, graph_json, file) + await self.set_state(new_state=current_state) + vector_json = json.dumps(TripleToJsonConverter.convert_vectorjson(triple)) + if vector_json: + current_state = EventState(EventType.Vector,1.0, vector_json, file) + await self.set_state(new_state=current_state) + else: + return except Exception as e: self.logger.error(f"Invalid {self.__class__.__name__} configuration. Unable to extract predicates using GPT. {e}") raise Exception(f"An error occurred while extracting predicates using GPT: {e}") diff --git a/querent/core/transformers/gpt_llm_gpt_ner.py b/querent/core/transformers/gpt_llm_gpt_ner.py index 6466d59c..5e4e5667 100644 --- a/querent/core/transformers/gpt_llm_gpt_ner.py +++ b/querent/core/transformers/gpt_llm_gpt_ner.py @@ -243,17 +243,20 @@ async def process_tokens(self, data: IngestedTokens): final_triples = self.remove_duplicate_triplets(final_triples) if len(final_triples) > 0: for triple in final_triples: - graph_json = json.dumps(triple) - if graph_json: - current_state = EventState(EventType.Graph,1.0, graph_json, file) - await self.set_state(new_state=current_state) - context_embeddings = self.create_emb.get_embeddings([triple['sentence']])[0] - triple['context_embeddings'] = context_embeddings - triple['context'] = triple['sentence'] - vector_json = json.dumps(TripleToJsonConverter.convert_vectorjson((triple['subject'],json.dumps(triple), triple['object']))) - if vector_json: - current_state = EventState(EventType.Vector,1.0, vector_json, file) + if not self.termination_event.is_set(): + graph_json = json.dumps(triple) + if graph_json: + current_state = EventState(EventType.Graph,1.0, graph_json, file) await self.set_state(new_state=current_state) + context_embeddings = self.create_emb.get_embeddings([triple['sentence']])[0] + triple['context_embeddings'] = context_embeddings + triple['context'] = triple['sentence'] + vector_json = json.dumps(TripleToJsonConverter.convert_vectorjson((triple['subject'],json.dumps(triple), triple['object']))) + if vector_json: + current_state = EventState(EventType.Vector,1.0, vector_json, file) + await self.set_state(new_state=current_state) + else: + return except Exception as e: self.logger.debug(f"Invalid {self.__class__.__name__} configuration. Unable to extract predicates using GPT NER LLM class. {e}") diff --git a/requirements.txt b/requirements.txt index c1a36e2d..91972ecf 100644 --- a/requirements.txt +++ b/requirements.txt @@ -31,7 +31,7 @@ redis==5.0.3 regex==2023.5.5 sentence-transformers==2.2.2 spacy==3.7.2 -uvicorn==0.22.0 +uvicorn==0.29.0 slack-sdk==3.26.1 pylint==2.17.4 pytest-cov==4.1.0 @@ -39,7 +39,7 @@ pytest-mock==3.11.1 tensorflow==2.14.0 transformers==4.36.0 torch==2.0.1 --index-url https://download.pytorch.org/whl/cpu -pymupdf==1.23.26 +pymupdf==1.24.0 asyncio==3.4.3 prometheus-client==0.17.1 rdflib==7.0.0 diff --git a/setup.py b/setup.py index 0c785a2b..3ae9355b 100644 --- a/setup.py +++ b/setup.py @@ -36,7 +36,7 @@ "regex==2023.5.5", "sentence-transformers==2.2.2", "spacy==3.7.2", - "uvicorn==0.22.0", + "uvicorn==0.29.0", "slack-sdk==3.26.1", "pylint==2.17.4", "pytest-cov==4.1.0", @@ -50,7 +50,7 @@ "pytest-asyncio==0.23.2", "pyshacl==0.25.0", "google-cloud-storage==2.14.0", - "PyMuPDF==1.23.26", + "PyMuPDF==1.24.0", "pydub==0.25.1", "SpeechRecognition==3.10.1", "pytesseract==0.3.10", diff --git a/tests/workflows/test_multiple_collectors.py b/tests/workflows/test_multiple_collectors.py index ae957bd5..b1bfe8b1 100644 --- a/tests/workflows/test_multiple_collectors.py +++ b/tests/workflows/test_multiple_collectors.py @@ -122,7 +122,7 @@ async def test_multiple_collectors_all_async(): else: unique_files.add(ingested_data.file) counter += 1 - assert counter == 86 + assert counter == 85 assert len(unique_files) > 1 assert messages > 0