impolemented event fixes

Querent-ai · Dec 18, 2023 · b268b38 · b268b38
1 parent 3924b24
commit b268b38
Show file tree

Hide file tree

Showing 12 changed files with 400 additions and 43 deletions.
diff --git a/README.md b/README.md
@@ -184,7 +184,7 @@ class MockLLMEngine(BaseEngine):
         # can set the state of the LLM using the set_state method
         # The state of the LLM is stored in the state attribute of the LLM
         # The state of the LLM is published to subscribers of the LLM
-        current_state = EventState(EventType.CONTEXTUAL_TRIPLES, 1.0, "anything")
+        current_state = EventState(EventType.ContextualTriples, 1.0, "anything")
         await self.set_state(new_state=current_state)
 
     def validate(self):
@@ -204,12 +204,12 @@ async def test_querent_with_base_llm():
 
     # Define a callback function to subscribe to state changes
     def state_change_callback(new_state):
-        assert new_state.event_type == EventType.CONTEXTUAL_TRIPLES
+        assert new_state.event_type == EventType.ContextualTriples
 
 
     # Subscribe to state change events
     # This pattern is ideal as we can expose multiple events for each use case of the LLM
-    llm_mocker.subscribe(EventType.CONTEXTUAL_TRIPLES, state_change_callback)
+    llm_mocker.subscribe(EventType.ContextualTriples, state_change_callback)
 
     ## one can also subscribe to other events, e.g. EventType.CHAT_COMPLETION ...
 

diff --git a/docs/Algorith_Relationship Extraction.md b/docs/Algorith_Relationship Extraction.md
@@ -13,8 +13,8 @@
 
 ## Trigger for the Relationship Extraction to start work:
 
-Event Type : CONTEXTUAL_TRIPLES
-We have subscribed the relationship extractor class to CONTEXTUAL_TRIPLES event type emitted from the bert llm class.  When this event type signal is emitted, the handle_event function is triggered inside the relationship extractor which is responsible for extracting relationships. Below is the workflow of the relationship extractor:
+Event Type : ContextualTriples
+We have subscribed the relationship extractor class to ContextualTriples event type emitted from the bert llm class.  When this event type signal is emitted, the handle_event function is triggered inside the relationship extractor which is responsible for extracting relationships. Below is the workflow of the relationship extractor:
 <br /><br />
 
 ### I. Data Validation in RealtionExtractor:<br />

diff --git a/docs/Event_types_llm.md b/docs/Event_types_llm.md
@@ -8,14 +8,14 @@ The `EventType` class contains predefined constants that represent different kin
 
 ### Attributes
 
-- **`CONTEXTUAL_TRIPLES`**: Indicates event involving the generation of contextual triples (text) in the system.
-- **`RDF_CONTEXTUAL_TRIPLES`**: Denotes event involving the generation of (Resource Description Framework) graph formatted contextual triples in the system.
-- **`RDF_SEMANTIC_TRIPLES`**: Used for events involving the creation of (Resource Description Framework) graph formatted semantic triples.
-- **`CONTEXTUAL_EMBEDDING`**: Used for events involving creation of Vector embeddings of context.
+- **`ContextualTriples`**: Indicates event involving the generation of contextual triples (text) in the system.
+- **`RdfContextualTriples`**: Denotes event involving the generation of (Resource Description Framework) graph formatted contextual triples in the system.
+- **`RdfSemanticTriples`**: Used for events involving the creation of (Resource Description Framework) graph formatted semantic triples.
+- **`ContextualEmbeddings`**: Used for events involving creation of Vector embeddings of context.
 
 ### Usage Example (All event types generated from llm for now)
 
 ```python
-current_state = EventState(EventType.CONTEXTUAL_TRIPLES, 1.0, filtered_triples)
-current_state = EventState(EventType.RDF_CONTEXTUAL_TRIPLES, 1.0, kgm.retrieve_triples())
-current_state = EventState(EventType.RDF_SEMANTIC_TRIPLES, 1.0, semantic_triples)
+current_state = EventState(EventType.ContextualTriples, 1.0, filtered_triples)
+current_state = EventState(EventType.RdfContextualTriples, 1.0, kgm.retrieve_triples())
+current_state = EventState(EventType.RdfSemanticTriples, 1.0, semantic_triples)
diff --git a/querent/common/types/querent_event.py b/querent/common/types/querent_event.py
@@ -10,10 +10,10 @@ class EventType:
         CHAT_COMPLETED (Literal["chat_completed"]): Event type for chat completion.
     """
 
-    CONTEXTUAL_TRIPLES = "contextual_triples"
-    RDF_CONTEXTUAL_TRIPLES = "rdf_contextual_triples"
-    RDF_SEMANTIC_TRIPLES = "rdf_semantic_triples"
-    CONTEXTUAL_EMBEDDING = "contextual_embedding"
+    ContextualTriples = "contextual_triples"
+    RdfContextualTriples = "rdf_contextual_triples"
+    RdfSemanticTriples = "rdf_semantic_triples"
+    ContextualEmbeddings = "contextual_embedding"
 
 
 class EventState:

diff --git a/querent/core/transformers/bert_llm.py b/querent/core/transformers/bert_llm.py
@@ -6,6 +6,7 @@
 from querent.core.transformers.relationship_extraction_llm import RelationExtractor
 from querent.kg.contextual_predicate import process_data
 from querent.kg.ner_helperfunctions.contextual_embeddings import EntityEmbeddingExtractor
+from querent.kg.ner_helperfunctions.fixed_entities import FixedEntityExtractor
 from querent.kg.ner_helperfunctions.graph_manager_contextual import KnowledgeGraphManager
 from querent.kg.ner_helperfunctions.graph_manager_contextual import KnowledgeGraphManager
 from querent.kg.ner_helperfunctions.ner_llm_transformer import NER_LLM
@@ -91,6 +92,10 @@ def __init__(
         self.fixed_relationships = config.fixed_relationships
         self.sample_relationships = config.sample_relationships
         self.user_context = config.user_context
+        if config.fixed_entities:
+                self.entity_context_extractor = FixedEntityExtractor(config.fixed_entities)
+        else:
+                self.entity_context_extractor = None
 
 
     def validate(self) -> bool:
@@ -140,6 +145,8 @@ async def process_tokens(self, data: IngestedTokens):
                 data.get_file_path(), data.data
             )
             if content:
+                if self.entity_context_extractor:
+                    content = self.entity_context_extractor.find_entity_sentences(content)
                 tokens = self.ner_llm_instance._tokenize_and_chunk(content)
                 for tokenized_sentence, original_sentence, sentence_idx in tokens:
                     (
@@ -179,16 +186,16 @@ async def process_tokens(self, data: IngestedTokens):
                         self.logger.log(f"Filtering in {self.__class__.__name__} producing 0 entity pairs. Filtering Disabled. ")
                 else:
                     filtered_triples = pairs_with_predicates     
-                current_state = EventState(EventType.CONTEXTUAL_TRIPLES, 1.0, filtered_triples)
+                current_state = EventState(EventType.ContextualTriples, 1.0, filtered_triples)
                 await self.set_state(new_state=current_state)
                 kgm = KnowledgeGraphManager()
                 kgm.feed_input(filtered_triples)
-                current_state = EventState(EventType.RDF_CONTEXTUAL_TRIPLES, 1.0, kgm.retrieve_triples())
+                current_state = EventState(EventType.RdfContextualTriples, 1.0, kgm.retrieve_triples())
                 await self.set_state(new_state=current_state)
                 mock_config = RelationshipExtractorConfig()
                 semantic_extractor = RelationExtractor(mock_config)
-                semantic_triples = semantic_extractor.process_tokens(EventState(EventType.CONTEXTUAL_TRIPLES, 1.0, filtered_triples))
-                current_state = EventState(EventType.RDF_SEMANTIC_TRIPLES, 1.0, semantic_triples)                 
+                semantic_triples = semantic_extractor.process_tokens(EventState(EventType.ContextualTriples, 1.0, filtered_triples))
+                current_state = EventState(EventType.RdfSemanticTriples, 1.0, semantic_triples)                 
                 await self.set_state(new_state=current_state)
         except Exception as e:
             self.logger.error(f"Invalid {self.__class__.__name__} configuration. Unable to process tokens. {e}")

diff --git a/querent/kg/ner_helperfunctions/fixed_entities.py b/querent/kg/ner_helperfunctions/fixed_entities.py
@@ -1,27 +1,62 @@
 import spacy
 import re
 from typing import List
+"""
+    A class for extracting sentences from a text that contain specified fixed entities.
 
-class EntityContextExtractor:
-    def __init__(self, fixed_entities: List[str], model="en_core_web_sm"):
+    This class utilizes regular expressions and spaCy's NLP capabilities to identify and 
+    extract sentences from a given text which include any of the user-specified entities. 
+    It is useful in scenarios where focus is required on specific entities within large 
+    volumes of text.
+
+    Attributes:
+        nlp (spacy.Language): An instance of spaCy's language model.
+        fixed_entities (List[str]): A list of entities (as strings) to search for in the text.
+        entity_patterns (List[re.Pattern]): Compiled regex patterns for the fixed entities, 
+                                            enabling case-insensitive searching.
+
+    Methods:
+        find_entity_sentences(text: str) -> str:
+            Identifies and returns sentences from the provided text that contain any of the 
+            fixed entities.
+
+        measure_reduction(original_text: str, reduced_text: str) -> float:
+            Calculates the percentage reduction in text length after extracting relevant sentences.
+
+    """
+
+class FixedEntityExtractor:
+    def __init__(self, fixed_entities: List[str], model="en_core_web_lg"):
         self.nlp = spacy.load(model)
         self.fixed_entities = fixed_entities
-        # Precompile regex patterns for faster searching
-        self.entity_patterns = [re.compile(re.escape(entity), re.IGNORECASE) for entity in self.fixed_entities]
+        self.entity_pattern = self.create_combined_pattern(fixed_entities)
+
+    def create_combined_pattern(self, entities):
+        combined_pattern = '|'.join(map(re.escape, entities))
+        return re.compile(r'\b(?:' + combined_pattern + r')\b', re.IGNORECASE)
 
-    def find_entity_sentences(self, text: str) -> str:
+    def find_entity_sentences(self, text: str, chunk_size=1000) -> str:
         doc = self.nlp(text)
-        sentences = list(doc.sents)
         relevant_sentences = set()
+        prev_sentence = None
+
+        for i in range(0, len(doc), chunk_size):
+            chunk = doc[i:i+chunk_size]
+            sentences = list(chunk.sents)
+
+            for j, sentence in enumerate(sentences):
+                sentence_text = sentence.text
+                if self.entity_pattern.search(sentence_text):
+                    # Add the previous, current, and next sentences
+                    if prev_sentence:
+                        relevant_sentences.add(prev_sentence.text)
+                    relevant_sentences.add(sentence_text)
+                    if j < len(sentences) - 1:  # Check if there is a next sentence
+                        relevant_sentences.add(sentences[j + 1].text)
 
-        for i, sentence in enumerate(sentences):
-            sentence_text = sentence.text
-            if any(pattern.search(sentence_text) for pattern in self.entity_patterns):
-                relevant_sentences.add(max(i - 1, 0))
-                relevant_sentences.add(i)
-                relevant_sentences.add(min(i + 1, len(sentences) - 1))
+                prev_sentence = sentence
 
-        return ' '.join(sentences[index].text for index in sorted(relevant_sentences))
+        return ' '.join(sorted(relevant_sentences))
 
     def measure_reduction(self, original_text: str, reduced_text: str) -> float:
         original_length = len(original_text)

diff --git a/querent/kg/rel_helperfunctions/fixed_relationships.py b/querent/kg/rel_helperfunctions/fixed_relationships.py
@@ -0,0 +1,77 @@
+import spacy
+import re
+from typing import List
+from nltk.corpus import wordnet as wn
+"""
+    A class designed to extract sentences from text that contain specified fixed relationships,
+    taking into account synonyms for more comprehensive matching.
+
+    This class utilizes regular expressions, spaCy's NLP capabilities, and the WordNet 
+    database to identify and extract sentences from a given text which include any of the 
+    user-specified relationships or their synonyms. It's particularly useful for focusing on
+    specific types of relationships within large volumes of text.
+
+    Attributes:
+        nlp (spacy.Language): An instance of spaCy's language model.
+        fixed_relationships (List[str]): A list of relationships (as strings) to search for in the text.
+        relationship_patterns (List[re.Pattern]): Compiled regex patterns for the fixed relationships and their synonyms, 
+                                                  enabling case-insensitive searching.
+
+    Methods:
+        create_patterns_with_synonyms(relationships: List[str]) -> List[re.Pattern]:
+            Generates and compiles regex patterns that include synonyms of the specified relationships.
+
+        find_relationship_sentences(text: str) -> str:
+            Identifies and returns sentences from the provided text that contain any of the 
+            fixed relationships or their synonyms.
+
+        measure_reduction(original_text: str, reduced_text: str) -> float:
+            Calculates the percentage reduction in text length after extracting relevant sentences.
+"""
+
+class FixedRelationshipExtractor:
+    def __init__(self, fixed_relationships: List[str], model="en_core_web_lg"):
+        self.nlp = spacy.load(model)
+        self.fixed_relationships = fixed_relationships
+        self.relationship_pattern = self.create_combined_pattern_with_synonyms(fixed_relationships)
+
+    def create_combined_pattern_with_synonyms(self, relationships):
+        all_synonyms = set()
+        for relationship in relationships:
+            all_synonyms.add(relationship)
+            for syn in wn.synsets(relationship, pos=wn.VERB):
+                for l in syn.lemmas():
+                    all_synonyms.add(l.name().replace('_', ' '))
+        combined_pattern = '|'.join(map(re.escape, all_synonyms))
+        print(combined_pattern)
+        return re.compile(r'\b(?:' + combined_pattern + r')\b', re.IGNORECASE)
+
+    def find_relationship_sentences(self, text: str, chunk_size=1000) -> str:
+        doc = self.nlp(text)
+        relevant_sentences = set()
+        prev_sentence = None
+
+        for i in range(0, len(doc), chunk_size):
+            chunk = doc[i:i+chunk_size]
+            sentences = list(chunk.sents)
+
+            for j, sentence in enumerate(sentences):
+                sentence_text = sentence.text
+                if self.relationship_pattern.search(sentence_text):
+                    # Add the previous, current, and next sentences
+                    if prev_sentence:
+                        relevant_sentences.add(prev_sentence.text)
+                    relevant_sentences.add(sentence_text)
+                    if j < len(sentences) - 1:  # Check if there is a next sentence
+                        relevant_sentences.add(sentences[j + 1].text)
+
+                prev_sentence = sentence
+
+        return ' '.join(sorted(relevant_sentences))
+
+
+    def measure_reduction(self, original_text: str, reduced_text: str) -> float:
+        original_length = len(original_text)
+        reduced_length = len(reduced_text)
+        reduction_percentage = ((original_length - reduced_length) / original_length) * 100
+        return reduction_percentage