Skip to content

Commit

Permalink
impolemented event fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
ngupta10 committed Dec 18, 2023
1 parent 3924b24 commit b268b38
Show file tree
Hide file tree
Showing 12 changed files with 400 additions and 43 deletions.
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -184,7 +184,7 @@ class MockLLMEngine(BaseEngine):
# can set the state of the LLM using the set_state method
# The state of the LLM is stored in the state attribute of the LLM
# The state of the LLM is published to subscribers of the LLM
current_state = EventState(EventType.CONTEXTUAL_TRIPLES, 1.0, "anything")
current_state = EventState(EventType.ContextualTriples, 1.0, "anything")
await self.set_state(new_state=current_state)

def validate(self):
Expand All @@ -204,12 +204,12 @@ async def test_querent_with_base_llm():

# Define a callback function to subscribe to state changes
def state_change_callback(new_state):
assert new_state.event_type == EventType.CONTEXTUAL_TRIPLES
assert new_state.event_type == EventType.ContextualTriples


# Subscribe to state change events
# This pattern is ideal as we can expose multiple events for each use case of the LLM
llm_mocker.subscribe(EventType.CONTEXTUAL_TRIPLES, state_change_callback)
llm_mocker.subscribe(EventType.ContextualTriples, state_change_callback)

## one can also subscribe to other events, e.g. EventType.CHAT_COMPLETION ...

Expand Down
4 changes: 2 additions & 2 deletions docs/Algorith_Relationship Extraction.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@

## Trigger for the Relationship Extraction to start work:

Event Type : CONTEXTUAL_TRIPLES
We have subscribed the relationship extractor class to CONTEXTUAL_TRIPLES event type emitted from the bert llm class. When this event type signal is emitted, the handle_event function is triggered inside the relationship extractor which is responsible for extracting relationships. Below is the workflow of the relationship extractor:
Event Type : ContextualTriples
We have subscribed the relationship extractor class to ContextualTriples event type emitted from the bert llm class. When this event type signal is emitted, the handle_event function is triggered inside the relationship extractor which is responsible for extracting relationships. Below is the workflow of the relationship extractor:
<br /><br />

### I. Data Validation in RealtionExtractor:<br />
Expand Down
14 changes: 7 additions & 7 deletions docs/Event_types_llm.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,14 @@ The `EventType` class contains predefined constants that represent different kin

### Attributes

- **`CONTEXTUAL_TRIPLES`**: Indicates event involving the generation of contextual triples (text) in the system.
- **`RDF_CONTEXTUAL_TRIPLES`**: Denotes event involving the generation of (Resource Description Framework) graph formatted contextual triples in the system.
- **`RDF_SEMANTIC_TRIPLES`**: Used for events involving the creation of (Resource Description Framework) graph formatted semantic triples.
- **`CONTEXTUAL_EMBEDDING`**: Used for events involving creation of Vector embeddings of context.
- **`ContextualTriples`**: Indicates event involving the generation of contextual triples (text) in the system.
- **`RdfContextualTriples`**: Denotes event involving the generation of (Resource Description Framework) graph formatted contextual triples in the system.
- **`RdfSemanticTriples`**: Used for events involving the creation of (Resource Description Framework) graph formatted semantic triples.
- **`ContextualEmbeddings`**: Used for events involving creation of Vector embeddings of context.

### Usage Example (All event types generated from llm for now)

```python
current_state = EventState(EventType.CONTEXTUAL_TRIPLES, 1.0, filtered_triples)
current_state = EventState(EventType.RDF_CONTEXTUAL_TRIPLES, 1.0, kgm.retrieve_triples())
current_state = EventState(EventType.RDF_SEMANTIC_TRIPLES, 1.0, semantic_triples)
current_state = EventState(EventType.ContextualTriples, 1.0, filtered_triples)
current_state = EventState(EventType.RdfContextualTriples, 1.0, kgm.retrieve_triples())
current_state = EventState(EventType.RdfSemanticTriples, 1.0, semantic_triples)
8 changes: 4 additions & 4 deletions querent/common/types/querent_event.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,10 @@ class EventType:
CHAT_COMPLETED (Literal["chat_completed"]): Event type for chat completion.
"""

CONTEXTUAL_TRIPLES = "contextual_triples"
RDF_CONTEXTUAL_TRIPLES = "rdf_contextual_triples"
RDF_SEMANTIC_TRIPLES = "rdf_semantic_triples"
CONTEXTUAL_EMBEDDING = "contextual_embedding"
ContextualTriples = "contextual_triples"
RdfContextualTriples = "rdf_contextual_triples"
RdfSemanticTriples = "rdf_semantic_triples"
ContextualEmbeddings = "contextual_embedding"


class EventState:
Expand Down
15 changes: 11 additions & 4 deletions querent/core/transformers/bert_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from querent.core.transformers.relationship_extraction_llm import RelationExtractor
from querent.kg.contextual_predicate import process_data
from querent.kg.ner_helperfunctions.contextual_embeddings import EntityEmbeddingExtractor
from querent.kg.ner_helperfunctions.fixed_entities import FixedEntityExtractor
from querent.kg.ner_helperfunctions.graph_manager_contextual import KnowledgeGraphManager
from querent.kg.ner_helperfunctions.graph_manager_contextual import KnowledgeGraphManager
from querent.kg.ner_helperfunctions.ner_llm_transformer import NER_LLM
Expand Down Expand Up @@ -91,6 +92,10 @@ def __init__(
self.fixed_relationships = config.fixed_relationships
self.sample_relationships = config.sample_relationships
self.user_context = config.user_context
if config.fixed_entities:
self.entity_context_extractor = FixedEntityExtractor(config.fixed_entities)
else:
self.entity_context_extractor = None


def validate(self) -> bool:
Expand Down Expand Up @@ -140,6 +145,8 @@ async def process_tokens(self, data: IngestedTokens):
data.get_file_path(), data.data
)
if content:
if self.entity_context_extractor:
content = self.entity_context_extractor.find_entity_sentences(content)
tokens = self.ner_llm_instance._tokenize_and_chunk(content)
for tokenized_sentence, original_sentence, sentence_idx in tokens:
(
Expand Down Expand Up @@ -179,16 +186,16 @@ async def process_tokens(self, data: IngestedTokens):
self.logger.log(f"Filtering in {self.__class__.__name__} producing 0 entity pairs. Filtering Disabled. ")
else:
filtered_triples = pairs_with_predicates
current_state = EventState(EventType.CONTEXTUAL_TRIPLES, 1.0, filtered_triples)
current_state = EventState(EventType.ContextualTriples, 1.0, filtered_triples)
await self.set_state(new_state=current_state)
kgm = KnowledgeGraphManager()
kgm.feed_input(filtered_triples)
current_state = EventState(EventType.RDF_CONTEXTUAL_TRIPLES, 1.0, kgm.retrieve_triples())
current_state = EventState(EventType.RdfContextualTriples, 1.0, kgm.retrieve_triples())
await self.set_state(new_state=current_state)
mock_config = RelationshipExtractorConfig()
semantic_extractor = RelationExtractor(mock_config)
semantic_triples = semantic_extractor.process_tokens(EventState(EventType.CONTEXTUAL_TRIPLES, 1.0, filtered_triples))
current_state = EventState(EventType.RDF_SEMANTIC_TRIPLES, 1.0, semantic_triples)
semantic_triples = semantic_extractor.process_tokens(EventState(EventType.ContextualTriples, 1.0, filtered_triples))
current_state = EventState(EventType.RdfSemanticTriples, 1.0, semantic_triples)
await self.set_state(new_state=current_state)
except Exception as e:
self.logger.error(f"Invalid {self.__class__.__name__} configuration. Unable to process tokens. {e}")
Expand Down
61 changes: 48 additions & 13 deletions querent/kg/ner_helperfunctions/fixed_entities.py
Original file line number Diff line number Diff line change
@@ -1,27 +1,62 @@
import spacy
import re
from typing import List
"""
A class for extracting sentences from a text that contain specified fixed entities.
class EntityContextExtractor:
def __init__(self, fixed_entities: List[str], model="en_core_web_sm"):
This class utilizes regular expressions and spaCy's NLP capabilities to identify and
extract sentences from a given text which include any of the user-specified entities.
It is useful in scenarios where focus is required on specific entities within large
volumes of text.
Attributes:
nlp (spacy.Language): An instance of spaCy's language model.
fixed_entities (List[str]): A list of entities (as strings) to search for in the text.
entity_patterns (List[re.Pattern]): Compiled regex patterns for the fixed entities,
enabling case-insensitive searching.
Methods:
find_entity_sentences(text: str) -> str:
Identifies and returns sentences from the provided text that contain any of the
fixed entities.
measure_reduction(original_text: str, reduced_text: str) -> float:
Calculates the percentage reduction in text length after extracting relevant sentences.
"""

class FixedEntityExtractor:
def __init__(self, fixed_entities: List[str], model="en_core_web_lg"):
self.nlp = spacy.load(model)
self.fixed_entities = fixed_entities
# Precompile regex patterns for faster searching
self.entity_patterns = [re.compile(re.escape(entity), re.IGNORECASE) for entity in self.fixed_entities]
self.entity_pattern = self.create_combined_pattern(fixed_entities)

def create_combined_pattern(self, entities):
combined_pattern = '|'.join(map(re.escape, entities))
return re.compile(r'\b(?:' + combined_pattern + r')\b', re.IGNORECASE)

def find_entity_sentences(self, text: str) -> str:
def find_entity_sentences(self, text: str, chunk_size=1000) -> str:
doc = self.nlp(text)
sentences = list(doc.sents)
relevant_sentences = set()
prev_sentence = None

for i in range(0, len(doc), chunk_size):
chunk = doc[i:i+chunk_size]
sentences = list(chunk.sents)

for j, sentence in enumerate(sentences):
sentence_text = sentence.text
if self.entity_pattern.search(sentence_text):
# Add the previous, current, and next sentences
if prev_sentence:
relevant_sentences.add(prev_sentence.text)
relevant_sentences.add(sentence_text)
if j < len(sentences) - 1: # Check if there is a next sentence
relevant_sentences.add(sentences[j + 1].text)

for i, sentence in enumerate(sentences):
sentence_text = sentence.text
if any(pattern.search(sentence_text) for pattern in self.entity_patterns):
relevant_sentences.add(max(i - 1, 0))
relevant_sentences.add(i)
relevant_sentences.add(min(i + 1, len(sentences) - 1))
prev_sentence = sentence

return ' '.join(sentences[index].text for index in sorted(relevant_sentences))
return ' '.join(sorted(relevant_sentences))

def measure_reduction(self, original_text: str, reduced_text: str) -> float:
original_length = len(original_text)
Expand Down
77 changes: 77 additions & 0 deletions querent/kg/rel_helperfunctions/fixed_relationships.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
import spacy
import re
from typing import List
from nltk.corpus import wordnet as wn
"""
A class designed to extract sentences from text that contain specified fixed relationships,
taking into account synonyms for more comprehensive matching.
This class utilizes regular expressions, spaCy's NLP capabilities, and the WordNet
database to identify and extract sentences from a given text which include any of the
user-specified relationships or their synonyms. It's particularly useful for focusing on
specific types of relationships within large volumes of text.
Attributes:
nlp (spacy.Language): An instance of spaCy's language model.
fixed_relationships (List[str]): A list of relationships (as strings) to search for in the text.
relationship_patterns (List[re.Pattern]): Compiled regex patterns for the fixed relationships and their synonyms,
enabling case-insensitive searching.
Methods:
create_patterns_with_synonyms(relationships: List[str]) -> List[re.Pattern]:
Generates and compiles regex patterns that include synonyms of the specified relationships.
find_relationship_sentences(text: str) -> str:
Identifies and returns sentences from the provided text that contain any of the
fixed relationships or their synonyms.
measure_reduction(original_text: str, reduced_text: str) -> float:
Calculates the percentage reduction in text length after extracting relevant sentences.
"""

class FixedRelationshipExtractor:
def __init__(self, fixed_relationships: List[str], model="en_core_web_lg"):
self.nlp = spacy.load(model)
self.fixed_relationships = fixed_relationships
self.relationship_pattern = self.create_combined_pattern_with_synonyms(fixed_relationships)

def create_combined_pattern_with_synonyms(self, relationships):
all_synonyms = set()
for relationship in relationships:
all_synonyms.add(relationship)
for syn in wn.synsets(relationship, pos=wn.VERB):
for l in syn.lemmas():
all_synonyms.add(l.name().replace('_', ' '))
combined_pattern = '|'.join(map(re.escape, all_synonyms))
print(combined_pattern)
return re.compile(r'\b(?:' + combined_pattern + r')\b', re.IGNORECASE)

def find_relationship_sentences(self, text: str, chunk_size=1000) -> str:
doc = self.nlp(text)
relevant_sentences = set()
prev_sentence = None

for i in range(0, len(doc), chunk_size):
chunk = doc[i:i+chunk_size]
sentences = list(chunk.sents)

for j, sentence in enumerate(sentences):
sentence_text = sentence.text
if self.relationship_pattern.search(sentence_text):
# Add the previous, current, and next sentences
if prev_sentence:
relevant_sentences.add(prev_sentence.text)
relevant_sentences.add(sentence_text)
if j < len(sentences) - 1: # Check if there is a next sentence
relevant_sentences.add(sentences[j + 1].text)

prev_sentence = sentence

return ' '.join(sorted(relevant_sentences))


def measure_reduction(self, original_text: str, reduced_text: str) -> float:
original_length = len(original_text)
reduced_length = len(reduced_text)
reduction_percentage = ((original_length - reduced_length) / original_length) * 100
return reduction_percentage
Loading

0 comments on commit b268b38

Please sign in to comment.