Skip to content

Commit

Permalink
algo improve and removed prints
Browse files Browse the repository at this point in the history
  • Loading branch information
ngupta10 committed Jun 7, 2024
1 parent 3357142 commit c52a437
Show file tree
Hide file tree
Showing 7 changed files with 50 additions and 179 deletions.
4 changes: 2 additions & 2 deletions querent/config/core/llm_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@ class LLM_Config(EngineConfig):
ner_model_name: str = "English"
spacy_model_path: str = 'en_core_web_lg'
nltk_path: str = '/model/nltk_data'
rel_model_type: str = 'llama'
rel_model_path: str = ''
rel_model_type: str = 'bert'
rel_model_path: str = 'bert-base-uncased'
grammar_file_path: str = './querent/kg/rel_helperfunctions/json.gbnf'
emb_model_name: str = 'sentence-transformers/all-MiniLM-L6-v2'
user_context: str = Field(default="In a semantic triple (Subject, Predicate & Object) framework, determine which of the above entity is the subject and which is the object based on the context along with the predicate between these entities. Please also identify the subject type, object type & predicate type.")
Expand Down
36 changes: 26 additions & 10 deletions querent/core/transformers/bert_ner_opensourcellm.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,8 +102,9 @@ def _initialize_extractors(self, config):
self.semantic_extractor = RelationExtractor(mock_config, self.create_emb)

elif not self.skip_inferences and self.attn_based_rel_extraction == True:
# config.rel_model_path = 'bert-base-uncased'
config.rel_model_path = self.ner_model_initialized
model_config = AutoConfig.from_pretrained(config.rel_model_path)
print("Model Config -------------", model_config)
if 'bert' in model_config.model_type.lower():
self.ner_helper_instance = NER_LLM(ner_model_name=config.rel_model_path)
self.ner_helper_tokenizer = self.ner_helper_instance.ner_tokenizer
Expand All @@ -115,9 +116,7 @@ def _initialize_extractors(self, config):
# self.ner_tokenizer = AutoTokenizer.from_pretrained(model_id, gguf_file=filename)
# self.model = transformers.AutoModelForCausalLM.from_pretrained(model_id, gguf_file=filename)
# self.ner_helper_instance = NER_LLM(provided_tokenizer =self.ner_tokenizer, provided_model=self.model)
print("Loaded Model-------------11")
self.model = transformers.AutoModelForCausalLM.from_pretrained(config.rel_model_path,trust_remote_code=True)
print("Loaded Model-------------")
self.ner_helper_instance = NER_LLM(ner_model_name= config.rel_model_path, provided_model=self.model)
self.ner_helper_tokenizer = self.ner_helper_instance.ner_tokenizer
self.ner_helper_model = self.ner_helper_instance.ner_model
Expand Down Expand Up @@ -245,7 +244,18 @@ async def process_images(self, data: IngestedImages):
if graph_json:
current_state = EventState(event_type=EventType.Graph, timestamp=time.time(), payload=graph_json, file=file, doc_source=doc_source, image_id=unique_id)
await self.set_state(new_state=current_state)
vector_json = json.dumps(TripleToJsonConverter.convert_vectorjson(updated_tuple, blob))
subject, json_str, object_ = updated_tuple
context = json.loads(json_str)
sen_emb = self.create_emb.get_embeddings([context['context']])[0]
sub_emb = self.create_emb.get_embeddings(subject)[0]
obj_emb = self.create_emb.get_embeddings(object_)[0]
predicate_score=1
final_emb = TripleToJsonConverter.dynamic_weighted_average_embeddings(
[sub_emb, obj_emb, sen_emb],
base_weights=[predicate_score, predicate_score, 3],
normalize_weights=True # Normalize weights to ensure they sum to 1
)
vector_json = json.dumps(TripleToJsonConverter.convert_vectorjson(updated_tuple, blob, final_emb))
if vector_json:
current_state = EventState(event_type=EventType.Vector, timestamp=time.time(), payload=vector_json, file=file, doc_source=doc_source, image_id=unique_id)
await self.set_state(new_state=current_state)
Expand Down Expand Up @@ -346,7 +356,6 @@ def _process_entity_types(self, doc_entity_pairs):
doc_entity_pairs = self.entity_context_extractor.process_entity_types(doc_entities=doc_entity_pairs)
if any(doc_entity_pairs):
doc_entity_pairs = self.ner_llm_instance.remove_duplicates(doc_entity_pairs)
print("Binary Pairs -------------", doc_entity_pairs)
return doc_entity_pairs

def _process_pairs_with_embeddings(self, pairs_withattn, file):
Expand Down Expand Up @@ -377,11 +386,8 @@ async def _process_relationships(self, filtered_triples, file, doc_source):
fixed_entities=(len(self.sample_entities) >= 1)
)
else:
print("Trimming -----")
filtered_triples = trim_triples(filtered_triples)
print("Filtereddddddd Triples ------------", len(filtered_triples))
relationships = process_tokens(filtered_triples=filtered_triples, ner_instance=self.ner_helper_instance, extractor=self.extractor, nlp_model=self.nlp_model)
print("Predicates Triples From Attn Method----", relationships)
if not relationships:
return

Expand Down Expand Up @@ -419,8 +425,18 @@ async def _process_embedding_triples(self, embedding_triples, file, doc_source):
doc_source=doc_source
)
await self.set_state(new_state=current_state)

vector_json = json.dumps(TripleToJsonConverter.convert_vectorjson(triple))
subject, json_str, object_ = triple
context = json.loads(json_str)
sen_emb = self.create_emb.get_embeddings([context['context']])[0]
sub_emb = self.create_emb.get_embeddings(subject)[0]
obj_emb = self.create_emb.get_embeddings(object_)[0]
predicate_score=context['score']
final_emb = TripleToJsonConverter.dynamic_weighted_average_embeddings(
[sub_emb, obj_emb, sen_emb],
base_weights=[predicate_score, predicate_score, 3],
normalize_weights=True # Normalize weights to ensure they sum to 1
)
vector_json = json.dumps(TripleToJsonConverter.convert_vectorjson(triple=triple, embeddings=final_emb))
if vector_json:
current_state = EventState(
event_type=EventType.Vector,
Expand Down
8 changes: 0 additions & 8 deletions querent/kg/ner_helperfunctions/ner_llm_transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,6 @@ def transform_entity_pairs(self, entity_pairs):

return transformed_pairs
except Exception as e:
print("EEEEEEEEEEEEEE", e)
self.logger.error(f"Error trasnforming entity pairs: {e}")
raise Exception(f"Error trasnforming entity pairs: {e}")

Expand Down Expand Up @@ -223,7 +222,6 @@ def combine_entities_wordpiece(self, entities: List[dict], tokens: List[str]):
combined_entities.append(entity)
i += 1
final_entities = []
print("Combined Entitiesssssss----------", combined_entities)
for entity in combined_entities:
entity_text = entity["entity"]
start_idx = entity["start_idx"]
Expand Down Expand Up @@ -344,13 +342,10 @@ def filter_matching_entities(self, tuples_nested_list, entities_nested_list):
return matched_tuples

def find_subword_indices(self, text, entity):
print("entity----", entity)
subwords = self.ner_tokenizer.tokenize(entity)
subword_ids = self.ner_tokenizer.convert_tokens_to_ids(subwords)
token_ids = self.ner_tokenizer.convert_tokens_to_ids(self.ner_tokenizer.tokenize(text))
subword_positions = []
print("entity----", subwords)
print("entity -----------", self.ner_tokenizer.tokenize(text))
for i in range(len(token_ids) - len(subword_ids) + 1):
if token_ids[i:i + len(subword_ids)] == subword_ids:
subword_positions.append((i+1, i + len(subword_ids)))
Expand All @@ -368,17 +363,14 @@ def extract_entities_from_sentence(self, sentence: str, sentence_idx: int, all_s
tokens = self.tokenize_sentence(sentence)
chunks = self.get_chunks(tokens)
all_entities = []
print("Tokenssssss-----", tokens)
for chunk in chunks:
if fixed_entities_flag == False:
entities = self.extract_entities_from_chunk(chunk)
else:
entities = self.extract_fixed_entities_from_chunk(chunk,fixed_entities, entity_types)
all_entities.extend(entities)
print("Before Wordpiece---------------", all_entities)
final_entities = self.combine_entities_wordpiece(all_entities, tokens)
if fixed_entities_flag == False:
print("Final Entities ----", final_entities)
parsed_entities = Dependency_Parsing(entities=final_entities, sentence=sentence, model=NER_LLM.nlp)
entities_withnnchunk = parsed_entities.entities
else:
Expand Down
Loading

0 comments on commit c52a437

Please sign in to comment.