Skip to content

Commit

Permalink
Merge branch 'develop'
Browse files Browse the repository at this point in the history
  • Loading branch information
kristikotini committed Jun 26, 2024
2 parents ecea861 + 47a9c96 commit 4df0e51
Show file tree
Hide file tree
Showing 37 changed files with 2,538 additions and 721 deletions.
117 changes: 26 additions & 91 deletions Project/backend/codebase/graph_analysis/graph_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,18 @@
import os
import json

def get_top_n_central_nodes(centrality_dict, n):
"""Sort nodes based on centrality measure and return top N nodes.
Args:
centrality_dict: Dictionary of nodes with their centrality values.
n: Number of top nodes to return.
Returns:
Sorted list of top N nodes with their centrality values.
"""
sorted_nodes = sorted(centrality_dict.items(), key=lambda item: item[1], reverse=True)
return sorted_nodes[:n]

def analyze_graph_structure(G):
"""Analyzes the structure of a knowledge graph and provides hopefully useful information.
Expand All @@ -17,15 +29,12 @@ def analyze_graph_structure(G):
# Basic Graph Statistics
num_nodes = G.number_of_nodes() # Total number of nodes
num_edges = G.number_of_edges() # Total number of edges
density = nx.density(G) # Ratio of actual edges to possible edges (0 to 1)
average_degree = 2 * num_edges / num_nodes # Average number of edges per node

# Degree Distribution
degree_distribution = dict(G.degree())
# Degree distribution can indicate the presence of hubs or important nodes
if num_nodes == 0 or num_edges == 0:
raise ValueError("The graph is empty or not properly constructed.")

# Degree Centrality: Measures node connectivity
degree_centrality = nx.degree_centrality(G)

""" Centrality Measures
- Degree Centrality: Measures node connectivity
- Nodes with high degree centrality are important in the network
Expand All @@ -36,8 +45,9 @@ def analyze_graph_structure(G):
- Degree Centrality: node1 = 0.33(1/3), node2 = 0.66(2/3), node3 = 0.33(1/3)
"""

betweenness_centrality = nx.betweenness_centrality(G)

# Betweenness Centrality: Measures node's control over information flow
betweenness_centrality = nx.betweenness_centrality(G)
"""
- Betweenness Centrality: Measures node's control over information flow
- Nodes with high betweenness centrality are important in the network
Expand All @@ -54,24 +64,8 @@ def analyze_graph_structure(G):
- Betweenness Centrality show the dependency of the network on a node
"""

# - Closeness Centrality: Measures average length of the shortest path from a node to all other nodes
closeness_centrality = nx.closeness_centrality(G)

"""
- Closeness Centrality: Measures average length of the shortest path from a node to all other nodes
- Nodes with high closeness centrality are important in the network
Examples: 4 nodes are connected
0
/ | \
2--1--3
- Here, node 0, 1 (1.0) has the highest closeness centrality because it is connected to all other nodes (node 2, 3 = 0.75)
- Closeness Centrality show the average distance of a node to all other nodes in the network
"""

# - Eigenvector Centrality: Measures influence of a node in a network

# eigenvector centrality measures the influence of a node in a network
eigenvector_centrality = nx.eigenvector_centrality(G)

"""
Expand All @@ -92,78 +86,16 @@ def analyze_graph_structure(G):
"""

# Community Structure
# - Louvain Algorithm (for community detection)
communities = list(nx.community.greedy_modularity_communities(G))
community_sizes = [len(community) for community in communities]
num_communities = len(communities)
# Communities can reveal modular structures in the graph
"""
- Community Detection: Identifying groups of nodes that are more connected to each other than to the rest of the network
- Communities can reveal modular structures in the graph
- Communities can be used to identify groups of nodes that are more connected to each other than to the rest of the network
Examples: 7 nodes are connected
1
/ \
2-----3
\ / 5
4-----/ \
6-----7
- Here, nodes 1, 2, 3, 4 are in one community and nodes 5, 6, 7 are in another community
"""

# Graph Connectivity
# - Check if the graph is connected
is_connected = nx.is_connected(G)
# - Calculate diameter: Longest shortest path between any two nodes
diameter = nx.diameter(G) if is_connected else float('inf')
# - Average shortest path length: Average of all shortest paths in the graph
average_shortest_path_length = nx.average_shortest_path_length(G) if is_connected else float('inf')

# Clustering Coefficient
# - Measures the degree to which nodes tend to cluster together
average_clustering_coefficient = nx.average_clustering(G)

# Assortativity
# - Measures the similarity of connections in the graph with respect to node degree
assortativity = nx.degree_assortativity_coefficient(G)

# Graph Diameter and Radius
# - Diameter: Longest shortest path in the graph
# - Radius: Minimum eccentricity of any node
radius = nx.radius(G) if is_connected else float('inf')

# Graph Transitivity
# - Measures the overall probability for the network to have adjacent nodes interconnected
transitivity = nx.transitivity(G)

# Return a dictionary containing the structural information
graph_info = {
"num_nodes": num_nodes,
"num_edges": num_edges,
"density": density,
"average_degree": average_degree,
"degree_distribution": degree_distribution,
"degree_centrality": degree_centrality,
"betweenness_centrality": betweenness_centrality,
"closeness_centrality": closeness_centrality,
"eigenvector_centrality": eigenvector_centrality,
"num_communities": num_communities,
"community_sizes": community_sizes,
"is_connected": is_connected,
"diameter": diameter,
"average_shortest_path_length": average_shortest_path_length,
"average_clustering_coefficient": average_clustering_coefficient,
"assortativity": assortativity,
"radius": radius,
"transitivity": transitivity
"top_degree_centrality": get_top_n_central_nodes(degree_centrality, top_n),
"top_betweenness_centrality": get_top_n_central_nodes(betweenness_centrality, top_n),
"top_eigenvector_centrality": get_top_n_central_nodes(eigenvector_centrality, top_n)
}

return graph_info


def print_graph_info(graph_info):
"""Prints the graph information in a formatted and readable way.
Expand All @@ -176,9 +108,12 @@ def print_graph_info(graph_info):

graph_directory = os.fsencode("../.media/graphs/")


top_n = int(input("Enter the number of top nodes to display: "))

with os.scandir("./Project/backend/codebase/.media/graphs/") as it:
for entry in it:
if entry.name.endswith(".gml") and entry.is_file():
if entry.name.endswith("c.gml") and entry.is_file():
print("-----------------------")
print(f"Filename: {entry.name}")
graph = nx.read_gml(entry.path)
Expand Down
43 changes: 25 additions & 18 deletions Project/backend/codebase/graph_creator/gemini.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import os
from datetime import datetime

import google.generativeai as genai

from graph_creator.services.json_handler import transform_llm_output_to_dict


Expand Down Expand Up @@ -36,27 +38,29 @@ def extract_entities_and_relations(chunk, genai_client):
"""
SYS_PROMPT = (
"Only answer in a JSON format. \n"
"You are a network graph maker who extracts terms and their relations from a given context. "
"You are provided with a context chunk (delimited by ```) Your task is to extract the ontology "
"of terms mentioned in the given context. These terms should represent the key concepts as per the context. \n"
"Thought 1: While traversing through each sentence, Think about the key terms mentioned in it.\n"
"\tTerms may include object, entity, location, organization, person, \n"
"\tcondition, acronym, documents, service, concept, etc.\n"
"\tTerms should be as atomistic as possible\n\n"
"Thought 2: Think about how these terms can have one on one relation with other terms.\n"
"\tTerms that are mentioned in the same sentence or the same paragraph are typically related to each other.\n"
"\tTerms can be related to many other terms\n\n"
"Thought 3: Find out the relation between each such related pair of terms. \n\n"
"Format your output as a list of JSON. Each element of the list contains a pair of terms"
"You are a network graph maker who extracts the most critical terms and their relations from a given context. "
"You are provided with a context chunk (delimited by ```). Your task is to extract the ontology "
"of the few key terms that are indispensable for understanding the given context. These terms should represent the core concepts as per the context. \n"
"Thought 1: Identify the few most critical terms in the entire context.\n"
"\tTerms may include only the most significant objects, entities, locations, organizations, people, conditions, acronyms, documents, services, concepts, etc.\n"
"\tExclude all terms that are not crucial to the core message.\n"
"\tDo not extract a term from every sentence; focus only on the most important terms across the entire context.\n\n"
"Thought 2: Determine how these indispensable terms are directly related to each other.\n"
"\tTerms that are mentioned in the same sentence or paragraph are typically related to each other.\n"
"\tFocus solely on relationships that reveal the most critical interactions or dependencies, ignoring all minor details.\n\n"
"Thought 3: Identify the specific type of relationship between each related pair of terms.\n"
"\tEnsure the relationship is crucial, highly relevant, and necessary for understanding the context.\n\n"
"Format your output as a list of JSON. Each element of the list contains a pair of terms "
"and the relation between them, like the following: \n"
"[\n"
" {\n"
' "node_1": "A concept from extracted ontology",\n'
' "node_2": "A related concept from extracted ontology",\n'
' "node_1": "A core concept from extracted ontology",\n'
' "node_2": "A related core concept from extracted ontology",\n'
' "edge": "relationship between the two concepts, node_1 and node_2"\n'
" }, {...}\n"
"]"
)

USER_PROMPT = f"context: ```{chunk}``` \n\n output: "

chat_session = genai_client.start_chat(history=[])
Expand All @@ -74,9 +78,12 @@ def check_for_connecting_relation(
"""
SYS_PROMPT = (
"Only answer in JSON format. \n"
"Your task is to help create a knowledge graph by extracting one more relation between any entity of list_1 with any entity of list_2.\n"
"We want to connect the subgraphs of nodes and relations that were extracted from the given text chunk (delimited by ```)."
"For this one more relation needs to be extracted from the given text chunk between any entity of list_1 and list_2:\n"
"Your task is to help create a knowledge graph by extracting one more relation between any entity of list_1 "
"with any entity of list_2.\n "
"We want to connect the subgraphs of nodes and relations that were extracted from the given text chunk ("
"delimited by ```). "
"For this one more relation needs to be extracted from the given text chunk between any entity of list_1 and "
"list_2:\n "
f"list_1: {entities_component_1}\n"
f"list_2: {entities_component_2}\n"
"Only use the exact entities given in the lists."
Expand Down Expand Up @@ -110,7 +117,7 @@ def check_for_connecting_relation_(
The text chunk to be proccessed
entities_component_1 : list
List of entities
entities_component_1 : list
entities_component_2 : list
List of entities
Returns
Expand Down
16 changes: 9 additions & 7 deletions Project/backend/codebase/graph_creator/graph_creator_main.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,15 @@
import logging
import mimetypes
import pandas

from graph_creator import graph_handler
from graph_creator import pdf_handler
from graph_creator.llama3 import process_chunks as groq_process_chunks
from graph_creator.models.graph_job import GraphJob
from graph_creator import pdf_handler
from graph_creator import graph_handler
from graph_creator.services import netx_graphdb

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


def process_file_to_graph(g_job: GraphJob):
"""
Expand Down Expand Up @@ -58,11 +61,9 @@ def process_file_to_entities_and_relations(file: str):
] # Assuming chunk has 'page_content' attribute

# Generate response using LLM
# response_json = process_chunks(text_chunks, prompt_template)
response_json = groq_process_chunks(text_chunks)
print(response_json)
except Exception as e:
print(e)
logging.error(e)
response_json = None

return response_json, chunks
Expand All @@ -83,6 +84,7 @@ def create_and_store_graph(uuid, entities_and_relations, chunks):

# combine knowledge graph pieces
# combined = graph_handler.connect_with_chunk_proximity(df_e_and_r)
# combined['chunk_id'] = '1'
for i in range(len(chunks)):
chunks[i] = chunks[i].dict()
combined = graph_handler.connect_with_llm(df_e_and_r, chunks, 30)
Expand All @@ -91,7 +93,7 @@ def create_and_store_graph(uuid, entities_and_relations, chunks):
graph_db_service = netx_graphdb.NetXGraphDB()

# read entities and relations
graph = graph_db_service.create_graph_from_df(combined)
graph = graph_db_service.create_graph_from_df(combined, chunks)

# save graph as file
graph_db_service.save_graph(uuid, graph)
Loading

0 comments on commit 4df0e51

Please sign in to comment.