Merge branch 'develop'

amosproj · Jun 26, 2024 · 4df0e51 · 4df0e51
2 parents ecea861 + 47a9c96
commit 4df0e51
Show file tree

Hide file tree

Showing 37 changed files with 2,538 additions and 721 deletions.
diff --git a/Project/backend/codebase/graph_analysis/graph_analysis.py b/Project/backend/codebase/graph_analysis/graph_analysis.py
@@ -2,6 +2,18 @@
 import os
 import json
 
+def get_top_n_central_nodes(centrality_dict, n):
+    """Sort nodes based on centrality measure and return top N nodes.
+    
+    Args:
+        centrality_dict: Dictionary of nodes with their centrality values.
+        n: Number of top nodes to return.
+    
+    Returns:
+        Sorted list of top N nodes with their centrality values.
+    """
+    sorted_nodes = sorted(centrality_dict.items(), key=lambda item: item[1], reverse=True)
+    return sorted_nodes[:n]
 
 def analyze_graph_structure(G):
     """Analyzes the structure of a knowledge graph and provides hopefully useful information.
@@ -17,15 +29,12 @@ def analyze_graph_structure(G):
     # Basic Graph Statistics
     num_nodes = G.number_of_nodes()  # Total number of nodes
     num_edges = G.number_of_edges()  # Total number of edges
-    density = nx.density(G)  # Ratio of actual edges to possible edges (0 to 1)
-    average_degree = 2 * num_edges / num_nodes  # Average number of edges per node
 
-    # Degree Distribution
-    degree_distribution = dict(G.degree())
-    # Degree distribution can indicate the presence of hubs or important nodes
+    if num_nodes == 0 or num_edges == 0:
+        raise ValueError("The graph is empty or not properly constructed.")
 
+    # Degree Centrality: Measures node connectivity
     degree_centrality = nx.degree_centrality(G)
-
     """ Centrality Measures
     - Degree Centrality: Measures node connectivity
     - Nodes with high degree centrality are important in the network
@@ -36,8 +45,9 @@ def analyze_graph_structure(G):
     - Degree Centrality: node1 = 0.33(1/3), node2 = 0.66(2/3), node3 = 0.33(1/3)
     """
 
-    betweenness_centrality = nx.betweenness_centrality(G)
 
+    # Betweenness Centrality: Measures node's control over information flow
+    betweenness_centrality = nx.betweenness_centrality(G)
     """ 
     - Betweenness Centrality: Measures node's control over information flow
     - Nodes with high betweenness centrality are important in the network
@@ -54,24 +64,8 @@ def analyze_graph_structure(G):
     - Betweenness Centrality show the dependency of the network on a node
 
     """
-
-    #  - Closeness Centrality: Measures average length of the shortest path from a node to all other nodes
-    closeness_centrality = nx.closeness_centrality(G)
-
-    """
-    - Closeness Centrality: Measures average length of the shortest path from a node to all other nodes
-    - Nodes with high closeness centrality are important in the network
-
-    Examples: 4 nodes are connected
-    0
- / | \
-2--1--3
-
-    - Here, node 0, 1 (1.0) has the highest closeness centrality because it is connected to all other nodes (node 2, 3 = 0.75)
-    - Closeness Centrality show the average distance of a node to all other nodes in the network
-    """
-
-    #  - Eigenvector Centrality: Measures influence of a node in a network
+
+    # eigenvector centrality measures the influence of a node in a network
     eigenvector_centrality = nx.eigenvector_centrality(G)
 
     """
@@ -92,78 +86,16 @@ def analyze_graph_structure(G):
 
     """
 
-    # Community Structure
-    #  - Louvain Algorithm (for community detection)
-    communities = list(nx.community.greedy_modularity_communities(G))
-    community_sizes = [len(community) for community in communities]
-    num_communities = len(communities)
-    # Communities can reveal modular structures in the graph
-    """
-    - Community Detection: Identifying groups of nodes that are more connected to each other than to the rest of the network
-    - Communities can reveal modular structures in the graph
-    - Communities can be used to identify groups of nodes that are more connected to each other than to the rest of the network
-
-    Examples: 7 nodes are connected
-    1
- /    \
-2-----3
- \   /     5
-   4-----/   \
-        6-----7
-
-    - Here, nodes 1, 2, 3, 4 are in one community and nodes 5, 6, 7 are in another community    
-    """
-
-    # Graph Connectivity
-    #  - Check if the graph is connected
-    is_connected = nx.is_connected(G)
-    #  - Calculate diameter: Longest shortest path between any two nodes
-    diameter = nx.diameter(G) if is_connected else float('inf')
-    #  - Average shortest path length: Average of all shortest paths in the graph
-    average_shortest_path_length = nx.average_shortest_path_length(G) if is_connected else float('inf')
-
-    # Clustering Coefficient
-    #  - Measures the degree to which nodes tend to cluster together
-    average_clustering_coefficient = nx.average_clustering(G)
-
-    # Assortativity
-    #  - Measures the similarity of connections in the graph with respect to node degree
-    assortativity = nx.degree_assortativity_coefficient(G)
-
-    # Graph Diameter and Radius
-    #  - Diameter: Longest shortest path in the graph
-    #  - Radius: Minimum eccentricity of any node
-    radius = nx.radius(G) if is_connected else float('inf')
-
-    # Graph Transitivity
-    #  - Measures the overall probability for the network to have adjacent nodes interconnected
-    transitivity = nx.transitivity(G)
-
-    # Return a dictionary containing the structural information
     graph_info = {
         "num_nodes": num_nodes,
         "num_edges": num_edges,
-        "density": density,
-        "average_degree": average_degree,
-        "degree_distribution": degree_distribution,
-        "degree_centrality": degree_centrality,
-        "betweenness_centrality": betweenness_centrality,
-        "closeness_centrality": closeness_centrality,
-        "eigenvector_centrality": eigenvector_centrality,
-        "num_communities": num_communities,
-        "community_sizes": community_sizes,
-        "is_connected": is_connected,
-        "diameter": diameter,
-        "average_shortest_path_length": average_shortest_path_length,
-        "average_clustering_coefficient": average_clustering_coefficient,
-        "assortativity": assortativity,
-        "radius": radius,
-        "transitivity": transitivity
+        "top_degree_centrality": get_top_n_central_nodes(degree_centrality, top_n),
+        "top_betweenness_centrality": get_top_n_central_nodes(betweenness_centrality, top_n),
+        "top_eigenvector_centrality": get_top_n_central_nodes(eigenvector_centrality, top_n)
     }
 
     return graph_info
 
-
 def print_graph_info(graph_info):
     """Prints the graph information in a formatted and readable way.
 
@@ -176,9 +108,12 @@ def print_graph_info(graph_info):
 
 graph_directory = os.fsencode("../.media/graphs/")
 
+
+top_n = int(input("Enter the number of top nodes to display: "))
+
 with os.scandir("./Project/backend/codebase/.media/graphs/") as it:
     for entry in it:
-        if entry.name.endswith(".gml") and entry.is_file():
+        if entry.name.endswith("c.gml") and entry.is_file():
             print("-----------------------")
             print(f"Filename: {entry.name}")
             graph = nx.read_gml(entry.path)

diff --git a/Project/backend/codebase/graph_creator/gemini.py b/Project/backend/codebase/graph_creator/gemini.py
@@ -1,6 +1,8 @@
 import os
 from datetime import datetime
+
 import google.generativeai as genai
+
 from graph_creator.services.json_handler import transform_llm_output_to_dict
 
 
@@ -36,27 +38,29 @@ def extract_entities_and_relations(chunk, genai_client):
     """
     SYS_PROMPT = (
         "Only answer in a JSON format. \n"
-        "You are a network graph maker who extracts terms and their relations from a given context. "
-        "You are provided with a context chunk (delimited by ```) Your task is to extract the ontology "
-        "of terms mentioned in the given context. These terms should represent the key concepts as per the context. \n"
-        "Thought 1: While traversing through each sentence, Think about the key terms mentioned in it.\n"
-        "\tTerms may include object, entity, location, organization, person, \n"
-        "\tcondition, acronym, documents, service, concept, etc.\n"
-        "\tTerms should be as atomistic as possible\n\n"
-        "Thought 2: Think about how these terms can have one on one relation with other terms.\n"
-        "\tTerms that are mentioned in the same sentence or the same paragraph are typically related to each other.\n"
-        "\tTerms can be related to many other terms\n\n"
-        "Thought 3: Find out the relation between each such related pair of terms. \n\n"
-        "Format your output as a list of JSON. Each element of the list contains a pair of terms"
+        "You are a network graph maker who extracts the most critical terms and their relations from a given context. "
+        "You are provided with a context chunk (delimited by ```). Your task is to extract the ontology "
+        "of the few key terms that are indispensable for understanding the given context. These terms should represent the core concepts as per the context. \n"
+        "Thought 1: Identify the few most critical terms in the entire context.\n"
+        "\tTerms may include only the most significant objects, entities, locations, organizations, people, conditions, acronyms, documents, services, concepts, etc.\n"
+        "\tExclude all terms that are not crucial to the core message.\n"
+        "\tDo not extract a term from every sentence; focus only on the most important terms across the entire context.\n\n"
+        "Thought 2: Determine how these indispensable terms are directly related to each other.\n"
+        "\tTerms that are mentioned in the same sentence or paragraph are typically related to each other.\n"
+        "\tFocus solely on relationships that reveal the most critical interactions or dependencies, ignoring all minor details.\n\n"
+        "Thought 3: Identify the specific type of relationship between each related pair of terms.\n"
+        "\tEnsure the relationship is crucial, highly relevant, and necessary for understanding the context.\n\n"
+        "Format your output as a list of JSON. Each element of the list contains a pair of terms "
         "and the relation between them, like the following: \n"
         "[\n"
         "   {\n"
-        '       "node_1": "A concept from extracted ontology",\n'
-        '       "node_2": "A related concept from extracted ontology",\n'
+        '       "node_1": "A core concept from extracted ontology",\n'
+        '       "node_2": "A related core concept from extracted ontology",\n'
         '       "edge": "relationship between the two concepts, node_1 and node_2"\n'
         "   }, {...}\n"
         "]"
     )
+
     USER_PROMPT = f"context: ```{chunk}``` \n\n output: "
 
     chat_session = genai_client.start_chat(history=[])
@@ -74,9 +78,12 @@ def check_for_connecting_relation(
     """
     SYS_PROMPT = (
         "Only answer in JSON format. \n"
-        "Your task is to help create a knowledge graph by extracting one more relation between any entity of list_1 with any entity of list_2.\n"
-        "We want to connect the subgraphs of nodes and relations that were extracted from the given text chunk (delimited by ```)."
-        "For this one more relation needs to be extracted from the given text chunk between any entity of list_1 and list_2:\n"
+        "Your task is to help create a knowledge graph by extracting one more relation between any entity of list_1 "
+        "with any entity of list_2.\n "
+        "We want to connect the subgraphs of nodes and relations that were extracted from the given text chunk ("
+        "delimited by ```). "
+        "For this one more relation needs to be extracted from the given text chunk between any entity of list_1 and "
+        "list_2:\n "
         f"list_1: {entities_component_1}\n"
         f"list_2: {entities_component_2}\n"
         "Only use the exact entities given in the lists."
@@ -110,7 +117,7 @@ def check_for_connecting_relation_(
         The text chunk to be proccessed
     entities_component_1 : list
         List of entities
-    entities_component_1 : list
+    entities_component_2 : list
         List of entities
 
     Returns

diff --git a/Project/backend/codebase/graph_creator/graph_creator_main.py b/Project/backend/codebase/graph_creator/graph_creator_main.py
@@ -1,12 +1,15 @@
+import logging
 import mimetypes
-import pandas
 
+from graph_creator import graph_handler
+from graph_creator import pdf_handler
 from graph_creator.llama3 import process_chunks as groq_process_chunks
 from graph_creator.models.graph_job import GraphJob
-from graph_creator import pdf_handler
-from graph_creator import graph_handler
 from graph_creator.services import netx_graphdb
 
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
 
 def process_file_to_graph(g_job: GraphJob):
     """
@@ -58,11 +61,9 @@ def process_file_to_entities_and_relations(file: str):
         ]  # Assuming chunk has 'page_content' attribute
 
         # Generate response using LLM
-        # response_json = process_chunks(text_chunks, prompt_template)
         response_json = groq_process_chunks(text_chunks)
-        print(response_json)
     except Exception as e:
-        print(e)
+        logging.error(e)
         response_json = None
 
     return response_json, chunks
@@ -83,6 +84,7 @@ def create_and_store_graph(uuid, entities_and_relations, chunks):
 
     # combine knowledge graph pieces
     # combined = graph_handler.connect_with_chunk_proximity(df_e_and_r)
+    # combined['chunk_id'] = '1'
     for i in range(len(chunks)):
         chunks[i] = chunks[i].dict()
     combined = graph_handler.connect_with_llm(df_e_and_r, chunks, 30)
@@ -91,7 +93,7 @@ def create_and_store_graph(uuid, entities_and_relations, chunks):
     graph_db_service = netx_graphdb.NetXGraphDB()
 
     # read entities and relations
-    graph = graph_db_service.create_graph_from_df(combined)
+    graph = graph_db_service.create_graph_from_df(combined, chunks)
 
     # save graph as file
     graph_db_service.save_graph(uuid, graph)