Add Parent to communities in data model (#1491)

* Add Parent to communities in data model * Semver * Pyright * Update docs * Use leiden cluster parent id * Format
microsoft · Dec 10, 2024 · 0440580 · 0440580
1 parent 61816e0
commit 0440580
Show file tree

Hide file tree

Showing 17 changed files with 60 additions and 26 deletions.
diff --git a/.semversioner/next-release/patch-20241209220948095014.json b/.semversioner/next-release/patch-20241209220948095014.json
@@ -0,0 +1,4 @@
+{
+  "type": "patch",
+  "description": "Add Parent id to communities data model"
+}
diff --git a/docs/index/outputs.md b/docs/index/outputs.md
@@ -16,6 +16,7 @@ This is a list of the final communities generated by Leiden. Communities are str
 | name             | type  | description |
 | ---------------- | ----- | ----------- |
 | community        | int   | Leiden-generated cluster ID for the community. Note that these increment with depth, so they are unique through all levels of the community hierarchy. For this table, human_readable_id is a copy of the community ID rather than a plain increment. |
+| parent           | int   | Parent community ID.|
 | level            | int   | Depth of the community in the hierarchy. |
 | title            | str   | Friendly name of the community. |
 | entity_ids       | str[] | List of entities that are members of the community. |
@@ -30,6 +31,7 @@ This is the list of summarized reports for each community.
 | name              | type  | description |
 | ----------------- | ----- | ----------- |
 | community         | int   | Short ID of the community this report applies to. |
+| parent            | int   | Parent community ID. |
 | level             | int   | Level of the community this report applies to. |
 | title             | str   | LM-generated title for the report. |
 | summary           | str   | LM-generated summary of the report. |

diff --git a/graphrag/index/flows/create_base_entity_graph.py b/graphrag/index/flows/create_base_entity_graph.py
@@ -3,7 +3,7 @@
 
 """All the steps to create the base entity graph."""
 
-from typing import Any, cast
+from typing import Any
 from uuid import uuid4
 
 import networkx as nx
@@ -159,12 +159,10 @@ def _prep_edges(relationships, summaries) -> pd.DataFrame:
 
 
 def _prep_communities(communities) -> pd.DataFrame:
-    base_communities = pd.DataFrame(
-        communities, columns=cast("Any", ["level", "community", "title"])
-    )
-    base_communities = base_communities.explode("title")
-    base_communities["community"] = base_communities["community"].astype(int)
-    return base_communities
+    # Convert the input into a DataFrame and explode the title column
+    return pd.DataFrame(
+        communities, columns=pd.Index(["level", "community", "parent", "title"])
+    ).explode("title")
 
 
 def _compute_degree(graph: nx.Graph) -> pd.DataFrame:

diff --git a/graphrag/index/flows/create_final_communities.py b/graphrag/index/flows/create_final_communities.py
@@ -38,16 +38,23 @@ def create_final_communities(
         matched = targets.loc[targets["community_x"] == targets["community_y"]]
         text_units = matched.explode("text_unit_ids")
         grouped = (
-            text_units.groupby(["community_x", "level_x"])
+            text_units.groupby(["community_x", "level_x", "parent_x"])
             .agg(relationship_ids=("id", list), text_unit_ids=("text_unit_ids", list))
             .reset_index()
         )
         grouped.rename(
-            columns={"community_x": "community", "level_x": "level"}, inplace=True
+            columns={
+                "community_x": "community",
+                "level_x": "level",
+                "parent_x": "parent",
+            },
+            inplace=True,
         )
         all_grouped = pd.concat([
             all_grouped,
-            grouped.loc[:, ["community", "level", "relationship_ids", "text_unit_ids"]],
+            grouped.loc[
+                :, ["community", "level", "parent", "relationship_ids", "text_unit_ids"]
+            ],
         ])
 
     # deduplicate the lists
@@ -63,6 +70,7 @@ def create_final_communities(
     communities["id"] = [str(uuid4()) for _ in range(len(communities))]
     communities["human_readable_id"] = communities["community"]
     communities["title"] = "Community " + communities["community"].astype(str)
+    communities["parent"] = communities["parent"].astype(int)
 
     # add fields for incremental update tracking
     communities["period"] = datetime.now(timezone.utc).date().isoformat()
@@ -74,6 +82,7 @@ def create_final_communities(
             "id",
             "human_readable_id",
             "community",
+            "parent",
             "level",
             "title",
             "entity_ids",

diff --git a/graphrag/index/flows/create_final_community_reports.py b/graphrag/index/flows/create_final_community_reports.py
@@ -88,7 +88,7 @@ async def create_final_community_reports(
 
     # Merge with communities to add size and period
     merged = community_reports.merge(
-        communities.loc[:, ["community", "size", "period"]],
+        communities.loc[:, ["community", "parent", "size", "period"]],
         on="community",
         how="left",
         copy=False,
@@ -99,6 +99,7 @@ async def create_final_community_reports(
             "id",
             "human_readable_id",
             "community",
+            "parent",
             "level",
             "title",
             "summary",
@@ -124,7 +125,7 @@ def _prep_nodes(input: pd.DataFrame) -> pd.DataFrame:
     )
 
     # Create NODE_DETAILS column
-    input[NODE_DETAILS] = input.loc[
+    input.loc[:, NODE_DETAILS] = input.loc[
         :, [NODE_ID, NODE_NAME, NODE_DESCRIPTION, NODE_DEGREE]
     ].to_dict(orient="records")
 
@@ -136,7 +137,7 @@ def _prep_edges(input: pd.DataFrame) -> pd.DataFrame:
     input.fillna(value={NODE_DESCRIPTION: "No Description"}, inplace=True)
 
     # Create EDGE_DETAILS column
-    input[EDGE_DETAILS] = input.loc[
+    input.loc[:, EDGE_DETAILS] = input.loc[
         :, [EDGE_ID, EDGE_SOURCE, EDGE_TARGET, EDGE_DESCRIPTION, EDGE_DEGREE]
     ].to_dict(orient="records")
 
@@ -148,7 +149,7 @@ def _prep_claims(input: pd.DataFrame) -> pd.DataFrame:
     input.fillna(value={NODE_DESCRIPTION: "No Description"}, inplace=True)
 
     # Create CLAIM_DETAILS column
-    input[CLAIM_DETAILS] = input.loc[
+    input.loc[:, CLAIM_DETAILS] = input.loc[
         :, [CLAIM_ID, CLAIM_SUBJECT, CLAIM_TYPE, CLAIM_STATUS, CLAIM_DESCRIPTION]
     ].to_dict(orient="records")
 

diff --git a/graphrag/index/operations/cluster_graph.py b/graphrag/index/operations/cluster_graph.py
@@ -11,7 +11,7 @@
 
 from graphrag.index.graph.utils import stable_largest_connected_component
 
-Communities = list[tuple[int, str, list[str]]]
+Communities = list[tuple[int, int, int, list[str]]]
 
 
 class GraphCommunityStrategyType(str, Enum):
@@ -41,25 +41,25 @@ def run_layout(strategy: dict[str, Any], graph: nx.Graph) -> Communities:
         log.warning("Graph has no nodes")
         return []
 
-    clusters: dict[int, dict[str, list[str]]] = {}
+    clusters: dict[int, dict[int, list[str]]] = {}
     strategy_type = strategy.get("type", GraphCommunityStrategyType.leiden)
     match strategy_type:
         case GraphCommunityStrategyType.leiden:
-            clusters = run_leiden(graph, strategy)
+            clusters, parent_mapping = run_leiden(graph, strategy)
         case _:
             msg = f"Unknown clustering strategy {strategy_type}"
             raise ValueError(msg)
 
     results: Communities = []
     for level in clusters:
         for cluster_id, nodes in clusters[level].items():
-            results.append((level, cluster_id, nodes))
+            results.append((level, cluster_id, parent_mapping[cluster_id], nodes))
     return results
 
 
 def run_leiden(
     graph: nx.Graph, args: dict[str, Any]
-) -> dict[int, dict[str, list[str]]]:
+) -> tuple[dict[int, dict[int, list[str]]], dict[int, int]]:
     """Run method definition."""
     max_cluster_size = args.get("max_cluster_size", 10)
     use_lcc = args.get("use_lcc", True)
@@ -68,7 +68,7 @@ def run_leiden(
             "Running leiden with max_cluster_size=%s, lcc=%s", max_cluster_size, use_lcc
         )
 
-    node_id_to_community_map = _compute_leiden_communities(
+    node_id_to_community_map, community_hierarchy_map = _compute_leiden_communities(
         graph=graph,
         max_cluster_size=max_cluster_size,
         use_lcc=use_lcc,
@@ -80,16 +80,16 @@ def run_leiden(
     if levels is None:
         levels = sorted(node_id_to_community_map.keys())
 
-    results_by_level: dict[int, dict[str, list[str]]] = {}
+    results_by_level: dict[int, dict[int, list[str]]] = {}
     for level in levels:
         result = {}
         results_by_level[level] = result
         for node_id, raw_community_id in node_id_to_community_map[level].items():
-            community_id = str(raw_community_id)
+            community_id = raw_community_id
             if community_id not in result:
                 result[community_id] = []
             result[community_id].append(node_id)
-    return results_by_level
+    return results_by_level, community_hierarchy_map
 
 
 # Taken from graph_intelligence & adapted
@@ -98,8 +98,8 @@ def _compute_leiden_communities(
     max_cluster_size: int,
     use_lcc: bool,
     seed=0xDEADBEEF,
-) -> dict[int, dict[str, int]]:
-    """Return Leiden root communities."""
+) -> tuple[dict[int, dict[str, int]], dict[int, int]]:
+    """Return Leiden root communities and their hierarchy mapping."""
     # NOTE: This import is done here to reduce the initial import time of the graphrag package
     from graspologic.partition import hierarchical_leiden
 
@@ -110,8 +110,13 @@ def _compute_leiden_communities(
         graph, max_cluster_size=max_cluster_size, random_seed=seed
     )
     results: dict[int, dict[str, int]] = {}
+    hierarchy: dict[int, int] = {}
     for partition in community_mapping:
         results[partition.level] = results.get(partition.level, {})
         results[partition.level][partition.node] = partition.cluster
 
-    return results
+        hierarchy[partition.cluster] = (
+            partition.parent_cluster if partition.parent_cluster is not None else -1
+        )
+
+    return results, hierarchy
diff --git a/graphrag/index/update/communities.py b/graphrag/index/update/communities.py
@@ -54,6 +54,7 @@ def _merge_and_resolve_nodes(
         v: v + old_max_community_id + 1
         for k, v in delta_nodes["community"].dropna().astype(int).items()
     }
+    community_id_mapping.update({-1: -1})
 
     delta_nodes["community"] = delta_nodes["community"].where(
         delta_nodes["community"].isna(),
@@ -130,6 +131,12 @@ def _update_and_merge_communities(
         .apply(lambda x: community_id_mapping.get(x, x))
     )
 
+    delta_communities["parent"] = (
+        delta_communities["parent"]
+        .astype(int)
+        .apply(lambda x: community_id_mapping.get(x, x))
+    )
+
     old_communities["community"] = old_communities["community"].astype(int)
 
     # Merge the final communities
@@ -150,6 +157,7 @@ def _update_and_merge_communities(
             "id",
             "human_readable_id",
             "community",
+            "parent",
             "level",
             "title",
             "entity_ids",
@@ -201,6 +209,12 @@ def _update_and_merge_community_reports(
         .apply(lambda x: community_id_mapping.get(x, x))
     )
 
+    delta_community_reports["parent"] = (
+        delta_community_reports["parent"]
+        .astype(int)
+        .apply(lambda x: community_id_mapping.get(x, x))
+    )
+
     old_community_reports["community"] = old_community_reports["community"].astype(int)
 
     # Merge the final community reports
@@ -223,6 +237,7 @@ def _update_and_merge_community_reports(
             "id",
             "human_readable_id",
             "community",
+            "parent",
             "level",
             "title",
             "summary",

diff --git a/tests/verbs/data/base_communities.parquet b/tests/verbs/data/base_communities.parquet
diff --git a/tests/verbs/data/base_entity_nodes.parquet b/tests/verbs/data/base_entity_nodes.parquet
diff --git a/tests/verbs/data/base_relationship_edges.parquet b/tests/verbs/data/base_relationship_edges.parquet
diff --git a/tests/verbs/data/create_final_communities.parquet b/tests/verbs/data/create_final_communities.parquet
diff --git a/tests/verbs/data/create_final_community_reports.parquet b/tests/verbs/data/create_final_community_reports.parquet
diff --git a/tests/verbs/data/create_final_covariates.parquet b/tests/verbs/data/create_final_covariates.parquet
diff --git a/tests/verbs/data/create_final_entities.parquet b/tests/verbs/data/create_final_entities.parquet
diff --git a/tests/verbs/data/create_final_nodes.parquet b/tests/verbs/data/create_final_nodes.parquet
diff --git a/tests/verbs/data/create_final_relationships.parquet b/tests/verbs/data/create_final_relationships.parquet
diff --git a/tests/verbs/data/create_final_text_units.parquet b/tests/verbs/data/create_final_text_units.parquet