Skip to content

Commit

Permalink
Add Parent to communities in data model (#1491)
Browse files Browse the repository at this point in the history
* Add Parent to communities in data model

* Semver

* Pyright

* Update docs

* Use leiden cluster parent id

* Format
  • Loading branch information
AlonsoGuevara authored Dec 10, 2024
1 parent 61816e0 commit 0440580
Show file tree
Hide file tree
Showing 17 changed files with 60 additions and 26 deletions.
4 changes: 4 additions & 0 deletions .semversioner/next-release/patch-20241209220948095014.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
{
"type": "patch",
"description": "Add Parent id to communities data model"
}
2 changes: 2 additions & 0 deletions docs/index/outputs.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ This is a list of the final communities generated by Leiden. Communities are str
| name | type | description |
| ---------------- | ----- | ----------- |
| community | int | Leiden-generated cluster ID for the community. Note that these increment with depth, so they are unique through all levels of the community hierarchy. For this table, human_readable_id is a copy of the community ID rather than a plain increment. |
| parent | int | Parent community ID.|
| level | int | Depth of the community in the hierarchy. |
| title | str | Friendly name of the community. |
| entity_ids | str[] | List of entities that are members of the community. |
Expand All @@ -30,6 +31,7 @@ This is the list of summarized reports for each community.
| name | type | description |
| ----------------- | ----- | ----------- |
| community | int | Short ID of the community this report applies to. |
| parent | int | Parent community ID. |
| level | int | Level of the community this report applies to. |
| title | str | LM-generated title for the report. |
| summary | str | LM-generated summary of the report. |
Expand Down
12 changes: 5 additions & 7 deletions graphrag/index/flows/create_base_entity_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

"""All the steps to create the base entity graph."""

from typing import Any, cast
from typing import Any
from uuid import uuid4

import networkx as nx
Expand Down Expand Up @@ -159,12 +159,10 @@ def _prep_edges(relationships, summaries) -> pd.DataFrame:


def _prep_communities(communities) -> pd.DataFrame:
base_communities = pd.DataFrame(
communities, columns=cast("Any", ["level", "community", "title"])
)
base_communities = base_communities.explode("title")
base_communities["community"] = base_communities["community"].astype(int)
return base_communities
# Convert the input into a DataFrame and explode the title column
return pd.DataFrame(
communities, columns=pd.Index(["level", "community", "parent", "title"])
).explode("title")


def _compute_degree(graph: nx.Graph) -> pd.DataFrame:
Expand Down
15 changes: 12 additions & 3 deletions graphrag/index/flows/create_final_communities.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,16 +38,23 @@ def create_final_communities(
matched = targets.loc[targets["community_x"] == targets["community_y"]]
text_units = matched.explode("text_unit_ids")
grouped = (
text_units.groupby(["community_x", "level_x"])
text_units.groupby(["community_x", "level_x", "parent_x"])
.agg(relationship_ids=("id", list), text_unit_ids=("text_unit_ids", list))
.reset_index()
)
grouped.rename(
columns={"community_x": "community", "level_x": "level"}, inplace=True
columns={
"community_x": "community",
"level_x": "level",
"parent_x": "parent",
},
inplace=True,
)
all_grouped = pd.concat([
all_grouped,
grouped.loc[:, ["community", "level", "relationship_ids", "text_unit_ids"]],
grouped.loc[
:, ["community", "level", "parent", "relationship_ids", "text_unit_ids"]
],
])

# deduplicate the lists
Expand All @@ -63,6 +70,7 @@ def create_final_communities(
communities["id"] = [str(uuid4()) for _ in range(len(communities))]
communities["human_readable_id"] = communities["community"]
communities["title"] = "Community " + communities["community"].astype(str)
communities["parent"] = communities["parent"].astype(int)

# add fields for incremental update tracking
communities["period"] = datetime.now(timezone.utc).date().isoformat()
Expand All @@ -74,6 +82,7 @@ def create_final_communities(
"id",
"human_readable_id",
"community",
"parent",
"level",
"title",
"entity_ids",
Expand Down
9 changes: 5 additions & 4 deletions graphrag/index/flows/create_final_community_reports.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ async def create_final_community_reports(

# Merge with communities to add size and period
merged = community_reports.merge(
communities.loc[:, ["community", "size", "period"]],
communities.loc[:, ["community", "parent", "size", "period"]],
on="community",
how="left",
copy=False,
Expand All @@ -99,6 +99,7 @@ async def create_final_community_reports(
"id",
"human_readable_id",
"community",
"parent",
"level",
"title",
"summary",
Expand All @@ -124,7 +125,7 @@ def _prep_nodes(input: pd.DataFrame) -> pd.DataFrame:
)

# Create NODE_DETAILS column
input[NODE_DETAILS] = input.loc[
input.loc[:, NODE_DETAILS] = input.loc[
:, [NODE_ID, NODE_NAME, NODE_DESCRIPTION, NODE_DEGREE]
].to_dict(orient="records")

Expand All @@ -136,7 +137,7 @@ def _prep_edges(input: pd.DataFrame) -> pd.DataFrame:
input.fillna(value={NODE_DESCRIPTION: "No Description"}, inplace=True)

# Create EDGE_DETAILS column
input[EDGE_DETAILS] = input.loc[
input.loc[:, EDGE_DETAILS] = input.loc[
:, [EDGE_ID, EDGE_SOURCE, EDGE_TARGET, EDGE_DESCRIPTION, EDGE_DEGREE]
].to_dict(orient="records")

Expand All @@ -148,7 +149,7 @@ def _prep_claims(input: pd.DataFrame) -> pd.DataFrame:
input.fillna(value={NODE_DESCRIPTION: "No Description"}, inplace=True)

# Create CLAIM_DETAILS column
input[CLAIM_DETAILS] = input.loc[
input.loc[:, CLAIM_DETAILS] = input.loc[
:, [CLAIM_ID, CLAIM_SUBJECT, CLAIM_TYPE, CLAIM_STATUS, CLAIM_DESCRIPTION]
].to_dict(orient="records")

Expand Down
29 changes: 17 additions & 12 deletions graphrag/index/operations/cluster_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

from graphrag.index.graph.utils import stable_largest_connected_component

Communities = list[tuple[int, str, list[str]]]
Communities = list[tuple[int, int, int, list[str]]]


class GraphCommunityStrategyType(str, Enum):
Expand Down Expand Up @@ -41,25 +41,25 @@ def run_layout(strategy: dict[str, Any], graph: nx.Graph) -> Communities:
log.warning("Graph has no nodes")
return []

clusters: dict[int, dict[str, list[str]]] = {}
clusters: dict[int, dict[int, list[str]]] = {}
strategy_type = strategy.get("type", GraphCommunityStrategyType.leiden)
match strategy_type:
case GraphCommunityStrategyType.leiden:
clusters = run_leiden(graph, strategy)
clusters, parent_mapping = run_leiden(graph, strategy)
case _:
msg = f"Unknown clustering strategy {strategy_type}"
raise ValueError(msg)

results: Communities = []
for level in clusters:
for cluster_id, nodes in clusters[level].items():
results.append((level, cluster_id, nodes))
results.append((level, cluster_id, parent_mapping[cluster_id], nodes))
return results


def run_leiden(
graph: nx.Graph, args: dict[str, Any]
) -> dict[int, dict[str, list[str]]]:
) -> tuple[dict[int, dict[int, list[str]]], dict[int, int]]:
"""Run method definition."""
max_cluster_size = args.get("max_cluster_size", 10)
use_lcc = args.get("use_lcc", True)
Expand All @@ -68,7 +68,7 @@ def run_leiden(
"Running leiden with max_cluster_size=%s, lcc=%s", max_cluster_size, use_lcc
)

node_id_to_community_map = _compute_leiden_communities(
node_id_to_community_map, community_hierarchy_map = _compute_leiden_communities(
graph=graph,
max_cluster_size=max_cluster_size,
use_lcc=use_lcc,
Expand All @@ -80,16 +80,16 @@ def run_leiden(
if levels is None:
levels = sorted(node_id_to_community_map.keys())

results_by_level: dict[int, dict[str, list[str]]] = {}
results_by_level: dict[int, dict[int, list[str]]] = {}
for level in levels:
result = {}
results_by_level[level] = result
for node_id, raw_community_id in node_id_to_community_map[level].items():
community_id = str(raw_community_id)
community_id = raw_community_id
if community_id not in result:
result[community_id] = []
result[community_id].append(node_id)
return results_by_level
return results_by_level, community_hierarchy_map


# Taken from graph_intelligence & adapted
Expand All @@ -98,8 +98,8 @@ def _compute_leiden_communities(
max_cluster_size: int,
use_lcc: bool,
seed=0xDEADBEEF,
) -> dict[int, dict[str, int]]:
"""Return Leiden root communities."""
) -> tuple[dict[int, dict[str, int]], dict[int, int]]:
"""Return Leiden root communities and their hierarchy mapping."""
# NOTE: This import is done here to reduce the initial import time of the graphrag package
from graspologic.partition import hierarchical_leiden

Expand All @@ -110,8 +110,13 @@ def _compute_leiden_communities(
graph, max_cluster_size=max_cluster_size, random_seed=seed
)
results: dict[int, dict[str, int]] = {}
hierarchy: dict[int, int] = {}
for partition in community_mapping:
results[partition.level] = results.get(partition.level, {})
results[partition.level][partition.node] = partition.cluster

return results
hierarchy[partition.cluster] = (
partition.parent_cluster if partition.parent_cluster is not None else -1
)

return results, hierarchy
15 changes: 15 additions & 0 deletions graphrag/index/update/communities.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ def _merge_and_resolve_nodes(
v: v + old_max_community_id + 1
for k, v in delta_nodes["community"].dropna().astype(int).items()
}
community_id_mapping.update({-1: -1})

delta_nodes["community"] = delta_nodes["community"].where(
delta_nodes["community"].isna(),
Expand Down Expand Up @@ -130,6 +131,12 @@ def _update_and_merge_communities(
.apply(lambda x: community_id_mapping.get(x, x))
)

delta_communities["parent"] = (
delta_communities["parent"]
.astype(int)
.apply(lambda x: community_id_mapping.get(x, x))
)

old_communities["community"] = old_communities["community"].astype(int)

# Merge the final communities
Expand All @@ -150,6 +157,7 @@ def _update_and_merge_communities(
"id",
"human_readable_id",
"community",
"parent",
"level",
"title",
"entity_ids",
Expand Down Expand Up @@ -201,6 +209,12 @@ def _update_and_merge_community_reports(
.apply(lambda x: community_id_mapping.get(x, x))
)

delta_community_reports["parent"] = (
delta_community_reports["parent"]
.astype(int)
.apply(lambda x: community_id_mapping.get(x, x))
)

old_community_reports["community"] = old_community_reports["community"].astype(int)

# Merge the final community reports
Expand All @@ -223,6 +237,7 @@ def _update_and_merge_community_reports(
"id",
"human_readable_id",
"community",
"parent",
"level",
"title",
"summary",
Expand Down
Binary file modified tests/verbs/data/base_communities.parquet
Binary file not shown.
Binary file modified tests/verbs/data/base_entity_nodes.parquet
Binary file not shown.
Binary file modified tests/verbs/data/base_relationship_edges.parquet
Binary file not shown.
Binary file modified tests/verbs/data/create_final_communities.parquet
Binary file not shown.
Binary file modified tests/verbs/data/create_final_community_reports.parquet
Binary file not shown.
Binary file modified tests/verbs/data/create_final_covariates.parquet
Binary file not shown.
Binary file modified tests/verbs/data/create_final_entities.parquet
Binary file not shown.
Binary file modified tests/verbs/data/create_final_nodes.parquet
Binary file not shown.
Binary file modified tests/verbs/data/create_final_relationships.parquet
Binary file not shown.
Binary file modified tests/verbs/data/create_final_text_units.parquet
Binary file not shown.

0 comments on commit 0440580

Please sign in to comment.