From 3afa105be7990f1f42694afe1a13de62bfbd42eb Mon Sep 17 00:00:00 2001
From: Xinyu Yao <77922129+yxy235@users.noreply.github.com>
Date: Thu, 25 Apr 2024 13:05:54 +0800
Subject: [PATCH] [GraphBolt][Doc] Update docs related to `seeds`. (#7351)

Co-authored-by: Ubuntu <ubuntu@ip-172-31-0-133.us-west-2.compute.internal>
---
 .../source/guide/minibatch-custom-sampler.rst |  4 +-
 docs/source/guide/minibatch-edge.rst          | 42 +++++++++----------
 docs/source/guide/minibatch-inference.rst     |  2 +-
 docs/source/guide/minibatch-link.rst          | 22 +++++-----
 .../ondisk-dataset-specification.rst          |  5 +--
 notebooks/graphbolt/walkthrough.ipynb         |  8 ++--
 6 files changed, 42 insertions(+), 41 deletions(-)

diff --git a/docs/source/guide/minibatch-custom-sampler.rst b/docs/source/guide/minibatch-custom-sampler.rst
index 5ca5464ebfa9..80473122fbaa 100644
--- a/docs/source/guide/minibatch-custom-sampler.rst
+++ b/docs/source/guide/minibatch-custom-sampler.rst
@@ -79,11 +79,11 @@ can be used on heterogeneous graphs:
         {
             "user": gb.ItemSet(
                 (torch.arange(0, 5), torch.arange(5, 10)),
-                names=("seed_nodes", "labels"),
+                names=("seeds", "labels"),
             ),
             "item": gb.ItemSet(
                 (torch.arange(5, 10), torch.arange(10, 15)),
-                names=("seed_nodes", "labels"),
+                names=("seeds", "labels"),
             ),
         }
     )
diff --git a/docs/source/guide/minibatch-edge.rst b/docs/source/guide/minibatch-edge.rst
index ee7cd85c676b..ae1ad9f49b90 100644
--- a/docs/source/guide/minibatch-edge.rst
+++ b/docs/source/guide/minibatch-edge.rst
@@ -30,9 +30,9 @@ edges(namely, node pairs) in the training set instead of the nodes.
 
     device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
     g = gb.SamplingGraph()
-    node_paris = torch.arange(0, 1000).reshape(-1, 2)
+    seeds = torch.arange(0, 1000).reshape(-1, 2)
     labels = torch.randint(0, 2, (5,))
-    train_set = gb.ItemSet((node_pairs, labels), names=("node_pairs", "labels"))
+    train_set = gb.ItemSet((seeds, labels), names=("seeds", "labels"))
     datapipe = gb.ItemSampler(train_set, batch_size=128, shuffle=True)
     datapipe = datapipe.sample_neighbor(g, [10, 10]) # 2 layers.
     # Or equivalently:
@@ -83,9 +83,9 @@ You can use :func:`~dgl.graphbolt.exclude_seed_edges` alongside with
 
     device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
     g = gb.SamplingGraph()
-    node_paris = torch.arange(0, 1000).reshape(-1, 2)
+    seeds = torch.arange(0, 1000).reshape(-1, 2)
     labels = torch.randint(0, 2, (5,))
-    train_set = gb.ItemSet((node_pairs, labels), names=("node_pairs", "labels"))
+    train_set = gb.ItemSet((seeds, labels), names=("seeds", "labels"))
     datapipe = gb.ItemSampler(train_set, batch_size=128, shuffle=True)
     datapipe = datapipe.sample_neighbor(g, [10, 10]) # 2 layers.
     exclude_seed_edges = partial(gb.exclude_seed_edges, include_reverse_edges=True)
@@ -138,9 +138,9 @@ concatenating the incident node features and projecting it with a dense layer.
             super().__init__()
             self.W = nn.Linear(2 * in_features, num_classes)
     
-        def forward(self, node_pairs, x):
-            src_x = x[node_pairs[0]]
-            dst_x = x[node_pairs[1]]
+        def forward(self, seeds, x):
+            src_x = x[seeds[:, 0]]
+            dst_x = x[seeds[:, 1]]
             data = torch.cat([src_x, dst_x], 1)
             return self.W(data)
 
@@ -157,9 +157,9 @@ loader, as well as the input node features as follows:
                 in_features, hidden_features, out_features)
             self.predictor = ScorePredictor(num_classes, out_features)
 
-        def forward(self, blocks, x, node_pairs):
+        def forward(self, blocks, x, seeds):
             x = self.gcn(blocks, x)
-            return self.predictor(node_pairs, x)
+            return self.predictor(seeds, x)
 
 DGL ensures that that the nodes in the edge subgraph are the same as the
 output nodes of the last MFG in the generated list of MFGs.
@@ -182,7 +182,7 @@ their incident node representations.
     for data in dataloader:
         blocks = data.blocks
         x = data.edge_features("feat")
-        y_hat = model(data.blocks, x, data.positive_node_pairs)
+        y_hat = model(data.blocks, x, data.compacted_seeds)
         loss = F.cross_entropy(data.labels, y_hat)
         opt.zero_grad()
         loss.backward()
@@ -226,10 +226,10 @@ over the edge types.
             super().__init__()
             self.W = nn.Linear(2 * in_features, num_classes)
     
-        def forward(self, node_pairs, x):
+        def forward(self, seeds, x):
             scores = {}
-            for etype in node_pairs.keys():
-                src, dst = node_pairs[etype]
+            for etype in seeds.keys():
+                src, dst = seeds[etype].T
                 data = torch.cat([x[etype][src], x[etype][dst]], 1)
                 scores[etype] = self.W(data)
             return scores
@@ -242,9 +242,9 @@ over the edge types.
                 in_features, hidden_features, out_features, etypes)
             self.pred = ScorePredictor(num_classes, out_features)
 
-        def forward(self, node_pairs, blocks, x):
+        def forward(self, seeds, blocks, x):
             x = self.rgcn(blocks, x)
-            return self.pred(node_pairs, x)
+            return self.pred(seeds, x)
 
 Data loader definition is almost identical to that of homogeneous graph. The
 only difference is that the train_set is now an instance of
@@ -256,17 +256,17 @@ only difference is that the train_set is now an instance of
 
     device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
     g = gb.SamplingGraph()
-    node_pairs = torch.arange(0, 1000).reshape(-1, 2)
+    seeds = torch.arange(0, 1000).reshape(-1, 2)
     labels = torch.randint(0, 3, (1000,))
-    node_pairs_labels = {
+    seeds_labels = {
         "user:like:item": gb.ItemSet(
-            (node_pairs, labels), names=("node_pairs", "labels")
+            (seeds, labels), names=("seeds", "labels")
         ),
         "user:follow:user": gb.ItemSet(
-            (node_pairs, labels), names=("node_pairs", "labels")
+            (seeds, labels), names=("seeds", "labels")
         ),
     }
-    train_set = gb.ItemSetDict(node_pairs_labels)
+    train_set = gb.ItemSetDict(seeds_labels)
     datapipe = gb.ItemSampler(train_set, batch_size=128, shuffle=True)
     datapipe = datapipe.sample_neighbor(g, [10, 10]) # 2 layers.
     datapipe = datapipe.fetch_feature(
@@ -316,7 +316,7 @@ dictionaries of node types and predictions here.
     for data in dataloader:
         blocks = data.blocks
         x = data.edge_features(("user:like:item", "feat"))
-        y_hat = model(data.blocks, x, data.positive_node_pairs)
+        y_hat = model(data.blocks, x, data.compacted_seeds)
         loss = F.cross_entropy(data.labels, y_hat)
         opt.zero_grad()
         loss.backward()
diff --git a/docs/source/guide/minibatch-inference.rst b/docs/source/guide/minibatch-inference.rst
index 2f2303e60cf7..54446e26828b 100644
--- a/docs/source/guide/minibatch-inference.rst
+++ b/docs/source/guide/minibatch-inference.rst
@@ -106,7 +106,7 @@ and combined as well.
                         hidden_x = self.dropout(hidden_x)
                     # By design, our output nodes are contiguous.
                     y[
-                        data.seed_nodes[0] : data.seed_nodes[-1] + 1
+                        data.seeds[0] : data.seeds[-1] + 1
                     ] = hidden_x.to(device)
                 feature = y
 
diff --git a/docs/source/guide/minibatch-link.rst b/docs/source/guide/minibatch-link.rst
index a3dbc341d742..ad1fc1d3d9f1 100644
--- a/docs/source/guide/minibatch-link.rst
+++ b/docs/source/guide/minibatch-link.rst
@@ -53,8 +53,8 @@ proportional to a power of degrees.
             self.weights = node_degrees ** 0.75
             self.k = k
     
-        def _sample_with_etype(node_pairs, etype=None):
-            src, _ = node_pairs
+        def _sample_with_etype(self, seeds, etype=None):
+            src, _ = seeds.T
             src = src.repeat_interleave(self.k)
             dst = self.weights.multinomial(len(src), replacement=True)
             return src, dst
@@ -95,7 +95,7 @@ Define a GraphSAGE model for minibatch training
 
 When a negative sampler is provided, the data loader will generate positive and
 negative node pairs for each minibatch besides the *Message Flow Graphs* (MFGs).
-Use `node_pairs_with_labels` to get compact node pairs with corresponding
+Use `compacted_seeds` and `labels` to get compact node pairs and corresponding
 labels.
 
 
@@ -116,7 +116,8 @@ above.
         start_epoch_time = time.time()
         for step, data in enumerate(dataloader):
             # Unpack MiniBatch.
-            compacted_pairs, labels = data.node_pairs_with_labels
+            compacted_seeds = data.compacted_seeds.T
+            labels = data.labels
             node_feature = data.node_features["feat"]
             # Convert sampled subgraphs to DGL blocks.
             blocks = data.blocks
@@ -124,7 +125,7 @@ above.
             # Get the embeddings of the input nodes.
             y = model(blocks, node_feature)
             logits = model.predictor(
-                y[compacted_pairs[0]] * y[compacted_pairs[1]]
+                y[compacted_seeds[0]] * y[compacted_seeds[1]]
             ).squeeze()
 
             # Compute loss.
@@ -217,8 +218,8 @@ If you want to give your own negative sampling function, just inherit from the
             }
             self.k = k
     
-        def _sample_with_etype(node_pairs, etype):
-            src, _ = node_pairs
+        def _sample_with_etype(self, seeds, etype):
+            src, _ = seeds.T
             src = src.repeat_interleave(self.k)
             dst = self.weights[etype].multinomial(len(src), replacement=True)
             return src, dst
@@ -241,7 +242,8 @@ loss on specific edge type.
         start_epoch_time = time.time()
         for step, data in enumerate(dataloader):
             # Unpack MiniBatch.
-            compacted_pairs, labels = data.node_pairs_with_labels
+            compacted_seeds = data.compacted_seeds
+            labels = data.labels
             node_features = {
                 ntype: data.node_features[(ntype, "feat")]
                 for ntype in data.blocks[0].srctypes
@@ -251,8 +253,8 @@ loss on specific edge type.
             # Get the embeddings of the input nodes.
             y = model(blocks, node_feature)
             logits = model.predictor(
-                y[category][compacted_pairs[category][0]]
-                * y[category][compacted_pairs[category][1]]
+                y[category][compacted_pairs[category][:, 0]]
+                * y[category][compacted_pairs[category][:, 1]]
             ).squeeze()
 
             # Compute loss.
diff --git a/docs/source/stochastic_training/ondisk-dataset-specification.rst b/docs/source/stochastic_training/ondisk-dataset-specification.rst
index 0587b26a8806..96f72227da88 100644
--- a/docs/source/stochastic_training/ondisk-dataset-specification.rst
+++ b/docs/source/stochastic_training/ondisk-dataset-specification.rst
@@ -201,9 +201,8 @@ such as ``num_classes`` and all these fields will be passed to the
 
         The ``name`` field is used to specify the name of the data. It is mandatory
         and used to specify the data fields of ``MiniBatch`` for sampling. It can
-        be either ``seed_nodes``, ``labels``, ``node_pairs``, ``negative_srcs`` or 
-        ``negative_dsts``. If any other name is used, it will be added into the
-        ``MiniBatch`` data fields.
+        be either ``seeds``, ``labels`` or ``indexes``. If any other name is used,
+        it will be added into the ``MiniBatch`` data fields.
     - ``format``: ``string``
 
         The ``format`` field is used to specify the format of the data. It can be
diff --git a/notebooks/graphbolt/walkthrough.ipynb b/notebooks/graphbolt/walkthrough.ipynb
index 137a2ba4d3f4..2500b7871fb1 100644
--- a/notebooks/graphbolt/walkthrough.ipynb
+++ b/notebooks/graphbolt/walkthrough.ipynb
@@ -61,12 +61,12 @@
       },
       "outputs": [],
       "source": [
-        "node_pairs = torch.tensor(\n",
+        "seeds = torch.tensor(\n",
         "    [[7, 0], [6, 0], [1, 3], [3, 3], [2, 4], [8, 4], [1, 4], [2, 4], [1, 5],\n",
         "     [9, 6], [0, 6], [8, 6], [7, 7], [7, 7], [4, 7], [6, 8], [5, 8], [9, 9],\n",
         "     [4, 9], [4, 9], [5, 9], [9, 9], [5, 9], [9, 9], [7, 9]]\n",
         ")\n",
-        "item_set = gb.ItemSet(node_pairs, names=\"node_pairs\")\n",
+        "item_set = gb.ItemSet(seeds, names=\"seeds\")\n",
         "print(list(item_set))"
       ]
     },
@@ -262,7 +262,7 @@
         "num_nodes = 10\n",
         "nodes = torch.arange(num_nodes)\n",
         "labels = torch.tensor([1, 2, 0, 2, 2, 0, 2, 2, 2, 2])\n",
-        "item_set = gb.ItemSet((nodes, labels), names=(\"seed_nodes\", \"labels\"))\n",
+        "item_set = gb.ItemSet((nodes, labels), names=(\"seeds\", \"labels\"))\n",
         "\n",
         "indptr = torch.tensor([0, 2, 2, 2, 4, 8, 9, 12, 15, 17, 25])\n",
         "indices = torch.tensor(\n",
@@ -311,4 +311,4 @@
   },
   "nbformat": 4,
   "nbformat_minor": 0
-}
\ No newline at end of file
+}