From 397a8a31c63055708cb55c66b31cd4b3031ff600 Mon Sep 17 00:00:00 2001
From: Anqi <anqi.wang@vesoft.com>
Date: Fri, 16 Jun 2023 09:41:11 +0800
Subject: [PATCH] encode root for path algo (#71)

---
 .../nebula/algorithm/config/AlgoConfig.scala  | 12 ++--
 .../nebula/algorithm/config/Configs.scala     |  1 +
 .../vesoft/nebula/algorithm/lib/BfsAlgo.scala |  5 +-
 .../vesoft/nebula/algorithm/lib/DfsAlgo.scala | 63 ++++++++++++++-----
 .../nebula/algorithm/lib/BfsAlgoSuite.scala   |  2 +-
 .../nebula/algorithm/lib/DfsAlgoSuite.scala   | 20 ++++--
 6 files changed, 74 insertions(+), 29 deletions(-)

diff --git a/nebula-algorithm/src/main/scala/com/vesoft/nebula/algorithm/config/AlgoConfig.scala b/nebula-algorithm/src/main/scala/com/vesoft/nebula/algorithm/config/AlgoConfig.scala
index 9811185..cba8814 100644
--- a/nebula-algorithm/src/main/scala/com/vesoft/nebula/algorithm/config/AlgoConfig.scala
+++ b/nebula-algorithm/src/main/scala/com/vesoft/nebula/algorithm/config/AlgoConfig.scala
@@ -200,16 +200,16 @@ object CoefficientConfig {
 /**
   * bfs
   */
-case class BfsConfig(maxIter: Int, root: Long, encodeId: Boolean = false)
+case class BfsConfig(maxIter: Int, root: String, encodeId: Boolean = false)
 object BfsConfig {
   var maxIter: Int      = _
-  var root: Long        = _
+  var root: String      = _
   var encodeId: Boolean = false
 
   def getBfsConfig(configs: Configs): BfsConfig = {
     val bfsConfig = configs.algorithmConfig.map
     maxIter = bfsConfig("algorithm.bfs.maxIter").toInt
-    root = bfsConfig("algorithm.bfs.root").toLong
+    root = bfsConfig("algorithm.bfs.root").toString
     encodeId = ConfigUtil.getOrElseBoolean(bfsConfig, "algorithm.bfs.encodeId", false)
     BfsConfig(maxIter, root, encodeId)
   }
@@ -218,16 +218,16 @@ object BfsConfig {
 /**
   * dfs
   */
-case class DfsConfig(maxIter: Int, root: Long, encodeId: Boolean = false)
+case class DfsConfig(maxIter: Int, root: String, encodeId: Boolean = false)
 object DfsConfig {
   var maxIter: Int      = _
-  var root: Long        = _
+  var root: String      = _
   var encodeId: Boolean = false
 
   def getDfsConfig(configs: Configs): DfsConfig = {
     val dfsConfig = configs.algorithmConfig.map
     maxIter = dfsConfig("algorithm.dfs.maxIter").toInt
-    root = dfsConfig("algorithm.dfs.root").toLong
+    root = dfsConfig("algorithm.dfs.root").toString
     encodeId = ConfigUtil.getOrElseBoolean(dfsConfig, "algorithm.dfs.encodeId", false)
     DfsConfig(maxIter, root, encodeId)
   }
diff --git a/nebula-algorithm/src/main/scala/com/vesoft/nebula/algorithm/config/Configs.scala b/nebula-algorithm/src/main/scala/com/vesoft/nebula/algorithm/config/Configs.scala
index abb94af..93f7311 100644
--- a/nebula-algorithm/src/main/scala/com/vesoft/nebula/algorithm/config/Configs.scala
+++ b/nebula-algorithm/src/main/scala/com/vesoft/nebula/algorithm/config/Configs.scala
@@ -386,6 +386,7 @@ object AlgoConstants {
   val HANP_RESULT_COL: String               = "hanp"
   val NODE2VEC_RESULT_COL: String           = "node2vec"
   val BFS_RESULT_COL: String                = "bfs"
+  val DFS_RESULT_COL: String                = "dfs"
   val ENCODE_ID_COL: String                 = "encodedId"
   val ORIGIN_ID_COL: String                 = "id"
 }
diff --git a/nebula-algorithm/src/main/scala/com/vesoft/nebula/algorithm/lib/BfsAlgo.scala b/nebula-algorithm/src/main/scala/com/vesoft/nebula/algorithm/lib/BfsAlgo.scala
index 8765f93..96a3fe7 100644
--- a/nebula-algorithm/src/main/scala/com/vesoft/nebula/algorithm/lib/BfsAlgo.scala
+++ b/nebula-algorithm/src/main/scala/com/vesoft/nebula/algorithm/lib/BfsAlgo.scala
@@ -26,15 +26,18 @@ object BfsAlgo {
     */
   def apply(spark: SparkSession, dataset: Dataset[Row], bfsConfig: BfsConfig): DataFrame = {
     var encodeIdDf: DataFrame = null
+    var finalRoot: Long       = 0
 
     val graph: Graph[None.type, Double] = if (bfsConfig.encodeId) {
       val (data, encodeId) = DecodeUtil.convertStringId2LongId(dataset, false)
       encodeIdDf = encodeId
+      finalRoot = encodeIdDf.filter(row => row.get(0).toString == bfsConfig.root).first().getLong(1)
       NebulaUtil.loadInitGraph(data, false)
     } else {
+      finalRoot = bfsConfig.root.toLong
       NebulaUtil.loadInitGraph(dataset, false)
     }
-    val bfsGraph = execute(graph, bfsConfig.maxIter, bfsConfig.root)
+    val bfsGraph = execute(graph, bfsConfig.maxIter, finalRoot)
 
     // filter out the not traversal vertices
     val visitedVertices = bfsGraph.vertices.filter(v => v._2 != Double.PositiveInfinity)
diff --git a/nebula-algorithm/src/main/scala/com/vesoft/nebula/algorithm/lib/DfsAlgo.scala b/nebula-algorithm/src/main/scala/com/vesoft/nebula/algorithm/lib/DfsAlgo.scala
index 1789ae6..1ee5d4f 100644
--- a/nebula-algorithm/src/main/scala/com/vesoft/nebula/algorithm/lib/DfsAlgo.scala
+++ b/nebula-algorithm/src/main/scala/com/vesoft/nebula/algorithm/lib/DfsAlgo.scala
@@ -5,9 +5,16 @@
 
 package com.vesoft.nebula.algorithm.lib
 
+import com.vesoft.nebula.algorithm.config.AlgoConstants.{
+  ALGO_ID_COL,
+  DFS_RESULT_COL,
+  ENCODE_ID_COL,
+  ORIGIN_ID_COL
+}
 import com.vesoft.nebula.algorithm.config.{AlgoConstants, BfsConfig, DfsConfig}
 import com.vesoft.nebula.algorithm.utils.{DecodeUtil, NebulaUtil}
-import org.apache.spark.graphx.{EdgeDirection, Graph, VertexId}
+import org.apache.spark.graphx.{EdgeDirection, EdgeTriplet, Graph, Pregel, VertexId}
+import org.apache.spark.sql.functions.col
 import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
 import org.apache.spark.sql.types.{DoubleType, LongType, StringType, StructField, StructType}
 
@@ -18,21 +25,28 @@ object DfsAlgo {
 
   def apply(spark: SparkSession, dataset: Dataset[Row], dfsConfig: DfsConfig): DataFrame = {
     var encodeIdDf: DataFrame = null
+    var finalRoot: Long       = 0
 
     val graph: Graph[None.type, Double] = if (dfsConfig.encodeId) {
       val (data, encodeId) = DecodeUtil.convertStringId2LongId(dataset, false)
       encodeIdDf = encodeId
+      finalRoot = encodeIdDf.filter(row => row.get(0).toString == dfsConfig.root).first().getLong(1)
       NebulaUtil.loadInitGraph(data, false)
     } else {
+      finalRoot = dfsConfig.root.toLong
       NebulaUtil.loadInitGraph(dataset, false)
     }
-    val bfsVertices = dfs(graph, dfsConfig.root, mutable.Seq.empty[VertexId])(dfsConfig.maxIter)
+    val bfsVertices =
+      dfs(graph, finalRoot, mutable.Seq.empty[VertexId])(dfsConfig.maxIter).vertices.filter(v =>
+        v._2 != Double.PositiveInfinity)
 
-    val schema = StructType(List(StructField("dfs", LongType, nullable = false)))
+    val schema = StructType(
+      List(StructField(ALGO_ID_COL, LongType, nullable = false),
+           StructField(DFS_RESULT_COL, DoubleType, nullable = true)))
 
-    val rdd = spark.sparkContext.parallelize(bfsVertices.toSeq, 1).map(row => Row(row))
-    val algoResult = spark.sqlContext
-      .createDataFrame(rdd, schema)
+    val resultRDD = bfsVertices.map(v => Row(v._1, v._2))
+    val algoResult =
+      spark.sqlContext.createDataFrame(resultRDD, schema).orderBy(col(DFS_RESULT_COL))
 
     if (dfsConfig.encodeId) {
       DecodeUtil.convertAlgoId2StringId(algoResult, encodeIdDf).coalesce(1)
@@ -42,18 +56,35 @@ object DfsAlgo {
   }
 
   def dfs(g: Graph[None.type, Double], vertexId: VertexId, visited: mutable.Seq[VertexId])(
-      maxIter: Int): mutable.Seq[VertexId] = {
-    if (visited.contains(vertexId)) {
-      visited
-    } else {
-      if (iterNums > maxIter) {
-        return visited
+      maxIter: Int): Graph[Double, Double] = {
+
+    val initialGraph =
+      g.mapVertices((id, _) => if (id == vertexId) 0.0 else Double.PositiveInfinity)
+
+    def vertexProgram(id: VertexId, attr: Double, msg: Double): Double = {
+      math.min(attr, msg)
+    }
+
+    def sendMessage(edge: EdgeTriplet[Double, Double]): Iterator[(VertexId, Double)] = {
+      val sourceVertex = edge.srcAttr
+      val targetVertex = edge.dstAttr
+      if (sourceVertex + 1 < targetVertex && sourceVertex < maxIter) {
+        Iterator((edge.dstId, sourceVertex + 1))
+      } else {
+        Iterator.empty
       }
-      val newVisited = visited :+ vertexId
-      val neighbors  = g.collectNeighbors(EdgeDirection.Out).lookup(vertexId).flatten
-      iterNums = iterNums + 1
-      neighbors.foldLeft(newVisited)((visited, neighbor) => dfs(g, neighbor._1, visited)(maxIter))
     }
+
+    def mergeMessage(a: Double, b: Double): Double = {
+      math.min(a, b)
+    }
+
+    //开始迭代
+    val resultGraph =
+      Pregel(initialGraph, Double.PositiveInfinity)(vertexProgram, sendMessage, mergeMessage)
+
+    resultGraph
+
   }
 
 }
diff --git a/nebula-algorithm/src/test/scala/com/vesoft/nebula/algorithm/lib/BfsAlgoSuite.scala b/nebula-algorithm/src/test/scala/com/vesoft/nebula/algorithm/lib/BfsAlgoSuite.scala
index 73c75df..a4c4f12 100644
--- a/nebula-algorithm/src/test/scala/com/vesoft/nebula/algorithm/lib/BfsAlgoSuite.scala
+++ b/nebula-algorithm/src/test/scala/com/vesoft/nebula/algorithm/lib/BfsAlgoSuite.scala
@@ -14,7 +14,7 @@ class BfsAlgoSuite {
   def bfsAlgoSuite(): Unit = {
     val spark         = SparkSession.builder().master("local").getOrCreate()
     val data          = spark.read.option("header", true).csv("src/test/resources/edge.csv")
-    val bfsAlgoConfig = new BfsConfig(5, 1)
+    val bfsAlgoConfig = new BfsConfig(5, "1")
     val result        = BfsAlgo.apply(spark, data, bfsAlgoConfig)
     result.show()
     assert(result.count() == 4)
diff --git a/nebula-algorithm/src/test/scala/com/vesoft/nebula/algorithm/lib/DfsAlgoSuite.scala b/nebula-algorithm/src/test/scala/com/vesoft/nebula/algorithm/lib/DfsAlgoSuite.scala
index e710d58..6d0f3c5 100644
--- a/nebula-algorithm/src/test/scala/com/vesoft/nebula/algorithm/lib/DfsAlgoSuite.scala
+++ b/nebula-algorithm/src/test/scala/com/vesoft/nebula/algorithm/lib/DfsAlgoSuite.scala
@@ -13,11 +13,21 @@ import org.junit.Test
 class DfsAlgoSuite {
   @Test
   def bfsAlgoSuite(): Unit = {
-    val spark         = SparkSession.builder().master("local").getOrCreate()
+    val spark = SparkSession
+      .builder()
+      .master("local")
+      .config("spark.sql.shuffle.partitions", 5)
+      .getOrCreate()
     val data          = spark.read.option("header", true).csv("src/test/resources/edge.csv")
-    val dfsAlgoConfig = new DfsConfig(5, 3)
-    val result        = DfsAlgo.apply(spark, data, dfsAlgoConfig)
-    result.show()
-    assert(result.count() == 4)
+    val dfsAlgoConfig = new DfsConfig(5, "3")
+//    val result        = DfsAlgo.apply(spark, data, dfsAlgoConfig)
+//    result.show()
+//    assert(result.count() == 4)
+
+    val encodeDfsConfig = new DfsConfig(5, "3", true)
+    val encodeResult    = DfsAlgo.apply(spark, data, encodeDfsConfig)
+
+    encodeResult.show()
+    assert(encodeResult.count() == 4)
   }
 }