add similarity algo and test (#31)

Nicole00 · web-flow · commit 219b9c2bbb5a · 2021-12-28T10:08:39.000+08:00
* add similarity algo

* modify comment
diff --git a/nebula-algorithm/src/main/resources/application.conf b/nebula-algorithm/src/main/resources/application.conf
@@ -173,5 +173,10 @@
        embSeparate: ",",
        modelPath: "hdfs://127.0.0.1:9000/model"
    }
+
+   # JaccardAlgo parameter
+   jaccard:{
+       tol: 1.0
+   }
  }
 }
diff --git a/nebula-algorithm/src/main/scala/com/vesoft/nebula/algorithm/Main.scala b/nebula-algorithm/src/main/scala/com/vesoft/nebula/algorithm/Main.scala
@@ -14,6 +14,7 @@ import com.vesoft.nebula.algorithm.config.{
   CoefficientConfig,
   Configs,
   HanpConfig,
+  JaccardConfig,
   KCoreConfig,
   LPAConfig,
   LouvainConfig,
@@ -31,6 +32,7 @@ import com.vesoft.nebula.algorithm.lib.{
   DegreeStaticAlgo,
   GraphTriangleCountAlgo,
   HanpAlgo,
+  JaccardAlgo,
   KCoreAlgo,
   LabelPropagationAlgo,
   LouvainAlgo,
@@ -191,6 +193,10 @@ object Main {
           val bfsConfig = BfsConfig.getBfsConfig(configs)
           BfsAlgo(spark, dataSet, bfsConfig)
         }
+        case "jaccard" => {
+          val jaccardConfig = JaccardConfig.getJaccardConfig(configs)
+          JaccardAlgo(spark, dataSet, jaccardConfig)
+        }
         case _ => throw new UnknownParameterException("unknown executeAlgo name.")
       }
     }
diff --git a/nebula-algorithm/src/main/scala/com/vesoft/nebula/algorithm/config/AlgoConfig.scala b/nebula-algorithm/src/main/scala/com/vesoft/nebula/algorithm/config/AlgoConfig.scala
@@ -266,6 +266,20 @@ object Node2vecConfig {
   }
 }
 
+/**
+  * Jaccard
+  */
+case class JaccardConfig(tol: Double)
+
+object JaccardConfig {
+  var tol: Double = _
+  def getJaccardConfig(configs: Configs): JaccardConfig = {
+    val jaccardConfig = configs.algorithmConfig.map
+    tol = jaccardConfig("algorithm.jaccard.tol").toDouble
+    JaccardConfig(tol)
+  }
+}
+
 case class AlgoConfig(configs: Configs)
 
 object AlgoConfig {
diff --git a/nebula-algorithm/src/main/scala/com/vesoft/nebula/algorithm/lib/JaccardAlgo.scala b/nebula-algorithm/src/main/scala/com/vesoft/nebula/algorithm/lib/JaccardAlgo.scala
@@ -0,0 +1,135 @@
+/* Copyright (c) 2021 vesoft inc. All rights reserved.
+ *
+ * This source code is licensed under Apache 2.0 License.
+ */
+
+package com.vesoft.nebula.algorithm.lib
+
+import com.vesoft.nebula.algorithm.config.JaccardConfig
+import org.apache.log4j.Logger
+import org.apache.spark.ml.feature.{
+  CountVectorizer,
+  CountVectorizerModel,
+  MinHashLSH,
+  MinHashLSHModel
+}
+import org.apache.spark.ml.linalg.SparseVector
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
+import org.apache.spark.sql.functions.col
+import org.apache.spark.sql.types.{DoubleType, StringType, StructField, StructType}
+
+object JaccardAlgo {
+  private val LOGGER = Logger.getLogger(this.getClass)
+
+  val ALGORITHM: String = "Jaccard"
+
+  /**
+    * run the Jaccard algorithm for nebula graph
+    */
+  def apply(spark: SparkSession, dataset: Dataset[Row], jaccardConfig: JaccardConfig): DataFrame = {
+
+    val jaccardResult: RDD[Row] = execute(spark, dataset, jaccardConfig.tol)
+
+    val schema = StructType(
+      List(
+        StructField("srcId", StringType, nullable = true),
+        StructField("dstId", StringType, nullable = true),
+        StructField("similarity", DoubleType, nullable = true)
+      ))
+    val algoResult = spark.sqlContext.createDataFrame(jaccardResult, schema)
+    algoResult
+  }
+
+  def execute(spark: SparkSession, dataset: Dataset[Row], tol: Double): RDD[Row] = {
+    // compute the node's 1-degree neighbor set
+    import spark.implicits._
+    val edges = dataset
+      .map(row => {
+        (row.get(0).toString, row.get(1).toString)
+      })
+      .rdd
+
+    // get in-degree neighbors
+    val inputNodeVector: RDD[(String, List[String])] = edges
+      .map(_.swap)
+      .combineByKey((v: String) => List(v),
+                    (c: List[String], v: String) => v :: c,
+                    (c1: List[String], c2: List[String]) => c1 ::: c2)
+      .repartition(100)
+
+    // get out-degree neighbors
+    val outputNodeVector: RDD[(String, List[String])] = edges
+      .combineByKey(
+        (v: String) => List(v),
+        (c: List[String], v: String) => v :: c,
+        (c1: List[String], c2: List[String]) => c1 ::: c2
+      )
+      .repartition(100)
+
+    // combine the neighbors
+    val nodeVector: RDD[(String, List[String])] = inputNodeVector
+      .fullOuterJoin(outputNodeVector)
+      .map(row => {
+        val inNeighbors: Option[List[String]]  = row._2._1
+        val outNeighbors: Option[List[String]] = row._2._2
+        val neighbors = if (inNeighbors.isEmpty && outNeighbors.isEmpty) {
+          (row._1, List())
+        } else if (inNeighbors.isEmpty && outNeighbors.isDefined) {
+          (row._1, outNeighbors.get)
+        } else if (inNeighbors.isDefined && outNeighbors.isEmpty) {
+          (row._1, inNeighbors.get)
+        } else {
+          (row._1, (inNeighbors.get ::: outNeighbors.get).distinct)
+        }
+        neighbors
+      })
+
+    // Preprocess the input data, process it into a 0-1 vector in the form of bag of word
+    val inputNodeVectorDF = spark.createDataFrame(nodeVector).toDF("node", "neighbors")
+    val cvModel: CountVectorizerModel =
+      new CountVectorizer()
+        .setInputCol("neighbors")
+        .setOutputCol("features")
+        .setBinary(true)
+        .fit(inputNodeVectorDF)
+
+    val inputNodeVectorDFSparse: DataFrame =
+      cvModel.transform(inputNodeVectorDF).select("node", "features")
+
+    val nodeVectorDFSparseFilter = spark
+      .createDataFrame(
+        inputNodeVectorDFSparse.rdd
+          .map(row => (row.getAs[String]("node"), row.getAs[SparseVector]("features")))
+          .map(x => (x._1, x._2, x._2.numNonzeros))
+          .filter(x => x._3 >= 1)
+          .map(x => (x._1, x._2)))
+      .toDF("node", "features")
+
+    // call ml's minhashLSH to compute the Jaccard
+    val mh                     = new MinHashLSH().setNumHashTables(100).setInputCol("features").setOutputCol("hashes")
+    val model: MinHashLSHModel = mh.fit(nodeVectorDFSparseFilter)
+    val nodeDistance: DataFrame = model
+      .approxSimilarityJoin(nodeVectorDFSparseFilter,
+                            nodeVectorDFSparseFilter,
+                            tol,
+                            "JaccardDistance")
+      .select(col("datasetA.node").alias("node1"),
+              col("datasetB.node").alias("node2"),
+              col("JaccardDistance"))
+
+    val nodeOverlapRatio = nodeDistance.rdd
+      .map(x => {
+        val node1        = x.getString(0)
+        val node2        = x.getString(1)
+        val overlapRatio = 1 - x.getDouble(2)
+        if (node1 < node2) ((node1, node2), overlapRatio) else ((node2, node1), overlapRatio)
+      })
+      .filter(x => x._1._1 != x._1._2)
+      .map(row => {
+        Row(row._1._1, row._1._2, row._2)
+      })
+
+    nodeOverlapRatio.distinct()
+  }
+}
diff --git a/nebula-algorithm/src/test/resources/edge.csv b/nebula-algorithm/src/test/resources/edge.csv
@@ -14,4 +14,4 @@ src,dst,weight
 4,1,1.0
 4,2,5.0
 4,3,1.0
-4,4,5.0
+4,4,5.0
diff --git a/nebula-algorithm/src/test/scala/com/vesoft/nebula/algorithm/lib/JaccardAlgoSuite.scala b/nebula-algorithm/src/test/scala/com/vesoft/nebula/algorithm/lib/JaccardAlgoSuite.scala
@@ -0,0 +1,22 @@
+/* Copyright (c) 2021 vesoft inc. All rights reserved.
+ *
+ * This source code is licensed under Apache 2.0 License.
+ */
+
+package com.vesoft.nebula.algorithm.lib
+
+import com.vesoft.nebula.algorithm.config.{JaccardConfig, KCoreConfig}
+import org.apache.spark.sql.SparkSession
+import org.junit.Test
+
+class JaccardAlgoSuite {
+  @Test
+  def kcoreSuite(): Unit = {
+    val spark         = SparkSession.builder().master("local").getOrCreate()
+    val data          = spark.read.option("header", true).csv("src/test/resources/edge.csv")
+    val jaccardConfig = new JaccardConfig(0.01)
+    val jaccardResult = JaccardAlgo.apply(spark, data, jaccardConfig)
+    jaccardResult.show()
+    assert(jaccardResult.count() == 6)
+  }
+}

Original file line number	Diff line number	Diff line change
`@@ -173,5 +173,10 @@`
`173`	`173`	`embSeparate: ",",`
`174`	`174`	`modelPath: "hdfs://127.0.0.1:9000/model"`
`175`	`175`	`}`
	`176`	`+`
	`177`	`+ # JaccardAlgo parameter`
	`178`	`+ jaccard:{`
	`179`	`+ tol: 1.0`
	`180`	`+ }`
`176`	`181`	`}`
`177`	`182`	`}`
-Original file line number
+Diff line change
 ,1,1.0
 ,2,5.0
 ,3,1.0
 -4,4,5.0
 +4,4,5.0