From a02559d5b6a09380f642be55eef67a23e0aad735 Mon Sep 17 00:00:00 2001 From: Pandurang Patil <5101898+pandurangpatil@users.noreply.github.com> Date: Fri, 1 Mar 2024 22:19:36 +0530 Subject: [PATCH] [x2cpg] Performance improvements in Overlay Passes (#4227) * Changes to parallelize linkToSingle method processing Changes to parallelize linkToSingle method processing * Converted StaticCallLinker to parallel pass * Some testing changes * Improvments in StaticCallLinker and LinkToMultiple methods 1. Made changes in `StaticCallLinker to divide the list of chunks further into smaller batches so that the in memory data generated in one batch becomes smaller and in turn when it gets absorbed the given memory will be garbage collected to free up the memory. 2. Introduced parallism inside `linkToMultiple` method handling/ * Few debug logs along with failing unit tests 1. Added few debug logs while linking `CALL` nodes to `METHOD` nodes if the number of METHOD nodes found is greater than 5. 2. There were wrong `FieldAccess` nodes being created, which is resulting into large memory being consumed in `StaticCallLinker` pass. Added respective failing unit tests in ignored state. These unit tests needs to be fixed. * Updated unit tests with working one and the failing one * Review comment fixes * Review comment fixes * Fix for fieldAccess operator call node In case of chained field access like `a.b.c` for `.c`, it was creating a `CALL` node with methodFullName set to `.fieldAccess` and name set to "c". Hence there were many `METHOD` sutbs being created for each of such `CALL` nodes with fullName set to `.fieldAccess`. Which is resulting into many edges being created mapping each fieldAccess `CALL` nodes to each of these `METHOD` stubs inside `StaticCallLinker` Pass. This change fixes the issue. * Changed TypeUsagePass and StaticCallLinker 1. Converted TypeUsagePass into two passes with ForkJoinParallelPass 2. Converted StaticCallLinker pass to extend from ForkJoinParallelPass * removed unwanted code * changes as per review comment * Review comment fixes * review comment fixes * review comment fixes --- .../scala/io/joern/x2cpg/layers/Base.scala | 3 +- .../x2cpg/passes/base/FileCreationPass.scala | 8 ++- .../x2cpg/passes/base/TypeEvalPass.scala | 43 +++++++++++ .../joern/x2cpg/passes/base/TypeRefPass.scala | 30 ++++++++ .../x2cpg/passes/base/TypeUsagePass.scala | 48 ------------- .../passes/callgraph/MethodRefLinker.scala | 8 ++- .../passes/callgraph/StaticCallLinker.scala | 72 +++++++------------ .../x2cpg/utils/ConcurrentTaskUtil.scala | 2 +- .../io/joern/x2cpg/utils/LinkingUtil.scala | 39 +++++----- 9 files changed, 131 insertions(+), 122 deletions(-) create mode 100644 joern-cli/frontends/x2cpg/src/main/scala/io/joern/x2cpg/passes/base/TypeEvalPass.scala create mode 100644 joern-cli/frontends/x2cpg/src/main/scala/io/joern/x2cpg/passes/base/TypeRefPass.scala delete mode 100644 joern-cli/frontends/x2cpg/src/main/scala/io/joern/x2cpg/passes/base/TypeUsagePass.scala diff --git a/joern-cli/frontends/x2cpg/src/main/scala/io/joern/x2cpg/layers/Base.scala b/joern-cli/frontends/x2cpg/src/main/scala/io/joern/x2cpg/layers/Base.scala index 6a7deb418413..57e017fb0c77 100644 --- a/joern-cli/frontends/x2cpg/src/main/scala/io/joern/x2cpg/layers/Base.scala +++ b/joern-cli/frontends/x2cpg/src/main/scala/io/joern/x2cpg/layers/Base.scala @@ -20,7 +20,8 @@ object Base { new MethodDecoratorPass(cpg), new AstLinkerPass(cpg), new ContainsEdgePass(cpg), - new TypeUsagePass(cpg) + new TypeRefPass(cpg), + new TypeEvalPass(cpg) ) } diff --git a/joern-cli/frontends/x2cpg/src/main/scala/io/joern/x2cpg/passes/base/FileCreationPass.scala b/joern-cli/frontends/x2cpg/src/main/scala/io/joern/x2cpg/passes/base/FileCreationPass.scala index bc340b45fd45..32e815c20bab 100644 --- a/joern-cli/frontends/x2cpg/src/main/scala/io/joern/x2cpg/passes/base/FileCreationPass.scala +++ b/joern-cli/frontends/x2cpg/src/main/scala/io/joern/x2cpg/passes/base/FileCreationPass.scala @@ -1,12 +1,12 @@ package io.joern.x2cpg.passes.base +import io.joern.x2cpg.utils.LinkingUtil import io.shiftleft.codepropertygraph.Cpg import io.shiftleft.codepropertygraph.generated.nodes.{NewFile, StoredNode} import io.shiftleft.codepropertygraph.generated.{EdgeTypes, NodeTypes, PropertyNames} import io.shiftleft.passes.CpgPass -import io.shiftleft.semanticcpg.language._ +import io.shiftleft.semanticcpg.language.* import io.shiftleft.semanticcpg.language.types.structure.FileTraversal -import io.joern.x2cpg.utils.LinkingUtil import scala.collection.mutable @@ -14,6 +14,7 @@ import scala.collection.mutable * SOURCE_FILE edges. */ class FileCreationPass(cpg: Cpg) extends CpgPass(cpg) with LinkingUtil { + private val srcLabels = List(NodeTypes.NAMESPACE_BLOCK, NodeTypes.TYPE_DECL, NodeTypes.METHOD, NodeTypes.COMMENT) override def run(dstGraph: DiffGraphBuilder): Unit = { val originalFileNameToNode = mutable.Map.empty[String, StoredNode] @@ -41,7 +42,8 @@ class FileCreationPass(cpg: Cpg) extends CpgPass(cpg) with LinkingUtil { // Create SOURCE_FILE edges from nodes of various types to FILE linkToSingle( cpg, - srcLabels = List(NodeTypes.NAMESPACE_BLOCK, NodeTypes.TYPE_DECL, NodeTypes.METHOD, NodeTypes.COMMENT), + srcNodes = cpg.graph.nodes(srcLabels: _*).toList, + srcLabels = srcLabels, dstNodeLabel = NodeTypes.FILE, edgeType = EdgeTypes.SOURCE_FILE, dstNodeMap = { x => diff --git a/joern-cli/frontends/x2cpg/src/main/scala/io/joern/x2cpg/passes/base/TypeEvalPass.scala b/joern-cli/frontends/x2cpg/src/main/scala/io/joern/x2cpg/passes/base/TypeEvalPass.scala new file mode 100644 index 000000000000..5f63163ec71b --- /dev/null +++ b/joern-cli/frontends/x2cpg/src/main/scala/io/joern/x2cpg/passes/base/TypeEvalPass.scala @@ -0,0 +1,43 @@ +package io.joern.x2cpg.passes.base + +import io.joern.x2cpg.utils.LinkingUtil +import io.shiftleft.codepropertygraph.Cpg +import io.shiftleft.codepropertygraph.generated.{EdgeTypes, NodeTypes, PropertyNames} +import io.shiftleft.passes.ForkJoinParallelCpgPass +import overflowdb.Node +import overflowdb.traversal.* + +class TypeEvalPass(cpg: Cpg) extends ForkJoinParallelCpgPass[List[Node]](cpg) with LinkingUtil { + val srcLabels = List( + NodeTypes.METHOD_PARAMETER_IN, + NodeTypes.METHOD_PARAMETER_OUT, + NodeTypes.METHOD_RETURN, + NodeTypes.MEMBER, + NodeTypes.LITERAL, + NodeTypes.CALL, + NodeTypes.LOCAL, + NodeTypes.IDENTIFIER, + NodeTypes.BLOCK, + NodeTypes.METHOD_REF, + NodeTypes.TYPE_REF, + NodeTypes.UNKNOWN + ) + + def generateParts(): Array[List[Node]] = { + val nodes = cpg.graph.nodes(srcLabels: _*).toList + nodes.grouped(getBatchSize(nodes.size)).toArray + } + def runOnPart(builder: DiffGraphBuilder, part: List[overflowdb.Node]): Unit = { + linkToSingle( + cpg = cpg, + srcNodes = part, + srcLabels = srcLabels, + dstNodeLabel = NodeTypes.TYPE, + edgeType = EdgeTypes.EVAL_TYPE, + dstNodeMap = typeFullNameToNode(cpg, _), + dstFullNameKey = PropertyNames.TYPE_FULL_NAME, + dstGraph = builder, + None + ) + } +} diff --git a/joern-cli/frontends/x2cpg/src/main/scala/io/joern/x2cpg/passes/base/TypeRefPass.scala b/joern-cli/frontends/x2cpg/src/main/scala/io/joern/x2cpg/passes/base/TypeRefPass.scala new file mode 100644 index 000000000000..3d371aa1b429 --- /dev/null +++ b/joern-cli/frontends/x2cpg/src/main/scala/io/joern/x2cpg/passes/base/TypeRefPass.scala @@ -0,0 +1,30 @@ +package io.joern.x2cpg.passes.base + +import io.joern.x2cpg.utils.LinkingUtil +import io.shiftleft.codepropertygraph.Cpg +import io.shiftleft.codepropertygraph.generated.{EdgeTypes, NodeTypes, PropertyNames} +import io.shiftleft.passes.ForkJoinParallelCpgPass +import overflowdb.Node +import overflowdb.traversal.* + +class TypeRefPass(cpg: Cpg) extends ForkJoinParallelCpgPass[List[Node]](cpg) with LinkingUtil { + val srcLabels = List(NodeTypes.TYPE) + + def generateParts(): Array[List[Node]] = { + val nodes = cpg.graph.nodes(srcLabels: _*).toList + nodes.grouped(getBatchSize(nodes.size)).toArray + } + def runOnPart(builder: DiffGraphBuilder, part: List[overflowdb.Node]): Unit = { + linkToSingle( + cpg = cpg, + srcNodes = part, + srcLabels = srcLabels, + dstNodeLabel = NodeTypes.TYPE_DECL, + edgeType = EdgeTypes.REF, + dstNodeMap = typeDeclFullNameToNode(cpg, _), + dstFullNameKey = PropertyNames.TYPE_DECL_FULL_NAME, + dstGraph = builder, + None + ) + } +} diff --git a/joern-cli/frontends/x2cpg/src/main/scala/io/joern/x2cpg/passes/base/TypeUsagePass.scala b/joern-cli/frontends/x2cpg/src/main/scala/io/joern/x2cpg/passes/base/TypeUsagePass.scala deleted file mode 100644 index 9ace35c71092..000000000000 --- a/joern-cli/frontends/x2cpg/src/main/scala/io/joern/x2cpg/passes/base/TypeUsagePass.scala +++ /dev/null @@ -1,48 +0,0 @@ -package io.joern.x2cpg.passes.base - -import io.joern.x2cpg.utils.LinkingUtil -import io.shiftleft.codepropertygraph.Cpg -import io.shiftleft.codepropertygraph.generated.{EdgeTypes, NodeTypes, PropertyNames} -import io.shiftleft.passes.CpgPass - -class TypeUsagePass(cpg: Cpg) extends CpgPass(cpg) with LinkingUtil { - - override def run(dstGraph: DiffGraphBuilder): Unit = { - // Create REF edges from TYPE nodes to TYPE_DECL - linkToSingle( - cpg, - srcLabels = List(NodeTypes.TYPE), - dstNodeLabel = NodeTypes.TYPE_DECL, - edgeType = EdgeTypes.REF, - dstNodeMap = typeDeclFullNameToNode(cpg, _), - dstFullNameKey = PropertyNames.TYPE_DECL_FULL_NAME, - dstGraph, - None - ) - - // Create EVAL_TYPE edges from nodes of various types to TYPE - linkToSingle( - cpg, - srcLabels = List( - NodeTypes.METHOD_PARAMETER_IN, - NodeTypes.METHOD_PARAMETER_OUT, - NodeTypes.METHOD_RETURN, - NodeTypes.MEMBER, - NodeTypes.LITERAL, - NodeTypes.CALL, - NodeTypes.LOCAL, - NodeTypes.IDENTIFIER, - NodeTypes.BLOCK, - NodeTypes.METHOD_REF, - NodeTypes.TYPE_REF, - NodeTypes.UNKNOWN - ), - dstNodeLabel = NodeTypes.TYPE, - edgeType = EdgeTypes.EVAL_TYPE, - dstNodeMap = typeFullNameToNode(cpg, _), - dstFullNameKey = "TYPE_FULL_NAME", - dstGraph, - None - ) - } -} diff --git a/joern-cli/frontends/x2cpg/src/main/scala/io/joern/x2cpg/passes/callgraph/MethodRefLinker.scala b/joern-cli/frontends/x2cpg/src/main/scala/io/joern/x2cpg/passes/callgraph/MethodRefLinker.scala index 6d8432d7269e..686fe6bc85fa 100644 --- a/joern-cli/frontends/x2cpg/src/main/scala/io/joern/x2cpg/passes/callgraph/MethodRefLinker.scala +++ b/joern-cli/frontends/x2cpg/src/main/scala/io/joern/x2cpg/passes/callgraph/MethodRefLinker.scala @@ -1,19 +1,23 @@ package io.joern.x2cpg.passes.callgraph +import io.joern.x2cpg.utils.LinkingUtil import io.shiftleft.codepropertygraph.Cpg -import io.shiftleft.codepropertygraph.generated._ +import io.shiftleft.codepropertygraph.generated.* import io.shiftleft.passes.CpgPass -import io.joern.x2cpg.utils.LinkingUtil +import overflowdb.traversal.* /** This pass has MethodStubCreator and TypeDeclStubCreator as prerequisite for language frontends which do not provide * method stubs and type decl stubs. */ class MethodRefLinker(cpg: Cpg) extends CpgPass(cpg) with LinkingUtil { + private val srcLabels = List(NodeTypes.METHOD_REF) + override def run(dstGraph: DiffGraphBuilder): Unit = { // Create REF edges from METHOD_REFs to METHOD linkToSingle( cpg, + srcNodes = cpg.graph.nodes(srcLabels: _*).toList, srcLabels = List(NodeTypes.METHOD_REF), dstNodeLabel = NodeTypes.METHOD, edgeType = EdgeTypes.REF, diff --git a/joern-cli/frontends/x2cpg/src/main/scala/io/joern/x2cpg/passes/callgraph/StaticCallLinker.scala b/joern-cli/frontends/x2cpg/src/main/scala/io/joern/x2cpg/passes/callgraph/StaticCallLinker.scala index 0d06049764c1..7f5ab2387383 100644 --- a/joern-cli/frontends/x2cpg/src/main/scala/io/joern/x2cpg/passes/callgraph/StaticCallLinker.scala +++ b/joern-cli/frontends/x2cpg/src/main/scala/io/joern/x2cpg/passes/callgraph/StaticCallLinker.scala @@ -1,64 +1,40 @@ package io.joern.x2cpg.passes.callgraph +import io.joern.x2cpg.utils.LinkingUtil import io.shiftleft.codepropertygraph.Cpg -import io.shiftleft.codepropertygraph.generated.nodes._ +import io.shiftleft.codepropertygraph.generated.nodes.* import io.shiftleft.codepropertygraph.generated.{DispatchTypes, EdgeTypes} -import io.shiftleft.passes.CpgPass -import io.shiftleft.semanticcpg.language._ +import io.shiftleft.passes.ForkJoinParallelCpgPass +import io.shiftleft.semanticcpg.language.* import org.slf4j.{Logger, LoggerFactory} -import scala.collection.mutable +class StaticCallLinker(cpg: Cpg) extends ForkJoinParallelCpgPass[Seq[Call]](cpg) with LinkingUtil { -class StaticCallLinker(cpg: Cpg) extends CpgPass(cpg) { - - import StaticCallLinker._ - private val methodFullNameToNode = mutable.Map.empty[String, List[Method]] - - override def run(dstGraph: DiffGraphBuilder): Unit = { + private val logger: Logger = LoggerFactory.getLogger(classOf[StaticCallLinker]) - cpg.method.foreach { method => - methodFullNameToNode.updateWith(method.fullName) { - case Some(l) => Some(method :: l) - case None => Some(List(method)) - } - } + override def generateParts(): Array[Seq[Call]] = { + val calls = cpg.call.l + calls.grouped(getBatchSize(calls.size)).toArray + } - cpg.call.foreach { call => + override def runOnPart(builder: DiffGraphBuilder, calls: Seq[Call]): Unit = { + calls.foreach { call => try { - linkCall(call, dstGraph) + call.dispatchType match { + case DispatchTypes.STATIC_DISPATCH | DispatchTypes.INLINED => + val resolvedMethods = cpg.method.fullNameExact(call.methodFullName).l + resolvedMethods.foreach(dst => builder.addEdge(call, dst, EdgeTypes.CALL)) + val size = resolvedMethods.size + // Add the debug logs with number of METHOD nodes found for given method full name + if size > 1 then logger.debug(s"Total $size METHOD nodes found for -> ${call.methodFullName}") + case DispatchTypes.DYNAMIC_DISPATCH => + // Do nothing + case _ => logger.warn(s"Unknown dispatch type on dynamic CALL ${call.code}") + } } catch { case exception: Exception => - throw new RuntimeException(exception) + logger.error(s"Exception in StaticCallLinker: ", exception) } } } - - private def linkCall(call: Call, dstGraph: DiffGraphBuilder): Unit = { - call.dispatchType match { - case DispatchTypes.STATIC_DISPATCH | DispatchTypes.INLINED => - linkStaticCall(call, dstGraph) - case DispatchTypes.DYNAMIC_DISPATCH => - // Do nothing - case _ => logger.warn(s"Unknown dispatch type on dynamic CALL ${call.code}") - } - } - - private def linkStaticCall(call: Call, dstGraph: DiffGraphBuilder): Unit = { - val resolvedMethodOption = methodFullNameToNode.get(call.methodFullName) - if (resolvedMethodOption.isDefined) { - resolvedMethodOption.get.foreach { dst => - dstGraph.addEdge(call, dst, EdgeTypes.CALL) - } - } else { - logger.info( - s"Unable to link static CALL with METHOD_FULL_NAME ${call.methodFullName}, NAME ${call.name}, " + - s"SIGNATURE ${call.signature}, CODE ${call.code}" - ) - } - } - -} - -object StaticCallLinker { - private val logger: Logger = LoggerFactory.getLogger(classOf[StaticCallLinker]) } diff --git a/joern-cli/frontends/x2cpg/src/main/scala/io/joern/x2cpg/utils/ConcurrentTaskUtil.scala b/joern-cli/frontends/x2cpg/src/main/scala/io/joern/x2cpg/utils/ConcurrentTaskUtil.scala index 97303cd67d96..3116eed4b0e4 100644 --- a/joern-cli/frontends/x2cpg/src/main/scala/io/joern/x2cpg/utils/ConcurrentTaskUtil.scala +++ b/joern-cli/frontends/x2cpg/src/main/scala/io/joern/x2cpg/utils/ConcurrentTaskUtil.scala @@ -11,7 +11,7 @@ import scala.util.Try */ object ConcurrentTaskUtil { - private val MAX_POOL_SIZE = Runtime.getRuntime.availableProcessors() + val MAX_POOL_SIZE = Runtime.getRuntime.availableProcessors() /** Uses a thread pool with a limited number of active threads executing a task at any given point. This is effective * when tasks may require large amounts of memory, or single tasks are too short lived. diff --git a/joern-cli/frontends/x2cpg/src/main/scala/io/joern/x2cpg/utils/LinkingUtil.scala b/joern-cli/frontends/x2cpg/src/main/scala/io/joern/x2cpg/utils/LinkingUtil.scala index 004efa4d8dde..dd3d44985d72 100644 --- a/joern-cli/frontends/x2cpg/src/main/scala/io/joern/x2cpg/utils/LinkingUtil.scala +++ b/joern-cli/frontends/x2cpg/src/main/scala/io/joern/x2cpg/utils/LinkingUtil.scala @@ -1,28 +1,21 @@ package io.joern.x2cpg.utils import io.joern.x2cpg.passes.frontend.Dereference -import io.shiftleft.codepropertygraph.generated.PropertyNames -import io.shiftleft.codepropertygraph.generated.nodes.StoredNode -import io.shiftleft.codepropertygraph.generated.nodes.TypeDecl -import io.shiftleft.codepropertygraph.generated.Properties -import io.shiftleft.codepropertygraph.generated.nodes.Method import io.shiftleft.codepropertygraph.Cpg -import io.shiftleft.codepropertygraph.generated.nodes.NamespaceBlock -import io.shiftleft.codepropertygraph.generated.nodes.Type -import org.slf4j.Logger -import org.slf4j.LoggerFactory -import overflowdb.NodeDb -import overflowdb.PropertyKey -import overflowdb.NodeRef -import overflowdb.traversal._ -import overflowdb.traversal.ChainedImplicitsTemp._ +import io.shiftleft.codepropertygraph.generated.nodes.* +import io.shiftleft.codepropertygraph.generated.{Properties, PropertyNames} +import org.slf4j.{Logger, LoggerFactory} +import overflowdb.traversal.* +import overflowdb.traversal.ChainedImplicitsTemp.* +import overflowdb.{Node, NodeDb, NodeRef, PropertyKey} import scala.collection.mutable -import scala.jdk.CollectionConverters._ +import scala.jdk.CollectionConverters.* trait LinkingUtil { import overflowdb.BatchedUpdate.DiffGraphBuilder + private val MAX_BATCH_SIZE = 100 val logger: Logger = LoggerFactory.getLogger(classOf[LinkingUtil]) @@ -41,11 +34,20 @@ trait LinkingUtil { def nodesWithFullName(cpg: Cpg, x: String): mutable.Seq[NodeRef[_ <: NodeDb]] = cpg.graph.indexManager.lookup(PropertyNames.FULL_NAME, x).asScala + protected def getBatchSize(totalItems: Int): Int = { + val batchSize = + if totalItems > MAX_BATCH_SIZE then totalItems / ConcurrentTaskUtil.MAX_POOL_SIZE + else MAX_BATCH_SIZE + Math.min(batchSize, MAX_BATCH_SIZE) + } + /** For all nodes `n` with a label in `srcLabels`, determine the value of `n.\$dstFullNameKey`, use that to lookup the * destination node in `dstNodeMap`, and create an edge of type `edgeType` between `n` and the destination node. */ - def linkToSingle( + + protected def linkToSingle( cpg: Cpg, + srcNodes: List[Node], srcLabels: List[String], dstNodeLabel: String, edgeType: String, @@ -54,9 +56,9 @@ trait LinkingUtil { dstGraph: DiffGraphBuilder, dstNotExistsHandler: Option[(StoredNode, String) => Unit] ): Unit = { - var loggedDeprecationWarning = false val dereference = Dereference(cpg) - cpg.graph.nodes(srcLabels: _*).foreach { srcNode => + var loggedDeprecationWarning = false + srcNodes.foreach { srcNode => // If the source node does not have any outgoing edges of this type // This check is just required for backward compatibility if (srcNode.outE(edgeType).isEmpty) { @@ -171,5 +173,4 @@ trait LinkingUtil { s"dstNodeType=$dstNodeType, dstNodeId=$dstNodeId" ) } - }