apache · AveryQi115 · Mar 15, 2025 · Mar 15, 2025 · Mar 15, 2025 · Mar 15, 2025
diff --git a/common/utils/src/main/resources/error/error-conditions.json b/common/utils/src/main/resources/error/error-conditions.json
@@ -4064,6 +4064,12 @@
     ],
     "sqlState" : "07501"
   },
+  "NESTED_REFERENCES_IN_SUBQUERY_NOT_SUPPORTED" : {
+    "message" : [
+      "Detected outer scope references <expression> in the subquery.This is not supported in the current version."
+    ],
+    "sqlState" : "0A000"
+  },
   "NONEXISTENT_FIELD_NAME_IN_LIST" : {
     "message" : [
       "Field(s) <nonExistFields> do(es) not exist. Available fields: <fieldNames>"

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
@@ -228,6 +228,68 @@ trait CheckAnalysis extends LookupCatalog with QueryErrorsBase with PlanToString
     }
   }
 
+  def checkNoNestedOuterReferencesInMainQuery(plan: LogicalPlan): Unit = {
+    def hasOuterScopeAttrsInSubqueryExpression(expr: Expression): Boolean = {
+      expr.exists {
+        case subExpr: SubqueryExpression if subExpr.getOuterScopeAttrs.nonEmpty => true
+        case _ => false
+      }
+    }
+
+    def getOuterScopeAttrsFromSubqueryExpression(
+        plan: LogicalPlan): Seq[(SubqueryExpression, AttributeSet)] = {
+      val res = plan.expressions.flatMap {
+        expr => expr.collect {
+          case subExpr: SubqueryExpression if subExpr.getOuterScopeAttrs.nonEmpty =>
+            (subExpr, subExpr.getOuterScopeAttrs)
+        }
+      }
+      res.map {
+        case (subExpr, nestedOuterExprs) =>
+          val attrs = nestedOuterExprs.collect {
+            case a: AttributeReference => a
+          }
+          (subExpr, AttributeSet(attrs))
+      }
+    }
+
+    def findFirstOccurence(
+        plan: LogicalPlan,
+        outerScopeAttrs: AttributeSet,
+        operator: LogicalPlan): (LogicalPlan, AttributeSet) = {
+      val firstOccuredOperator = operator
+      plan.foreach {
+        case p if p.expressions.exists(hasOuterScopeAttrsInSubqueryExpression) =>
+          val res = getOuterScopeAttrsFromSubqueryExpression(p)
+          res.find(_._2.intersect(outerScopeAttrs).nonEmpty) match {
+            case Some((subExpr, outerScopeAttrsInP)) =>
+              return findFirstOccurence(subExpr.plan,
+                outerScopeAttrsInP.intersect(outerScopeAttrs), p)
+            case None => // Do nothing
+          }
+        case _ => // Do nothing
+      }
+      (firstOccuredOperator, outerScopeAttrs)
+    }
+    def throwUnresolvedColumnErrorForOuterScopeAttrs(plan: LogicalPlan): Unit = {
+      val (subExpr, outerScopeAttrs) = getOuterScopeAttrsFromSubqueryExpression(plan).head
+      val (operator, missingInput) = findFirstOccurence(subExpr.plan, outerScopeAttrs, plan)
+      operator.failAnalysis(
+        errorClass = "MISSING_ATTRIBUTES.RESOLVED_ATTRIBUTE_MISSING_FROM_INPUT",
+        messageParameters = Map(
+          "missingAttributes" -> missingInput.toSeq.map(attr => toSQLExpr(attr)).mkString(", "),
+          "input" -> operator.inputSet.map(attr => toSQLExpr(attr)).mkString(", "),
+          "operator" -> operator.simpleString(SQLConf.get.maxToStringFields)
+        )
+      )
+    }
+    plan.foreach {
+      case p: LogicalPlan if p.expressions.exists(hasOuterScopeAttrsInSubqueryExpression) =>
+        throwUnresolvedColumnErrorForOuterScopeAttrs(p)
+      case _ =>
+    }
+  }
+
   def checkAnalysis(plan: LogicalPlan): Unit = {
     // We should inline all CTE relations to restore the original plan shape, as the analysis check
     // may need to match certain plan shapes. For dangling CTE relations, they will still be kept
@@ -241,6 +303,7 @@ trait CheckAnalysis extends LookupCatalog with QueryErrorsBase with PlanToString
     }
     preemptedError.clear()
     try {
+      checkNoNestedOuterReferencesInMainQuery(inlinedPlan)
       checkAnalysis0(inlinedPlan)
       preemptedError.getErrorOpt().foreach(throw _) // throw preempted error if any
     } catch {

diff --git a/...talyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ColumnResolutionHelper.scala b/...talyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ColumnResolutionHelper.scala
@@ -199,21 +199,29 @@ trait ColumnResolutionHelper extends Logging with DataTypeErrorsBase {
 
   // Resolves `UnresolvedAttribute` to `OuterReference`.
   protected def resolveOuterRef(e: Expression): Expression = {
-    val outerPlan = AnalysisContext.get.outerPlan
-    if (outerPlan.isEmpty) return e
+    val outerPlanContext = AnalysisContext.get.outerPlans
+    if (outerPlanContext.isEmpty) return e
 
     def resolve(nameParts: Seq[String]): Option[Expression] = try {
-      outerPlan.get match {
-        // Subqueries in UnresolvedHaving can host grouping expressions and aggregate functions.
-        // We should resolve columns with `agg.output` and the rule `ResolveAggregateFunctions` will
-        // push them down to Aggregate later. This is similar to what we do in `resolveColumns`.
-        case u @ UnresolvedHaving(_, agg: Aggregate) =>
-          agg.resolveChildren(nameParts, conf.resolver)
-            .orElse(u.resolveChildren(nameParts, conf.resolver))
-            .map(wrapOuterReference)
-        case other =>
-          other.resolveChildren(nameParts, conf.resolver).map(wrapOuterReference)
+      val outerPlans = outerPlanContext.get
+      val resolvedExpressions = outerPlans.flatMap {
+        _ match {
+          // Subqueries in UnresolvedHaving can host grouping
+          // expressions and aggregate functions. We should resolve
+          // columns with `agg.output` and the rule `ResolveAggregateFunctions` will
+          // push them down to Aggregate later. This is similar to what we do in `resolveColumns`.
+          case u @ UnresolvedHaving(_, agg: Aggregate) =>
+            agg.resolveChildren(nameParts, conf.resolver)
+              .orElse(u.resolveChildren(nameParts, conf.resolver))
+              .map(wrapOuterReference)
+          case other =>
+            other.resolveChildren(nameParts, conf.resolver).map(wrapOuterReference)
+        }
       }
+      // We use the first resolved expression here
+      // as the outerPlans are ordered by their depth and the
+      // first one is the closest to the subquery scope.
+      resolvedExpressions.headOption
     } catch {
       case ae: AnalysisException =>
         logDebug(ae.getMessage)

diff --git a/...st/src/main/scala/org/apache/spark/sql/catalyst/analysis/ValidateSubqueryExpression.scala b/...st/src/main/scala/org/apache/spark/sql/catalyst/analysis/ValidateSubqueryExpression.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.sql.catalyst.analysis
 
 import org.apache.spark.internal.{Logging, LogKeys, MDC}
+import org.apache.spark.sql.AnalysisException
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.SubExprUtils._
 import org.apache.spark.sql.catalyst.expressions.aggregate.AggregateExpression
@@ -111,9 +112,13 @@ object ValidateSubqueryExpression
       case f: Filter =>
         if (hasOuterReferences(expr.plan)) {
           expr.plan.expressions.foreach(_.foreachUp {
-            case o: OuterReference =>
+            case o@OuterReference(a) =>
               p.children.foreach(e =>
-                if (!e.output.exists(_.exprId == o.exprId)) {
+                if (!e.output.exists(_.exprId == o.exprId) &&
+                  !expr.getOuterScopeAttrs.contains(a)) {
+                  // If the outer reference is not found in the children plan,
+                  // it should be a outer scope reference. Otherwise, it is
+                  // invalid.
                   o.failAnalysis(
                     errorClass = "UNSUPPORTED_SUBQUERY_EXPRESSION_CATEGORY." +
                       "CORRELATED_COLUMN_NOT_FOUND",
@@ -125,11 +130,53 @@ object ValidateSubqueryExpression
       case _ =>
     }
 
+    def checkNestedOuterReferences(expr: SubqueryExpression): Unit = {
+      if (expr.getOuterScopeAttrs.nonEmpty) {
+        if (!SQLConf.get.getConf(SQLConf.SUPPORT_NESTED_CORRELATED_SUBQUERIES)) {
+          throw new AnalysisException(
+            errorClass = "NESTED_REFERENCES_IN_SUBQUERY_NOT_SUPPORTED",
+            messageParameters = Map(
+              "expression" -> expr.getOuterScopeAttrs.map(_.sql).mkString(","))
+          )
+        }
+        expr match {
+          case _: ScalarSubquery if
+            !SQLConf.get.getConf(
+              SQLConf.SUPPORT_NESTED_CORRELATED_SUBQUERIES_FOR_SCALARSUBQUERIES) =>
+            throw new AnalysisException(
+              errorClass = "NESTED_REFERENCES_IN_SUBQUERY_NOT_SUPPORTED",
+              messageParameters = Map(
+                "expression" -> expr.getOuterScopeAttrs.map(_.sql).mkString(","))
+            )
+          case _: ListQuery if
+            !SQLConf.get.getConf(
+              SQLConf.SUPPORT_NESTED_CORRELATED_SUBQUERIES_FOR_INSUBQUERIES) =>
+            throw new AnalysisException(
+              errorClass = "NESTED_REFERENCES_IN_SUBQUERY_NOT_SUPPORTED",
+              messageParameters = Map(
+                "expression" -> expr.getOuterScopeAttrs.map(_.sql).mkString(","))
+            )
+          case _: Exists if
+            !SQLConf.get.getConf(
+              SQLConf.SUPPORT_NESTED_CORRELATED_SUBQUERIES_FOR_EXISTSSUBQUERIES) =>
+            throw new AnalysisException(
+              errorClass = "NESTED_REFERENCES_IN_SUBQUERY_NOT_SUPPORTED",
+              messageParameters = Map(
+                "expression" -> expr.getOuterScopeAttrs.map(_.sql).mkString(","))
+            )
+          case _ => // Do nothing
+        }
+      }
+    }
+
+    // Check if there are nested correlated subqueries in the plan.
+    checkNestedOuterReferences(expr)
+
     // Check if there is outer attribute that cannot be found from the plan.
     checkOuterReference(plan, expr)
 
     expr match {
-      case ScalarSubquery(query, outerAttrs, _, _, _, _, _) =>
+      case ScalarSubquery(query, outerAttrs, _, _, _, _, _, _) =>
         // Scalar subquery must return one column as output.
         if (query.output.size != 1) {
           throw QueryCompilationErrors.subqueryReturnMoreThanOneColumn(query.output.size,

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SQLFunction.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SQLFunction.scala
@@ -85,8 +85,8 @@ case class SQLFunction(
       case (None, Some(Project(expr :: Nil, _: OneRowRelation)))
         if !isTableFunc =>
         (Some(expr), None)
-      case (Some(ScalarSubquery(Project(expr :: Nil, _: OneRowRelation), _, _, _, _, _, _)), None)
-        if !isTableFunc =>
+      case (Some(ScalarSubquery(Project(expr :: Nil, _: OneRowRelation),
+        _, _, _, _, _, _, _)), None) if !isTableFunc =>
         (Some(expr), None)
       case (_, _) =>
         (parsedExpression, parsedQuery)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/DynamicPruning.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/DynamicPruning.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.sql.catalyst.expressions
 
+import org.apache.spark.SparkException
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode}
 import org.apache.spark.sql.catalyst.plans.logical.{HintInfo, LogicalPlan}
@@ -29,6 +30,8 @@ trait DynamicPruning extends Predicate
  * The DynamicPruningSubquery expression is only used in join operations to prune one side of the
  * join with a filter from the other side of the join. It is inserted in cases where partition
  * pruning can be applied.
+ * The DynamicPruningSubquery expression should only have a single outer
+ * attribute which is the pruning key and should not have any outer scope attributes.
  *
  * @param pruningKey the filtering key of the plan to be pruned.
  * @param buildQuery the build side of the join.
@@ -47,7 +50,7 @@ case class DynamicPruningSubquery(
     onlyInBroadcast: Boolean,
     exprId: ExprId = NamedExpression.newExprId,
     hint: Option[HintInfo] = None)
-  extends SubqueryExpression(buildQuery, Seq(pruningKey), exprId, Seq.empty, hint)
+  extends SubqueryExpression(buildQuery, Seq(pruningKey), Seq.empty, exprId, Seq.empty, hint)
   with DynamicPruning
   with Unevaluable
   with UnaryLike[Expression] {
@@ -67,6 +70,16 @@ case class DynamicPruningSubquery(
     copy()
   }
 
+  override def withNewOuterScopeAttrs(
+    outerScopeAttrs: Seq[Expression]
+  ): DynamicPruningSubquery = {
+    if (outerScopeAttrs.nonEmpty) {
+      throw SparkException.internalError(
+        "DynamicPruningSubquery should not have outer scope attributes.")
+    }
+    this
+  }
+
   override def withNewHint(hint: Option[HintInfo]): SubqueryExpression = copy(hint = hint)
 
   override lazy val resolved: Boolean = {

diff --git a/...a/org/apache/spark/sql/catalyst/expressions/FunctionTableSubqueryArgumentExpression.scala b/...a/org/apache/spark/sql/catalyst/expressions/FunctionTableSubqueryArgumentExpression.scala
@@ -18,7 +18,7 @@
 package org.apache.spark.sql.catalyst.expressions
 
 import org.apache.spark.sql.catalyst.plans.logical.{HintInfo, LogicalPlan, Project, Repartition, RepartitionByExpression, Sort}
-import org.apache.spark.sql.catalyst.trees.TreePattern.{FUNCTION_TABLE_RELATION_ARGUMENT_EXPRESSION, TreePattern}
+import org.apache.spark.sql.catalyst.trees.TreePattern.{FUNCTION_TABLE_RELATION_ARGUMENT_EXPRESSION, NESTED_CORRELATED_SUBQUERY, TreePattern}
 import org.apache.spark.sql.errors.QueryCompilationErrors
 import org.apache.spark.sql.types.DataType
 
@@ -46,6 +46,10 @@ import org.apache.spark.sql.types.DataType
  *             relation or as a more complex logical plan in the event of a table subquery.
  * @param outerAttrs outer references of this subquery plan, generally empty since these table
  *                   arguments do not allow correlated references currently
+ * @param outerScopeAttrs outer references of the subquery plan that cannot be resolved by the
+ *                        direct containing query of the subquery. They have to be the subset of
+ *                        outerAttrs and are generally empty since these table arguments do not
+ *                        allow correlated references currently
  * @param exprId expression ID of this subquery expression, generally generated afresh each time
  * @param partitionByExpressions if non-empty, the TABLE argument included the PARTITION BY clause
  *                               to indicate that the input relation should be repartitioned by the
@@ -67,30 +71,53 @@ import org.apache.spark.sql.types.DataType
 case class FunctionTableSubqueryArgumentExpression(
     plan: LogicalPlan,
     outerAttrs: Seq[Expression] = Seq.empty,
+    outerScopeAttrs: Seq[Expression] = Seq.empty,
     exprId: ExprId = NamedExpression.newExprId,
     partitionByExpressions: Seq[Expression] = Seq.empty,
     withSinglePartition: Boolean = false,
     orderByExpressions: Seq[SortOrder] = Seq.empty,
     selectedInputExpressions: Seq[PythonUDTFSelectedExpression] = Seq.empty)
-  extends SubqueryExpression(plan, outerAttrs, exprId, Seq.empty, None) with Unevaluable {
+  extends SubqueryExpression(
+      plan,
+      outerAttrs,
+      outerScopeAttrs,
+      exprId,
+      Seq.empty,
+      None
+    ) with Unevaluable {
 
   assert(!(withSinglePartition && partitionByExpressions.nonEmpty),
     "WITH SINGLE PARTITION is mutually exclusive with PARTITION BY")
 
   override def dataType: DataType = plan.schema
+
   override def nullable: Boolean = false
+
   override def withNewPlan(plan: LogicalPlan): FunctionTableSubqueryArgumentExpression =
     copy(plan = plan)
+
   override def withNewOuterAttrs(outerAttrs: Seq[Expression])
   : FunctionTableSubqueryArgumentExpression = copy(outerAttrs = outerAttrs)
+
   override def hint: Option[HintInfo] = None
+
   override def withNewHint(hint: Option[HintInfo]): FunctionTableSubqueryArgumentExpression =
     copy()
+
+  override def withNewOuterScopeAttrs(
+    newOuterScopeAttrs: Seq[Expression]
+  ): FunctionTableSubqueryArgumentExpression = {
+    validateOuterScopeAttrs(newOuterScopeAttrs)
+    copy(outerScopeAttrs = newOuterScopeAttrs)
+  }
+
   override def toString: String = s"table-argument#${exprId.id} $conditionString"
+
   override lazy val canonicalized: Expression = {
     FunctionTableSubqueryArgumentExpression(
       plan.canonicalized,
       outerAttrs.map(_.canonicalized),
+      outerScopeAttrs.map(_.canonicalized),
       ExprId(0),
       partitionByExpressions,
       withSinglePartition,
@@ -101,8 +128,13 @@ case class FunctionTableSubqueryArgumentExpression(
       newChildren: IndexedSeq[Expression]): FunctionTableSubqueryArgumentExpression =
     copy(outerAttrs = newChildren)
 
-  final override def nodePatternsInternal(): Seq[TreePattern] =
-    Seq(FUNCTION_TABLE_RELATION_ARGUMENT_EXPRESSION)
+  final override def nodePatternsInternal(): Seq[TreePattern] = {
+    if (outerScopeAttrs.isEmpty) {
+      Seq(FUNCTION_TABLE_RELATION_ARGUMENT_EXPRESSION)
+    } else {
+      Seq(NESTED_CORRELATED_SUBQUERY, FUNCTION_TABLE_RELATION_ARGUMENT_EXPRESSION)
+    }
+  }
 
   def hasRepartitioning: Boolean = withSinglePartition || partitionByExpressions.nonEmpty