diff --git a/common/utils/src/main/resources/error/error-conditions.json b/common/utils/src/main/resources/error/error-conditions.json index 6591a62151d4d..e622b2fd1904b 100644 --- a/common/utils/src/main/resources/error/error-conditions.json +++ b/common/utils/src/main/resources/error/error-conditions.json @@ -4064,6 +4064,12 @@ ], "sqlState" : "07501" }, + "NESTED_REFERENCES_IN_SUBQUERY_NOT_SUPPORTED" : { + "message" : [ + "Detected outer scope references in the subquery.This is not supported in the current version." + ], + "sqlState" : "0A000" + }, "NONEXISTENT_FIELD_NAME_IN_LIST" : { "message" : [ "Field(s) do(es) not exist. Available fields: " diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala index 0c9e537eda6bc..19f7316ed5892 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala @@ -28,13 +28,7 @@ import scala.util.{Failure, Random, Success, Try} import org.apache.spark.{SparkException, SparkThrowable, SparkUnsupportedOperationException} import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst._ -import org.apache.spark.sql.catalyst.analysis.resolver.{ - AnalyzerBridgeState, - HybridAnalyzer, - Resolver => OperatorResolver, - ResolverExtension, - ResolverGuard -} +import org.apache.spark.sql.catalyst.analysis.resolver.{AnalyzerBridgeState, HybridAnalyzer, Resolver => OperatorResolver, ResolverExtension, ResolverGuard} import org.apache.spark.sql.catalyst.catalog._ import org.apache.spark.sql.catalyst.encoders.OuterScopes import org.apache.spark.sql.catalyst.expressions._ @@ -156,7 +150,7 @@ case class AnalysisContext( // lookup a temporary function. And export to the view metadata. referredTempFunctionNames: mutable.Set[String] = mutable.Set.empty, referredTempVariableNames: Seq[Seq[String]] = Seq.empty, - outerPlan: Option[LogicalPlan] = None, + outerPlans: Option[Seq[LogicalPlan]] = None, isExecuteImmediate: Boolean = false, collation: Option[String] = None, @@ -234,9 +228,9 @@ object AnalysisContext { try f finally { set(originContext) } } - def withOuterPlan[A](outerPlan: LogicalPlan)(f: => A): A = { + def withOuterPlan[A](outerPlans: Seq[LogicalPlan])(f: => A): A = { val originContext = value.get() - val context = originContext.copy(outerPlan = Some(outerPlan)) + val context = originContext.copy(outerPlans = Some(outerPlans)) set(context) try f finally { set(originContext) } } @@ -1799,17 +1793,30 @@ class Analyzer(override val catalogManager: CatalogManager) extends RuleExecutor s.expand(plan, resolver) } catch { case e: AnalysisException => - AnalysisContext.get.outerPlan.map { - // Only Project, Aggregate, CollectMetrics can host star expressions. - case u @ (_: Project | _: Aggregate | _: CollectMetrics) => - Try(s.expand(u.children.head, resolver)) match { - case Success(expanded) => expanded.map(wrapOuterReference) - case Failure(_) => throw e - } - // Do not use the outer plan to resolve the star expression - // since the star usage is invalid. - case _ => throw e - }.getOrElse { throw e } + val outerPlans = + if (AnalysisContext.get.outerPlans.isDefined) { + AnalysisContext.get.outerPlans.get + } else { + Seq.empty[LogicalPlan] + } + val success = outerPlans.flatMap { plan => + plan match { + // Only Project, Aggregate, CollectMetrics can host star expressions. + case u @ (_: Project | _: Aggregate | _: CollectMetrics) => + Try(s.expand(u.children.head, resolver)) match { + case Success(expanded) => expanded.map(wrapOuterReference) + case Failure(_) => Seq[NamedExpression]() + } + // Do not use the outer plan to resolve the star expression + // since the star usage is invalid. + case _ => Seq[NamedExpression]() + } + } + if (success.nonEmpty) { + return success + } else { + throw e + } } } } @@ -2309,6 +2316,27 @@ class Analyzer(override val catalogManager: CatalogManager) extends RuleExecutor * Note: CTEs are handled in CTESubstitution. */ object ResolveSubquery extends Rule[LogicalPlan] { + + /** + * Returns the outer scope attributes referenced in the subquery expressions + * in current plan and the children of the current plan. + */ + private def getOuterAttrsNeedToBePropagated(plan: LogicalPlan): Seq[Expression] = { + plan.expressions.flatMap { + case subExpr: SubqueryExpression => subExpr.getOuterScopeAttrs + case in: InSubquery => in.query.getOuterScopeAttrs + case expr if expr.containsPattern(PLAN_EXPRESSION) => + expr.collect { + case subExpr: SubqueryExpression => subExpr.getOuterScopeAttrs + }.flatten + case _ => Seq.empty + } ++ plan.children.flatMap{ + case p if p.containsPattern(PLAN_EXPRESSION) => + getOuterAttrsNeedToBePropagated(p) + case _ => Seq.empty + } + } + /** * Resolves the subquery plan that is referenced in a subquery expression, by invoking the * entire analyzer recursively. We set outer plan in `AnalysisContext`, so that the analyzer @@ -2320,20 +2348,69 @@ class Analyzer(override val catalogManager: CatalogManager) extends RuleExecutor e: SubqueryExpression, outer: LogicalPlan)( f: (LogicalPlan, Seq[Expression]) => SubqueryExpression): SubqueryExpression = { - val newSubqueryPlan = AnalysisContext.withOuterPlan(outer) { - executeSameContext(e.plan) + val outerPlanContext = AnalysisContext.get.outerPlans + val newSubqueryPlan = if (outerPlanContext.isDefined && + // We don't allow lateral subquery having nested correlation + !e.isInstanceOf[LateralSubquery] + ) { + // The previous outerPlanContext contains resolved outer scope plans + // and unresolved direct outer plan. Append the current outer plan into + // new outerPlanContext as current outer is guaranteed to be resolved. + val updatedOuterPlan = Seq(outer) ++ outerPlanContext.get + AnalysisContext.withOuterPlan(updatedOuterPlan) { + executeSameContext(e.plan) + } + } else { + AnalysisContext.withOuterPlan(Seq(outer)) { + executeSameContext(e.plan) + } } // If the subquery plan is fully resolved, pull the outer references and record // them as children of SubqueryExpression. if (newSubqueryPlan.resolved) { // Record the outer references as children of subquery expression. - f(newSubqueryPlan, SubExprUtils.getOuterReferences(newSubqueryPlan)) + val outer = SubExprUtils.getOuterReferences(newSubqueryPlan) ++ + getOuterAttrsNeedToBePropagated(newSubqueryPlan) + f(newSubqueryPlan, outer) } else { e.withNewPlan(newSubqueryPlan) } } + /** + * Returns the outer references that are not resolved in the current plan {{p}}. + * These outer references are outer scope references which can be resolved + * in outer scope plans. + * If these references cannot be resolved in the whole query plan, an analysis + * exception will be thrown in checkAnalysis or ColumnResolutionHelper$resolve. + */ + private def getNestedOuterReferences( + s: SubqueryExpression, p: LogicalPlan + ): Seq[Expression] = { + val outerReferencesInSubquery = s.getOuterAttrs + + // return outer references cannot be resolved in current plan + outerReferencesInSubquery.filter( + _ match { + case a: AttributeReference => !p.inputSet.contains(a) + case outer: AggregateExpression => + // For resolveSubquery, we only check if the references of the aggregate expression + // can be resolved in the p.inputSet as p might be changed after resolveAggregate. + // Currently we only allow subqueries in the Having clause + // to have aggregate expressions as outer references. + // So if p does not have Aggregate or the output of Aggregate does not have + // this outer reference, UpdateOuterReference won't trigger and we throw + // UNSUPPORTED_SUBQUERY_EXPRESSION_CATEGORY.CORRELATED_REFERENCE. + !p.exists{ + case plan: LogicalPlan if outer.references.subsetOf(plan.inputSet) => true + case _ => false + } + case _ => false + } + ) + } + /** * Resolves the subquery. Apart of resolving the subquery and outer references (if any) * in the subquery plan, the children of subquery expression are updated to record the @@ -2345,18 +2422,46 @@ class Analyzer(override val catalogManager: CatalogManager) extends RuleExecutor */ private def resolveSubQueries(plan: LogicalPlan, outer: LogicalPlan): LogicalPlan = { plan.transformAllExpressionsWithPruning(_.containsPattern(PLAN_EXPRESSION), ruleId) { - case s @ ScalarSubquery(sub, _, exprId, _, _, _, _) if !sub.resolved => - resolveSubQuery(s, outer)(ScalarSubquery(_, _, exprId)) - case e @ Exists(sub, _, exprId, _, _) if !sub.resolved => - resolveSubQuery(e, outer)(Exists(_, _, exprId)) - case InSubquery(values, l @ ListQuery(_, _, exprId, _, _, _)) + // There are four kinds of outer references here: + // 1. Outer references which are newly introduced in the subquery `res` + // which can be resolved in current `plan`. + // It is extracted by `SubExprUtils.getOuterReferences(res.plan)` and + // stored among res.outerAttrs + // 2. Outer references which are newly introduced in the subquery `res` + // which cannot be resolved in current `plan` + // It is extracted by `SubExprUtils.getOuterReferences(res.plan)` with + // `getNestedOuterReferences(res, plan)` filter and stored in + // res.outerScopeAttrs + // 3. Outer references which are introduced by nested subquery within `res.plan` + // which can be resolved in current `plan` + // It is extracted by `getOuterAttrsNeedToBePropagated(res.plan)`, filtered + // by `plan.inputSet.contains(_)`, need to be stored in res.outerAttrs + // 4. Outer references which are introduced by nested subquery within `res.plan` + // which cannot be resolved in current `plan` + // It is extracted by `getOuterAttrsNeedToBePropagated(res.plan)`, filtered + // by `!plan.inputSet.contains(_)`, need to be stored in + // res.outerAttrs and res.outerScopeAttrs + case s @ ScalarSubquery(sub, _, _, exprId, _, _, _, _) if !sub.resolved => + val res = resolveSubQuery(s, outer)(ScalarSubquery(_, _, Seq.empty, exprId)) + val nestedOuterReferences = getNestedOuterReferences(res, plan) + res.withNewOuterScopeAttrs(nestedOuterReferences) + case e @ Exists(sub, _, _, exprId, _, _) if !sub.resolved => + val res = resolveSubQuery(e, outer)(Exists(_, _, Seq.empty, exprId)) + val nestedOuterReferences = getNestedOuterReferences(res, plan) + res.withNewOuterScopeAttrs(nestedOuterReferences) + case InSubquery(values, l) if values.forall(_.resolved) && !l.resolved => val expr = resolveSubQuery(l, outer)((plan, exprs) => { - ListQuery(plan, exprs, exprId, plan.output.length) - }) - InSubquery(values, expr.asInstanceOf[ListQuery]) - case s @ LateralSubquery(sub, _, exprId, _, _) if !sub.resolved => - resolveSubQuery(s, outer)(LateralSubquery(_, _, exprId)) + ListQuery(plan, exprs, Seq.empty, l.exprId, plan.output.length) + }).asInstanceOf[ListQuery] + val nestedOuterReferences = getNestedOuterReferences(expr, plan) + val newExpr = expr.withNewOuterScopeAttrs(nestedOuterReferences) + InSubquery(values, newExpr) + case s @ LateralSubquery(sub, _, _, exprId, _, _) if !sub.resolved => + val res = resolveSubQuery(s, outer)(LateralSubquery(_, _, Seq.empty, exprId)) + val nestedOuterReferences = getNestedOuterReferences(res, plan) + assert(nestedOuterReferences.isEmpty) + res case a: FunctionTableSubqueryArgumentExpression if !a.plan.resolved => resolveSubQuery(a, outer)( (plan, outerAttrs) => a.copy(plan = plan, outerAttrs = outerAttrs)) @@ -2806,6 +2911,18 @@ class Analyzer(override val catalogManager: CatalogManager) extends RuleExecutor * and group by expressions from them. */ object ResolveAggregateFunctions extends Rule[LogicalPlan] { + def updateSubqueryOuterReferences(expression: Expression, aggregate: Aggregate): Expression = { + expression.transformUpWithPruning(_.containsPattern(PLAN_EXPRESSION)) { + case sub: SubqueryExpression if sub.getOuterScopeAttrs.nonEmpty => + val newOuterScopeAttrs = + sub.getOuterAttrs.filter( outerExpr => outerExpr match { + case a: AttributeReference => !aggregate.outputSet.contains(a) + case _ => true + }) + sub.withNewOuterScopeAttrs(newOuterScopeAttrs) + } + } + def apply(plan: LogicalPlan): LogicalPlan = { val collatedPlan = if (conf.getConf(SQLConf.RUN_COLLATION_TYPE_CASTS_BEFORE_ALIAS_ASSIGNMENT)) { @@ -2820,7 +2937,11 @@ class Analyzer(override val catalogManager: CatalogManager) extends RuleExecutor _.containsPattern(AGGREGATE), ruleId) { case UnresolvedHaving(cond, agg: Aggregate) if agg.resolved && cond.resolved => resolveOperatorWithAggregate(Seq(cond), agg, (newExprs, newChild) => { - val newCond = newExprs.head + // Update the subquery in having clause as the aggregate output may be changed + // after the resolution. Some outer references being marked as outer scope + // references might be removed. + val headCond = newExprs.head + val newCond = updateSubqueryOuterReferences(headCond, newChild) if (newCond.resolved) { Filter(newCond, newChild) } else { @@ -4144,7 +4265,7 @@ object UpdateOuterReferences extends Rule[LogicalPlan] { private def updateOuterReferenceInSubquery( plan: LogicalPlan, refExprs: Seq[Expression]): LogicalPlan = { - plan resolveExpressions { case e => + val newPlan = plan resolveExpressions { case e => val outerAlias = refExprs.find(stripAlias(_).semanticEquals(stripOuterReference(e))) outerAlias match { @@ -4152,19 +4273,53 @@ object UpdateOuterReferences extends Rule[LogicalPlan] { case _ => e } } + // The above step might modify the outerAttrs + // in any SubqueryExpressions in the plan. + // We need to make sure the outerAttrs and the outerScopeAttrs are aligned and + // don't contain any outer wrappers. + newPlan.transformAllExpressionsWithPruning(_.containsPattern(PLAN_EXPRESSION)) { + case s: SubqueryExpression if s.getOuterAttrs.exists(containsOuter) => + val newOuterScopeAttrs = s.getOuterScopeAttrs.map { e => + val outerAlias = + refExprs.find(stripAlias(_).semanticEquals(stripOuterReference(e))) + outerAlias match { + case Some(a: Alias) => a.toAttribute + case _ => e + } + } + val newOuterAttrs = s.getOuterAttrs.map(stripOuterReference) + s.withNewOuterAttrs(newOuterAttrs).withNewOuterScopeAttrs(newOuterScopeAttrs) + } + } + + def updateOuterReferenceInAllSubqueries( + s: SubqueryExpression, outerAliases: Seq[Alias]): SubqueryExpression = { + val subPlan = s.plan + val planWithNestedSubqueriesRewritten = + subPlan.transformAllExpressionsWithPruning(_.containsPattern(PLAN_EXPRESSION)) { + // Only update the nested subqueries if they have outer scope references + // And we don't collect new outerAliases along s.plan because this rule + // will be fired multiple times for each subquery plan in the Analyzer, + // we only collect outerAliases in the outer plan each time. + case s: SubqueryExpression if s.getOuterScopeAttrs.nonEmpty => + updateOuterReferenceInAllSubqueries(s, outerAliases) + } + val newPlan = + updateOuterReferenceInSubquery(planWithNestedSubqueriesRewritten, outerAliases) + s.withNewPlan(newPlan) } def apply(plan: LogicalPlan): LogicalPlan = { plan.resolveOperatorsWithPruning( _.containsAllPatterns(PLAN_EXPRESSION, FILTER, AGGREGATE), ruleId) { case f @ Filter(_, a: Aggregate) if f.resolved => + val outerAliases = a.aggregateExpressions collect { case a: Alias => a } f.transformExpressionsWithPruning(_.containsPattern(PLAN_EXPRESSION), ruleId) { case s: SubqueryExpression if s.children.nonEmpty => // Collect the aliases from output of aggregate. - val outerAliases = a.aggregateExpressions collect { case a: Alias => a } // Update the subquery plan to record the OuterReference to point to outer query plan. - s.withNewPlan(updateOuterReferenceInSubquery(s.plan, outerAliases)) - } + updateOuterReferenceInAllSubqueries(s, outerAliases) + } } } } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala index 61d4820884614..f66fbd33c8674 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala @@ -228,6 +228,68 @@ trait CheckAnalysis extends LookupCatalog with QueryErrorsBase with PlanToString } } + def checkNoNestedOuterReferencesInMainQuery(plan: LogicalPlan): Unit = { + def hasOuterScopeAttrsInSubqueryExpression(expr: Expression): Boolean = { + expr.exists { + case subExpr: SubqueryExpression if subExpr.getOuterScopeAttrs.nonEmpty => true + case _ => false + } + } + + def getOuterScopeAttrsFromSubqueryExpression( + plan: LogicalPlan): Seq[(SubqueryExpression, AttributeSet)] = { + val res = plan.expressions.flatMap { + expr => expr.collect { + case subExpr: SubqueryExpression if subExpr.getOuterScopeAttrs.nonEmpty => + (subExpr, subExpr.getOuterScopeAttrs) + } + } + res.map { + case (subExpr, nestedOuterExprs) => + val attrs = nestedOuterExprs.collect { + case a: AttributeReference => a + } + (subExpr, AttributeSet(attrs)) + } + } + + def findFirstOccurence( + plan: LogicalPlan, + outerScopeAttrs: AttributeSet, + operator: LogicalPlan): (LogicalPlan, AttributeSet) = { + val firstOccuredOperator = operator + plan.foreach { + case p if p.expressions.exists(hasOuterScopeAttrsInSubqueryExpression) => + val res = getOuterScopeAttrsFromSubqueryExpression(p) + res.find(_._2.intersect(outerScopeAttrs).nonEmpty) match { + case Some((subExpr, outerScopeAttrsInP)) => + return findFirstOccurence(subExpr.plan, + outerScopeAttrsInP.intersect(outerScopeAttrs), p) + case None => // Do nothing + } + case _ => // Do nothing + } + (firstOccuredOperator, outerScopeAttrs) + } + def throwUnresolvedColumnErrorForOuterScopeAttrs(plan: LogicalPlan): Unit = { + val (subExpr, outerScopeAttrs) = getOuterScopeAttrsFromSubqueryExpression(plan).head + val (operator, missingInput) = findFirstOccurence(subExpr.plan, outerScopeAttrs, plan) + operator.failAnalysis( + errorClass = "MISSING_ATTRIBUTES.RESOLVED_ATTRIBUTE_MISSING_FROM_INPUT", + messageParameters = Map( + "missingAttributes" -> missingInput.toSeq.map(attr => toSQLExpr(attr)).mkString(", "), + "input" -> operator.inputSet.map(attr => toSQLExpr(attr)).mkString(", "), + "operator" -> operator.simpleString(SQLConf.get.maxToStringFields) + ) + ) + } + plan.foreach { + case p: LogicalPlan if p.expressions.exists(hasOuterScopeAttrsInSubqueryExpression) => + throwUnresolvedColumnErrorForOuterScopeAttrs(p) + case _ => + } + } + def checkAnalysis(plan: LogicalPlan): Unit = { // We should inline all CTE relations to restore the original plan shape, as the analysis check // may need to match certain plan shapes. For dangling CTE relations, they will still be kept @@ -241,6 +303,7 @@ trait CheckAnalysis extends LookupCatalog with QueryErrorsBase with PlanToString } preemptedError.clear() try { + checkNoNestedOuterReferencesInMainQuery(inlinedPlan) checkAnalysis0(inlinedPlan) preemptedError.getErrorOpt().foreach(throw _) // throw preempted error if any } catch { diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ColumnResolutionHelper.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ColumnResolutionHelper.scala index b2e068fd990ba..6240c7b95b0c9 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ColumnResolutionHelper.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ColumnResolutionHelper.scala @@ -199,21 +199,29 @@ trait ColumnResolutionHelper extends Logging with DataTypeErrorsBase { // Resolves `UnresolvedAttribute` to `OuterReference`. protected def resolveOuterRef(e: Expression): Expression = { - val outerPlan = AnalysisContext.get.outerPlan - if (outerPlan.isEmpty) return e + val outerPlanContext = AnalysisContext.get.outerPlans + if (outerPlanContext.isEmpty) return e def resolve(nameParts: Seq[String]): Option[Expression] = try { - outerPlan.get match { - // Subqueries in UnresolvedHaving can host grouping expressions and aggregate functions. - // We should resolve columns with `agg.output` and the rule `ResolveAggregateFunctions` will - // push them down to Aggregate later. This is similar to what we do in `resolveColumns`. - case u @ UnresolvedHaving(_, agg: Aggregate) => - agg.resolveChildren(nameParts, conf.resolver) - .orElse(u.resolveChildren(nameParts, conf.resolver)) - .map(wrapOuterReference) - case other => - other.resolveChildren(nameParts, conf.resolver).map(wrapOuterReference) + val outerPlans = outerPlanContext.get + val resolvedExpressions = outerPlans.flatMap { + _ match { + // Subqueries in UnresolvedHaving can host grouping + // expressions and aggregate functions. We should resolve + // columns with `agg.output` and the rule `ResolveAggregateFunctions` will + // push them down to Aggregate later. This is similar to what we do in `resolveColumns`. + case u @ UnresolvedHaving(_, agg: Aggregate) => + agg.resolveChildren(nameParts, conf.resolver) + .orElse(u.resolveChildren(nameParts, conf.resolver)) + .map(wrapOuterReference) + case other => + other.resolveChildren(nameParts, conf.resolver).map(wrapOuterReference) + } } + // We use the first resolved expression here + // as the outerPlans are ordered by their depth and the + // first one is the closest to the subquery scope. + resolvedExpressions.headOption } catch { case ae: AnalysisException => logDebug(ae.getMessage) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ValidateSubqueryExpression.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ValidateSubqueryExpression.scala index d6b7a4dccb907..36efde6e7efce 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ValidateSubqueryExpression.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ValidateSubqueryExpression.scala @@ -18,6 +18,7 @@ package org.apache.spark.sql.catalyst.analysis import org.apache.spark.internal.{Logging, LogKeys, MDC} +import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.expressions.SubExprUtils._ import org.apache.spark.sql.catalyst.expressions.aggregate.AggregateExpression @@ -111,9 +112,13 @@ object ValidateSubqueryExpression case f: Filter => if (hasOuterReferences(expr.plan)) { expr.plan.expressions.foreach(_.foreachUp { - case o: OuterReference => + case o@OuterReference(a) => p.children.foreach(e => - if (!e.output.exists(_.exprId == o.exprId)) { + if (!e.output.exists(_.exprId == o.exprId) && + !expr.getOuterScopeAttrs.contains(a)) { + // If the outer reference is not found in the children plan, + // it should be a outer scope reference. Otherwise, it is + // invalid. o.failAnalysis( errorClass = "UNSUPPORTED_SUBQUERY_EXPRESSION_CATEGORY." + "CORRELATED_COLUMN_NOT_FOUND", @@ -125,11 +130,53 @@ object ValidateSubqueryExpression case _ => } + def checkNestedOuterReferences(expr: SubqueryExpression): Unit = { + if (expr.getOuterScopeAttrs.nonEmpty) { + if (!SQLConf.get.getConf(SQLConf.SUPPORT_NESTED_CORRELATED_SUBQUERIES)) { + throw new AnalysisException( + errorClass = "NESTED_REFERENCES_IN_SUBQUERY_NOT_SUPPORTED", + messageParameters = Map( + "expression" -> expr.getOuterScopeAttrs.map(_.sql).mkString(",")) + ) + } + expr match { + case _: ScalarSubquery if + !SQLConf.get.getConf( + SQLConf.SUPPORT_NESTED_CORRELATED_SUBQUERIES_FOR_SCALARSUBQUERIES) => + throw new AnalysisException( + errorClass = "NESTED_REFERENCES_IN_SUBQUERY_NOT_SUPPORTED", + messageParameters = Map( + "expression" -> expr.getOuterScopeAttrs.map(_.sql).mkString(",")) + ) + case _: ListQuery if + !SQLConf.get.getConf( + SQLConf.SUPPORT_NESTED_CORRELATED_SUBQUERIES_FOR_INSUBQUERIES) => + throw new AnalysisException( + errorClass = "NESTED_REFERENCES_IN_SUBQUERY_NOT_SUPPORTED", + messageParameters = Map( + "expression" -> expr.getOuterScopeAttrs.map(_.sql).mkString(",")) + ) + case _: Exists if + !SQLConf.get.getConf( + SQLConf.SUPPORT_NESTED_CORRELATED_SUBQUERIES_FOR_EXISTSSUBQUERIES) => + throw new AnalysisException( + errorClass = "NESTED_REFERENCES_IN_SUBQUERY_NOT_SUPPORTED", + messageParameters = Map( + "expression" -> expr.getOuterScopeAttrs.map(_.sql).mkString(",")) + ) + case _ => // Do nothing + } + } + } + + // Check if there are nested correlated subqueries in the plan. + checkNestedOuterReferences(expr) + // Check if there is outer attribute that cannot be found from the plan. checkOuterReference(plan, expr) expr match { - case ScalarSubquery(query, outerAttrs, _, _, _, _, _) => + case ScalarSubquery(query, outerAttrs, _, _, _, _, _, _) => // Scalar subquery must return one column as output. if (query.output.size != 1) { throw QueryCompilationErrors.subqueryReturnMoreThanOneColumn(query.output.size, diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SQLFunction.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SQLFunction.scala index 923373c1856a9..d65c90099b62c 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SQLFunction.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SQLFunction.scala @@ -85,8 +85,8 @@ case class SQLFunction( case (None, Some(Project(expr :: Nil, _: OneRowRelation))) if !isTableFunc => (Some(expr), None) - case (Some(ScalarSubquery(Project(expr :: Nil, _: OneRowRelation), _, _, _, _, _, _)), None) - if !isTableFunc => + case (Some(ScalarSubquery(Project(expr :: Nil, _: OneRowRelation), + _, _, _, _, _, _, _)), None) if !isTableFunc => (Some(expr), None) case (_, _) => (parsedExpression, parsedQuery) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/DynamicPruning.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/DynamicPruning.scala index b65576403e9d8..edc84f73289ca 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/DynamicPruning.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/DynamicPruning.scala @@ -17,6 +17,7 @@ package org.apache.spark.sql.catalyst.expressions +import org.apache.spark.SparkException import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode} import org.apache.spark.sql.catalyst.plans.logical.{HintInfo, LogicalPlan} @@ -29,6 +30,8 @@ trait DynamicPruning extends Predicate * The DynamicPruningSubquery expression is only used in join operations to prune one side of the * join with a filter from the other side of the join. It is inserted in cases where partition * pruning can be applied. + * The DynamicPruningSubquery expression should only have a single outer + * attribute which is the pruning key and should not have any outer scope attributes. * * @param pruningKey the filtering key of the plan to be pruned. * @param buildQuery the build side of the join. @@ -47,7 +50,7 @@ case class DynamicPruningSubquery( onlyInBroadcast: Boolean, exprId: ExprId = NamedExpression.newExprId, hint: Option[HintInfo] = None) - extends SubqueryExpression(buildQuery, Seq(pruningKey), exprId, Seq.empty, hint) + extends SubqueryExpression(buildQuery, Seq(pruningKey), Seq.empty, exprId, Seq.empty, hint) with DynamicPruning with Unevaluable with UnaryLike[Expression] { @@ -67,6 +70,16 @@ case class DynamicPruningSubquery( copy() } + override def withNewOuterScopeAttrs( + outerScopeAttrs: Seq[Expression] + ): DynamicPruningSubquery = { + if (outerScopeAttrs.nonEmpty) { + throw SparkException.internalError( + "DynamicPruningSubquery should not have outer scope attributes.") + } + this + } + override def withNewHint(hint: Option[HintInfo]): SubqueryExpression = copy(hint = hint) override lazy val resolved: Boolean = { diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/FunctionTableSubqueryArgumentExpression.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/FunctionTableSubqueryArgumentExpression.scala index bfd3bc8051dff..87b6f91d0dbb1 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/FunctionTableSubqueryArgumentExpression.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/FunctionTableSubqueryArgumentExpression.scala @@ -18,7 +18,7 @@ package org.apache.spark.sql.catalyst.expressions import org.apache.spark.sql.catalyst.plans.logical.{HintInfo, LogicalPlan, Project, Repartition, RepartitionByExpression, Sort} -import org.apache.spark.sql.catalyst.trees.TreePattern.{FUNCTION_TABLE_RELATION_ARGUMENT_EXPRESSION, TreePattern} +import org.apache.spark.sql.catalyst.trees.TreePattern.{FUNCTION_TABLE_RELATION_ARGUMENT_EXPRESSION, NESTED_CORRELATED_SUBQUERY, TreePattern} import org.apache.spark.sql.errors.QueryCompilationErrors import org.apache.spark.sql.types.DataType @@ -46,6 +46,10 @@ import org.apache.spark.sql.types.DataType * relation or as a more complex logical plan in the event of a table subquery. * @param outerAttrs outer references of this subquery plan, generally empty since these table * arguments do not allow correlated references currently + * @param outerScopeAttrs outer references of the subquery plan that cannot be resolved by the + * direct containing query of the subquery. They have to be the subset of + * outerAttrs and are generally empty since these table arguments do not + * allow correlated references currently * @param exprId expression ID of this subquery expression, generally generated afresh each time * @param partitionByExpressions if non-empty, the TABLE argument included the PARTITION BY clause * to indicate that the input relation should be repartitioned by the @@ -67,30 +71,53 @@ import org.apache.spark.sql.types.DataType case class FunctionTableSubqueryArgumentExpression( plan: LogicalPlan, outerAttrs: Seq[Expression] = Seq.empty, + outerScopeAttrs: Seq[Expression] = Seq.empty, exprId: ExprId = NamedExpression.newExprId, partitionByExpressions: Seq[Expression] = Seq.empty, withSinglePartition: Boolean = false, orderByExpressions: Seq[SortOrder] = Seq.empty, selectedInputExpressions: Seq[PythonUDTFSelectedExpression] = Seq.empty) - extends SubqueryExpression(plan, outerAttrs, exprId, Seq.empty, None) with Unevaluable { + extends SubqueryExpression( + plan, + outerAttrs, + outerScopeAttrs, + exprId, + Seq.empty, + None + ) with Unevaluable { assert(!(withSinglePartition && partitionByExpressions.nonEmpty), "WITH SINGLE PARTITION is mutually exclusive with PARTITION BY") override def dataType: DataType = plan.schema + override def nullable: Boolean = false + override def withNewPlan(plan: LogicalPlan): FunctionTableSubqueryArgumentExpression = copy(plan = plan) + override def withNewOuterAttrs(outerAttrs: Seq[Expression]) : FunctionTableSubqueryArgumentExpression = copy(outerAttrs = outerAttrs) + override def hint: Option[HintInfo] = None + override def withNewHint(hint: Option[HintInfo]): FunctionTableSubqueryArgumentExpression = copy() + + override def withNewOuterScopeAttrs( + newOuterScopeAttrs: Seq[Expression] + ): FunctionTableSubqueryArgumentExpression = { + validateOuterScopeAttrs(newOuterScopeAttrs) + copy(outerScopeAttrs = newOuterScopeAttrs) + } + override def toString: String = s"table-argument#${exprId.id} $conditionString" + override lazy val canonicalized: Expression = { FunctionTableSubqueryArgumentExpression( plan.canonicalized, outerAttrs.map(_.canonicalized), + outerScopeAttrs.map(_.canonicalized), ExprId(0), partitionByExpressions, withSinglePartition, @@ -101,8 +128,13 @@ case class FunctionTableSubqueryArgumentExpression( newChildren: IndexedSeq[Expression]): FunctionTableSubqueryArgumentExpression = copy(outerAttrs = newChildren) - final override def nodePatternsInternal(): Seq[TreePattern] = - Seq(FUNCTION_TABLE_RELATION_ARGUMENT_EXPRESSION) + final override def nodePatternsInternal(): Seq[TreePattern] = { + if (outerScopeAttrs.isEmpty) { + Seq(FUNCTION_TABLE_RELATION_ARGUMENT_EXPRESSION) + } else { + Seq(NESTED_CORRELATED_SUBQUERY, FUNCTION_TABLE_RELATION_ARGUMENT_EXPRESSION) + } + } def hasRepartitioning: Boolean = withSinglePartition || partitionByExpressions.nonEmpty diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala index 2af6a1ba84ec8..f7981c586264b 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala @@ -444,6 +444,48 @@ case class OuterReference(e: NamedExpression) final override val nodePatterns: Seq[TreePattern] = Seq(OUTER_REFERENCE) } +/** + * A place holder used to hold outer references in DomainJoins to instruct RewriteDomainJoins + * the mapping between domain attributes and outer references. + * We use it instead of OuterReference to avoid treating the already decorrelated subqueries + * as correlated when we rerun PullUpCorrelatedPredicates. + */ +case class OuterReferenceForDomainJoin(e: NamedExpression) + extends LeafExpression with NamedExpression with Unevaluable { + override def dataType: DataType = e.dataType + override def nullable: Boolean = e.nullable + override def prettyName: String = "outer" + + override def sql: String = s"$prettyName(${e.sql})" + override def name: String = e.name + override def qualifier: Seq[String] = e.qualifier + override def exprId: ExprId = e.exprId + override def toAttribute: Attribute = e.toAttribute + override def newInstance(): NamedExpression = OuterReferenceForDomainJoin(e.newInstance()) + final override val nodePatterns: Seq[TreePattern] = Seq(OUTER_REFERENCE_FOR_DOMAIN_JOIN) +} + +/** + * A place holder used to hold attributes need to be propagated up through subqueries. + * This should be only used in PullUpCorrelatedPredicates, RewritePredicateSubquery, + * RewriteLateralSubquery, RewriteCorrelatedScalarSubquery rules. + * It can only be used in the intermediate results in the optimization stage, should not appear + * in the physical plan. + */ +case class InnerReference(e: NamedExpression) + extends LeafExpression with NamedExpression with Unevaluable { + override def dataType: DataType = e.dataType + override def nullable: Boolean = e.nullable + override def prettyName: String = "inner" + + override def sql: String = s"$prettyName(${e.sql})" + override def name: String = e.name + override def qualifier: Seq[String] = e.qualifier + override def exprId: ExprId = e.exprId + override def toAttribute: Attribute = e.toAttribute + override def newInstance(): NamedExpression = InnerReference(e.newInstance()) +} + /** * A placeholder used to hold a [[NamedExpression]] that has been temporarily resolved as the * reference to a lateral column alias. It will be restored back to [[UnresolvedAttribute]] if diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/subquery.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/subquery.scala index 210b7f8fb5306..2bd1af7b6825b 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/subquery.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/subquery.scala @@ -24,7 +24,7 @@ import org.apache.spark.sql.catalyst.expressions.aggregate.AggregateExpression import org.apache.spark.sql.catalyst.plans._ import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.catalyst.trees.TreePattern._ -import org.apache.spark.sql.errors.QueryCompilationErrors +import org.apache.spark.sql.errors.{QueryCompilationErrors, QueryExecutionErrors} import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types._ import org.apache.spark.util.collection.BitSet @@ -67,6 +67,8 @@ abstract class PlanExpression[T <: QueryPlan[_]] extends Expression { * * @param plan: the subquery plan * @param outerAttrs: the outer references in the subquery plan + * @param outerScopeAttrs: the outer references in the subquery plan that cannot be resolved + * in its immediate parent plan * @param exprId: ID of the expression * @param joinCond: the join conditions with the outer query. It contains both inner and outer * query references. @@ -76,18 +78,50 @@ abstract class PlanExpression[T <: QueryPlan[_]] extends Expression { abstract class SubqueryExpression( plan: LogicalPlan, outerAttrs: Seq[Expression], + outerScopeAttrs: Seq[Expression], exprId: ExprId, joinCond: Seq[Expression], hint: Option[HintInfo]) extends PlanExpression[LogicalPlan] { + override lazy val resolved: Boolean = childrenResolved && plan.resolved + override lazy val references: AttributeSet = - AttributeSet.fromAttributeSets(outerAttrs.map(_.references)) + AttributeSet.fromAttributeSets(outerAttrs.map(_.references)) -- + AttributeSet.fromAttributeSets(outerScopeAttrs.map(_.references)) + override def children: Seq[Expression] = outerAttrs ++ joinCond + override def withNewPlan(plan: LogicalPlan): SubqueryExpression + def withNewOuterAttrs(outerAttrs: Seq[Expression]): SubqueryExpression + + def withNewOuterScopeAttrs(outerScopeAttrs: Seq[Expression]): SubqueryExpression + + def validateOuterScopeAttrs(newOuterScopeAttrs: Seq[Expression]): Unit = { + assert(newOuterScopeAttrs.toSet.subsetOf(outerAttrs.toSet), + s"outerScopeAttrs must be a subset of outerAttrs, " + + s"but got ${newOuterScopeAttrs.mkString(", ")}") + } + + def getOuterScopeAttrs: Seq[Expression] = outerScopeAttrs + + def getOuterAttrs: Seq[Expression] = outerAttrs + + def getJoinCond: Seq[Expression] = joinCond + def isCorrelated: Boolean = outerAttrs.nonEmpty + def hint: Option[HintInfo] + def withNewHint(hint: Option[HintInfo]): SubqueryExpression + + override def nodePatternsInternal(): Seq[TreePattern] = { + if (outerScopeAttrs.nonEmpty) { + Seq(NESTED_CORRELATED_SUBQUERY) + } else { + Seq() + } + } } object SubqueryExpression { @@ -172,6 +206,165 @@ object SubExprUtils extends PredicateHelper { plan.exists(_.expressions.exists(containsOuter)) } + /** + * Given a logical plan, returns TRUE if it has a SubqueryExpression + * with non empty outer references + */ + def containsCorrelatedSubquery(e: Expression): Boolean = { + e.exists{ + case in: InSubquery => in.query.getOuterAttrs.nonEmpty && in.query.getJoinCond.isEmpty + case s: SubqueryExpression => s.getOuterAttrs.nonEmpty && s.getJoinCond.isEmpty + case _ => false + } + } + + /** + * Given a logical plan, returns TRUE if it has an outer reference or + * correlated subqueries. + */ + def hasOuterReferencesConsideringNestedCorrelation(plan: LogicalPlan): Boolean = { + plan.exists(_.expressions.exists { + expr => containsOuter(expr) || containsCorrelatedSubquery(expr) + }) + } + + /** + * Returns TRUE if the scalar subquery has multiple Aggregates and the lower Aggregate + * is vulnerable to count bug. + * If it returns TRUE, we need to handle the count bug in [[DecorrelateInnerQuery]]. + * If it returns FALSE, the scalar subquery either does not have a count bug or it + * has a count bug but we handle it in [[RewriteCorrelatedScalarSubquery#constructLeftJoins]]. + */ + def scalarSubqueryHasCountBug(sub: LogicalPlan): Boolean = { + def mayHaveCountBugAgg(a: Aggregate): Boolean = { + a.groupingExpressions.isEmpty && a.aggregateExpressions.exists(_.exists { + case a: AggregateExpression => a.aggregateFunction.defaultResult.isDefined + case _ => false + }) + } + + // The below logic controls handling count bug for scalar subqueries in + // [[DecorrelateInnerQuery]], and if we don't handle it here, we handle it in + // [[RewriteCorrelatedScalarSubquery#constructLeftJoins]]. Note that handling it in + // [[DecorrelateInnerQuery]] is always correct, and turning it off to handle it in + // constructLeftJoins is an optimization, so that additional, redundant left outer joins are + // not introduced. + val conf = SQLConf.get + conf.decorrelateInnerQueryEnabled && + !conf.getConf(SQLConf.LEGACY_SCALAR_SUBQUERY_COUNT_BUG_HANDLING) && + !(sub match { + // Handle count bug only if there exists lower level Aggs with count bugs. It does not + // matter if the top level agg is count bug vulnerable or not, because: + // 1. If the top level agg is count bug vulnerable, it can be handled in + // constructLeftJoins, unless there are lower aggs that are count bug vulnerable. + // E.g. COUNT(COUNT + COUNT) + // 2. If the top level agg is not count bug vulnerable, it can be count bug vulnerable if + // there are lower aggs that are count bug vulnerable. E.g. SUM(COUNT) + case agg: Aggregate => !agg.child.exists { + case lowerAgg: Aggregate => mayHaveCountBugAgg(lowerAgg) + case _ => false + } + case _ => false + }) + } + + /** Returns true if 'query' is guaranteed to return at most 1 row. */ + private def guaranteedToReturnOneRow(query: LogicalPlan): Boolean = { + if (query.maxRows.exists(_ <= 1)) { + return true + } + val aggNode = query match { + case havingPart@Filter(_, aggPart: Aggregate) => Some(aggPart) + case aggPart: Aggregate => Some(aggPart) + // LIMIT 1 is handled above, this is for all other types of LIMITs + case Limit(_, aggPart: Aggregate) => Some(aggPart) + case Project(_, aggPart: Aggregate) => Some(aggPart) + case _: LogicalPlan => None + } + if (!aggNode.isDefined) { + return false + } + val aggregates = aggNode.get.expressions.flatMap(_.collect { + case a: AggregateExpression => a + }) + if (aggregates.isEmpty) { + return false + } + nonEquivalentGroupbyCols(query, aggNode.get).isEmpty + } + + /** Returns TRUE if the scalarSubquery needs a single join. */ + def scalarSubqueryNeedsSingleJoinAfterDecorrelate( + sub: LogicalPlan, needSingleJoinOld: Option[Boolean]): Boolean = { + if (needSingleJoinOld.isDefined) { + needSingleJoinOld.get + } else { + SQLConf.get.getConf(SQLConf.SCALAR_SUBQUERY_USE_SINGLE_JOIN) && !guaranteedToReturnOneRow(sub) + } + } + + /** + * Split the plan for a scalar subquery into the parts above the innermost query block + * (first part of returned value), the HAVING clause of the innermost query block + * (optional second part) and the Aggregate below the HAVING CLAUSE (optional third part). + * When the third part is empty, it means the subquery is a non-aggregated single-row subquery. + */ + def splitSubquery( + plan: LogicalPlan): (Seq[LogicalPlan], Option[Filter], Option[Aggregate]) = { + val topPart = ArrayBuffer.empty[LogicalPlan] + var bottomPart: LogicalPlan = plan + while (true) { + bottomPart match { + case havingPart @ Filter(_, aggPart: Aggregate) => + return (topPart.toSeq, Option(havingPart), Some(aggPart)) + + case aggPart: Aggregate => + // No HAVING clause + return (topPart.toSeq, None, Some(aggPart)) + + case p @ Project(_, child) => + topPart += p + bottomPart = child + + case s @ SubqueryAlias(_, child) => + topPart += s + bottomPart = child + + case p: LogicalPlan if p.maxRows.exists(_ <= 1) => + // Non-aggregated one row subquery. + return (topPart.toSeq, None, None) + + case Filter(_, op) => + throw QueryExecutionErrors.unexpectedOperatorInCorrelatedSubquery(op, " below filter") + + case op @ _ => throw QueryExecutionErrors.unexpectedOperatorInCorrelatedSubquery(op) + } + } + + throw QueryExecutionErrors.unreachableError() + + } + + def scalarSubqueryMayHaveCountBugAfterDecorrelate( + sub: LogicalPlan, + mayHaveCountBugOld: Option[Boolean], + handleCountBugInDecorrelate: Boolean): Boolean = { + if (mayHaveCountBugOld.isDefined) { + // For idempotency, we must save this variable the first time this rule is run, because + // decorrelation introduces a GROUP BY is if one wasn't already present. + mayHaveCountBugOld.get + } else if (handleCountBugInDecorrelate) { + // Count bug was already handled in the above decorrelate function call. + false + } else { + // Check whether the pre-rewrite subquery had empty groupingExpressions. If yes, it may + // be subject to the COUNT bug. If it has non-empty groupingExpressions, there is + // no COUNT bug. + val (topPart, havingNode, aggNode) = splitSubquery(sub) + (aggNode.isDefined && aggNode.get.groupingExpressions.isEmpty) + } + } + /** * Given an expression, returns the expressions which have outer references. Aggregate * expressions are treated in a special way. If the children of aggregate expression contains an @@ -395,12 +588,15 @@ object SubExprUtils extends PredicateHelper { case class ScalarSubquery( plan: LogicalPlan, outerAttrs: Seq[Expression] = Seq.empty, + outerScopeAttrs: Seq[Expression] = Seq.empty, exprId: ExprId = NamedExpression.newExprId, joinCond: Seq[Expression] = Seq.empty, hint: Option[HintInfo] = None, mayHaveCountBug: Option[Boolean] = None, needSingleJoin: Option[Boolean] = None) - extends SubqueryExpression(plan, outerAttrs, exprId, joinCond, hint) with Unevaluable { + extends SubqueryExpression( + plan, outerAttrs, outerScopeAttrs, exprId, joinCond, hint) with Unevaluable { + override def dataType: DataType = { if (!plan.schema.fields.nonEmpty) { throw QueryCompilationErrors.subqueryReturnMoreThanOneColumn(plan.schema.fields.length, @@ -408,16 +604,30 @@ case class ScalarSubquery( } plan.schema.fields.head.dataType } + override def nullable: Boolean = true + override def withNewPlan(plan: LogicalPlan): ScalarSubquery = copy(plan = plan) + override def withNewOuterAttrs(outerAttrs: Seq[Expression]): ScalarSubquery = copy( outerAttrs = outerAttrs) + + override def withNewOuterScopeAttrs( + newOuterScopeAttrs: Seq[Expression] + ): ScalarSubquery = { + validateOuterScopeAttrs(newOuterScopeAttrs) + copy(outerScopeAttrs = newOuterScopeAttrs) + } + override def withNewHint(hint: Option[HintInfo]): ScalarSubquery = copy(hint = hint) + override def toString: String = s"scalar-subquery#${exprId.id} $conditionString" + override lazy val canonicalized: Expression = { ScalarSubquery( plan.canonicalized, outerAttrs.map(_.canonicalized), + outerScopeAttrs.map(_.canonicalized), ExprId(0), joinCond.map(_.canonicalized)) } @@ -428,7 +638,13 @@ case class ScalarSubquery( outerAttrs = newChildren.take(outerAttrs.size), joinCond = newChildren.drop(outerAttrs.size)) - final override def nodePatternsInternal(): Seq[TreePattern] = Seq(SCALAR_SUBQUERY) + final override def nodePatternsInternal(): Seq[TreePattern] = { + if (outerScopeAttrs.isEmpty) { + Seq(SCALAR_SUBQUERY) + } else { + Seq(NESTED_CORRELATED_SUBQUERY, SCALAR_SUBQUERY) + } + } } object ScalarSubquery { @@ -474,21 +690,38 @@ case class UnresolvedTableArgPlanId( case class LateralSubquery( plan: LogicalPlan, outerAttrs: Seq[Expression] = Seq.empty, + outerScopeAttrs: Seq[Expression] = Seq.empty, exprId: ExprId = NamedExpression.newExprId, joinCond: Seq[Expression] = Seq.empty, hint: Option[HintInfo] = None) - extends SubqueryExpression(plan, outerAttrs, exprId, joinCond, hint) with Unevaluable { + extends SubqueryExpression( + plan, outerAttrs, outerScopeAttrs, exprId, joinCond, hint) with Unevaluable { + override def dataType: DataType = plan.output.toStructType + override def nullable: Boolean = true + override def withNewPlan(plan: LogicalPlan): LateralSubquery = copy(plan = plan) + override def withNewOuterAttrs(outerAttrs: Seq[Expression]): LateralSubquery = copy( outerAttrs = outerAttrs) + + override def withNewOuterScopeAttrs( + newOuterScopeAttrs: Seq[Expression] + ): LateralSubquery = { + validateOuterScopeAttrs(newOuterScopeAttrs) + copy(outerScopeAttrs = newOuterScopeAttrs) + } + override def withNewHint(hint: Option[HintInfo]): LateralSubquery = copy(hint = hint) + override def toString: String = s"lateral-subquery#${exprId.id} $conditionString" + override lazy val canonicalized: Expression = { LateralSubquery( plan.canonicalized, outerAttrs.map(_.canonicalized), + outerScopeAttrs.map(_.canonicalized), ExprId(0), joinCond.map(_.canonicalized)) } @@ -499,7 +732,17 @@ case class LateralSubquery( outerAttrs = newChildren.take(outerAttrs.size), joinCond = newChildren.drop(outerAttrs.size)) - final override def nodePatternsInternal(): Seq[TreePattern] = Seq(LATERAL_SUBQUERY) + final override def nodePatternsInternal(): Seq[TreePattern] = { + if (outerScopeAttrs.isEmpty) { + Seq(LATERAL_SUBQUERY) + } else { + // Currently we don't support lateral subqueries with + // nested outer references. + assert(false, "Nested outer references are not supported in lateral subqueries." + + " Please file a bug if you see this error.") + Seq(NESTED_CORRELATED_SUBQUERY, LATERAL_SUBQUERY) + } + } } /** @@ -517,20 +760,26 @@ case class LateralSubquery( case class ListQuery( plan: LogicalPlan, outerAttrs: Seq[Expression] = Seq.empty, + outerScopeAttrs: Seq[Expression] = Seq.empty, exprId: ExprId = NamedExpression.newExprId, // The plan of list query may have more columns after de-correlation, and we need to track the // number of the columns of the original plan, to report the data type properly. numCols: Int = -1, joinCond: Seq[Expression] = Seq.empty, hint: Option[HintInfo] = None) - extends SubqueryExpression(plan, outerAttrs, exprId, joinCond, hint) with Unevaluable { + extends SubqueryExpression( + plan, outerAttrs, outerScopeAttrs, exprId, joinCond, hint) with Unevaluable { + def childOutputs: Seq[Attribute] = plan.output.take(numCols) + override def dataType: DataType = if (numCols > 1) { childOutputs.toStructType } else { plan.output.head.dataType } + override lazy val resolved: Boolean = childrenResolved && plan.resolved && numCols != -1 + override def nullable: Boolean = { // ListQuery can't be executed alone so its nullability is not defined. // Consider using ListQuery.childOutputs.exists(_.nullable) @@ -540,15 +789,26 @@ case class ListQuery( } false } + override def withNewPlan(plan: LogicalPlan): ListQuery = copy(plan = plan) + override def withNewOuterAttrs(outerAttrs: Seq[Expression]): ListQuery = copy( outerAttrs = outerAttrs) + + override def withNewOuterScopeAttrs(newOuterScopeAttrs: Seq[Expression]): ListQuery = { + validateOuterScopeAttrs(newOuterScopeAttrs) + copy(outerScopeAttrs = newOuterScopeAttrs) + } + override def withNewHint(hint: Option[HintInfo]): ListQuery = copy(hint = hint) + override def toString: String = s"list#${exprId.id} $conditionString" + override lazy val canonicalized: Expression = { ListQuery( plan.canonicalized, outerAttrs.map(_.canonicalized), + outerScopeAttrs.map(_.canonicalized), ExprId(0), numCols, joinCond.map(_.canonicalized)) @@ -559,7 +819,13 @@ case class ListQuery( outerAttrs = newChildren.take(outerAttrs.size), joinCond = newChildren.drop(outerAttrs.size)) - final override def nodePatternsInternal(): Seq[TreePattern] = Seq(LIST_SUBQUERY) + final override def nodePatternsInternal(): Seq[TreePattern] = { + if (outerScopeAttrs.isEmpty) { + Seq(LIST_SUBQUERY) + } else { + Seq(NESTED_CORRELATED_SUBQUERY, LIST_SUBQUERY) + } + } } /** @@ -591,22 +857,35 @@ case class ListQuery( case class Exists( plan: LogicalPlan, outerAttrs: Seq[Expression] = Seq.empty, + outerScopeAttrs: Seq[Expression] = Seq.empty, exprId: ExprId = NamedExpression.newExprId, joinCond: Seq[Expression] = Seq.empty, hint: Option[HintInfo] = None) - extends SubqueryExpression(plan, outerAttrs, exprId, joinCond, hint) + extends SubqueryExpression(plan, outerAttrs, outerScopeAttrs, exprId, joinCond, hint) with Predicate with Unevaluable { + override def nullable: Boolean = false + override def withNewPlan(plan: LogicalPlan): Exists = copy(plan = plan) + override def withNewOuterAttrs(outerAttrs: Seq[Expression]): Exists = copy( outerAttrs = outerAttrs) + + override def withNewOuterScopeAttrs(newOuterScopeAttrs: Seq[Expression]): Exists = { + validateOuterScopeAttrs(newOuterScopeAttrs) + copy(outerScopeAttrs = newOuterScopeAttrs) + } + override def withNewHint(hint: Option[HintInfo]): Exists = copy(hint = hint) + override def toString: String = s"exists#${exprId.id} $conditionString" + override lazy val canonicalized: Expression = { Exists( plan.canonicalized, outerAttrs.map(_.canonicalized), + outerScopeAttrs.map(_.canonicalized), ExprId(0), joinCond.map(_.canonicalized)) } @@ -616,7 +895,13 @@ case class Exists( outerAttrs = newChildren.take(outerAttrs.size), joinCond = newChildren.drop(outerAttrs.size)) - final override def nodePatternsInternal(): Seq[TreePattern] = Seq(EXISTS_SUBQUERY) + final override def nodePatternsInternal(): Seq[TreePattern] = { + if (outerScopeAttrs.isEmpty) { + Seq(EXISTS_SUBQUERY) + } else { + Seq(NESTED_CORRELATED_SUBQUERY, EXISTS_SUBQUERY) + } + } } case class UnresolvedExistsPlanId(planId: Long) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/DecorrelateInnerQuery.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/DecorrelateInnerQuery.scala index 47cee2e789c7c..311db629055b4 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/DecorrelateInnerQuery.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/DecorrelateInnerQuery.scala @@ -24,7 +24,7 @@ import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.expressions.SubExprUtils._ import org.apache.spark.sql.catalyst.plans._ import org.apache.spark.sql.catalyst.plans.logical._ -import org.apache.spark.sql.catalyst.trees.TreePattern.OUTER_REFERENCE +import org.apache.spark.sql.catalyst.trees.TreePattern.{NESTED_CORRELATED_SUBQUERY, OUTER_REFERENCE, OUTER_REFERENCE_FOR_DOMAIN_JOIN, PLAN_EXPRESSION} import org.apache.spark.sql.errors.{QueryCompilationErrors, QueryExecutionErrors} import org.apache.spark.sql.internal.SQLConf import org.apache.spark.util.collection.Utils @@ -128,6 +128,14 @@ object DecorrelateInnerQuery extends PredicateHelper { AttributeSet(expression.collect { case o: OuterReference => o.toAttribute }) } + /** + * Collect outer references for domain joins + * in an expressions that are in the output attributes of the outer plan. + */ + private def collectOuterReferencesForDomainJoin(expression: Expression): AttributeSet = { + AttributeSet(expression.collect { case o: OuterReferenceForDomainJoin => o.toAttribute }) + } + /** * Collect outer references in a sequence of expressions that are in the output attributes * of the outer plan. @@ -214,7 +222,9 @@ object DecorrelateInnerQuery extends PredicateHelper { def deduplicate( innerPlan: LogicalPlan, conditions: Seq[Expression], - outerOutputSet: AttributeSet): (LogicalPlan, Seq[Expression]) = { + outerOutputSet: AttributeSet, + outerReferenceMap: AttributeMap[Attribute] = AttributeMap.empty[Attribute] + ): (LogicalPlan, Seq[Expression], AttributeMap[Attribute]) = { val duplicates = innerPlan.outputSet.intersect(outerOutputSet) if (duplicates.nonEmpty) { val aliasMap = AttributeMap(duplicates.map { dup => @@ -227,9 +237,12 @@ object DecorrelateInnerQuery extends PredicateHelper { val aliasedConditions = conditions.map(_.transform { case ref: Attribute => aliasMap.getOrElse(ref, ref).toAttribute }) - (aliasedProjection, aliasedConditions) + val aliasedOuterReferenceMap = AttributeMap(outerReferenceMap.map { + case (k, v) => k -> aliasMap.getOrElse(v, v).toAttribute + }) + (aliasedProjection, aliasedConditions, aliasedOuterReferenceMap) } else { - (innerPlan, conditions) + (innerPlan, conditions, outerReferenceMap) } } @@ -383,6 +396,127 @@ object DecorrelateInnerQuery extends PredicateHelper { } } + def groupDomainsByItsTargetOuterPlan( + domainAttrs: Seq[Attribute], + domainAttrMap: AttributeMap[Expression], + possibleOuterPlans: Seq[LogicalPlan] + ): Seq[(Seq[Attribute], AttributeMap[Expression], LogicalPlan)] = { + val reverseIndex = possibleOuterPlans.indices.reverse + val outerPlanIdToAttrMap = domainAttrs.foldLeft(Map.empty[Int, Seq[Attribute]]) { (acc, attr) => + val matchingPlanIndex = reverseIndex + .find { i => + val outerPlanOutputSet = possibleOuterPlans(i).outputSet + val outerReferences = collectOuterReferences(domainAttrMap(attr)) + outerReferences.subsetOf(outerPlanOutputSet) + } + .getOrElse(throw SparkException.internalError("Cannot find outer references")) + + acc.updated(matchingPlanIndex, acc.getOrElse(matchingPlanIndex, Seq.empty) :+ attr) + } + + outerPlanIdToAttrMap.map { case (i, attrs) => + val newDomainAttrMap = AttributeMap(attrs.map(attr => attr -> domainAttrMap(attr))) + (attrs, newDomainAttrMap, possibleOuterPlans(i)) + }.toSeq + } + + def rewriteDomainJoinsConsideringNestedCorrelation( + possibleOuterPlans: Seq[LogicalPlan], + innerPlan: LogicalPlan + ): LogicalPlan = innerPlan match { + case d @ DomainJoin(domainAttrs, child, joinType, outerJoinCondition) => + assert(outerJoinCondition.isDefined, + "DomainJoin should always have the join condition defined") + val newChild = joinType match { + // Left outer domain joins are used to handle the COUNT bug. + case LeftOuter => + // Replace the attributes in the domain join condition with the actual outer expressions + // and use the new join conditions to rewrite domain joins in its child. For example: + // DomainJoin [c'] LeftOuter (a = c') with domainAttrMap: { c' -> _1 }. + // Then the new conditions to use will be [(a = _1)]. + assert(outerJoinCondition.isDefined, + s"LeftOuter domain join should always have the join condition defined:\n$d") + // Recursively rewrite domain joins using the new conditions. + rewriteDomainJoinsConsideringNestedCorrelation( + possibleOuterPlans, child) + case Inner => + // The decorrelation framework adds domain inner joins by traversing down the plan tree + // recursively until it reaches a node that is not correlated with the outer query. + // So the child node of a domain inner join shouldn't contain another domain join. + assert(!child.exists(_.isInstanceOf[DomainJoin]), + s"Child of a domain inner join shouldn't contain another domain join.\n$child") + child + case o => + throw SparkException.internalError(s"Unexpected domain join type $o") + } + + // We only needs the domain join conditions that contain outer references, + // which stores the mapping between the domain attributes and the outer plan attributes. + val conditions = splitConjunctivePredicates(outerJoinCondition.get) + val (conditionsContainingOuterReferenceForDomainJoin, conditionsNotContainingOuter) = + conditions.partition(_.containsPattern(OUTER_REFERENCE_FOR_DOMAIN_JOIN)) + val conditionsContainingOuter = + conditionsContainingOuterReferenceForDomainJoin.map(_.transform { + case OuterReferenceForDomainJoin(a) => OuterReference(a) + }) + val domainAttrMap = buildDomainAttrMap(conditionsContainingOuter, domainAttrs) + assert((joinType == Inner && conditionsNotContainingOuter.isEmpty) + || (joinType == LeftOuter && conditionsNotContainingOuter.nonEmpty), + "LeftOuter domain join should have conditions not containing outer references," + + "and Inner domain join should have all conditions containing outer references.") + val domainJoinCond = conditionsNotContainingOuter.reduceOption(And) + + // We should only rewrite a domain join when all corresponding outer plan attributes + // can be found from the join condition. + if (domainAttrMap.size == domainAttrs.size) { + val domainInfoSeq = + groupDomainsByItsTargetOuterPlan(domainAttrs, + AttributeMap(domainAttrMap), possibleOuterPlans) + val plan = domainInfoSeq.foldLeft(newChild) { + case (newChild, (domainAttrs, domainAttrMap, outerPlan)) => + val groupingExprs = stripOuterReferences(domainAttrs.map(domainAttrMap)) + val aggregateExprs = groupingExprs.zip(domainAttrs).map { + // Rebuild the aliases. + case (inputAttr, outputAttr) => Alias(inputAttr, outputAttr.name)(outputAttr.exprId) + } + // Construct a domain with the outer query plan. + // DomainJoin [a', b'] => Aggregate [a, b] [a AS a', b AS b'] + // +- Relation [a, b] + val domain = Aggregate(groupingExprs, aggregateExprs, outerPlan) + newChild match { + // A special optimization for OneRowRelation. + // TODO: add a more general rule to optimize join with OneRowRelation. + case _: OneRowRelation => domain + // Construct a domain join. + // Join joinType condition + // :- Domain + // +- Inner Query + case _ => Join(domain, newChild, joinType, domainJoinCond, JoinHint.NONE) + } + } + assert(newChild.outputSet.subsetOf(plan.outputSet)) + // rearrange the output attrs to make sure original outputs are in the front + val projectList = newChild.output ++ plan.output.filterNot(newChild.outputSet.contains) + Project(projectList, plan) + } else { + throw SparkException.internalError( + s"Unable to rewrite domain join with conditions: $conditions\n$d.") + } + case s @ (_ : Union | _: SetOperation) => + // Remap the domain attributes for the children of the set op - see comments on the function. + s.mapChildren { child => + rewriteDomainJoinsConsideringNestedCorrelation(possibleOuterPlans, child) + } + case j: Join if j.joinType == LeftSemi || j.joinType == LeftAnti => + // For the INTERSECT/EXCEPT DISTINCT case, the set op is rewritten to a semi/anti join and we + // need to remap the domain attributes for the right child - see comments on the function. + j.mapChildren { child => + rewriteDomainJoinsConsideringNestedCorrelation(possibleOuterPlans, child) + } + case p: LogicalPlan => + p.mapChildren(rewriteDomainJoinsConsideringNestedCorrelation(possibleOuterPlans, _)) + } + /** * Rewrite all [[DomainJoin]]s in the inner query to actual joins with the outer query. */ @@ -461,11 +595,55 @@ object DecorrelateInnerQuery extends PredicateHelper { p.mapChildren(rewriteDomainJoins(outerPlan, _, conditions)) } + def pushDownSubqueriesToProject(agg: Aggregate): LogicalPlan = { + val subqueriesWithUnresolvedOuter = agg.expressions.flatMap { + expr => expr.collect { + case a@Alias(s: SubqueryExpression, _) if s.getOuterScopeAttrs.nonEmpty => + a -> a + case s: SubqueryExpression if s.getOuterScopeAttrs.nonEmpty => + s -> Alias(s, s"subquery${s.exprId}")() // TODO(avery): is there any other formal name? + } + }.toMap + + if (subqueriesWithUnresolvedOuter.isEmpty) { + // If there are no subqueries with unresolved outer attributes in + // the aggregate expressions, no transformation needed. + return agg + } + + val newAgg = + agg.transformExpressionsDownWithPruning(_.containsPattern(PLAN_EXPRESSION)) { + case a@Alias(s: SubqueryExpression, _) if subqueriesWithUnresolvedOuter.contains(a) => + a.toAttribute + case s: SubqueryExpression if subqueriesWithUnresolvedOuter.contains(s) => + subqueriesWithUnresolvedOuter(s).toAttribute + } + + val projectList = subqueriesWithUnresolvedOuter.values.toSeq + val newProject = Project(newAgg.output ++ projectList, newAgg.child) + Aggregate(newAgg.groupingExpressions, newAgg.aggregateExpressions, newProject, newAgg.hint) + } + + def transformPlanWithPotentialNewOutput(plan: LogicalPlan): LogicalPlan = { + plan transformUpWithNewOutput { + case agg: Aggregate => + // If there are subqueries with unresolved outer attrs, + // we need to push down the subqueries to make sure later + // decorrelation is correct. + val newPlan = pushDownSubqueriesToProject(agg) + val attrMapping = agg.output.zip(newPlan.output) + newPlan -> attrMapping + } + } + def apply( innerPlan: LogicalPlan, outerPlan: LogicalPlan, handleCountBug: Boolean = false): (LogicalPlan, Seq[Expression]) = { val outputPlanInputAttrs = outerPlan.inputSet + var innerHandleCountBug = handleCountBug + val containsNestedCorrelations = + innerPlan.containsPattern(NESTED_CORRELATED_SUBQUERY) // The return type of the recursion. // The first parameter is a new logical plan with correlation eliminated. @@ -474,6 +652,249 @@ object DecorrelateInnerQuery extends PredicateHelper { // expressions from the inner query that is used to replace outer references. type ReturnType = (LogicalPlan, Seq[Expression], AttributeMap[Attribute]) + /** + * This function is used as a aid to enforce idempotency of pullUpCorrelatedPredicate rule. + * In the first call to rewriteSubqueries, all the outer references from the subplan are + * pulled up and join predicates are recorded as children of the enclosing subquery expression. + * The subsequent call to rewriteSubqueries would simply re-records the `children` which would + * contains the pulled up correlated predicates (from the previous call) in the enclosing + * subquery expression. + */ + def getJoinCondition(newCond: Seq[Expression], oldCond: Seq[Expression]): Seq[Expression] = { + if (newCond.isEmpty) oldCond else newCond + } + + /** + * In short, transformSubqueryExpressions calls decorrelate on any subquery expressions + * in current operator. + * For each subquery expression, it does 4 steps: + * 1. If subquery expression has unresolvedOuterAttrs, + * set underNestedSubqueries for decorrelate to true as + * we'll insert domain joins to eliminate the correlation pessimistically. + * If it hasn't, set underNestedSubqueries to false as we can solve all + * pulled up predicates in current plan. + * 2. restore current innerHandleCountBug flag, recalculate it according to subquery types + * 3. Call decorrelate to pull up predicates or insert domain joins to eliminate correlation. + * 4. Collect join conditions and outer reference maps from decorrelating subqueries, + * these are conditions can be resolved in current plan. + * 5. Collect join conditions need to be pulled up, these include references cannot be resolved + * in current plan and need to be pulled up. + * 6. Reset joinCond, hints and other associative information. + */ + def transformSubqueryExpressions( + plan: LogicalPlan + ): LogicalPlan = { + + // processJoinCondsAndOuterReferenceMap is a helper function to + // 1. Strip OuterReference wrapper from joinConds + // 2. Separate joinConds into two parts: + // a. joinConds containing unresolvedOuterAttrs + // b. joinConds not containing unresolvedOuterAttrs + // + // 3. For the three parts, there are different processing logic: + // a. For a, unresolved outer attrs in the join conds are wrapped + // with OuterReference again and should be added to the pulled up + // joinConds from the outer plan. Other attrs are from the subquery plan + // and should be wrapped with InnerReference in the remained join conds as + // a hint to add these in the output of subqueries for rules rewriting + // subqueries to joins. + // b. For b, they do not have nested correlations and later rules can handle it normally. + // + // eg: + // Query: SELECT * FROM t0 WHERE t0.b = + // (SELECT SUM(t1.b) FROM t1 WHERE t1.a = + // (SELECT t2.a FROM t2 WHERE t2.a = t0.a AND t2.b > t0.b AND t2.a = t1.a)); + // + // Unprocessed joinConds from the inner most subquery is: + // [outer(t0.a) = t2.a, outer(t1.a) = t2.a, t2.b > outer(t0.b)] + // Among the three joinConds, + // the one can be pulled up(type a) is: [outer(t0.a) = t2.a, t2.b > outer(t0.b)] + // the one can be handled normally(type c) is: [outer(t1.a) = t2.a] + // + // After processing, the pulled up joinConds are: [outer(t0.a) = t2.a, t2.b > outer(t0.b)] + // The remained joinConds are: + // [t1.a = t2.a, InnerReference(t2.a), InnerReference(t2.b)] + // After decorrelate it should be: + // Project [t0.a, t0.b] + // +- Filter [t0.b = scalar-subquery#1[t0.a = t2.a AND t0.b = new_attr_b]] + // +- Aggregate [t2.a, new_attr_b][SUM(t1.b), t2.a, new_attr_b] + // +- Filter [t2.b > new_attr_b] + // +- DomainJoin [new_attr_b][outer(t0.b) = new_attr_b] + // +- Filter [t1.a = + // scalar-subquery#2 [t1.a = t2.a, + // InnerReference(t2.a), InnerReference(t2.b)]] + // +- Project [t2.a, t2.b] + // +- Scan [t2] + // +- Scan[t1] + // +- Scan[t0] + // + // The pulled up joinConds are transformed into a filter predicate. + // For joinConds can be pulled up over Agg and Union, after decorrelation, + // we won't have the filter left in the plan. (It is being further pulled up + // as joinConds in the subqueryExpression). For other joinConds, the proper way + // to insert filter is between the outer plan operator and the subquery expressions. + // That's why we do pushDownSubqueriesToProject before decorrelation, that makes sure + // the aggregate operator and subquery expressions are splitted and we can insert a + // filter between them. For union, there can't be correlated subquery expressions in + // union.expressions so we don't do any preprocessing for it. + def processJoinConds( + joinConds: Seq[Expression], + unresolvedOuterAttrs: Seq[Expression] + ): (Seq[Expression], Seq[Expression]) = { + val joinCondsWithoutOuterWrappers = stripOuterReferences(joinConds) + val (containUnresolvedOuterAttrs, notContainUnresolvedOuterAttrs) = + joinCondsWithoutOuterWrappers.partition( + _.exists(expr => unresolvedOuterAttrs.contains(expr))) + + val pulledUpJoinConds = containUnresolvedOuterAttrs.map { + _.transform { + case a: Attribute if unresolvedOuterAttrs.contains(a) => OuterReference(a) + } + } + + val remainedJoinConds = containUnresolvedOuterAttrs.flatMap { + expr => expr.collect { + case a: Attribute if !unresolvedOuterAttrs.contains(a) => InnerReference(a) + } + } ++ notContainUnresolvedOuterAttrs + assert(remainedJoinConds.nonEmpty, "empty join conds break the idempotency of the rule") + (remainedJoinConds, pulledUpJoinConds) + } + + // transformSubquery is a helper function to transform subquery expressions. + // It does the following: + // 1. calculate new handleCountBug + // 2. call decorrelate + // 3. restore previous handleCountBug + // 4. process joinConds + def transformSubquery( + sub: LogicalPlan, + outputPlanInputAttrs: AttributeSet, + unresolvedOuterAttrs: Seq[Expression], + handleCountBugInDecorrelate: Option[Boolean] = None + ): (LogicalPlan, Seq[Expression], Seq[Expression]) = { + // 1. calculate handleCountBug + val previousHandleCountBug = innerHandleCountBug + innerHandleCountBug = if (handleCountBugInDecorrelate.isDefined) { + handleCountBugInDecorrelate.get + } else { + true + } + // 2. call decorrelate + val (newSub, joinCondsFromSubqueries, outerReferenceMapFromSubqueries) = + decorrelate(BooleanSimplification(sub), AttributeSet.empty, + aggregated = false, underSetOp = false) + // 3. call deduplicate + val (newPlan, newJoinConds, newOuterReferenceMap) = + deduplicate(newSub, joinCondsFromSubqueries, + outputPlanInputAttrs, outerReferenceMapFromSubqueries) + // 4. restore previousHandleCountBug + innerHandleCountBug = previousHandleCountBug + // 5. process joinConds + val (remainedJoinConds, pulledUpJoinConds) = + processJoinConds(newJoinConds, unresolvedOuterAttrs) + (newPlan, remainedJoinConds, pulledUpJoinConds) + } + + val outputPlanInputAttrs = plan.inputSet + var totalPulledUpJoinConds = Seq.empty[Expression] + val newPlan = plan.transformExpressionsDownWithPruning(_.containsPattern(PLAN_EXPRESSION)) { + case in@InSubquery(_, + ListQuery(sub, outerAttrs, unresolvedOuterAttrs, exprId, numCols, joinCond, hint) + ) if outerAttrs.nonEmpty && joinCond.isEmpty => + val (newPlan, remainedJoinConds, pulledUpJoinConds) = + transformSubquery(sub, outputPlanInputAttrs, unresolvedOuterAttrs) + totalPulledUpJoinConds ++= pulledUpJoinConds + val newListQuery = ListQuery(newPlan, outerAttrs, unresolvedOuterAttrs, exprId, numCols, + getJoinCondition( + remainedJoinConds, joinCond), hint) + in.copy(query = newListQuery) + case ScalarSubquery( + sub, outerAttrs, unresolvedOuterAttrs, exprId, + joinCond, hint, mayHaveCountBugOld, needSingleJoinOld) + if outerAttrs.nonEmpty && joinCond.isEmpty => + val handleCountBugInDecorrelate = scalarSubqueryHasCountBug(sub) + val (newPlan, remainedJoinConds, pulledUpJoinConds) + = transformSubquery( + sub, outputPlanInputAttrs, unresolvedOuterAttrs, Some(handleCountBugInDecorrelate)) + totalPulledUpJoinConds ++= pulledUpJoinConds + val mayHaveCountBug = scalarSubqueryMayHaveCountBugAfterDecorrelate( + sub, mayHaveCountBugOld, handleCountBugInDecorrelate) + val needSingleJoin = scalarSubqueryNeedsSingleJoinAfterDecorrelate( + sub, needSingleJoinOld) + ScalarSubquery(newPlan, outerAttrs, unresolvedOuterAttrs, exprId, + getJoinCondition(remainedJoinConds, joinCond), + hint, + Some(mayHaveCountBug), + Some(needSingleJoin)) + case Exists(sub, outerAttrs, unresolvedOuterAttrs, exprId, joinCond, hint) + if outerAttrs.nonEmpty && joinCond.isEmpty => + val (newPlan, remainedJoinConds, pulledUpJoinConds) = + transformSubquery(sub, outputPlanInputAttrs, unresolvedOuterAttrs) + totalPulledUpJoinConds ++= pulledUpJoinConds + Exists(newPlan, outerAttrs, unresolvedOuterAttrs, exprId, + getJoinCondition( + remainedJoinConds, joinCond), hint) + case LateralSubquery(sub, outerAttrs, unresolvedOuterAttrs, exprId, joinCond, hint) + if outerAttrs.nonEmpty && joinCond.isEmpty => + val (newPlan, remainedJoinConds, pulledUpJoinConds) = + transformSubquery(sub, outputPlanInputAttrs, unresolvedOuterAttrs) + totalPulledUpJoinConds ++= pulledUpJoinConds + LateralSubquery(newPlan, outerAttrs, unresolvedOuterAttrs, exprId, + getJoinCondition( + remainedJoinConds, joinCond), hint) + } + if (totalPulledUpJoinConds.nonEmpty) { + Filter(totalPulledUpJoinConds.reduce(And), newPlan) + } else { + newPlan + } + } + + def insertDomainJoin( + plan: LogicalPlan, + attributes: Seq[Attribute] + ): (LogicalPlan, Seq[Expression], AttributeMap[Attribute]) = { + val domains = attributes.map(_.newInstance()) + // A placeholder to be rewritten into domain join. + val outerReferenceMap = Utils.toMap(attributes, domains) + // Build join conditions between domain attributes and outer references. + // EqualNullSafe is used to make sure null key can be joined together. Note + // outer referenced attributes can be changed during the outer query optimization. + // The equality conditions will also serve as an attribute mapping between new + // outer references and domain attributes when rewriting the domain joins. + // E.g. if the attribute a is changed to a1, the join condition a' <=> outer(a) + // will become a' <=> a1, and we can construct the aliases based on the condition: + // DomainJoin [a'] Join Inner + // +- InnerQuery => :- InnerQuery + // +- Aggregate [a1] [a1 AS a'] + // +- OuterQuery + val conditions = outerReferenceMap.map { + case (o, a) => + val cond = EqualNullSafe(a, OuterReference(o)) + // SPARK-40615: Certain data types (e.g. MapType) do not support ordering, so + // the EqualNullSafe join condition can become unresolved. + if (!cond.resolved) { + if (!RowOrdering.isOrderable(a.dataType)) { + throw QueryCompilationErrors.unsupportedCorrelatedReferenceDataTypeError( + o, a.dataType, plan.origin) + } else { + throw SparkException.internalError(s"Unable to decorrelate subquery: " + + s"join condition '${cond.sql}' cannot be resolved.") + } + } + cond + } + val joinConditions: Seq[Expression] = conditions.toSeq + val domainJoinCondition = joinConditions.map{ + _.transform { + case OuterReference(a) => OuterReferenceForDomainJoin(a) + } + } + val domainJoin = DomainJoin(domains, plan, Inner, Some(domainJoinCondition.reduce(And))) + (domainJoin, joinConditions, AttributeMap(outerReferenceMap)) + } + // Decorrelate the input plan. // parentOuterReferences: a set of parent outer references. As we recurse down we collect the // set of outer references that are part of the Domain, and use it to construct the DomainJoins @@ -498,7 +919,16 @@ object DecorrelateInnerQuery extends PredicateHelper { aggregated: Boolean = false, underSetOp: Boolean = false ): ReturnType = { - val isCorrelated = hasOuterReferences(plan) + // We can't directly use pattern NESTED_CORRELATED_SUBQUERY to check if the plan + // is correlated as this rule might be run multiple times and if the outer references + // or the correlated subqueries are already rewritten in previous puns, the plan should + // be treated as non-correlated. + val isCorrelated = + if (containsNestedCorrelations) { + hasOuterReferencesConsideringNestedCorrelation(plan) + } else { + hasOuterReferences(plan) + } if (!isCorrelated) { // We have reached a plan without correlation to the outer plan. if (parentOuterReferences.isEmpty) { @@ -509,41 +939,57 @@ object DecorrelateInnerQuery extends PredicateHelper { } else { // Build the domain join with the parent outer references. val attributes = parentOuterReferences.toSeq - val domains = attributes.map(_.newInstance()) - // A placeholder to be rewritten into domain join. - val domainJoin = DomainJoin(domains, plan) - val outerReferenceMap = Utils.toMap(attributes, domains) - // Build join conditions between domain attributes and outer references. - // EqualNullSafe is used to make sure null key can be joined together. Note - // outer referenced attributes can be changed during the outer query optimization. - // The equality conditions will also serve as an attribute mapping between new - // outer references and domain attributes when rewriting the domain joins. - // E.g. if the attribute a is changed to a1, the join condition a' <=> outer(a) - // will become a' <=> a1, and we can construct the aliases based on the condition: - // DomainJoin [a'] Join Inner - // +- InnerQuery => :- InnerQuery - // +- Aggregate [a1] [a1 AS a'] - // +- OuterQuery - val conditions = outerReferenceMap.map { - case (o, a) => - val cond = EqualNullSafe(a, OuterReference(o)) - // SPARK-40615: Certain data types (e.g. MapType) do not support ordering, so - // the EqualNullSafe join condition can become unresolved. - if (!cond.resolved) { - if (!RowOrdering.isOrderable(a.dataType)) { - throw QueryCompilationErrors.unsupportedCorrelatedReferenceDataTypeError( - o, a.dataType, plan.origin) - } else { - throw SparkException.internalError(s"Unable to decorrelate subquery: " + - s"join condition '${cond.sql}' cannot be resolved.") + if (containsNestedCorrelations) { + val (domainJoin, joinConds, outerReferenceMap) = insertDomainJoin(plan, attributes) + (domainJoin, joinConds, outerReferenceMap) + } else { + val domains = attributes.map(_.newInstance()) + // A placeholder to be rewritten into domain join. + val domainJoin = DomainJoin(domains, plan) + val outerReferenceMap = Utils.toMap(attributes, domains) + // Build join conditions between domain attributes and outer references. + // EqualNullSafe is used to make sure null key can be joined together. Note + // outer referenced attributes can be changed during the outer query optimization. + // The equality conditions will also serve as an attribute mapping between new + // outer references and domain attributes when rewriting the domain joins. + // E.g. if the attribute a is changed to a1, the join condition a' <=> outer(a) + // will become a' <=> a1, and we can construct the aliases based on the condition: + // DomainJoin [a'] Join Inner + // +- InnerQuery => :- InnerQuery + // +- Aggregate [a1] [a1 AS a'] + // +- OuterQuery + val conditions = outerReferenceMap.map { + case (o, a) => + val cond = EqualNullSafe(a, OuterReference(o)) + // SPARK-40615: Certain data types (e.g. MapType) do not support ordering, so + // the EqualNullSafe join condition can become unresolved. + if (!cond.resolved) { + if (!RowOrdering.isOrderable(a.dataType)) { + throw QueryCompilationErrors.unsupportedCorrelatedReferenceDataTypeError( + o, + a.dataType, + plan.origin + ) + } else { + throw SparkException.internalError( + s"Unable to decorrelate subquery: " + + s"join condition '${cond.sql}' cannot be resolved." + ) + } } - } - cond + cond + } + (domainJoin, conditions.toSeq, AttributeMap(outerReferenceMap)) } - (domainJoin, conditions.toSeq, AttributeMap(outerReferenceMap)) } } else { - plan match { + val planWithSubqueriesProcessed = + if (containsNestedCorrelations) { + transformSubqueryExpressions(plan) + } else { + plan + } + planWithSubqueriesProcessed match { case Filter(condition, child) => val conditions = splitConjunctivePredicates(condition) val (correlated, uncorrelated) = conditions.partition(containsOuter) @@ -838,7 +1284,7 @@ object DecorrelateInnerQuery extends PredicateHelper { // | 0 | 2 | true | 2 | // | 0 | null | null | 0 | <--- correct result // +---+------+------------+--------------------------------+ - if (groupingExpressions.isEmpty && handleCountBug) { + if (groupingExpressions.isEmpty && innerHandleCountBug) { // Evaluate the aggregate expressions with zero tuples. val resultMap = RewriteCorrelatedScalarSubquery.evalAggregateOnZeroTups(newAggregate) val alwaysTrue = Alias(Literal.TrueLiteral, "alwaysTrue")() @@ -904,8 +1350,31 @@ object DecorrelateInnerQuery extends PredicateHelper { // Use the current join conditions returned from the recursive call as the join // conditions for the left outer join. All outer references in the join // conditions are replaced by the newly created domain attributes. - val condition = replaceOuterReferences(joinCond, mapping).reduceOption(And) - val domainJoin = DomainJoin(domainAttrs, agg, LeftOuter, condition) + val domainJoin = + if (containsNestedCorrelations) { + val conds: Seq[Expression] = mapping.map { + case (o, a) => + val cond = EqualNullSafe(a, OuterReferenceForDomainJoin(o)) + // SPARK-40615: Certain data types (e.g. MapType) do not support ordering, so + // the EqualNullSafe join condition can become unresolved. + if (!cond.resolved) { + if (!RowOrdering.isOrderable(a.dataType)) { + throw QueryCompilationErrors.unsupportedCorrelatedReferenceDataTypeError( + o, a.dataType, plan.origin) + } else { + throw SparkException.internalError(s"Unable to decorrelate subquery: " + + s"join condition '${cond.sql}' cannot be resolved.") + } + } + cond + }.toSeq + val condition = replaceOuterReferences(joinCond, mapping).reduce(And) + val addedConditions = (Seq(condition) ++ conds).reduceOption(And) + DomainJoin(domainAttrs, agg, LeftOuter, addedConditions) + } else { + val condition = replaceOuterReferences(joinCond, mapping).reduceOption(And) + DomainJoin(domainAttrs, agg, LeftOuter, condition) + } // Original domain attributes preserved through Aggregate are no longer needed. val newProjectList = projectList.filter(!referencesToAdd.contains(_)) val project = Project(newProjectList ++ domainAttrs, domainJoin) @@ -991,10 +1460,14 @@ object DecorrelateInnerQuery extends PredicateHelper { parentOuterReferences ++ outerReferences -- equivalences.keySet var shouldPushToLeft = joinType match { case LeftOuter | LeftSemiOrAnti(_) | FullOuter => true + case _ if containsNestedCorrelations => + hasOuterReferencesConsideringNestedCorrelation(left) case _ => hasOuterReferences(left) } val shouldPushToRight = joinType match { case RightOuter | FullOuter => true + case _ if containsNestedCorrelations => + hasOuterReferencesConsideringNestedCorrelation(right) case _ => hasOuterReferences(right) } if (shouldDecorrelatePredicates && !shouldPushToLeft && !shouldPushToRight @@ -1107,8 +1580,14 @@ object DecorrelateInnerQuery extends PredicateHelper { } } } - val (newChild, joinCond, _) = decorrelate(BooleanSimplification(innerPlan), AttributeSet.empty) - val (plan, conditions) = deduplicate(newChild, joinCond, outputPlanInputAttrs) + val updatedInnerPlan = if (containsNestedCorrelations) { + transformPlanWithPotentialNewOutput(innerPlan) + } else { + innerPlan + } + val (newChild, joinCond, _) = + decorrelate(BooleanSimplification(updatedInnerPlan), AttributeSet.empty) + val (plan, conditions, _) = deduplicate(newChild, joinCond, outputPlanInputAttrs) (plan, stripOuterReferences(conditions)) } } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala index 7b437c302b145..54352d1597ebb 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala @@ -205,6 +205,13 @@ abstract class Optimizer(catalogManager: CatalogManager) OptimizeOneRowRelationSubquery, PullOutNestedDataOuterRefExpressions, PullupCorrelatedPredicates), + // This batch rewrites all correlated subqueries along with any domain joins inside. + // Each rule in the batch is only effective when there are nested correlated subqueries + // in the plan. + Batch("Rewrite Nested Correlated Subqueries", Once, + RewriteDomainJoinsInOnePass, + RewriteCorrelatedSubqueriesInOnePass + ), // Subquery batch applies the optimizer rules recursively. Therefore, it makes no sense // to enforce idempotence on it and we change this batch from Once to FixedPoint(1). Batch("Subquery", FixedPoint(1), @@ -295,6 +302,7 @@ abstract class Optimizer(catalogManager: CatalogManager) RewriteIntersectAll.ruleName, ReplaceDistinctWithAggregate.ruleName, PullupCorrelatedPredicates.ruleName, + RewriteDomainJoinsInOnePass.ruleName, RewriteCorrelatedScalarSubquery.ruleName, RewritePredicateSubquery.ruleName, NormalizeFloatingNumbers.ruleName, @@ -364,7 +372,7 @@ abstract class Optimizer(catalogManager: CatalogManager) case d: DynamicPruningSubquery => d case s @ ScalarSubquery( PhysicalOperation(projections, predicates, a @ Aggregate(group, _, child, _)), - _, _, _, _, mayHaveCountBug, _) + _, _, _, _, _, mayHaveCountBug, _) if conf.getConf(SQLConf.DECORRELATE_SUBQUERY_PREVENT_CONSTANT_FOLDING_FOR_COUNT_BUG) && mayHaveCountBug.nonEmpty && mayHaveCountBug.get => // This is a subquery with an aggregate that may suffer from a COUNT bug. diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala index b3236bbfa3755..dbc710ba09afb 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala @@ -91,7 +91,7 @@ object ConstantFolding extends Rule[LogicalPlan] { } // Don't replace ScalarSubquery if its plan is an aggregate that may suffer from a COUNT bug. - case s @ ScalarSubquery(_, _, _, _, _, mayHaveCountBug, _) + case s @ ScalarSubquery(_, _, _, _, _, _, mayHaveCountBug, _) if conf.getConf(SQLConf.DECORRELATE_SUBQUERY_PREVENT_CONSTANT_FOLDING_FOR_COUNT_BUG) && mayHaveCountBug.nonEmpty && mayHaveCountBug.get => s @@ -892,7 +892,7 @@ object NullPropagation extends Rule[LogicalPlan] { case InSubquery(Seq(Literal(null, _)), _) if SQLConf.get.legacyNullInEmptyBehavior => Literal.create(null, BooleanType) - case InSubquery(Seq(Literal(null, _)), ListQuery(sub, _, _, _, conditions, _)) + case InSubquery(Seq(Literal(null, _)), ListQuery(sub, _, _, _, _, conditions, _)) if !SQLConf.get.legacyNullInEmptyBehavior && conditions.isEmpty => If(Exists(sub), Literal(null, BooleanType), FalseLiteral) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/subquery.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/subquery.scala index 378081221c8c1..5c49fdaa8cf16 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/subquery.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/subquery.scala @@ -26,16 +26,14 @@ import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.expressions.ScalarSubquery._ import org.apache.spark.sql.catalyst.expressions.SubExprUtils._ import org.apache.spark.sql.catalyst.expressions.aggregate._ -import org.apache.spark.sql.catalyst.optimizer.RewriteCorrelatedScalarSubquery.splitSubquery import org.apache.spark.sql.catalyst.planning.PhysicalAggregation import org.apache.spark.sql.catalyst.plans._ import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.catalyst.rules._ -import org.apache.spark.sql.catalyst.trees.TreePattern.{EXISTS_SUBQUERY, IN_SUBQUERY, LATERAL_JOIN, LIST_SUBQUERY, PLAN_EXPRESSION, SCALAR_SUBQUERY} +import org.apache.spark.sql.catalyst.trees.TreePattern.{EXISTS_SUBQUERY, IN_SUBQUERY, LATERAL_JOIN, LIST_SUBQUERY, NESTED_CORRELATED_SUBQUERY, PLAN_EXPRESSION, SCALAR_SUBQUERY} import org.apache.spark.sql.errors.{QueryCompilationErrors, QueryExecutionErrors} import org.apache.spark.sql.internal.SQLConf -import org.apache.spark.sql.internal.SQLConf.{DECORRELATE_PREDICATE_SUBQUERIES_IN_JOIN_CONDITION, OPTIMIZE_UNCORRELATED_IN_SUBQUERIES_IN_JOIN_CONDITION, - WRAP_EXISTS_IN_AGGREGATE_FUNCTION} +import org.apache.spark.sql.internal.SQLConf.{DECORRELATE_PREDICATE_SUBQUERIES_IN_JOIN_CONDITION, OPTIMIZE_UNCORRELATED_IN_SUBQUERIES_IN_JOIN_CONDITION, WRAP_EXISTS_IN_AGGREGATE_FUNCTION} import org.apache.spark.sql.types._ import org.apache.spark.util.Utils @@ -152,17 +150,17 @@ object RewritePredicateSubquery extends Rule[LogicalPlan] with PredicateHelper { // Filter the plan by applying left semi and left anti joins. withSubquery.foldLeft(newFilter) { - case (p, Exists(sub, _, _, conditions, subHint)) => + case (p, Exists(sub, _, _, _, conditions, subHint)) => val (joinCond, outerPlan) = rewriteExistentialExpr(conditions, p) val join = buildJoin(outerPlan, rewriteDomainJoinsIfPresent(outerPlan, sub, joinCond), LeftSemi, joinCond, subHint) Project(p.output, join) - case (p, Not(Exists(sub, _, _, conditions, subHint))) => + case (p, Not(Exists(sub, _, _, _, conditions, subHint))) => val (joinCond, outerPlan) = rewriteExistentialExpr(conditions, p) val join = buildJoin(outerPlan, rewriteDomainJoinsIfPresent(outerPlan, sub, joinCond), LeftAnti, joinCond, subHint) Project(p.output, join) - case (p, InSubquery(values, ListQuery(sub, _, _, _, conditions, subHint))) => + case (p, InSubquery(values, ListQuery(sub, _, _, _, _, conditions, subHint))) => // Deduplicate conflicting attributes if any. val newSub = dedupSubqueryOnSelfJoin(p, sub, Some(values)) val inConditions = values.zip(newSub.output).map(EqualTo.tupled) @@ -170,7 +168,7 @@ object RewritePredicateSubquery extends Rule[LogicalPlan] with PredicateHelper { val join = Join(outerPlan, rewriteDomainJoinsIfPresent(outerPlan, newSub, joinCond), LeftSemi, joinCond, JoinHint(None, subHint)) Project(p.output, join) - case (p, Not(InSubquery(values, ListQuery(sub, _, _, _, conditions, subHint)))) => + case (p, Not(InSubquery(values, ListQuery(sub, _, _, _, _, conditions, subHint)))) => // This is a NULL-aware (left) anti join (NAAJ) e.g. col NOT IN expr // Construct the condition. A NULL in one of the conditions is regarded as a positive // result; such a row will be filtered out by the Anti-Join operator. @@ -400,7 +398,7 @@ object RewritePredicateSubquery extends Rule[LogicalPlan] with PredicateHelper { val introducedAttrs = ArrayBuffer.empty[Attribute] val newExprs = exprs.map { e => e.transformDownWithPruning(_.containsAnyPattern(EXISTS_SUBQUERY, IN_SUBQUERY)) { - case Exists(sub, _, _, conditions, subHint) => + case Exists(sub, _, _, _, conditions, subHint) => val exists = AttributeReference("exists", BooleanType, nullable = false)() val existenceJoin = ExistenceJoin(exists) val newCondition = conditions.reduceLeftOption(And) @@ -409,7 +407,7 @@ object RewritePredicateSubquery extends Rule[LogicalPlan] with PredicateHelper { existenceJoin, newCondition, subHint) introducedAttrs += exists exists - case Not(InSubquery(values, ListQuery(sub, _, _, _, conditions, subHint))) => + case Not(InSubquery(values, ListQuery(sub, _, _, _, _, conditions, subHint))) => val exists = AttributeReference("exists", BooleanType, nullable = false)() // Deduplicate conflicting attributes if any. val newSub = dedupSubqueryOnSelfJoin(newPlan, sub, Some(values)) @@ -434,7 +432,7 @@ object RewritePredicateSubquery extends Rule[LogicalPlan] with PredicateHelper { ExistenceJoin(exists), Some(finalJoinCond), joinHint) introducedAttrs += exists Not(exists) - case InSubquery(values, ListQuery(sub, _, _, _, conditions, subHint)) => + case InSubquery(values, ListQuery(sub, _, _, _, _, conditions, subHint)) => val exists = AttributeReference("exists", BooleanType, nullable = false)() // Deduplicate conflicting attributes if any. val newSub = dedupSubqueryOnSelfJoin(newPlan, sub, Some(values)) @@ -528,7 +526,7 @@ object PullupCorrelatedPredicates extends Rule[LogicalPlan] with PredicateHelper val baseConditions = predicateMap.values.flatten.toSeq val outerPlanInputAttrs = outer.inputSet val (newPlan, newCond) = if (outerPlanInputAttrs.nonEmpty) { - val (plan, deDuplicatedConditions) = + val (plan, deDuplicatedConditions, _) = DecorrelateInnerQuery.deduplicate(transformed, baseConditions, outerPlanInputAttrs) (plan, stripOuterReferences(deDuplicatedConditions)) } else { @@ -537,31 +535,6 @@ object PullupCorrelatedPredicates extends Rule[LogicalPlan] with PredicateHelper (newPlan, newCond) } - // Returns true if 'query' is guaranteed to return at most 1 row. - private def guaranteedToReturnOneRow(query: LogicalPlan): Boolean = { - if (query.maxRows.exists(_ <= 1)) { - return true - } - val aggNode = query match { - case havingPart@Filter(_, aggPart: Aggregate) => Some(aggPart) - case aggPart: Aggregate => Some(aggPart) - // LIMIT 1 is handled above, this is for all other types of LIMITs - case Limit(_, aggPart: Aggregate) => Some(aggPart) - case Project(_, aggPart: Aggregate) => Some(aggPart) - case _: LogicalPlan => None - } - if (!aggNode.isDefined) { - return false - } - val aggregates = aggNode.get.expressions.flatMap(_.collect { - case a: AggregateExpression => a - }) - if (aggregates.isEmpty) { - return false - } - nonEquivalentGroupbyCols(query, aggNode.get).isEmpty - } - private def rewriteSubQueries(plan: LogicalPlan): LogicalPlan = { /** * This function is used as a aid to enforce idempotency of pullUpCorrelatedPredicate rule. @@ -587,78 +560,37 @@ object PullupCorrelatedPredicates extends Rule[LogicalPlan] with PredicateHelper } plan.transformExpressionsWithPruning(_.containsPattern(PLAN_EXPRESSION)) { - case ScalarSubquery(sub, children, exprId, conditions, hint, + case ScalarSubquery(sub, children, outerScopeAttrs, exprId, conditions, hint, mayHaveCountBugOld, needSingleJoinOld) if children.nonEmpty => + val handleCountBugInDecorrelate = scalarSubqueryHasCountBug(sub) + val (newPlan, newCond) = decorrelate(sub, plan, handleCountBugInDecorrelate) - def mayHaveCountBugAgg(a: Aggregate): Boolean = { - a.groupingExpressions.isEmpty && a.aggregateExpressions.exists(_.exists { - case a: AggregateExpression => a.aggregateFunction.defaultResult.isDefined - case _ => false - }) - } + val mayHaveCountBug = scalarSubqueryMayHaveCountBugAfterDecorrelate( + sub, mayHaveCountBugOld, handleCountBugInDecorrelate) + val needSingleJoin = scalarSubqueryNeedsSingleJoinAfterDecorrelate( + sub, needSingleJoinOld) - // The below logic controls handling count bug for scalar subqueries in - // [[DecorrelateInnerQuery]], and if we don't handle it here, we handle it in - // [[RewriteCorrelatedScalarSubquery#constructLeftJoins]]. Note that handling it in - // [[DecorrelateInnerQuery]] is always correct, and turning it off to handle it in - // constructLeftJoins is an optimization, so that additional, redundant left outer joins are - // not introduced. - val handleCountBugInDecorrelate = SQLConf.get.decorrelateInnerQueryEnabled && - !conf.getConf(SQLConf.LEGACY_SCALAR_SUBQUERY_COUNT_BUG_HANDLING) && !(sub match { - // Handle count bug only if there exists lower level Aggs with count bugs. It does not - // matter if the top level agg is count bug vulnerable or not, because: - // 1. If the top level agg is count bug vulnerable, it can be handled in - // constructLeftJoins, unless there are lower aggs that are count bug vulnerable. - // E.g. COUNT(COUNT + COUNT) - // 2. If the top level agg is not count bug vulnerable, it can be count bug vulnerable if - // there are lower aggs that are count bug vulnerable. E.g. SUM(COUNT) - case agg: Aggregate => !agg.child.exists { - case lowerAgg: Aggregate => mayHaveCountBugAgg(lowerAgg) - case _ => false - } - case _ => false - }) - val (newPlan, newCond) = decorrelate(sub, plan, handleCountBugInDecorrelate) - val mayHaveCountBug = if (mayHaveCountBugOld.isDefined) { - // For idempotency, we must save this variable the first time this rule is run, because - // decorrelation introduces a GROUP BY is if one wasn't already present. - mayHaveCountBugOld.get - } else if (handleCountBugInDecorrelate) { - // Count bug was already handled in the above decorrelate function call. - false - } else { - // Check whether the pre-rewrite subquery had empty groupingExpressions. If yes, it may - // be subject to the COUNT bug. If it has non-empty groupingExpressions, there is - // no COUNT bug. - val (topPart, havingNode, aggNode) = splitSubquery(sub) - (aggNode.isDefined && aggNode.get.groupingExpressions.isEmpty) - } - val needSingleJoin = if (needSingleJoinOld.isDefined) { - needSingleJoinOld.get - } else { - conf.getConf(SQLConf.SCALAR_SUBQUERY_USE_SINGLE_JOIN) && !guaranteedToReturnOneRow(sub) - } - ScalarSubquery(newPlan, children, exprId, getJoinCondition(newCond, conditions), + ScalarSubquery(newPlan, children, outerScopeAttrs, exprId, getJoinCondition(newCond, conditions), hint, Some(mayHaveCountBug), Some(needSingleJoin)) - case Exists(sub, children, exprId, conditions, hint) if children.nonEmpty => + case Exists(sub, children, outerScopeAttrs, exprId, conditions, hint) if children.nonEmpty => val (newPlan, newCond) = if (SQLConf.get.decorrelateInnerQueryEnabledForExistsIn) { decorrelate(sub, plan, handleCountBug = true) } else { pullOutCorrelatedPredicates(sub, plan) } - Exists(newPlan, children, exprId, getJoinCondition(newCond, conditions), hint) - case ListQuery(sub, children, exprId, numCols, conditions, hint) if children.nonEmpty => + Exists(newPlan, children, outerScopeAttrs, exprId, getJoinCondition(newCond, conditions), hint) + case ListQuery(sub, children, outerScopeAttrs, exprId, numCols, conditions, hint) if children.nonEmpty => val (newPlan, newCond) = if (SQLConf.get.decorrelateInnerQueryEnabledForExistsIn) { decorrelate(sub, plan, handleCountBug = true) } else { pullOutCorrelatedPredicates(sub, plan) } val joinCond = getJoinCondition(newCond, conditions) - ListQuery(newPlan, children, exprId, numCols, joinCond, hint) - case LateralSubquery(sub, children, exprId, conditions, hint) if children.nonEmpty => + ListQuery(newPlan, children, outerScopeAttrs, exprId, numCols, joinCond, hint) + case LateralSubquery(sub, children, outerScopeAttrs, exprId, conditions, hint) if children.nonEmpty => val (newPlan, newCond) = decorrelate(sub, plan, handleCountBug = true) - LateralSubquery(newPlan, children, exprId, getJoinCondition(newCond, conditions), hint) + LateralSubquery(newPlan, children, outerScopeAttrs, exprId, getJoinCondition(newCond, conditions), hint) } } @@ -695,7 +627,7 @@ object RewriteCorrelatedScalarSubquery extends Rule[LogicalPlan] with AliasHelpe * Extract all correlated scalar subqueries from an expression. The subqueries are collected using * the given collector. The expression is rewritten and returned. */ - private def extractCorrelatedScalarSubqueries[E <: Expression]( + def extractCorrelatedScalarSubqueries[E <: Expression]( expression: E, subqueries: ArrayBuffer[ScalarSubquery]): E = { val newExpression = expression.transformWithPruning(_.containsPattern(SCALAR_SUBQUERY)) { @@ -842,48 +774,6 @@ object RewriteCorrelatedScalarSubquery extends Rule[LogicalPlan] with AliasHelpe } } - /** - * Split the plan for a scalar subquery into the parts above the innermost query block - * (first part of returned value), the HAVING clause of the innermost query block - * (optional second part) and the Aggregate below the HAVING CLAUSE (optional third part). - * When the third part is empty, it means the subquery is a non-aggregated single-row subquery. - */ - def splitSubquery( - plan: LogicalPlan): (Seq[LogicalPlan], Option[Filter], Option[Aggregate]) = { - val topPart = ArrayBuffer.empty[LogicalPlan] - var bottomPart: LogicalPlan = plan - while (true) { - bottomPart match { - case havingPart @ Filter(_, aggPart: Aggregate) => - return (topPart.toSeq, Option(havingPart), Some(aggPart)) - - case aggPart: Aggregate => - // No HAVING clause - return (topPart.toSeq, None, Some(aggPart)) - - case p @ Project(_, child) => - topPart += p - bottomPart = child - - case s @ SubqueryAlias(_, child) => - topPart += s - bottomPart = child - - case p: LogicalPlan if p.maxRows.exists(_ <= 1) => - // Non-aggregated one row subquery. - return (topPart.toSeq, None, None) - - case Filter(_, op) => - throw QueryExecutionErrors.unexpectedOperatorInCorrelatedSubquery(op, " below filter") - - case op @ _ => throw QueryExecutionErrors.unexpectedOperatorInCorrelatedSubquery(op) - } - } - - throw QueryExecutionErrors.unreachableError() - - } - // Name of generated column used in rewrite below val ALWAYS_TRUE_COLNAME = "alwaysTrue" @@ -898,8 +788,16 @@ object RewriteCorrelatedScalarSubquery extends Rule[LogicalPlan] with AliasHelpe subqueries: ArrayBuffer[ScalarSubquery]): (LogicalPlan, AttributeMap[Attribute]) = { val subqueryAttrMapping = ArrayBuffer[(Attribute, Attribute)]() val newChild = subqueries.foldLeft(child) { - case (currentChild, ScalarSubquery(sub, _, _, conditions, subHint, mayHaveCountBug, + case (currentChild, ScalarSubquery(sub, _, outerScopeAttrs, _, rawConditions, subHint, mayHaveCountBug, needSingleJoin)) => + val (conditionsContainInnerReferences, conditions) = + rawConditions.partition(_.exists(_.isInstanceOf[InnerReference])) + val neededInnerAttrs = conditionsContainInnerReferences.flatMap(_.collect { + case InnerReference(a) => a + }) + assert(neededInnerAttrs.isEmpty || outerScopeAttrs.nonEmpty, + "Inner references are not allowed for subqueries without nested correlations") + val query = DecorrelateInnerQuery.rewriteDomainJoins(currentChild, sub, conditions) val origOutput = query.output.head // The subquery appears on the right side of the join, hence add its hint to the right @@ -912,7 +810,7 @@ object RewriteCorrelatedScalarSubquery extends Rule[LogicalPlan] with AliasHelpe case _ => LeftOuter } lazy val planWithoutCountBug = Project( - currentChild.output :+ origOutput, + (currentChild.output :+ origOutput) ++ neededInnerAttrs, Join(currentChild, query, joinType, conditions.reduceOption(And), joinHint)) if (Utils.isTesting) { @@ -959,7 +857,7 @@ object RewriteCorrelatedScalarSubquery extends Rule[LogicalPlan] with AliasHelpe aggValRef), origOutput.name)() subqueryAttrMapping += ((origOutput, subqueryResultExpr.toAttribute)) Project( - currentChild.output :+ subqueryResultExpr, + (currentChild.output :+ subqueryResultExpr) ++ neededInnerAttrs, Join(currentChild, Project(query.output :+ alwaysTrueExpr, query), joinType, conditions.reduceOption(And), joinHint)) @@ -991,7 +889,7 @@ object RewriteCorrelatedScalarSubquery extends Rule[LogicalPlan] with AliasHelpe subqueryAttrMapping += ((origOutput, caseExpr.toAttribute)) Project( - currentChild.output :+ caseExpr, + (currentChild.output :+ caseExpr) ++ neededInnerAttrs, Join(currentChild, Project(subqueryRoot.output :+ alwaysTrueExpr, subqueryRoot), joinType, conditions.reduceOption(And), joinHint)) @@ -1028,6 +926,19 @@ object RewriteCorrelatedScalarSubquery extends Rule[LogicalPlan] with AliasHelpe } } + def extractInnerReferenceFromCorrelatedSubqueries( + subqueries: Seq[SubqueryExpression] + ): AttributeSet = { + val innerAttrs = subqueries.flatMap { + sub => sub.getJoinCond.flatMap { + expr => expr.collect { + case InnerReference(a) => a + } + } + } + AttributeSet(innerAttrs) + } + /** * Rewrite [[Filter]], [[Project]] and [[Aggregate]] plans containing correlated scalar * subqueries. @@ -1045,8 +956,10 @@ object RewriteCorrelatedScalarSubquery extends Rule[LogicalPlan] with AliasHelpe } val (newChild, subqueryAttrMapping) = constructLeftJoins(child, subqueries) val newExprs = updateAttrs(rewriteExprs, subqueryAttrMapping) - val newAgg = Aggregate(newGrouping, newExprs, newChild) - val attrMapping = a.output.zip(newAgg.output) + val innerAttrs = extractInnerReferenceFromCorrelatedSubqueries(subqueries.toSeq) + val newAgg = Aggregate( + newGrouping ++ innerAttrs.toSeq, newExprs ++ innerAttrs.toSeq, newChild) + val attrMapping = a.output.zip(newAgg.output.take(a.output.size)) checkScalarSubqueryInAgg(newAgg) newAgg -> attrMapping } else { @@ -1057,9 +970,10 @@ object RewriteCorrelatedScalarSubquery extends Rule[LogicalPlan] with AliasHelpe val rewriteExprs = expressions.map(extractCorrelatedScalarSubqueries(_, subqueries)) if (subqueries.nonEmpty) { val (newChild, subqueryAttrMapping) = constructLeftJoins(child, subqueries) + val innerAttrs = extractInnerReferenceFromCorrelatedSubqueries(subqueries.toSeq) val newExprs = updateAttrs(rewriteExprs, subqueryAttrMapping) - val newProj = Project(newExprs, newChild) - val attrMapping = p.output.zip(newProj.output) + val newProj = Project(newExprs ++ innerAttrs.toSeq, newChild) + val attrMapping = p.output.zip(newProj.output.take(p.output.size)) newProj -> attrMapping } else { p -> Nil @@ -1069,9 +983,10 @@ object RewriteCorrelatedScalarSubquery extends Rule[LogicalPlan] with AliasHelpe val rewriteCondition = extractCorrelatedScalarSubqueries(condition, subqueries) if (subqueries.nonEmpty) { val (newChild, subqueryAttrMapping) = constructLeftJoins(child, subqueries) + val innerAttrs = extractInnerReferenceFromCorrelatedSubqueries(subqueries.toSeq) val newCondition = updateAttrs(Seq(rewriteCondition), subqueryAttrMapping).head - val newProj = Project(f.output, Filter(newCondition, newChild)) - val attrMapping = f.output.zip(newProj.output) + val newProj = Project(f.output ++ innerAttrs.toSeq, Filter(newCondition, newChild)) + val attrMapping = f.output.zip(newProj.output.take(f.output.size)) newProj -> attrMapping } else { f -> Nil @@ -1085,7 +1000,7 @@ object RewriteCorrelatedScalarSubquery extends Rule[LogicalPlan] with AliasHelpe object RewriteLateralSubquery extends Rule[LogicalPlan] { def apply(plan: LogicalPlan): LogicalPlan = plan.transformUpWithPruning( _.containsPattern(LATERAL_JOIN)) { - case LateralJoin(left, LateralSubquery(sub, _, _, joinCond, subHint), joinType, condition) => + case LateralJoin(left, LateralSubquery(sub, _, _, _, joinCond, subHint), joinType, condition) => val newRight = DecorrelateInnerQuery.rewriteDomainJoins(left, sub, joinCond) val newCond = (condition ++ joinCond).reduceOption(And) // The subquery appears on the right side of the join, hence add the hint to the right side @@ -1093,11 +1008,64 @@ object RewriteLateralSubquery extends Rule[LogicalPlan] { } } +/** + * Recalculate outerAttrs and outerScopeAttrs in SubqueryExpressions. + */ +object RecalculateOuterAttrsAndOuterScopeAttrs extends Rule[LogicalPlan] { + /** + * Returns the outer scope attributes referenced in the subquery expressions + * in current plan and the children of the current plan. + */ + private def getOuterAttrsNeedToBePropagated(plan: LogicalPlan): Seq[Expression] = { + plan.expressions.flatMap { + case subExpr: SubqueryExpression => subExpr.getOuterScopeAttrs + case in: InSubquery => in.query.getOuterScopeAttrs + case expr if expr.containsPattern(PLAN_EXPRESSION) => + expr.collect { + case subExpr: SubqueryExpression => subExpr.getOuterScopeAttrs + }.flatten + case _ => Seq.empty + } ++ plan.children.flatMap{ + case p if p.containsPattern(PLAN_EXPRESSION) => + getOuterAttrsNeedToBePropagated(p) + case _ => Seq.empty + } + } + + private def getNestedOuterReferences( + outerAttrs: Seq[Expression], p: LogicalPlan + ): Seq[Expression] = { + outerAttrs.filter { + _ match { + case a: AttributeReference => !p.inputSet.contains(a) + case n: NamedExpression => !p.inputSet.contains(n.toAttribute) + case _ => false + } + } + } + + def apply0(plan: LogicalPlan): LogicalPlan = plan.transformExpressions { + case s: SubqueryExpression if s.children.nonEmpty && s.getJoinCond.isEmpty => + val newSubPlan = apply(s.plan) + val allOuterAttrs = getOuterReferences(newSubPlan) ++ + getOuterAttrsNeedToBePropagated(newSubPlan) + val nestedOuterAttrs = getNestedOuterReferences(allOuterAttrs, plan) + s.withNewOuterAttrs(allOuterAttrs).withNewOuterScopeAttrs(nestedOuterAttrs) + } + + def apply(plan: LogicalPlan): LogicalPlan = plan.transformUpWithPruning(_.containsPattern(PLAN_EXPRESSION)) { + case p: LogicalPlan if p.expressions.exists(SubqueryExpression.hasCorrelatedSubquery) => + apply0(p) + } +} + /** * This rule optimizes subqueries with OneRowRelation as leaf nodes. */ object OptimizeOneRowRelationSubquery extends Rule[LogicalPlan] { + var needToRecalculateOuterScopeAttrs = false + object OneRowSubquery { def unapply(plan: LogicalPlan): Option[UnaryNode] = { // SPARK-40800: always inline expressions to support a broader range of correlated @@ -1122,7 +1090,7 @@ object OptimizeOneRowRelationSubquery extends Rule[LogicalPlan] { */ private def rewrite(plan: LogicalPlan): LogicalPlan = plan.transformUpWithSubqueries { case LateralJoin( - left, right @ LateralSubquery(OneRowSubquery(plan), _, _, _, _), _, None) + left, right @ LateralSubquery(OneRowSubquery(plan), _, _, _, _, _), _, None) if !hasCorrelatedSubquery(right.plan) && right.joinCond.isEmpty => plan match { case Project(projectList, _: OneRowRelation) => @@ -1145,10 +1113,18 @@ object OptimizeOneRowRelationSubquery extends Rule[LogicalPlan] { case p: LogicalPlan => p.transformExpressionsUpWithPruning( _.containsPattern(SCALAR_SUBQUERY)) { - case s @ ScalarSubquery(OneRowSubquery(p @ Project(_, _: OneRowRelation)), _, _, _, _, _, _) + case s @ ScalarSubquery(OneRowSubquery(p @ Project(_, _: OneRowRelation)), outerAttrs, outerScopeAttrs, _, _, _, _, _) if !hasCorrelatedSubquery(s.plan) && s.joinCond.isEmpty => assert(p.projectList.size == 1) - stripOuterReferences(p.projectList).head + needToRecalculateOuterScopeAttrs = true + val originalOutput = p.projectList.head + // If the outer reference is a outerScopeAttr, even if current subquery + // is eliminated to one or multiple expressions, we can't strip its outer references. + // After the rule is applied, the outerAttrs and the outerScopeAttrs need to be reevaluated. + originalOutput.transform { + case OuterReference(a) if !outerScopeAttrs.contains(a) => + a + } } } @@ -1156,7 +1132,261 @@ object OptimizeOneRowRelationSubquery extends Rule[LogicalPlan] { if (!conf.getConf(SQLConf.OPTIMIZE_ONE_ROW_RELATION_SUBQUERY)) { plan } else { - rewrite(plan) + needToRecalculateOuterScopeAttrs = false + val newPlan = rewrite(plan) + if (needToRecalculateOuterScopeAttrs) { + RecalculateOuterAttrsAndOuterScopeAttrs(newPlan) + } else { + newPlan + } + } + } +} + +/** + * This rule rewrites domain joins created by PullUpCorrelatedPredicates, + * It rewrites all the domain joins within the main query and nested subqueries + * in a top down manner. + */ +object RewriteDomainJoinsInOnePass extends Rule[LogicalPlan] { + private def containsCorrelatedSubquery(expr: Expression): Boolean = { + expr exists { + case s: SubqueryExpression => s.children.nonEmpty + case _ => false + } + } + + private def rewriteDomainJoinsUnderJoin( + j: Join, + possibleOuterPlans: Seq[LogicalPlan] + ): (LogicalPlan, Seq[(Attribute, Attribute)]) = { + val relevantSubqueries = j.condition.get.collect { + case i: InSubquery if i.query.isCorrelated => i + case e: Exists if e.isCorrelated => e + } + if (relevantSubqueries.isEmpty) { + j -> j.output.zip(j.output) + } else { + // `subqueriesWithJoinInputReferenceInfo`is of type Seq[(Expression, Boolean, Boolean)] + // (1): Expression, the join predicate containing some predicate subquery we are interested + // in re-writing + // (2): Boolean, whether (1) references the left join input + // (3): Boolean, whether (1) references the right join input + val subqueriesWithJoinInputReferenceInfo = relevantSubqueries.map { e => + val referenceLeft = e.references.intersect(j.left.outputSet).nonEmpty + val referenceRight = e.references.intersect(j.right.outputSet).nonEmpty + (e, referenceLeft, referenceRight) + } + val subqueriesReferencingBothJoinInputs = subqueriesWithJoinInputReferenceInfo + .filter(i => i._2 && i._3) + + // Currently do not support correlated subqueries in the join predicate that reference both + // join inputs + if (subqueriesReferencingBothJoinInputs.nonEmpty) { + throw QueryCompilationErrors.unsupportedCorrelatedSubqueryInJoinConditionError( + subqueriesReferencingBothJoinInputs.map(_._1) + ) + } + val subqueriesReferencingLeft = subqueriesWithJoinInputReferenceInfo.filter(_._2).map(_._1) + val subqueriesReferencingRight = subqueriesWithJoinInputReferenceInfo.filter(_._3).map(_._1) + if (subqueriesReferencingLeft.isEmpty && subqueriesReferencingRight.isEmpty) { + j -> j.output.zip(j.output) + } else { + var newCondition = j.condition.get + subqueriesReferencingLeft.foldLeft(j.left) { + case (p, e) => + val newSubExpr = e match { + case i: InSubquery => + val subExpr = i.query + val newPlan = + DecorrelateInnerQuery.rewriteDomainJoinsConsideringNestedCorrelation( + possibleOuterPlans ++ Seq(p), subExpr.plan) + val newQuery = subExpr.withNewPlan(newPlan) + i.copy(query = newQuery) + case ex: Exists => + val newPlan = + DecorrelateInnerQuery.rewriteDomainJoinsConsideringNestedCorrelation( + possibleOuterPlans ++ Seq(p), ex.plan) + ex.copy(plan = newPlan) + } + // Update the join condition to rewrite the subquery expression + newCondition = newCondition.transform { + case expr if expr.fastEquals(e) => newSubExpr + } + p + } + subqueriesReferencingRight.foldLeft(j.right) { + case (p, e) => + val newSubExpr = e match { + case i: InSubquery => + val subExpr = i.query + val newPlan = + DecorrelateInnerQuery.rewriteDomainJoinsConsideringNestedCorrelation( + possibleOuterPlans ++ Seq(p), subExpr.plan) + val newQuery = subExpr.withNewPlan(newPlan) + i.copy(query = newQuery) + case ex: Exists => + val newPlan = + DecorrelateInnerQuery.rewriteDomainJoinsConsideringNestedCorrelation( + possibleOuterPlans ++ Seq(p), ex.plan) + ex.copy(plan = newPlan) + } + // Update the join condition to rewrite the subquery expression + newCondition = newCondition.transform { + case expr if expr.fastEquals(e) => newSubExpr + } + p + } + val newJ = j.copy(condition = Some(newCondition)) + (newJ, j.output.zip(newJ.output)) + } + } + } + + private def rewriteDomainJoinsUnderUnaryNode( + plan: UnaryNode, + possibleOuterPlans: Seq[LogicalPlan] + ): (LogicalPlan, Seq[(Attribute, Attribute)]) = { + val newPlan = plan.transformExpressionsWithPruning(_.containsPattern(SCALAR_SUBQUERY)) { + case s: ScalarSubquery if s.children.nonEmpty => + // rewriteDomainJoins in pre order traversal to + // ensure outer domain joins are rewritten first + val rewrittenSub = DecorrelateInnerQuery.rewriteDomainJoinsConsideringNestedCorrelation( + possibleOuterPlans ++ Seq(plan.child), s.plan) + s.copy(plan = rewrittenSub) + case e: Exists + if e.children.nonEmpty && SQLConf.get.decorrelateInnerQueryEnabledForExistsIn => + // TODO(avery): This is different from original rewritePredicateSubquery, check + // if there are any plan changes + val rewrittenSub = DecorrelateInnerQuery.rewriteDomainJoinsConsideringNestedCorrelation( + possibleOuterPlans ++ Seq(plan.child), e.plan) + e.copy(plan = rewrittenSub) + case l: ListQuery + if l.children.nonEmpty && SQLConf.get.decorrelateInnerQueryEnabledForExistsIn => + val rewrittenSub = DecorrelateInnerQuery.rewriteDomainJoinsConsideringNestedCorrelation( + possibleOuterPlans ++ Seq(plan.child), l.plan) + l.copy(plan = rewrittenSub) + } + // TODO(avery): I assume there are no changed outputs, check if this is true + assert(plan.sameOutput(newPlan)) + assert(plan.output.size == newPlan.output.size) + val attrMapping = plan.output.zip(newPlan.output) + (newPlan, attrMapping) + } + + private def rewriteDomainJoinsInOneLayer( + plan: LogicalPlan, possibleOuterPlans: Seq[LogicalPlan] + ): LogicalPlan = { + plan transformUpWithNewOutput { + case u: UnaryNode if u.expressions.exists(containsCorrelatedSubquery) => + val (newU, attrMapping) = rewriteDomainJoinsUnderUnaryNode(u, possibleOuterPlans) + newU -> attrMapping + case j: Join if j.condition.exists(cond => + SubqueryExpression.hasInOrCorrelatedExistsSubquery(cond)) && + conf.getConf(DECORRELATE_PREDICATE_SUBQUERIES_IN_JOIN_CONDITION) => + val (newJ, attrMapping) = rewriteDomainJoinsUnderJoin(j, possibleOuterPlans) + newJ -> attrMapping + case l: LateralJoin => + val ls = l.right + val newRightPlan = DecorrelateInnerQuery.rewriteDomainJoins(l.left, ls.plan, ls.getJoinCond) + val newLateralSubquery = ls.copy(plan = newRightPlan) + val newLateralJoin = l.copy(right = newLateralSubquery) + newLateralJoin -> l.output.zip(newLateralJoin.output) + } + } + + private def apply0( + plan: LogicalPlan, + possibleOuterPlans: Seq[LogicalPlan] = Seq.empty[LogicalPlan] + ): LogicalPlan = { + val updatedPlan = rewriteDomainJoinsInOneLayer(plan, possibleOuterPlans) + val res = updatedPlan.transformDownWithPruning(_.containsPattern(PLAN_EXPRESSION)) { + case u: UnaryNode if u.expressions.exists(containsCorrelatedSubquery) => + u.transformAllExpressionsWithPruning( + _.containsPattern(PLAN_EXPRESSION)) { + case s: SubqueryExpression if s.children.nonEmpty => + val newPlan = apply0(s.plan, possibleOuterPlans ++ Seq(u.child)) + s.withNewPlan(newPlan) + } + case j: Join if j.condition.exists(cond => + SubqueryExpression.hasInOrCorrelatedExistsSubquery(cond)) && + conf.getConf(DECORRELATE_PREDICATE_SUBQUERIES_IN_JOIN_CONDITION) => + j.transformAllExpressionsWithPruning( + _.containsPattern(PLAN_EXPRESSION)) { + case s: SubqueryExpression if s.children.nonEmpty => + // TODO(avery): (not sure if all the children should be included) + val newPlan = apply0(s.plan, possibleOuterPlans ++ j.children) + s.withNewPlan(newPlan) + } + case l: LateralJoin => + l.transformAllExpressionsWithPruning( + _.containsPattern(PLAN_EXPRESSION)) { + case s: SubqueryExpression if s.children.nonEmpty => + val newPlan = apply0(s.plan, possibleOuterPlans ++ Seq(l.left)) + s.withNewPlan(newPlan) + } + } + res + } + + /** + * Rewrite domain joins in any correlated subquery plan. + */ + def apply(plan: LogicalPlan): LogicalPlan = { + if (plan.containsPattern(NESTED_CORRELATED_SUBQUERY)) { + apply0(plan) + } else { + plan + } + } +} + +/** + * This rule rewrites all correlated subqueries into joins. + * It rewrites the subqueries in a bottom up manner. + * Now it only works for correlated scalar subqueries as currently + * only scalar subqueries are allowed to have nested outer references. + */ +object RewriteCorrelatedSubqueriesInOnePass extends Rule[LogicalPlan] { + private def containsCorrelatedScalarSubquery(e: Expression): Boolean = { + e.exists { + case s: ScalarSubquery if s.children.nonEmpty => true + case _ => false + } + } + + private def apply0(plan: LogicalPlan): LogicalPlan = { + val newPlan = plan.transformUpWithPruning(_.containsPattern(SCALAR_SUBQUERY)) { + case p: LogicalPlan if p.expressions.exists(containsCorrelatedScalarSubquery) => + val subqueries = ArrayBuffer.empty[ScalarSubquery] + p.expressions.map( + RewriteCorrelatedScalarSubquery.extractCorrelatedScalarSubqueries(_, subqueries)) + val newSubqueries = subqueries.map { + case sub => + val newPlan = apply0(sub.plan) + sub.withNewPlan(newPlan) + } + val replaceMap = subqueries.zip(newSubqueries).map { + case (oldSub, newSub) => oldSub -> newSub + }.toMap + val newP = p.transformExpressionsUpWithPruning(_.containsPattern(SCALAR_SUBQUERY)) { + case s: ScalarSubquery => replaceMap.applyOrElse(s, (s: ScalarSubquery) => s) + } + newP + } + RewriteCorrelatedScalarSubquery(newPlan) + } + + def apply(plan: LogicalPlan): LogicalPlan = { + if (plan.containsPattern(NESTED_CORRELATED_SUBQUERY)) { + val newPlan = apply0(plan) + // After rewrite, there should be no nested correlated subqueries left. + assert(!newPlan.containsPattern(NESTED_CORRELATED_SUBQUERY), + "There should be no nested correlated subqueries left after" + + "RewriteCorrelatedSubqueriesInOnePass.") + newPlan + } else { + plan } } } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreePatterns.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreePatterns.scala index 3ea32f3cc464f..ba833e58a07e1 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreePatterns.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreePatterns.scala @@ -69,6 +69,7 @@ object TreePattern extends Enumeration { val LITERAL: Value = Value val MAP_OBJECTS: Value = Value val MULTI_ALIAS: Value = Value + val NESTED_CORRELATED_SUBQUERY: Value = Value val NEW_INSTANCE: Value = Value val NOT: Value = Value val NULL_CHECK: Value = Value @@ -77,6 +78,7 @@ object TreePattern extends Enumeration { val SERIALIZE_FROM_OBJECT: Value = Value val OR: Value = Value val OUTER_REFERENCE: Value = Value + val OUTER_REFERENCE_FOR_DOMAIN_JOIN: Value = Value val PARAMETER: Value = Value val PARAMETERIZED_QUERY: Value = Value val PIPE_EXPRESSION: Value = Value diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index 56969eba08cb9..fd601f76d3ffc 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -4237,6 +4237,51 @@ object SQLConf { .booleanConf .createWithDefault(true) + // TODO: remove this flag when the nested correlations handling is stable. + val SUPPORT_NESTED_CORRELATED_SUBQUERIES = + buildConf("spark.sql.optimizer.supportNestedCorrelatedSubqueries.enabled") + .internal() + .doc("If enabled, support nested correlated subqueries. This is a temporary flag " + + "to enable the new implementation of nested correlated subqueries." + + "This flag should be set to true when the new implementation is stable " + + "in both analyzer and optimizer. Otherwise, it should be set to false by default.") + .version("4.1.0") + .booleanConf + .createWithDefault(false) + + val SUPPORT_NESTED_CORRELATED_SUBQUERIES_FOR_SCALARSUBQUERIES = + buildConf("spark.sql.optimizer.supportNestedCorrelatedSubqueriesForScalarSubqueries.enabled") + .internal() + .doc("If enabled, support nested correlated scalar subqueries. This is a temporary flag " + + "to enable the new implementation of nested correlated subqueries." + + "This flag should be set to true when the new implementation is stable " + + "in both analyzer and optimizer. Otherwise, it should be set to false by default.") + .version("4.1.0") + .booleanConf + .createWithDefault(false) + + val SUPPORT_NESTED_CORRELATED_SUBQUERIES_FOR_INSUBQUERIES = + buildConf("spark.sql.optimizer.supportNestedCorrelatedSubqueriesForINSubqueries.enabled") + .internal() + .doc("If enabled, support nested correlated IN subqueries. This is a temporary flag " + + "to enable the new implementation of nested correlated subqueries." + + "This flag should be set to true when the new implementation is stable " + + "in both analyzer and optimizer. Otherwise, it should be set to false by default.") + .version("4.1.0") + .booleanConf + .createWithDefault(false) + + val SUPPORT_NESTED_CORRELATED_SUBQUERIES_FOR_EXISTSSUBQUERIES = + buildConf("spark.sql.optimizer.supportNestedCorrelatedSubqueriesForEXISTSSubqueries.enabled") + .internal() + .doc("If enabled, support nested correlated EXISTS subqueries. This is a temporary flag " + + "to enable the new implementation of nested correlated subqueries." + + "This flag should be set to true when the new implementation is stable " + + "in both analyzer and optimizer. Otherwise, it should be set to false by default.") + .version("4.1.0") + .booleanConf + .createWithDefault(false) + val PULL_OUT_NESTED_DATA_OUTER_REF_EXPRESSIONS_ENABLED = buildConf("spark.sql.optimizer.pullOutNestedDataOuterRefExpressions.enabled") .internal() diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/PlanAdaptiveSubqueries.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/PlanAdaptiveSubqueries.scala index 5f2638655c37c..dd38c1e94e481 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/PlanAdaptiveSubqueries.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/PlanAdaptiveSubqueries.scala @@ -30,11 +30,11 @@ case class PlanAdaptiveSubqueries( def apply(plan: SparkPlan): SparkPlan = { plan.transformAllExpressionsWithPruning( _.containsAnyPattern(SCALAR_SUBQUERY, IN_SUBQUERY, DYNAMIC_PRUNING_SUBQUERY)) { - case expressions.ScalarSubquery(_, _, exprId, _, _, _, _) => + case expressions.ScalarSubquery(_, _, _, exprId, _, _, _, _) => val subquery = SubqueryExec.createForScalarSubquery( s"subquery#${exprId.id}", subqueryMap(exprId.id)) execution.ScalarSubquery(subquery, exprId) - case expressions.InSubquery(values, ListQuery(_, _, exprId, _, _, _)) => + case expressions.InSubquery(values, ListQuery(_, _, _, exprId, _, _, _)) => val expr = if (values.length == 1) { values.head } else { diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/subquery.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/subquery.scala index d5f258a8084be..e07dcc1b5e46d 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/subquery.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/subquery.scala @@ -187,7 +187,7 @@ case class PlanSubqueries(sparkSession: SparkSession) extends Rule[SparkPlan] { SubqueryExec.createForScalarSubquery( s"scalar-subquery#${subquery.exprId.id}", executedPlan), subquery.exprId) - case expressions.InSubquery(values, ListQuery(query, _, exprId, _, _, _)) => + case expressions.InSubquery(values, ListQuery(query, _, _, exprId, _, _, _)) => val expr = if (values.length == 1) { values.head } else { diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/join-lateral.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/join-lateral.sql.out index 390fcf5e30152..f8e88a57bf426 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/join-lateral.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/join-lateral.sql.out @@ -739,19 +739,11 @@ SELECT * FROM t1, LATERAL (SELECT c1, (SELECT SUM(c2) FROM t2 WHERE c1 = t1.c1)) -- !query analysis org.apache.spark.sql.catalyst.ExtendedAnalysisException { - "errorClass" : "UNRESOLVED_COLUMN.WITH_SUGGESTION", - "sqlState" : "42703", + "errorClass" : "NESTED_REFERENCES_IN_SUBQUERY_NOT_SUPPORTED", + "sqlState" : "0A000", "messageParameters" : { - "objectName" : "`t1`.`c1`", - "proposal" : "`t2`.`c1`, `t2`.`c2`" - }, - "queryContext" : [ { - "objectType" : "", - "objectName" : "", - "startIndex" : 74, - "stopIndex" : 78, - "fragment" : "t1.c1" - } ] + "expression" : "spark_catalog.default.t1.c1" + } } diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/postgreSQL/aggregates_part1.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/postgreSQL/aggregates_part1.sql.out index 0577d73ea6a3c..f6683cbca32dc 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/postgreSQL/aggregates_part1.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/postgreSQL/aggregates_part1.sql.out @@ -505,17 +505,9 @@ from tenk1 o -- !query analysis org.apache.spark.sql.catalyst.ExtendedAnalysisException { - "errorClass" : "UNRESOLVED_COLUMN.WITH_SUGGESTION", - "sqlState" : "42703", + "errorClass" : "NESTED_REFERENCES_IN_SUBQUERY_NOT_SUPPORTED", + "sqlState" : "0A000", "messageParameters" : { - "objectName" : "`o`.`unique1`", - "proposal" : "`i`.`unique1`, `i`.`unique2`, `i`.`even`, `i`.`four`, `i`.`odd`" - }, - "queryContext" : [ { - "objectType" : "", - "objectName" : "", - "startIndex" : 71, - "stopIndex" : 79, - "fragment" : "o.unique1" - } ] + "expression" : "o.unique1" + } } diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/sql-udf.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/sql-udf.sql.out index 8a110190bb020..08a19b10fd35d 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/sql-udf.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/sql-udf.sql.out @@ -408,18 +408,11 @@ CREATE FUNCTION foo2_2c(a INT) RETURNS INT RETURN 1 + (SELECT (SELECT a)) -- !query analysis org.apache.spark.sql.catalyst.ExtendedAnalysisException { - "errorClass" : "UNRESOLVED_COLUMN.WITHOUT_SUGGESTION", - "sqlState" : "42703", + "errorClass" : "NESTED_REFERENCES_IN_SUBQUERY_NOT_SUPPORTED", + "sqlState" : "0A000", "messageParameters" : { - "objectName" : "`a`" - }, - "queryContext" : [ { - "objectType" : "", - "objectName" : "", - "startIndex" : 21, - "stopIndex" : 21, - "fragment" : "a" - } ] + "expression" : "foo2_2c.a" + } } @@ -428,18 +421,11 @@ CREATE FUNCTION foo2_2d(a INT) RETURNS INT RETURN 1 + (SELECT (SELECT (SELECT (S -- !query analysis org.apache.spark.sql.catalyst.ExtendedAnalysisException { - "errorClass" : "UNRESOLVED_COLUMN.WITHOUT_SUGGESTION", - "sqlState" : "42703", + "errorClass" : "NESTED_REFERENCES_IN_SUBQUERY_NOT_SUPPORTED", + "sqlState" : "0A000", "messageParameters" : { - "objectName" : "`a`" - }, - "queryContext" : [ { - "objectType" : "", - "objectName" : "", - "startIndex" : 37, - "stopIndex" : 37, - "fragment" : "a" - } ] + "expression" : "foo2_2d.a" + } } diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/negative-cases/invalid-correlation.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/negative-cases/invalid-correlation.sql.out index 95b38e1c7e0f5..1f5db69fdb06b 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/negative-cases/invalid-correlation.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/negative-cases/invalid-correlation.sql.out @@ -81,7 +81,7 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException "messageParameters" : { "input" : "\"min(t2a)\", \"t2c\"", "missingAttributes" : "\"t2b\"", - "operator" : "!Filter t2c#x IN (list#x [t2b#x])" + "operator" : "Filter t2c#x IN (list#x [t2b#x])" }, "queryContext" : [ { "objectType" : "", diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/nestedcorrelation/scalar-subquery.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/nestedcorrelation/scalar-subquery.sql.out new file mode 100644 index 0000000000000..52d5e286c2add --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/subquery/nestedcorrelation/scalar-subquery.sql.out @@ -0,0 +1,760 @@ +-- Automatically generated by SQLQueryTestSuite +-- !query +set spark.sql.optimizer.supportNestedCorrelatedSubqueries.enabled=true +-- !query analysis +SetCommand (spark.sql.optimizer.supportNestedCorrelatedSubqueries.enabled,Some(true)) + + +-- !query +set spark.sql.optimizer.supportNestedCorrelatedSubqueriesForScalarSubqueries.enabled=true +-- !query analysis +SetCommand (spark.sql.optimizer.supportNestedCorrelatedSubqueriesForScalarSubqueries.enabled,Some(true)) + + +-- !query +DROP TABLE IF EXISTS myt1 +-- !query analysis +DropTable true, false ++- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.myt1 + + +-- !query +DROP TABLE IF EXISTS myt2 +-- !query analysis +DropTable true, false ++- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.myt2 + + +-- !query +DROP TABLE IF EXISTS myt3 +-- !query analysis +DropTable true, false ++- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.myt3 + + +-- !query +CREATE TABLE myt1(a INT, b INT, c INT) +-- !query analysis +CreateDataSourceTableCommand `spark_catalog`.`default`.`myt1`, false + + +-- !query +CREATE TABLE myt2(a INT, b INT, c INT) +-- !query analysis +CreateDataSourceTableCommand `spark_catalog`.`default`.`myt2`, false + + +-- !query +CREATE TABLE myt3(a INT, b INT, c INT) +-- !query analysis +CreateDataSourceTableCommand `spark_catalog`.`default`.`myt3`, false + + +-- !query +INSERT INTO myt1 VALUES (0, 0, 0), (1, 1, 1), (2, 2, 2), (3, 3, 3), (NULL, NULL, NULL) +-- !query analysis +InsertIntoHadoopFsRelationCommand file:[not included in comparison]/{warehouse_dir}/myt1, false, Parquet, [path=file:[not included in comparison]/{warehouse_dir}/myt1], Append, `spark_catalog`.`default`.`myt1`, org.apache.spark.sql.execution.datasources.InMemoryFileIndex(file:[not included in comparison]/{warehouse_dir}/myt1), [a, b, c] ++- Project [cast(col1#x as int) AS a#x, cast(col2#x as int) AS b#x, cast(col3#x as int) AS c#x] + +- LocalRelation [col1#x, col2#x, col3#x] + + +-- !query +INSERT INTO myt2 VALUES (0, 0, 0), (1, 1, 1), (2, 2, 2), (3, 3, 3), (NULL, NULL, NULL) +-- !query analysis +InsertIntoHadoopFsRelationCommand file:[not included in comparison]/{warehouse_dir}/myt2, false, Parquet, [path=file:[not included in comparison]/{warehouse_dir}/myt2], Append, `spark_catalog`.`default`.`myt2`, org.apache.spark.sql.execution.datasources.InMemoryFileIndex(file:[not included in comparison]/{warehouse_dir}/myt2), [a, b, c] ++- Project [cast(col1#x as int) AS a#x, cast(col2#x as int) AS b#x, cast(col3#x as int) AS c#x] + +- LocalRelation [col1#x, col2#x, col3#x] + + +-- !query +INSERT INTO myt3 VALUES (0, 0, 0), (1, 1, 1), (2, 2, 2), (3, 3, 3), (NULL, NULL, NULL) +-- !query analysis +InsertIntoHadoopFsRelationCommand file:[not included in comparison]/{warehouse_dir}/myt3, false, Parquet, [path=file:[not included in comparison]/{warehouse_dir}/myt3], Append, `spark_catalog`.`default`.`myt3`, org.apache.spark.sql.execution.datasources.InMemoryFileIndex(file:[not included in comparison]/{warehouse_dir}/myt3), [a, b, c] ++- Project [cast(col1#x as int) AS a#x, cast(col2#x as int) AS b#x, cast(col3#x as int) AS c#x] + +- LocalRelation [col1#x, col2#x, col3#x] + + +-- !query +SELECT * +FROM myt1 +WHERE myt1.a = ( + SELECT MAX(myt2.a) + FROM myt2 + WHERE myt2.a = ( + SELECT MAX(myt3.a) + FROM myt3 + WHERE myt3.b > myt2.b AND myt3.c > myt1.c + ) AND myt2.b > myt1.b +) +-- !query analysis +Project [a#x, b#x, c#x] ++- Filter (a#x = scalar-subquery#x [b#x && c#x]) + : +- Aggregate [max(a#x) AS max(a)#x] + : +- Filter ((a#x = scalar-subquery#x [b#x && c#x]) AND (b#x > outer(b#x))) + : : +- Aggregate [max(a#x) AS max(a)#x] + : : +- Filter ((b#x > outer(b#x)) AND (c#x > outer(c#x))) + : : +- SubqueryAlias spark_catalog.default.myt3 + : : +- Relation spark_catalog.default.myt3[a#x,b#x,c#x] parquet + : +- SubqueryAlias spark_catalog.default.myt2 + : +- Relation spark_catalog.default.myt2[a#x,b#x,c#x] parquet + +- SubqueryAlias spark_catalog.default.myt1 + +- Relation spark_catalog.default.myt1[a#x,b#x,c#x] parquet + + +-- !query +SELECT * +FROM myt1 +WHERE myt1.a = ( + SELECT MAX(myt2.a) + FROM myt2 + WHERE myt2.a = ( + SELECT MAX(myt3.a) + FROM myt3 + WHERE myt3.b = myt2.b AND myt3.c = myt1.c + ) AND myt2.b = myt1.b +) +-- !query analysis +Project [a#x, b#x, c#x] ++- Filter (a#x = scalar-subquery#x [b#x && c#x]) + : +- Aggregate [max(a#x) AS max(a)#x] + : +- Filter ((a#x = scalar-subquery#x [b#x && c#x]) AND (b#x = outer(b#x))) + : : +- Aggregate [max(a#x) AS max(a)#x] + : : +- Filter ((b#x = outer(b#x)) AND (c#x = outer(c#x))) + : : +- SubqueryAlias spark_catalog.default.myt3 + : : +- Relation spark_catalog.default.myt3[a#x,b#x,c#x] parquet + : +- SubqueryAlias spark_catalog.default.myt2 + : +- Relation spark_catalog.default.myt2[a#x,b#x,c#x] parquet + +- SubqueryAlias spark_catalog.default.myt1 + +- Relation spark_catalog.default.myt1[a#x,b#x,c#x] parquet + + +-- !query +SELECT * +FROM myt1 +WHERE myt1.a = ( + SELECT COUNT(myt2.a) + FROM myt2 + WHERE myt2.a = ( + SELECT COUNT(myt3.a) + FROM myt3 + WHERE myt3.b > myt2.b AND myt3.c > myt1.c + ) AND myt2.b > myt1.b +) +-- !query analysis +Project [a#x, b#x, c#x] ++- Filter (cast(a#x as bigint) = scalar-subquery#x [b#x && c#x]) + : +- Aggregate [count(a#x) AS count(a)#xL] + : +- Filter ((cast(a#x as bigint) = scalar-subquery#x [b#x && c#x]) AND (b#x > outer(b#x))) + : : +- Aggregate [count(a#x) AS count(a)#xL] + : : +- Filter ((b#x > outer(b#x)) AND (c#x > outer(c#x))) + : : +- SubqueryAlias spark_catalog.default.myt3 + : : +- Relation spark_catalog.default.myt3[a#x,b#x,c#x] parquet + : +- SubqueryAlias spark_catalog.default.myt2 + : +- Relation spark_catalog.default.myt2[a#x,b#x,c#x] parquet + +- SubqueryAlias spark_catalog.default.myt1 + +- Relation spark_catalog.default.myt1[a#x,b#x,c#x] parquet + + +-- !query +SELECT * +FROM myt1 +WHERE myt1.a = ( + SELECT COUNT(myt2.a) + FROM myt2 + WHERE myt2.a = ( + SELECT COUNT(myt3.a) + FROM myt3 + WHERE myt3.b = myt2.b AND myt3.c = myt1.c + ) AND myt2.b = myt1.b +) +-- !query analysis +Project [a#x, b#x, c#x] ++- Filter (cast(a#x as bigint) = scalar-subquery#x [b#x && c#x]) + : +- Aggregate [count(a#x) AS count(a)#xL] + : +- Filter ((cast(a#x as bigint) = scalar-subquery#x [b#x && c#x]) AND (b#x = outer(b#x))) + : : +- Aggregate [count(a#x) AS count(a)#xL] + : : +- Filter ((b#x = outer(b#x)) AND (c#x = outer(c#x))) + : : +- SubqueryAlias spark_catalog.default.myt3 + : : +- Relation spark_catalog.default.myt3[a#x,b#x,c#x] parquet + : +- SubqueryAlias spark_catalog.default.myt2 + : +- Relation spark_catalog.default.myt2[a#x,b#x,c#x] parquet + +- SubqueryAlias spark_catalog.default.myt1 + +- Relation spark_catalog.default.myt1[a#x,b#x,c#x] parquet + + +-- !query +SELECT myt1.a, ( + SELECT ( + SELECT MAX(myt3.a) + FROM myt3 + WHERE myt3.b > myt2.b AND myt3.c > myt1.c + ) + FROM myt2 +) +FROM myt1 +-- !query analysis +Project [a#x, scalar-subquery#x [c#x] AS scalarsubquery(c)#x] +: +- Project [scalar-subquery#x [b#x && c#x] AS scalarsubquery(b, c)#x] +: : +- Aggregate [max(a#x) AS max(a)#x] +: : +- Filter ((b#x > outer(b#x)) AND (c#x > outer(c#x))) +: : +- SubqueryAlias spark_catalog.default.myt3 +: : +- Relation spark_catalog.default.myt3[a#x,b#x,c#x] parquet +: +- SubqueryAlias spark_catalog.default.myt2 +: +- Relation spark_catalog.default.myt2[a#x,b#x,c#x] parquet ++- SubqueryAlias spark_catalog.default.myt1 + +- Relation spark_catalog.default.myt1[a#x,b#x,c#x] parquet + + +-- !query +SELECT myt1.a, ( + SELECT ( + SELECT MAX(myt3.a) + FROM myt3 + WHERE myt3.b = myt2.b AND myt3.c = myt1.c + ) + FROM myt2 +) +FROM myt1 +-- !query analysis +Project [a#x, scalar-subquery#x [c#x] AS scalarsubquery(c)#x] +: +- Project [scalar-subquery#x [b#x && c#x] AS scalarsubquery(b, c)#x] +: : +- Aggregate [max(a#x) AS max(a)#x] +: : +- Filter ((b#x = outer(b#x)) AND (c#x = outer(c#x))) +: : +- SubqueryAlias spark_catalog.default.myt3 +: : +- Relation spark_catalog.default.myt3[a#x,b#x,c#x] parquet +: +- SubqueryAlias spark_catalog.default.myt2 +: +- Relation spark_catalog.default.myt2[a#x,b#x,c#x] parquet ++- SubqueryAlias spark_catalog.default.myt1 + +- Relation spark_catalog.default.myt1[a#x,b#x,c#x] parquet + + +-- !query +SELECT myt1.a, ( + SELECT ( + SELECT COUNT(myt3.a) + FROM myt3 + WHERE myt3.b > myt2.b AND myt3.c > myt1.c + ) + FROM myt2 +) +FROM myt1 +-- !query analysis +Project [a#x, scalar-subquery#x [c#x] AS scalarsubquery(c)#xL] +: +- Project [scalar-subquery#x [b#x && c#x] AS scalarsubquery(b, c)#xL] +: : +- Aggregate [count(a#x) AS count(a)#xL] +: : +- Filter ((b#x > outer(b#x)) AND (c#x > outer(c#x))) +: : +- SubqueryAlias spark_catalog.default.myt3 +: : +- Relation spark_catalog.default.myt3[a#x,b#x,c#x] parquet +: +- SubqueryAlias spark_catalog.default.myt2 +: +- Relation spark_catalog.default.myt2[a#x,b#x,c#x] parquet ++- SubqueryAlias spark_catalog.default.myt1 + +- Relation spark_catalog.default.myt1[a#x,b#x,c#x] parquet + + +-- !query +SELECT myt1.a, ( + SELECT ( + SELECT COUNT(myt3.a) + FROM myt3 + WHERE myt3.b = myt2.b AND myt3.c = myt1.c + ) + FROM myt2 +) +FROM myt1 +-- !query analysis +Project [a#x, scalar-subquery#x [c#x] AS scalarsubquery(c)#xL] +: +- Project [scalar-subquery#x [b#x && c#x] AS scalarsubquery(b, c)#xL] +: : +- Aggregate [count(a#x) AS count(a)#xL] +: : +- Filter ((b#x = outer(b#x)) AND (c#x = outer(c#x))) +: : +- SubqueryAlias spark_catalog.default.myt3 +: : +- Relation spark_catalog.default.myt3[a#x,b#x,c#x] parquet +: +- SubqueryAlias spark_catalog.default.myt2 +: +- Relation spark_catalog.default.myt2[a#x,b#x,c#x] parquet ++- SubqueryAlias spark_catalog.default.myt1 + +- Relation spark_catalog.default.myt1[a#x,b#x,c#x] parquet + + +-- !query +SELECT MIN( + SELECT MAX( + SELECT MAX(myt3.a) + FROM myt3 + WHERE myt3.b > myt2.b AND myt3.c > myt1.c + ) + FROM myt2 + ) +FROM myt1 +-- !query analysis +org.apache.spark.sql.catalyst.parser.ParseException +{ + "errorClass" : "PARSE_SYNTAX_ERROR", + "sqlState" : "42601", + "messageParameters" : { + "error" : "'MAX'", + "hint" : "" + } +} + + +-- !query +SELECT MIN( + SELECT MAX( + SELECT MAX(myt3.a) + FROM myt3 + WHERE myt3.b = myt2.b AND myt3.c = myt1.c + ) + FROM myt2 + ) +FROM myt1 +-- !query analysis +org.apache.spark.sql.catalyst.parser.ParseException +{ + "errorClass" : "PARSE_SYNTAX_ERROR", + "sqlState" : "42601", + "messageParameters" : { + "error" : "'MAX'", + "hint" : "" + } +} + + +-- !query +SELECT COUNT( + SELECT COUNT( + SELECT COUNT(myt3.a) + FROM myt3 + WHERE myt3.b > myt2.b AND myt3.c > myt1.c + ) + FROM myt2 + ) +FROM myt1 +-- !query analysis +org.apache.spark.sql.catalyst.parser.ParseException +{ + "errorClass" : "PARSE_SYNTAX_ERROR", + "sqlState" : "42601", + "messageParameters" : { + "error" : "'COUNT'", + "hint" : "" + } +} + + +-- !query +SELECT COUNT( + SELECT COUNT( + SELECT COUNT(myt3.a) + FROM myt3 + WHERE myt3.b = myt2.b AND myt3.c = myt1.c + ) + FROM myt2 + ) +FROM myt1 +-- !query analysis +org.apache.spark.sql.catalyst.parser.ParseException +{ + "errorClass" : "PARSE_SYNTAX_ERROR", + "sqlState" : "42601", + "messageParameters" : { + "error" : "'COUNT'", + "hint" : "" + } +} + + +-- !query +SELECT b, MAX(myt1.a) +FROM myt1 +GROUP BY b +HAVING ( + SELECT MAX(myt2.a) + FROM myt2 + WHERE myt2.a = ( + SELECT MAX(myt3.a) + FROM myt3 + WHERE myt3.a > MAX(myt1.a) + ) AND myt2.b > myt1.b +) +-- !query analysis +Filter cast(scalar-subquery#x [b#x && max(a)#x] as boolean) +: +- Aggregate [max(a#x) AS max(a)#x] +: +- Filter ((a#x = scalar-subquery#x [max(a)#x]) AND (b#x > outer(b#x))) +: : +- Aggregate [max(a#x) AS max(a)#x] +: : +- Filter (a#x > outer(max(a)#x)) +: : +- SubqueryAlias spark_catalog.default.myt3 +: : +- Relation spark_catalog.default.myt3[a#x,b#x,c#x] parquet +: +- SubqueryAlias spark_catalog.default.myt2 +: +- Relation spark_catalog.default.myt2[a#x,b#x,c#x] parquet ++- Aggregate [b#x], [b#x, max(a#x) AS max(a)#x] + +- SubqueryAlias spark_catalog.default.myt1 + +- Relation spark_catalog.default.myt1[a#x,b#x,c#x] parquet + + +-- !query +SELECT b, MAX(myt1.a) +FROM myt1 +GROUP BY b +HAVING ( + SELECT MAX(myt2.a) + FROM myt2 + WHERE myt2.a = ( + SELECT MAX(myt3.a) + FROM myt3 + WHERE myt3.a = MAX(myt1.a) + ) AND myt2.b = myt1.b +) +-- !query analysis +Filter cast(scalar-subquery#x [b#x && max(a)#x] as boolean) +: +- Aggregate [max(a#x) AS max(a)#x] +: +- Filter ((a#x = scalar-subquery#x [max(a)#x]) AND (b#x = outer(b#x))) +: : +- Aggregate [max(a#x) AS max(a)#x] +: : +- Filter (a#x = outer(max(a)#x)) +: : +- SubqueryAlias spark_catalog.default.myt3 +: : +- Relation spark_catalog.default.myt3[a#x,b#x,c#x] parquet +: +- SubqueryAlias spark_catalog.default.myt2 +: +- Relation spark_catalog.default.myt2[a#x,b#x,c#x] parquet ++- Aggregate [b#x], [b#x, max(a#x) AS max(a)#x] + +- SubqueryAlias spark_catalog.default.myt1 + +- Relation spark_catalog.default.myt1[a#x,b#x,c#x] parquet + + +-- !query +SELECT b, MAX(myt1.a) +FROM myt1 +GROUP BY b +HAVING ( + SELECT COUNT(myt2.a) + FROM myt2 + WHERE myt2.a = ( + SELECT COUNT(myt3.a) + FROM myt3 + WHERE myt3.a > MAX(myt1.a) + ) AND myt2.b > myt1.b +) +-- !query analysis +Filter cast(scalar-subquery#x [b#x && max(a)#x] as boolean) +: +- Aggregate [count(a#x) AS count(a)#xL] +: +- Filter ((cast(a#x as bigint) = scalar-subquery#x [max(a)#x]) AND (b#x > outer(b#x))) +: : +- Aggregate [count(a#x) AS count(a)#xL] +: : +- Filter (a#x > outer(max(a)#x)) +: : +- SubqueryAlias spark_catalog.default.myt3 +: : +- Relation spark_catalog.default.myt3[a#x,b#x,c#x] parquet +: +- SubqueryAlias spark_catalog.default.myt2 +: +- Relation spark_catalog.default.myt2[a#x,b#x,c#x] parquet ++- Aggregate [b#x], [b#x, max(a#x) AS max(a)#x] + +- SubqueryAlias spark_catalog.default.myt1 + +- Relation spark_catalog.default.myt1[a#x,b#x,c#x] parquet + + +-- !query +SELECT b, MAX(myt1.a) +FROM myt1 +GROUP BY b +HAVING ( + SELECT COUNT(myt2.a) + FROM myt2 + WHERE myt2.a = ( + SELECT COUNT(myt3.a) + FROM myt3 + WHERE myt3.a = MAX(myt1.a) + ) AND myt2.b = myt1.b +) +-- !query analysis +Filter cast(scalar-subquery#x [b#x && max(a)#x] as boolean) +: +- Aggregate [count(a#x) AS count(a)#xL] +: +- Filter ((cast(a#x as bigint) = scalar-subquery#x [max(a)#x]) AND (b#x = outer(b#x))) +: : +- Aggregate [count(a#x) AS count(a)#xL] +: : +- Filter (a#x = outer(max(a)#x)) +: : +- SubqueryAlias spark_catalog.default.myt3 +: : +- Relation spark_catalog.default.myt3[a#x,b#x,c#x] parquet +: +- SubqueryAlias spark_catalog.default.myt2 +: +- Relation spark_catalog.default.myt2[a#x,b#x,c#x] parquet ++- Aggregate [b#x], [b#x, max(a#x) AS max(a)#x] + +- SubqueryAlias spark_catalog.default.myt1 + +- Relation spark_catalog.default.myt1[a#x,b#x,c#x] parquet + + +-- !query +SELECT myt1.a +FROM myt1 +WHERE EXISTS ( + SELECT 1 + FROM myt2 + WHERE myt2.a = ( + SELECT MAX(myt3.a) + FROM myt3 + WHERE myt3.b > myt2.b AND myt3.c > myt1.c + ) AND myt2.b > myt1.b +) +-- !query analysis +Project [a#x] ++- Filter exists#x [b#x && c#x] + : +- Project [1 AS 1#x] + : +- Filter ((a#x = scalar-subquery#x [b#x && c#x]) AND (b#x > outer(b#x))) + : : +- Aggregate [max(a#x) AS max(a)#x] + : : +- Filter ((b#x > outer(b#x)) AND (c#x > outer(c#x))) + : : +- SubqueryAlias spark_catalog.default.myt3 + : : +- Relation spark_catalog.default.myt3[a#x,b#x,c#x] parquet + : +- SubqueryAlias spark_catalog.default.myt2 + : +- Relation spark_catalog.default.myt2[a#x,b#x,c#x] parquet + +- SubqueryAlias spark_catalog.default.myt1 + +- Relation spark_catalog.default.myt1[a#x,b#x,c#x] parquet + + +-- !query +SELECT myt1.a +FROM myt1 +WHERE myt1.b = ( + SELECT myt2.b + FROM myt2 + WHERE EXISTS ( + SELECT 1 + FROM myt3 + WHERE myt3.b > myt2.b AND myt3.c > myt1.c + ) AND myt2.b > myt1.b +) +-- !query analysis +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "errorClass" : "NESTED_REFERENCES_IN_SUBQUERY_NOT_SUPPORTED", + "sqlState" : "0A000", + "messageParameters" : { + "expression" : "spark_catalog.default.myt1.c" + } +} + + +-- !query +SELECT 1 FROM (SELECT 1) t0(c0) WHERE (SELECT (SELECT c0)) = 1 +-- !query analysis +Project [1 AS 1#x] ++- Filter (scalar-subquery#x [c0#x] = 1) + : +- Project [scalar-subquery#x [c0#x] AS scalarsubquery(c0)#x] + : : +- Project [outer(c0#x)] + : : +- OneRowRelation + : +- OneRowRelation + +- SubqueryAlias t0 + +- Project [1#x AS c0#x] + +- Project [1 AS 1#x] + +- OneRowRelation + + +-- !query +DROP TABLE IF EXISTS table_integers +-- !query analysis +DropTable true, false ++- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.table_integers + + +-- !query +CREATE TABLE table_integers(i INTEGER) +-- !query analysis +CreateDataSourceTableCommand `spark_catalog`.`default`.`table_integers`, false + + +-- !query +INSERT INTO table_integers VALUES (1), (2), (3), (NULL) +-- !query analysis +InsertIntoHadoopFsRelationCommand file:[not included in comparison]/{warehouse_dir}/table_integers, false, Parquet, [path=file:[not included in comparison]/{warehouse_dir}/table_integers], Append, `spark_catalog`.`default`.`table_integers`, org.apache.spark.sql.execution.datasources.InMemoryFileIndex(file:[not included in comparison]/{warehouse_dir}/table_integers), [i] ++- Project [cast(col1#x as int) AS i#x] + +- LocalRelation [col1#x] + + +-- !query +SELECT i, (SELECT (SELECT 42+i1.i)+42+i1.i) AS j FROM table_integers i1 ORDER BY i +-- !query analysis +Sort [i#x ASC NULLS FIRST], true ++- Project [i#x, scalar-subquery#x [i#x && i#x] AS j#x] + : +- Project [((scalar-subquery#x [i#x] + 42) + outer(i#x)) AS ((scalarsubquery(i) + 42) + outer(i1.i))#x] + : : +- Project [(42 + outer(i#x)) AS (42 + outer(i1.i))#x] + : : +- OneRowRelation + : +- OneRowRelation + +- SubqueryAlias i1 + +- SubqueryAlias spark_catalog.default.table_integers + +- Relation spark_catalog.default.table_integers[i#x] parquet + + +-- !query +SELECT i, (SELECT (SELECT (SELECT (SELECT 42+i1.i)++i1.i)+42+i1.i)+42+i1.i) AS j FROM table_integers i1 ORDER BY i +-- !query analysis +Sort [i#x ASC NULLS FIRST], true ++- Project [i#x, scalar-subquery#x [i#x && i#x && i#x && i#x] AS j#x] + : +- Project [((scalar-subquery#x [i#x && i#x && i#x] + 42) + outer(i#x)) AS ((scalarsubquery(i, i, i) + 42) + outer(i1.i))#x] + : : +- Project [((scalar-subquery#x [i#x && i#x] + 42) + outer(i#x)) AS ((scalarsubquery(i, i) + 42) + outer(i1.i))#x] + : : : +- Project [(scalar-subquery#x [i#x] + positive(outer(i#x))) AS (scalarsubquery(i) + (+ outer(i1.i)))#x] + : : : : +- Project [(42 + outer(i#x)) AS (42 + outer(i1.i))#x] + : : : : +- OneRowRelation + : : : +- OneRowRelation + : : +- OneRowRelation + : +- OneRowRelation + +- SubqueryAlias i1 + +- SubqueryAlias spark_catalog.default.table_integers + +- Relation spark_catalog.default.table_integers[i#x] parquet + + +-- !query +SELECT i, (SELECT (SELECT (SELECT (SELECT i1.i+i1.i+i1.i+i1.i+i1.i)))) AS j FROM table_integers i1 ORDER BY i +-- !query analysis +Sort [i#x ASC NULLS FIRST], true ++- Project [i#x, scalar-subquery#x [i#x && i#x && i#x && i#x && i#x] AS j#x] + : +- Project [scalar-subquery#x [i#x && i#x && i#x && i#x && i#x] AS scalarsubquery(i, i, i, i, i)#x] + : : +- Project [scalar-subquery#x [i#x && i#x && i#x && i#x && i#x] AS scalarsubquery(i, i, i, i, i)#x] + : : : +- Project [scalar-subquery#x [i#x && i#x && i#x && i#x && i#x] AS scalarsubquery(i, i, i, i, i)#x] + : : : : +- Project [((((outer(i#x) + outer(i#x)) + outer(i#x)) + outer(i#x)) + outer(i#x)) AS ((((outer(i1.i) + outer(i1.i)) + outer(i1.i)) + outer(i1.i)) + outer(i1.i))#x] + : : : : +- OneRowRelation + : : : +- OneRowRelation + : : +- OneRowRelation + : +- OneRowRelation + +- SubqueryAlias i1 + +- SubqueryAlias spark_catalog.default.table_integers + +- Relation spark_catalog.default.table_integers[i#x] parquet + + +-- !query +SELECT i, (SELECT (SELECT (SELECT (SELECT i1.i+i1.i+i1.i+i1.i+i1.i+i2.i) FROM table_integers i2 WHERE i2.i=i1.i))) AS j FROM table_integers i1 ORDER BY i +-- !query analysis +Sort [i#x ASC NULLS FIRST], true ++- Project [i#x, scalar-subquery#x [i#x] AS j#x] + : +- Project [scalar-subquery#x [i#x] AS scalarsubquery(i)#x] + : : +- Project [scalar-subquery#x [i#x] AS scalarsubquery(i)#x] + : : : +- Project [scalar-subquery#x [i#x && i#x && i#x && i#x && i#x && i#x] AS scalarsubquery(i, i, i, i, i, i)#x] + : : : : +- Project [(((((outer(i#x) + outer(i#x)) + outer(i#x)) + outer(i#x)) + outer(i#x)) + outer(i#x)) AS (((((outer(i1.i) + outer(i1.i)) + outer(i1.i)) + outer(i1.i)) + outer(i1.i)) + outer(i2.i))#x] + : : : : +- OneRowRelation + : : : +- Filter (i#x = outer(i#x)) + : : : +- SubqueryAlias i2 + : : : +- SubqueryAlias spark_catalog.default.table_integers + : : : +- Relation spark_catalog.default.table_integers[i#x] parquet + : : +- OneRowRelation + : +- OneRowRelation + +- SubqueryAlias i1 + +- SubqueryAlias spark_catalog.default.table_integers + +- Relation spark_catalog.default.table_integers[i#x] parquet + + +-- !query +SELECT i, (SELECT SUM(s1.i) FROM (SELECT i FROM table_integers WHERE i=i1.i) s1 LEFT OUTER JOIN table_integers s2 ON s1.i=s2.i) AS j FROM table_integers i1 ORDER BY i +-- !query analysis +Sort [i#x ASC NULLS FIRST], true ++- Project [i#x, scalar-subquery#x [i#x] AS j#xL] + : +- Aggregate [sum(i#x) AS sum(i)#xL] + : +- Join LeftOuter, (i#x = i#x) + : :- SubqueryAlias s1 + : : +- Project [i#x] + : : +- Filter (i#x = outer(i#x)) + : : +- SubqueryAlias spark_catalog.default.table_integers + : : +- Relation spark_catalog.default.table_integers[i#x] parquet + : +- SubqueryAlias s2 + : +- SubqueryAlias spark_catalog.default.table_integers + : +- Relation spark_catalog.default.table_integers[i#x] parquet + +- SubqueryAlias i1 + +- SubqueryAlias spark_catalog.default.table_integers + +- Relation spark_catalog.default.table_integers[i#x] parquet + + +-- !query +SELECT i, (SELECT SUM(s1.i) FROM (SELECT i FROM table_integers WHERE i<>i1.i) s1 LEFT OUTER JOIN table_integers s2 ON s1.i=s2.i) AS j FROM table_integers i1 ORDER BY i +-- !query analysis +Sort [i#x ASC NULLS FIRST], true ++- Project [i#x, scalar-subquery#x [i#x] AS j#xL] + : +- Aggregate [sum(i#x) AS sum(i)#xL] + : +- Join LeftOuter, (i#x = i#x) + : :- SubqueryAlias s1 + : : +- Project [i#x] + : : +- Filter NOT (i#x = outer(i#x)) + : : +- SubqueryAlias spark_catalog.default.table_integers + : : +- Relation spark_catalog.default.table_integers[i#x] parquet + : +- SubqueryAlias s2 + : +- SubqueryAlias spark_catalog.default.table_integers + : +- Relation spark_catalog.default.table_integers[i#x] parquet + +- SubqueryAlias i1 + +- SubqueryAlias spark_catalog.default.table_integers + +- Relation spark_catalog.default.table_integers[i#x] parquet + + +-- !query +SELECT i, (SELECT SUM(ss2.i) FROM (SELECT i FROM table_integers s1 WHERE i=i1.i) ss2) AS j FROM table_integers i1 ORDER BY i +-- !query analysis +Sort [i#x ASC NULLS FIRST], true ++- Project [i#x, scalar-subquery#x [i#x] AS j#xL] + : +- Aggregate [sum(i#x) AS sum(i)#xL] + : +- SubqueryAlias ss2 + : +- Project [i#x] + : +- Filter (i#x = outer(i#x)) + : +- SubqueryAlias s1 + : +- SubqueryAlias spark_catalog.default.table_integers + : +- Relation spark_catalog.default.table_integers[i#x] parquet + +- SubqueryAlias i1 + +- SubqueryAlias spark_catalog.default.table_integers + +- Relation spark_catalog.default.table_integers[i#x] parquet + + +-- !query +SELECT i, (SELECT * FROM (SELECT (SELECT 42+i1.i)) s1) AS j FROM table_integers i1 ORDER BY i +-- !query analysis +Sort [i#x ASC NULLS FIRST], true ++- Project [i#x, scalar-subquery#x [i#x] AS j#x] + : +- Project [scalarsubquery(i)#x] + : +- SubqueryAlias s1 + : +- Project [scalar-subquery#x [i#x] AS scalarsubquery(i)#x] + : : +- Project [(42 + outer(i#x)) AS (42 + outer(i1.i))#x] + : : +- OneRowRelation + : +- OneRowRelation + +- SubqueryAlias i1 + +- SubqueryAlias spark_catalog.default.table_integers + +- Relation spark_catalog.default.table_integers[i#x] parquet + + +-- !query +SELECT i, (SELECT s1.k+s2.k FROM (SELECT (SELECT 42+i1.i) AS k) s1, (SELECT (SELECT 42+i1.i) AS k) s2) AS j FROM table_integers i1 ORDER BY i +-- !query analysis +Sort [i#x ASC NULLS FIRST], true ++- Project [i#x, scalar-subquery#x [i#x && i#x] AS j#x] + : +- Project [(k#x + k#x) AS (k + k)#x] + : +- Join Inner + : :- SubqueryAlias s1 + : : +- Project [scalar-subquery#x [i#x] AS k#x] + : : : +- Project [(42 + outer(i#x)) AS (42 + outer(i1.i))#x] + : : : +- OneRowRelation + : : +- OneRowRelation + : +- SubqueryAlias s2 + : +- Project [scalar-subquery#x [i#x] AS k#x] + : : +- Project [(42 + outer(i#x)) AS (42 + outer(i1.i))#x] + : : +- OneRowRelation + : +- OneRowRelation + +- SubqueryAlias i1 + +- SubqueryAlias spark_catalog.default.table_integers + +- Relation spark_catalog.default.table_integers[i#x] parquet + + +-- !query +SELECT i, (SELECT s1.k+s2.k FROM (SELECT (SELECT 42+i1.i) AS k) s1 LEFT OUTER JOIN (SELECT (SELECT 42+i1.i) AS k) s2 ON s1.k=s2.k) AS j FROM table_integers i1 ORDER BY i +-- !query analysis +Sort [i#x ASC NULLS FIRST], true ++- Project [i#x, scalar-subquery#x [i#x && i#x] AS j#x] + : +- Project [(k#x + k#x) AS (k + k)#x] + : +- Join LeftOuter, (k#x = k#x) + : :- SubqueryAlias s1 + : : +- Project [scalar-subquery#x [i#x] AS k#x] + : : : +- Project [(42 + outer(i#x)) AS (42 + outer(i1.i))#x] + : : : +- OneRowRelation + : : +- OneRowRelation + : +- SubqueryAlias s2 + : +- Project [scalar-subquery#x [i#x] AS k#x] + : : +- Project [(42 + outer(i#x)) AS (42 + outer(i1.i))#x] + : : +- OneRowRelation + : +- OneRowRelation + +- SubqueryAlias i1 + +- SubqueryAlias spark_catalog.default.table_integers + +- Relation spark_catalog.default.table_integers[i#x] parquet + + +-- !query +SELECT i, (SELECT i1.i IN (1, 2, 3, 4, 5, 6, 7, 8)) AS j FROM table_integers i1 ORDER BY i +-- !query analysis +Sort [i#x ASC NULLS FIRST], true ++- Project [i#x, scalar-subquery#x [i#x] AS j#x] + : +- Project [outer(i#x) IN (1,2,3,4,5,6,7,8) AS (outer(i1.i) IN (1, 2, 3, 4, 5, 6, 7, 8))#x] + : +- OneRowRelation + +- SubqueryAlias i1 + +- SubqueryAlias spark_catalog.default.table_integers + +- Relation spark_catalog.default.table_integers[i#x] parquet diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/to_from_avro.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/to_from_avro.sql.out index 8275e4f1c0ff0..c9bad43cabd02 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/to_from_avro.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/to_from_avro.sql.out @@ -1,4 +1,11 @@ -- Automatically generated by SQLQueryTestSuite +-- !query +drop table if exists t +-- !query analysis +DropTable true, false ++- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.t + + -- !query create table t as select named_struct('u', named_struct('member0', member0, 'member1', member1)) as s diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/udf/postgreSQL/udf-aggregates_part1.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/udf/postgreSQL/udf-aggregates_part1.sql.out index ec5fb2058447e..eafdf4c2ae70d 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/udf/postgreSQL/udf-aggregates_part1.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/udf/postgreSQL/udf-aggregates_part1.sql.out @@ -496,17 +496,9 @@ from tenk1 o -- !query analysis org.apache.spark.sql.catalyst.ExtendedAnalysisException { - "errorClass" : "UNRESOLVED_COLUMN.WITH_SUGGESTION", - "sqlState" : "42703", + "errorClass" : "NESTED_REFERENCES_IN_SUBQUERY_NOT_SUPPORTED", + "sqlState" : "0A000", "messageParameters" : { - "objectName" : "`o`.`unique1`", - "proposal" : "`i`.`unique1`, `i`.`unique2`, `i`.`even`, `i`.`four`, `i`.`odd`" - }, - "queryContext" : [ { - "objectType" : "", - "objectName" : "", - "startIndex" : 75, - "stopIndex" : 83, - "fragment" : "o.unique1" - } ] + "expression" : "o.unique1" + } } diff --git a/sql/core/src/test/resources/sql-tests/inputs/subquery/nestedcorrelation-analyzer-only/combined-subquery.sql b/sql/core/src/test/resources/sql-tests/inputs/subquery/nestedcorrelation-analyzer-only/combined-subquery.sql new file mode 100644 index 0000000000000..450b3421eb037 --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/inputs/subquery/nestedcorrelation-analyzer-only/combined-subquery.sql @@ -0,0 +1,15 @@ +--ONLY_IF spark +set spark.sql.optimizer.supportNestedCorrelatedSubqueries.enabled=true; +set spark.sql.optimizer.supportNestedCorrelatedSubqueriesForScalarSubqueries.enabled=true; +set spark.sql.optimizer.supportNestedCorrelatedSubqueriesForINSubqueries.enabled=true; +set spark.sql.optimizer.supportNestedCorrelatedSubqueriesForEXISTSSubqueries.enabled=true; + +DROP TABLE IF EXISTS tbl; +CREATE TABLE tbl(a TINYINT, b SMALLINT, c INTEGER, d BIGINT, e VARCHAR(1), f DATE, g TIMESTAMP); + +INSERT INTO tbl VALUES (1, 2, 3, 4, '5', DATE '1992-01-01', TIMESTAMP '1992-01-01 00:00:00'); + +SELECT t1.c+(SELECT t1.b FROM tbl t2 WHERE EXISTS(SELECT t1.b+t2.a)) FROM tbl t1; + +-- non deterministic due to the timestamp type, the query itself is supported. +SELECT 1 FROM tbl t1 JOIN tbl t2 ON (t1.d=t2.d) WHERE EXISTS(SELECT t1.c FROM tbl t3 WHERE t1.d+t3.c<100 AND EXISTS(SELECT 1 FROM tbl t4 WHERE t2.f < DATE '2000-01-01')); \ No newline at end of file diff --git a/sql/core/src/test/resources/sql-tests/inputs/subquery/nestedcorrelation-analyzer-only/exists-subquery.sql b/sql/core/src/test/resources/sql-tests/inputs/subquery/nestedcorrelation-analyzer-only/exists-subquery.sql new file mode 100644 index 0000000000000..163f3823bf831 --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/inputs/subquery/nestedcorrelation-analyzer-only/exists-subquery.sql @@ -0,0 +1,100 @@ +--ONLY_IF spark +set spark.sql.optimizer.supportNestedCorrelatedSubqueries.enabled=true; +set spark.sql.optimizer.supportNestedCorrelatedSubqueriesForScalarSubqueries.enabled=true; +set spark.sql.optimizer.supportNestedCorrelatedSubqueriesForINSubqueries.enabled=true; +set spark.sql.optimizer.supportNestedCorrelatedSubqueriesForEXISTSSubqueries.enabled=true; + +DROP TABLE IF EXISTS table_integers; +CREATE TABLE table_integers(i INTEGER); +INSERT INTO table_integers VALUES (1), (2), (3), (NULL); + +SELECT + i, + ( + SELECT SUM(ss1.i) + FROM ( + SELECT s1.i + FROM table_integers s1 + WHERE EXISTS ( + SELECT 1 + FROM table_integers t2 + WHERE s1.i > t2.i + ) + ) ss1 + ) AS j +FROM table_integers i1 +ORDER BY i; + +SELECT + i, + ( + SELECT SUM(ss2.i) + FROM ( + SELECT s1.i + FROM table_integers s1 + WHERE s1.i = i1.i + AND EXISTS ( + SELECT 1 + FROM table_integers t2 + WHERE t2.i = s1.i + ) + ) ss2 + ) AS j +FROM table_integers i1 +ORDER BY i; + +SELECT + i, + ( + SELECT SUM(ss1.i) + SUM(ss2.i) + FROM ( + -- First derived table: values greater than at least one other + SELECT s1.i + FROM table_integers s1 + WHERE EXISTS ( + SELECT 1 + FROM table_integers t2 + WHERE s1.i > t2.i + ) + ) ss1 + LEFT OUTER JOIN ( + -- Second derived table: values equal to at least one other + SELECT s1.i + FROM table_integers s1 + WHERE EXISTS ( + SELECT 1 + FROM table_integers t2 + WHERE s1.i = t2.i + ) + ) ss2 + ON ss1.i = ss2.i + ) AS j +FROM table_integers i1 +ORDER BY i; + +SELECT i, (SELECT SUM(ss2.i) FROM (SELECT i FROM table_integers s1 WHERE CASE WHEN (i=i1.i AND EXISTS (SELECT i FROM table_integers WHERE i=s1.i)) THEN true ELSE false END) ss2) AS j FROM table_integers i1 ORDER BY i; + +SELECT i, (SELECT SUM(ss2.i) FROM (SELECT i FROM table_integers s1 WHERE i=i1.i AND EXISTS (SELECT i FROM table_integers WHERE i=s1.i)) ss2) AS j FROM table_integers i1 ORDER BY i; + +SELECT i, (SELECT SUM(ss2.i) FROM (SELECT i FROM table_integers s1 WHERE (SELECT i FROM table_integers WHERE i=s1.i) = 1) ss2) AS j FROM table_integers i1 ORDER BY i; + +SELECT i, (SELECT i FROM table_integers s1 WHERE i=i1.i AND EXISTS (SELECT i FROM table_integers WHERE i=s1.i)) AS j FROM table_integers i1 ORDER BY i; + +SELECT i, (SELECT SUM(ss2.i) FROM (SELECT i FROM table_integers s1 WHERE i=i1.i OR i=ANY(SELECT i FROM table_integers WHERE i=s1.i)) ss2) AS j FROM table_integers i1 ORDER BY i; + +SELECT i, (SELECT SUM(ss2.i) FROM (SELECT i FROM table_integers s1 WHERE CASE WHEN (i=i1.i AND EXISTS(SELECT i FROM table_integers WHERE i=s1.i)) THEN true ELSE false END) ss2) AS j FROM table_integers i1 ORDER BY i; + +SELECT i, (SELECT SUM(ss2.i) FROM (SELECT i FROM table_integers s1 WHERE i=i1.i AND EXISTS(SELECT i FROM table_integers WHERE i=s1.i)) ss2) AS j FROM table_integers i1 ORDER BY i; + +SELECT i, (SELECT SUM(ss1.i) FROM (SELECT i FROM table_integers s1 WHERE EXISTS(SELECT i FROM table_integers WHERE i<>s1.i AND s1.i > i)) ss1) AS j FROM table_integers i1 ORDER BY i; + +SELECT i, (SELECT SUM(ss1.i)+SUM(ss2.i) FROM (SELECT i FROM table_integers s1 WHERE i=i1.i AND EXISTS(SELECT i FROM table_integers WHERE i<>s1.i AND s1.i>i)) ss1 LEFT OUTER JOIN (SELECT i FROM table_integers s1 WHERE EXISTS(SELECT i FROM table_integers WHERE i=s1.i)) ss2 ON ss1.i=ss2.i) AS j FROM table_integers i1 ORDER BY i; + +DROP TABLE IF EXISTS tbl_ProductSales; +DROP TABLE IF EXISTS another_T; +CREATE TABLE tbl_ProductSales (ColID int, Product_Category varchar(64), Product_Name varchar(64), TotalSales int); +CREATE TABLE another_T (col1 INT, col2 INT, col3 INT, col4 INT, col5 INT, col6 INT, col7 INT, col8 INT); +INSERT INTO tbl_ProductSales VALUES (1,'Game','Mobo Game',200),(2,'Game','PKO Game',400),(3,'Fashion','Shirt',500),(4,'Fashion','Shorts',100); +INSERT INTO another_T VALUES (1,2,3,4,5,6,7,8), (11,22,33,44,55,66,77,88), (111,222,333,444,555,666,777,888), (1111,2222,3333,4444,5555,6666,7777,8888); + +SELECT (SELECT MIN(ColID) FROM tbl_ProductSales INNER JOIN another_T t2 ON EXISTS (SELECT MAX(t1.col1 + t3.col4) AS mymax FROM another_T t3 HAVING t1.col7 <> mymax)) FROM another_T t1; \ No newline at end of file diff --git a/sql/core/src/test/resources/sql-tests/inputs/subquery/nestedcorrelation-analyzer-only/lateral-subquery.sql b/sql/core/src/test/resources/sql-tests/inputs/subquery/nestedcorrelation-analyzer-only/lateral-subquery.sql new file mode 100644 index 0000000000000..53a0b13e23af7 --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/inputs/subquery/nestedcorrelation-analyzer-only/lateral-subquery.sql @@ -0,0 +1,26 @@ +--ONLY_IF spark +set spark.sql.optimizer.supportNestedCorrelatedSubqueries.enabled=true; +set spark.sql.optimizer.supportNestedCorrelatedSubqueriesForScalarSubqueries.enabled=true; +set spark.sql.optimizer.supportNestedCorrelatedSubqueriesForINSubqueries.enabled=true; +set spark.sql.optimizer.supportNestedCorrelatedSubqueriesForEXISTSSubqueries.enabled=true; + +DROP TABLE IF EXISTS t; +CREATE TABLE t(ps_supplycost INT, n_name INT); + +-- These two queries will fail analysis because +-- currently we don't support nested correlations in lateral subqueries. +SELECT NULL +FROM + t AS ref_2, + LATERAL (SELECT (SELECT NULL + FROM (SELECT * FROM t AS ref_5, + LATERAL (SELECT ref_5.ps_supplycost AS c0, + ref_2.n_name AS c1) AS alias1) AS alias2) AS alias3) AS alias4; +SELECT * +FROM + t AS ref_2, + LATERAL (SELECT (SELECT NULL + FROM (SELECT * FROM t AS ref_5, + LATERAL (SELECT ref_5.ps_supplycost AS c0, + ref_2.n_name AS c1) AS alias1) AS alias2) AS alias3) AS alias4; + diff --git a/sql/core/src/test/resources/sql-tests/inputs/subquery/nestedcorrelation-analyzer-only/scalar-subquery.sql b/sql/core/src/test/resources/sql-tests/inputs/subquery/nestedcorrelation-analyzer-only/scalar-subquery.sql new file mode 100644 index 0000000000000..0000a5e35d2cd --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/inputs/subquery/nestedcorrelation-analyzer-only/scalar-subquery.sql @@ -0,0 +1,33 @@ +--ONLY_IF spark +set spark.sql.optimizer.supportNestedCorrelatedSubqueries.enabled=true; +set spark.sql.optimizer.supportNestedCorrelatedSubqueriesForScalarSubqueries.enabled=true; +set spark.sql.optimizer.supportNestedCorrelatedSubqueriesForINSubqueries.enabled=true; +set spark.sql.optimizer.supportNestedCorrelatedSubqueriesForEXISTSSubqueries.enabled=true; + +SELECT 1 FROM (SELECT 1) t0(c0) WHERE (SELECT (SELECT c0)) = 1; + +DROP TABLE IF EXISTS table_integers; +CREATE TABLE table_integers(i INTEGER); +INSERT INTO table_integers VALUES (1), (2), (3), (NULL); + +SELECT i, (SELECT (SELECT 42+i1.i)+42+i1.i) AS j FROM table_integers i1 ORDER BY i; + +SELECT i, (SELECT (SELECT (SELECT (SELECT 42+i1.i)++i1.i)+42+i1.i)+42+i1.i) AS j FROM table_integers i1 ORDER BY i; + +SELECT i, (SELECT (SELECT (SELECT (SELECT i1.i+i1.i+i1.i+i1.i+i1.i)))) AS j FROM table_integers i1 ORDER BY i; + +SELECT i, (SELECT (SELECT (SELECT (SELECT i1.i+i1.i+i1.i+i1.i+i1.i+i2.i) FROM table_integers i2 WHERE i2.i=i1.i))) AS j FROM table_integers i1 ORDER BY i; + +SELECT i, (SELECT SUM(s1.i) FROM (SELECT i FROM table_integers WHERE i=i1.i) s1 LEFT OUTER JOIN table_integers s2 ON s1.i=s2.i) AS j FROM table_integers i1 ORDER BY i; + +SELECT i, (SELECT SUM(s1.i) FROM (SELECT i FROM table_integers WHERE i<>i1.i) s1 LEFT OUTER JOIN table_integers s2 ON s1.i=s2.i) AS j FROM table_integers i1 ORDER BY i; + +SELECT i, (SELECT SUM(ss2.i) FROM (SELECT i FROM table_integers s1 WHERE i=i1.i) ss2) AS j FROM table_integers i1 ORDER BY i; + +SELECT i, (SELECT * FROM (SELECT (SELECT 42+i1.i)) s1) AS j FROM table_integers i1 ORDER BY i; + +SELECT i, (SELECT s1.k+s2.k FROM (SELECT (SELECT 42+i1.i) AS k) s1, (SELECT (SELECT 42+i1.i) AS k) s2) AS j FROM table_integers i1 ORDER BY i; + +SELECT i, (SELECT s1.k+s2.k FROM (SELECT (SELECT 42+i1.i) AS k) s1 LEFT OUTER JOIN (SELECT (SELECT 42+i1.i) AS k) s2 ON s1.k=s2.k) AS j FROM table_integers i1 ORDER BY i; + +SELECT i, (SELECT i1.i IN (1, 2, 3, 4, 5, 6, 7, 8)) AS j FROM table_integers i1 ORDER BY i; \ No newline at end of file diff --git a/sql/core/src/test/resources/sql-tests/inputs/subquery/nestedcorrelation-analyzer-only/subquery-not-supported.sql b/sql/core/src/test/resources/sql-tests/inputs/subquery/nestedcorrelation-analyzer-only/subquery-not-supported.sql new file mode 100644 index 0000000000000..790405a088427 --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/inputs/subquery/nestedcorrelation-analyzer-only/subquery-not-supported.sql @@ -0,0 +1,82 @@ +--ONLY_IF spark +set spark.sql.optimizer.supportNestedCorrelatedSubqueries.enabled=true; +set spark.sql.optimizer.supportNestedCorrelatedSubqueriesForScalarSubqueries.enabled=true; +set spark.sql.optimizer.supportNestedCorrelatedSubqueriesForINSubqueries.enabled=true; +set spark.sql.optimizer.supportNestedCorrelatedSubqueriesForEXISTSSubqueries.enabled=true; + +-- Spark SQL does not support correlations in ORDER BY clause. +SELECT 1 FROM (SELECT 1) t0(c0) WHERE (SELECT (SELECT 1 ORDER BY c0)) = 1; + +-- Spark SQL does not support correlations in LIMIT/OFFSET clause. +SELECT 1 FROM (SELECT 1) t0(c0) WHERE (SELECT (SELECT 1 LIMIT c0)) = 1; + +DROP TABLE IF EXISTS t; +CREATE TABLE t(ps_supplycost INT, n_name INT); + +-- Spark SQL does not support correlated subqueries in FROM clause without +-- explicit Lateral keyword. +SELECT NULL +FROM + t AS ref_2, + (SELECT (SELECT NULL + FROM (FROM t AS ref_5, + (SELECT ref_2.n_name AS c1)))); + +-- Spark SQL does not support correlated subqueries in FROM clause without +-- explicit Lateral keyword. +SELECT NULL +FROM + t AS ref_2, + (SELECT (SELECT NULL + FROM (FROM t AS ref_5, + (SELECT ref_5.ps_supplycost AS c0, + ref_2.n_name AS c1)))); + +DROP TABLE IF EXISTS table_integers; +CREATE TABLE table_integers(i INTEGER); +INSERT INTO table_integers VALUES (1), (2), (3), (NULL); + +-- Spark SQL only allow Project/Join/Filter to contain outer references. +-- Any subqueries containing outer references with aggregate expressions must +-- be on the having clause. +SELECT i, (SELECT (SELECT i1.i+SUM(i2.i)) FROM table_integers i2) AS j FROM table_integers i1 ORDER BY i; +SELECT i, (SELECT ((SELECT ((SELECT ((SELECT SUM(i)+SUM(i4.i)+SUM(i3.i)+SUM(i2.i)+SUM(i1.i) FROM table_integers i5)) FROM table_integers i4)) FROM table_integers i3)) FROM table_integers i2) AS j FROM table_integers i1 GROUP BY i ORDER BY i; +SELECT (SELECT (SELECT SUM(i1.i)+SUM(i2.i)+SUM(i3.i) FROM table_integers i3) FROM table_integers i2) FROM table_integers i1 ORDER BY 1; +SELECT i, SUM(i), (SELECT (SELECT SUM(i)+SUM(i1.i)+SUM(i2.i) FROM table_integers) FROM table_integers i2) FROM table_integers i1 GROUP BY i ORDER BY i; + +-- ScalarSubquery cannot be in the groupBy/aggregate expressions. +SELECT i, (SELECT SUM(i)+(SELECT 42+i1.i) FROM table_integers) AS j FROM table_integers i1 ORDER BY i; + +-- No correlated subqueries in the join condition. +SELECT i, (SELECT SUM(s1.i) FROM table_integers s1 INNER JOIN table_integers s2 ON (SELECT i1.i+s1.i)=(SELECT i1.i+s2.i)) AS j FROM table_integers i1 ORDER BY i; +SELECT i, (SELECT SUM(s1.i) FROM table_integers s1 LEFT OUTER JOIN table_integers s2 ON (SELECT i1.i+s1.i)=(SELECT i1.i+s2.i)) AS j FROM table_integers i1 ORDER BY i + +-- Spark sql does not allow mixing outer references and local references in one aggregates. +SELECT (SELECT (SELECT COVAR_POP(i2.i, i3.i) FROM table_integers i3) FROM table_integers i2 ORDER BY i NULLS LAST LIMIT 1) FROM table_integers i1 ORDER BY 1; +SELECT (SELECT (SELECT COVAR_POP(i1.i, i3.i) FROM table_integers i3) FROM table_integers i2 LIMIT 1) FROM table_integers i1 ORDER BY 1; + +-- Spark sql does not allow correlations in the right child of left outer join. +SELECT i, (SELECT SUM(ss1.i) FROM (SELECT i FROM table_integers s1 WHERE EXISTS(SELECT i FROM table_integers WHERE i<>s1.i AND s1.i > i)) ss1 LEFT OUTER JOIN (SELECT i FROM table_integers s1 WHERE i=i1.i AND EXISTS(SELECT i FROM table_integers WHERE i=s1.i)) ss2 ON ss1.i=ss2.i) AS j FROM table_integers i1 ORDER BY i; +SELECT i, (SELECT SUM(ss2.i) FROM (SELECT i FROM table_integers s1 WHERE EXISTS(SELECT i FROM table_integers WHERE i<>s1.i AND s1.i > i)) ss1 LEFT OUTER JOIN (SELECT i FROM table_integers s1 WHERE i=i1.i AND EXISTS(SELECT i FROM table_integers WHERE i=s1.i)) ss2 ON ss1.i=ss2.i) AS j FROM table_integers i1 ORDER BY i; +SELECT i, (SELECT SUM(ss1.i)+SUM(ss2.i) FROM (SELECT i FROM table_integers s1 WHERE EXISTS(SELECT i FROM table_integers WHERE i<>s1.i AND s1.i > i)) ss1 LEFT OUTER JOIN (SELECT i FROM table_integers s1 WHERE i=i1.i AND EXISTS(SELECT i FROM table_integers WHERE i=s1.i)) ss2 ON ss1.i=ss2.i) AS j FROM table_integers i1 ORDER BY i; +SELECT i, (SELECT SUM(ss1.i)+SUM(ss2.i) FROM (SELECT i FROM table_integers s1 WHERE i=i1.i AND EXISTS(SELECT i FROM table_integers WHERE i<>s1.i AND s1.i>i)) ss1 LEFT OUTER JOIN (SELECT i FROM table_integers s1 WHERE i<>i1.i OR EXISTS(SELECT i FROM table_integers WHERE i=s1.i)) ss2 ON ss1.i=ss2.i) AS j FROM table_integers i1 ORDER BY i; +SELECT i, (SELECT SUM(s2.i) FROM table_integers s1 LEFT OUTER JOIN (SELECT i FROM table_integers WHERE i=i1.i) s2 ON s1.i=s2.i) AS j FROM table_integers i1 ORDER BY i; +SELECT i, (SELECT SUM(s2.i) FROM table_integers s1 LEFT OUTER JOIN (SELECT i FROM table_integers WHERE i<>i1.i) s2 ON s1.i=s2.i) AS j FROM table_integers i1 ORDER BY i; + +DROP TABLE IF EXISTS tbl_ProductSales; +DROP TABLE IF EXISTS another_T; +CREATE TABLE tbl_ProductSales (ColID int, Product_Category varchar(64), Product_Name varchar(64), TotalSales int); +CREATE TABLE another_T (col1 INT, col2 INT, col3 INT, col4 INT, col5 INT, col6 INT, col7 INT, col8 INT); +INSERT INTO tbl_ProductSales VALUES (1,'Game','Mobo Game',200),(2,'Game','PKO Game',400),(3,'Fashion','Shirt',500),(4,'Fashion','Shorts',100); +INSERT INTO another_T VALUES (1,2,3,4,5,6,7,8), (11,22,33,44,55,66,77,88), (111,222,333,444,555,666,777,888), (1111,2222,3333,4444,5555,6666,7777,8888); + +-- Spark sql does not allow mixing outer references and local references in one aggregates. +SELECT (SELECT MIN(ColID) FROM tbl_ProductSales INNER JOIN another_T t2 ON t1.col7 <> (SELECT MAX(t1.col1 + t3.col4) FROM another_T t3)) FROM another_T t1; + +-- Spark SQL only allow Project/Join/Filter to contain outer references. +-- Any subqueries containing outer references with aggregate expressions must +-- be on the having clause. +SELECT CASE WHEN 1 IN (SELECT (SELECT MAX(col7))) THEN 2 ELSE NULL END FROM another_T t1; +SELECT CASE WHEN 1 IN (SELECT (SELECT MAX(col7)) UNION ALL (SELECT MIN(ColID) FROM tbl_ProductSales INNER JOIN another_T t2 ON t2.col5 = t2.col1)) THEN 2 ELSE NULL END FROM another_T t1; +SELECT CASE WHEN 1 IN (SELECT (SELECT MIN(ColID) FROM tbl_ProductSales INNER JOIN another_T t2 ON t2.col5 = t2.col1) UNION ALL (SELECT MAX(col7))) THEN 2 ELSE NULL END FROM another_T t1; +SELECT CASE WHEN NOT col1 NOT IN (SELECT (SELECT MAX(col7)) UNION (SELECT MIN(ColID) FROM tbl_ProductSales LEFT JOIN another_T t2 ON t2.col5 = t1.col1)) THEN 1 ELSE 2 END FROM another_T t1 GROUP BY col1 ORDER BY 1; \ No newline at end of file diff --git a/sql/core/src/test/resources/sql-tests/inputs/subquery/nestedcorrelation/scalar-subquery.sql b/sql/core/src/test/resources/sql-tests/inputs/subquery/nestedcorrelation/scalar-subquery.sql new file mode 100644 index 0000000000000..ff3a217b5d92e --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/inputs/subquery/nestedcorrelation/scalar-subquery.sql @@ -0,0 +1,286 @@ +set spark.sql.optimizer.supportNestedCorrelatedSubqueries.enabled=true; +set spark.sql.optimizer.supportNestedCorrelatedSubqueriesForScalarSubqueries.enabled=true; + +-- One test case for each following type: +-- --------------------------------------------------------------- +-- | SubqueryPosition | VulnerableToCountBug | TriggerDomainJoin| +-- --------------------------------------------------------------- +-- 1 | Filter | False | True | +-- --------------------------------------------------------------- +-- 2 | Filter | False | False | +-- --------------------------------------------------------------- +-- 3 | Filter | True | True | +-- --------------------------------------------------------------- +-- 4 | Filter | True | False | +-- --------------------------------------------------------------- +-- 5 | Project | False | True | +-- --------------------------------------------------------------- +-- 6 | Project | False | False | +-- --------------------------------------------------------------- +-- 7 | Project | True | True | +-- --------------------------------------------------------------- +-- 8 | Project | True | False | +-- --------------------------------------------------------------- +-- 9 | Aggregate | False | True | +-- --------------------------------------------------------------- +-- 10| Aggregate | False | False | +-- --------------------------------------------------------------- +-- 11| Aggregate | True | True | +-- --------------------------------------------------------------- +-- 12| Aggregate | True | False | +-- --------------------------------------------------------------- +-- 13| Filter(havingClause) | False | True | +-- --------------------------------------------------------------- +-- 14| Filter(havingClause) | False | False | +-- --------------------------------------------------------------- +-- 15| Filter(havingClause) | True | True | +-- --------------------------------------------------------------- +-- 16| Filter(havingClause) | True | False | +-- --------------------------------------------------------------- + +DROP TABLE IF EXISTS myt1; +DROP TABLE IF EXISTS myt2; +DROP TABLE IF EXISTS myt3; +CREATE TABLE myt1(a INT, b INT, c INT); +CREATE TABLE myt2(a INT, b INT, c INT); +CREATE TABLE myt3(a INT, b INT, c INT); +INSERT INTO myt1 VALUES (0, 0, 0), (1, 1, 1), (2, 2, 2), (3, 3, 3), (NULL, NULL, NULL); +INSERT INTO myt2 VALUES (0, 0, 0), (1, 1, 1), (2, 2, 2), (3, 3, 3), (NULL, NULL, NULL); +INSERT INTO myt3 VALUES (0, 0, 0), (1, 1, 1), (2, 2, 2), (3, 3, 3), (NULL, NULL, NULL); + +-- query 1 +SELECT * +FROM myt1 +WHERE myt1.a = ( + SELECT MAX(myt2.a) + FROM myt2 + WHERE myt2.a = ( + SELECT MAX(myt3.a) + FROM myt3 + WHERE myt3.b > myt2.b AND myt3.c > myt1.c + ) AND myt2.b > myt1.b +); +-- query 2 +SELECT * +FROM myt1 +WHERE myt1.a = ( + SELECT MAX(myt2.a) + FROM myt2 + WHERE myt2.a = ( + SELECT MAX(myt3.a) + FROM myt3 + WHERE myt3.b = myt2.b AND myt3.c = myt1.c + ) AND myt2.b = myt1.b +); +-- query 3 +SELECT * +FROM myt1 +WHERE myt1.a = ( + SELECT COUNT(myt2.a) + FROM myt2 + WHERE myt2.a = ( + SELECT COUNT(myt3.a) + FROM myt3 + WHERE myt3.b > myt2.b AND myt3.c > myt1.c + ) AND myt2.b > myt1.b +); +-- query 4 +SELECT * +FROM myt1 +WHERE myt1.a = ( + SELECT COUNT(myt2.a) + FROM myt2 + WHERE myt2.a = ( + SELECT COUNT(myt3.a) + FROM myt3 + WHERE myt3.b = myt2.b AND myt3.c = myt1.c + ) AND myt2.b = myt1.b +); +-- query 5 +SELECT myt1.a, ( + SELECT ( + SELECT MAX(myt3.a) + FROM myt3 + WHERE myt3.b > myt2.b AND myt3.c > myt1.c + ) + FROM myt2 +) +FROM myt1; +-- query 6 +SELECT myt1.a, ( + SELECT ( + SELECT MAX(myt3.a) + FROM myt3 + WHERE myt3.b = myt2.b AND myt3.c = myt1.c + ) + FROM myt2 +) +FROM myt1; +-- query 7 +SELECT myt1.a, ( + SELECT ( + SELECT COUNT(myt3.a) + FROM myt3 + WHERE myt3.b > myt2.b AND myt3.c > myt1.c + ) + FROM myt2 +) +FROM myt1; +-- query 8 +SELECT myt1.a, ( + SELECT ( + SELECT COUNT(myt3.a) + FROM myt3 + WHERE myt3.b = myt2.b AND myt3.c = myt1.c + ) + FROM myt2 +) +FROM myt1; +-- query 9 +SELECT MIN( + SELECT MAX( + SELECT MAX(myt3.a) + FROM myt3 + WHERE myt3.b > myt2.b AND myt3.c > myt1.c + ) + FROM myt2 + ) +FROM myt1; +-- query 10 +SELECT MIN( + SELECT MAX( + SELECT MAX(myt3.a) + FROM myt3 + WHERE myt3.b = myt2.b AND myt3.c = myt1.c + ) + FROM myt2 + ) +FROM myt1; +-- query 11 +SELECT COUNT( + SELECT COUNT( + SELECT COUNT(myt3.a) + FROM myt3 + WHERE myt3.b > myt2.b AND myt3.c > myt1.c + ) + FROM myt2 + ) +FROM myt1; +-- query 12 +SELECT COUNT( + SELECT COUNT( + SELECT COUNT(myt3.a) + FROM myt3 + WHERE myt3.b = myt2.b AND myt3.c = myt1.c + ) + FROM myt2 + ) +FROM myt1; +-- query 13 +SELECT b, MAX(myt1.a) +FROM myt1 +GROUP BY b +HAVING ( + SELECT MAX(myt2.a) + FROM myt2 + WHERE myt2.a = ( + SELECT MAX(myt3.a) + FROM myt3 + WHERE myt3.a > MAX(myt1.a) + ) AND myt2.b > myt1.b +); +-- query 14 +SELECT b, MAX(myt1.a) +FROM myt1 +GROUP BY b +HAVING ( + SELECT MAX(myt2.a) + FROM myt2 + WHERE myt2.a = ( + SELECT MAX(myt3.a) + FROM myt3 + WHERE myt3.a = MAX(myt1.a) + ) AND myt2.b = myt1.b +); +-- query 15 +SELECT b, MAX(myt1.a) +FROM myt1 +GROUP BY b +HAVING ( + SELECT COUNT(myt2.a) + FROM myt2 + WHERE myt2.a = ( + SELECT COUNT(myt3.a) + FROM myt3 + WHERE myt3.a > MAX(myt1.a) + ) AND myt2.b > myt1.b +); +-- query 16 +SELECT b, MAX(myt1.a) +FROM myt1 +GROUP BY b +HAVING ( + SELECT COUNT(myt2.a) + FROM myt2 + WHERE myt2.a = ( + SELECT COUNT(myt3.a) + FROM myt3 + WHERE myt3.a = MAX(myt1.a) + ) AND myt2.b = myt1.b +); + +-- test that queries containing both nested correlated scalar subqueries +-- and other types of subqueries will be blocked by the analyzer when +-- we only support nested correlated scalar subqueries. +SELECT myt1.a +FROM myt1 +WHERE EXISTS ( + SELECT 1 + FROM myt2 + WHERE myt2.a = ( + SELECT MAX(myt3.a) + FROM myt3 + WHERE myt3.b > myt2.b AND myt3.c > myt1.c + ) AND myt2.b > myt1.b +); + +SELECT myt1.a +FROM myt1 +WHERE myt1.b = ( + SELECT myt2.b + FROM myt2 + WHERE EXISTS ( + SELECT 1 + FROM myt3 + WHERE myt3.b > myt2.b AND myt3.c > myt1.c + ) AND myt2.b > myt1.b +); + +-- testcases extracted from DUCKDB +SELECT 1 FROM (SELECT 1) t0(c0) WHERE (SELECT (SELECT c0)) = 1; + +DROP TABLE IF EXISTS table_integers; +CREATE TABLE table_integers(i INTEGER); +INSERT INTO table_integers VALUES (1), (2), (3), (NULL); + +SELECT i, (SELECT (SELECT 42+i1.i)+42+i1.i) AS j FROM table_integers i1 ORDER BY i; + +SELECT i, (SELECT (SELECT (SELECT (SELECT 42+i1.i)++i1.i)+42+i1.i)+42+i1.i) AS j FROM table_integers i1 ORDER BY i; + +SELECT i, (SELECT (SELECT (SELECT (SELECT i1.i+i1.i+i1.i+i1.i+i1.i)))) AS j FROM table_integers i1 ORDER BY i; + +SELECT i, (SELECT (SELECT (SELECT (SELECT i1.i+i1.i+i1.i+i1.i+i1.i+i2.i) FROM table_integers i2 WHERE i2.i=i1.i))) AS j FROM table_integers i1 ORDER BY i; + +SELECT i, (SELECT SUM(s1.i) FROM (SELECT i FROM table_integers WHERE i=i1.i) s1 LEFT OUTER JOIN table_integers s2 ON s1.i=s2.i) AS j FROM table_integers i1 ORDER BY i; + +SELECT i, (SELECT SUM(s1.i) FROM (SELECT i FROM table_integers WHERE i<>i1.i) s1 LEFT OUTER JOIN table_integers s2 ON s1.i=s2.i) AS j FROM table_integers i1 ORDER BY i; + +SELECT i, (SELECT SUM(ss2.i) FROM (SELECT i FROM table_integers s1 WHERE i=i1.i) ss2) AS j FROM table_integers i1 ORDER BY i; + +SELECT i, (SELECT * FROM (SELECT (SELECT 42+i1.i)) s1) AS j FROM table_integers i1 ORDER BY i; + +SELECT i, (SELECT s1.k+s2.k FROM (SELECT (SELECT 42+i1.i) AS k) s1, (SELECT (SELECT 42+i1.i) AS k) s2) AS j FROM table_integers i1 ORDER BY i; + +SELECT i, (SELECT s1.k+s2.k FROM (SELECT (SELECT 42+i1.i) AS k) s1 LEFT OUTER JOIN (SELECT (SELECT 42+i1.i) AS k) s2 ON s1.k=s2.k) AS j FROM table_integers i1 ORDER BY i; + +SELECT i, (SELECT i1.i IN (1, 2, 3, 4, 5, 6, 7, 8)) AS j FROM table_integers i1 ORDER BY i; \ No newline at end of file diff --git a/sql/core/src/test/resources/sql-tests/inputs/to_from_avro.sql b/sql/core/src/test/resources/sql-tests/inputs/to_from_avro.sql index 12541ff26e24e..8d6325906df5c 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/to_from_avro.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/to_from_avro.sql @@ -1,4 +1,5 @@ -- Create some temporary test data. +drop table if exists t; create table t as select named_struct('u', named_struct('member0', member0, 'member1', member1)) as s from values (1, null), (null, 'a') tab(member0, member1); diff --git a/sql/core/src/test/resources/sql-tests/results/join-lateral.sql.out b/sql/core/src/test/resources/sql-tests/results/join-lateral.sql.out index 11bafb2cf63c9..f8074eb805b4f 100644 --- a/sql/core/src/test/resources/sql-tests/results/join-lateral.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/join-lateral.sql.out @@ -541,19 +541,11 @@ struct<> -- !query output org.apache.spark.sql.catalyst.ExtendedAnalysisException { - "errorClass" : "UNRESOLVED_COLUMN.WITH_SUGGESTION", - "sqlState" : "42703", + "errorClass" : "NESTED_REFERENCES_IN_SUBQUERY_NOT_SUPPORTED", + "sqlState" : "0A000", "messageParameters" : { - "objectName" : "`t1`.`c1`", - "proposal" : "`t2`.`c1`, `t2`.`c2`" - }, - "queryContext" : [ { - "objectType" : "", - "objectName" : "", - "startIndex" : 74, - "stopIndex" : 78, - "fragment" : "t1.c1" - } ] + "expression" : "spark_catalog.default.t1.c1" + } } diff --git a/sql/core/src/test/resources/sql-tests/results/postgreSQL/aggregates_part1.sql.out b/sql/core/src/test/resources/sql-tests/results/postgreSQL/aggregates_part1.sql.out index 42cf942add486..ec25271e6b54e 100644 --- a/sql/core/src/test/resources/sql-tests/results/postgreSQL/aggregates_part1.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/postgreSQL/aggregates_part1.sql.out @@ -509,17 +509,9 @@ struct<> -- !query output org.apache.spark.sql.catalyst.ExtendedAnalysisException { - "errorClass" : "UNRESOLVED_COLUMN.WITH_SUGGESTION", - "sqlState" : "42703", + "errorClass" : "NESTED_REFERENCES_IN_SUBQUERY_NOT_SUPPORTED", + "sqlState" : "0A000", "messageParameters" : { - "objectName" : "`o`.`unique1`", - "proposal" : "`i`.`unique1`, `i`.`unique2`, `i`.`even`, `i`.`four`, `i`.`odd`" - }, - "queryContext" : [ { - "objectType" : "", - "objectName" : "", - "startIndex" : 71, - "stopIndex" : 79, - "fragment" : "o.unique1" - } ] + "expression" : "o.unique1" + } } diff --git a/sql/core/src/test/resources/sql-tests/results/sql-udf.sql.out b/sql/core/src/test/resources/sql-tests/results/sql-udf.sql.out index 184489190d0e5..79d612d972ac0 100644 --- a/sql/core/src/test/resources/sql-tests/results/sql-udf.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/sql-udf.sql.out @@ -327,18 +327,11 @@ struct<> -- !query output org.apache.spark.sql.catalyst.ExtendedAnalysisException { - "errorClass" : "UNRESOLVED_COLUMN.WITHOUT_SUGGESTION", - "sqlState" : "42703", + "errorClass" : "NESTED_REFERENCES_IN_SUBQUERY_NOT_SUPPORTED", + "sqlState" : "0A000", "messageParameters" : { - "objectName" : "`a`" - }, - "queryContext" : [ { - "objectType" : "", - "objectName" : "", - "startIndex" : 21, - "stopIndex" : 21, - "fragment" : "a" - } ] + "expression" : "foo2_2c.a" + } } @@ -349,18 +342,11 @@ struct<> -- !query output org.apache.spark.sql.catalyst.ExtendedAnalysisException { - "errorClass" : "UNRESOLVED_COLUMN.WITHOUT_SUGGESTION", - "sqlState" : "42703", + "errorClass" : "NESTED_REFERENCES_IN_SUBQUERY_NOT_SUPPORTED", + "sqlState" : "0A000", "messageParameters" : { - "objectName" : "`a`" - }, - "queryContext" : [ { - "objectType" : "", - "objectName" : "", - "startIndex" : 37, - "stopIndex" : 37, - "fragment" : "a" - } ] + "expression" : "foo2_2d.a" + } } diff --git a/sql/core/src/test/resources/sql-tests/results/subquery/negative-cases/invalid-correlation.sql.out b/sql/core/src/test/resources/sql-tests/results/subquery/negative-cases/invalid-correlation.sql.out index 93c60fd49c582..77bcf7dd13d8e 100644 --- a/sql/core/src/test/resources/sql-tests/results/subquery/negative-cases/invalid-correlation.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/subquery/negative-cases/invalid-correlation.sql.out @@ -76,7 +76,7 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException "messageParameters" : { "input" : "\"min(t2a)\", \"t2c\"", "missingAttributes" : "\"t2b\"", - "operator" : "!Filter t2c#x IN (list#x [t2b#x])" + "operator" : "Filter t2c#x IN (list#x [t2b#x])" }, "queryContext" : [ { "objectType" : "", diff --git a/sql/core/src/test/resources/sql-tests/results/subquery/nestedcorrelation-analyzer-only/combined-subquery.sql.out b/sql/core/src/test/resources/sql-tests/results/subquery/nestedcorrelation-analyzer-only/combined-subquery.sql.out new file mode 100644 index 0000000000000..eb712556160d6 --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/results/subquery/nestedcorrelation-analyzer-only/combined-subquery.sql.out @@ -0,0 +1,66 @@ +-- Automatically generated by SQLQueryTestSuite +-- !query +set spark.sql.optimizer.supportNestedCorrelatedSubqueries.enabled=true +-- !query analysis +SetCommand (spark.sql.optimizer.supportNestedCorrelatedSubqueries.enabled,Some(true)) + + +-- !query +set spark.sql.optimizer.supportNestedCorrelatedSubqueriesForScalarSubqueries.enabled=true +-- !query analysis +SetCommand (spark.sql.optimizer.supportNestedCorrelatedSubqueriesForScalarSubqueries.enabled,Some(true)) + + +-- !query +set spark.sql.optimizer.supportNestedCorrelatedSubqueriesForINSubqueries.enabled=true +-- !query analysis +SetCommand (spark.sql.optimizer.supportNestedCorrelatedSubqueriesForINSubqueries.enabled,Some(true)) + + +-- !query +set spark.sql.optimizer.supportNestedCorrelatedSubqueriesForEXISTSSubqueries.enabled=true +-- !query analysis +SetCommand (spark.sql.optimizer.supportNestedCorrelatedSubqueriesForEXISTSSubqueries.enabled,Some(true)) + + +-- !query +DROP TABLE IF EXISTS tbl +-- !query analysis +DropTable true, false ++- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.tbl + + +-- !query +CREATE TABLE tbl(a TINYINT, b SMALLINT, c INTEGER, d BIGINT, e VARCHAR(1), f DATE, g TIMESTAMP) +-- !query analysis +CreateDataSourceTableCommand `spark_catalog`.`default`.`tbl`, false + + +-- !query +INSERT INTO tbl VALUES (1, 2, 3, 4, '5', DATE '1992-01-01', TIMESTAMP '1992-01-01 00:00:00') +-- !query analysis +InsertIntoHadoopFsRelationCommand file:[not included in comparison]/{warehouse_dir}/tbl, false, Parquet, [path=file:[not included in comparison]/{warehouse_dir}/tbl], Append, `spark_catalog`.`default`.`tbl`, org.apache.spark.sql.execution.datasources.InMemoryFileIndex(file:[not included in comparison]/{warehouse_dir}/tbl), [a, b, c, d, e, f, g] ++- Project [cast(col1#x as tinyint) AS a#x, cast(col2#x as smallint) AS b#x, cast(col3#x as int) AS c#x, cast(col4#x as bigint) AS d#xL, static_invoke(CharVarcharCodegenUtils.varcharTypeWriteSideCheck(cast(col5#x as string), 1)) AS e#x, cast(col6#x as date) AS f#x, cast(col7#x as timestamp) AS g#x] + +- LocalRelation [col1#x, col2#x, col3#x, col4#x, col5#x, col6#x, col7#x] + + +-- !query +SELECT t1.c+(SELECT t1.b FROM tbl t2 WHERE EXISTS(SELECT t1.b+t2.a)) FROM tbl t1 +-- !query analysis +Project [(c#x + cast(scalar-subquery#x [b#x] as int)) AS (c + scalarsubquery(b))#x] +: +- Project [outer(b#x)] +: +- Filter exists#x [b#x && a#x] +: : +- Project [(outer(b#x) + cast(outer(a#x) as smallint)) AS (outer(t1.b) + outer(t2.a))#x] +: : +- OneRowRelation +: +- SubqueryAlias t2 +: +- SubqueryAlias spark_catalog.default.tbl +: +- Relation spark_catalog.default.tbl[a#x,b#x,c#x,d#xL,e#x,f#x,g#x] parquet ++- SubqueryAlias t1 + +- SubqueryAlias spark_catalog.default.tbl + +- Relation spark_catalog.default.tbl[a#x,b#x,c#x,d#xL,e#x,f#x,g#x] parquet + + +-- !query +SELECT 1 FROM tbl t1 JOIN tbl t2 ON (t1.d=t2.d) WHERE EXISTS(SELECT t1.c FROM tbl t3 WHERE t1.d+t3.c<100 AND EXISTS(SELECT 1 FROM tbl t4 WHERE t2.f < DATE '2000-01-01')) +-- !query analysis +[Analyzer test output redacted due to nondeterminism] diff --git a/sql/core/src/test/resources/sql-tests/results/subquery/nestedcorrelation-analyzer-only/exists-subquery.sql.out b/sql/core/src/test/resources/sql-tests/results/subquery/nestedcorrelation-analyzer-only/exists-subquery.sql.out new file mode 100644 index 0000000000000..c47867443fde7 --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/results/subquery/nestedcorrelation-analyzer-only/exists-subquery.sql.out @@ -0,0 +1,430 @@ +-- Automatically generated by SQLQueryTestSuite +-- !query +set spark.sql.optimizer.supportNestedCorrelatedSubqueries.enabled=true +-- !query analysis +SetCommand (spark.sql.optimizer.supportNestedCorrelatedSubqueries.enabled,Some(true)) + + +-- !query +set spark.sql.optimizer.supportNestedCorrelatedSubqueriesForScalarSubqueries.enabled=true +-- !query analysis +SetCommand (spark.sql.optimizer.supportNestedCorrelatedSubqueriesForScalarSubqueries.enabled,Some(true)) + + +-- !query +set spark.sql.optimizer.supportNestedCorrelatedSubqueriesForINSubqueries.enabled=true +-- !query analysis +SetCommand (spark.sql.optimizer.supportNestedCorrelatedSubqueriesForINSubqueries.enabled,Some(true)) + + +-- !query +set spark.sql.optimizer.supportNestedCorrelatedSubqueriesForEXISTSSubqueries.enabled=true +-- !query analysis +SetCommand (spark.sql.optimizer.supportNestedCorrelatedSubqueriesForEXISTSSubqueries.enabled,Some(true)) + + +-- !query +DROP TABLE IF EXISTS table_integers +-- !query analysis +DropTable true, false ++- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.table_integers + + +-- !query +CREATE TABLE table_integers(i INTEGER) +-- !query analysis +CreateDataSourceTableCommand `spark_catalog`.`default`.`table_integers`, false + + +-- !query +INSERT INTO table_integers VALUES (1), (2), (3), (NULL) +-- !query analysis +InsertIntoHadoopFsRelationCommand file:[not included in comparison]/{warehouse_dir}/table_integers, false, Parquet, [path=file:[not included in comparison]/{warehouse_dir}/table_integers], Append, `spark_catalog`.`default`.`table_integers`, org.apache.spark.sql.execution.datasources.InMemoryFileIndex(file:[not included in comparison]/{warehouse_dir}/table_integers), [i] ++- Project [cast(col1#x as int) AS i#x] + +- LocalRelation [col1#x] + + +-- !query +SELECT + i, + ( + SELECT SUM(ss1.i) + FROM ( + SELECT s1.i + FROM table_integers s1 + WHERE EXISTS ( + SELECT 1 + FROM table_integers t2 + WHERE s1.i > t2.i + ) + ) ss1 + ) AS j +FROM table_integers i1 +ORDER BY i +-- !query analysis +Sort [i#x ASC NULLS FIRST], true ++- Project [i#x, scalar-subquery#x [] AS j#xL] + : +- Aggregate [sum(i#x) AS sum(i)#xL] + : +- SubqueryAlias ss1 + : +- Project [i#x] + : +- Filter exists#x [i#x] + : : +- Project [1 AS 1#x] + : : +- Filter (outer(i#x) > i#x) + : : +- SubqueryAlias t2 + : : +- SubqueryAlias spark_catalog.default.table_integers + : : +- Relation spark_catalog.default.table_integers[i#x] parquet + : +- SubqueryAlias s1 + : +- SubqueryAlias spark_catalog.default.table_integers + : +- Relation spark_catalog.default.table_integers[i#x] parquet + +- SubqueryAlias i1 + +- SubqueryAlias spark_catalog.default.table_integers + +- Relation spark_catalog.default.table_integers[i#x] parquet + + +-- !query +SELECT + i, + ( + SELECT SUM(ss2.i) + FROM ( + SELECT s1.i + FROM table_integers s1 + WHERE s1.i = i1.i + AND EXISTS ( + SELECT 1 + FROM table_integers t2 + WHERE t2.i = s1.i + ) + ) ss2 + ) AS j +FROM table_integers i1 +ORDER BY i +-- !query analysis +Sort [i#x ASC NULLS FIRST], true ++- Project [i#x, scalar-subquery#x [i#x] AS j#xL] + : +- Aggregate [sum(i#x) AS sum(i)#xL] + : +- SubqueryAlias ss2 + : +- Project [i#x] + : +- Filter ((i#x = outer(i#x)) AND exists#x [i#x]) + : : +- Project [1 AS 1#x] + : : +- Filter (i#x = outer(i#x)) + : : +- SubqueryAlias t2 + : : +- SubqueryAlias spark_catalog.default.table_integers + : : +- Relation spark_catalog.default.table_integers[i#x] parquet + : +- SubqueryAlias s1 + : +- SubqueryAlias spark_catalog.default.table_integers + : +- Relation spark_catalog.default.table_integers[i#x] parquet + +- SubqueryAlias i1 + +- SubqueryAlias spark_catalog.default.table_integers + +- Relation spark_catalog.default.table_integers[i#x] parquet + + +-- !query +SELECT + i, + ( + SELECT SUM(ss1.i) + SUM(ss2.i) + FROM ( + SELECT s1.i + FROM table_integers s1 + WHERE EXISTS ( + SELECT 1 + FROM table_integers t2 + WHERE s1.i > t2.i + ) + ) ss1 + LEFT OUTER JOIN ( + SELECT s1.i + FROM table_integers s1 + WHERE EXISTS ( + SELECT 1 + FROM table_integers t2 + WHERE s1.i = t2.i + ) + ) ss2 + ON ss1.i = ss2.i + ) AS j +FROM table_integers i1 +ORDER BY i +-- !query analysis +Sort [i#x ASC NULLS FIRST], true ++- Project [i#x, scalar-subquery#x [] AS j#xL] + : +- Aggregate [(sum(i#x) + sum(i#x)) AS (sum(i) + sum(i))#xL] + : +- Join LeftOuter, (i#x = i#x) + : :- SubqueryAlias ss1 + : : +- Project [i#x] + : : +- Filter exists#x [i#x] + : : : +- Project [1 AS 1#x] + : : : +- Filter (outer(i#x) > i#x) + : : : +- SubqueryAlias t2 + : : : +- SubqueryAlias spark_catalog.default.table_integers + : : : +- Relation spark_catalog.default.table_integers[i#x] parquet + : : +- SubqueryAlias s1 + : : +- SubqueryAlias spark_catalog.default.table_integers + : : +- Relation spark_catalog.default.table_integers[i#x] parquet + : +- SubqueryAlias ss2 + : +- Project [i#x] + : +- Filter exists#x [i#x] + : : +- Project [1 AS 1#x] + : : +- Filter (outer(i#x) = i#x) + : : +- SubqueryAlias t2 + : : +- SubqueryAlias spark_catalog.default.table_integers + : : +- Relation spark_catalog.default.table_integers[i#x] parquet + : +- SubqueryAlias s1 + : +- SubqueryAlias spark_catalog.default.table_integers + : +- Relation spark_catalog.default.table_integers[i#x] parquet + +- SubqueryAlias i1 + +- SubqueryAlias spark_catalog.default.table_integers + +- Relation spark_catalog.default.table_integers[i#x] parquet + + +-- !query +SELECT i, (SELECT SUM(ss2.i) FROM (SELECT i FROM table_integers s1 WHERE CASE WHEN (i=i1.i AND EXISTS (SELECT i FROM table_integers WHERE i=s1.i)) THEN true ELSE false END) ss2) AS j FROM table_integers i1 ORDER BY i +-- !query analysis +Sort [i#x ASC NULLS FIRST], true ++- Project [i#x, scalar-subquery#x [i#x] AS j#xL] + : +- Aggregate [sum(i#x) AS sum(i)#xL] + : +- SubqueryAlias ss2 + : +- Project [i#x] + : +- Filter CASE WHEN ((i#x = outer(i#x)) AND exists#x [i#x]) THEN true ELSE false END + : : +- Project [i#x] + : : +- Filter (i#x = outer(i#x)) + : : +- SubqueryAlias spark_catalog.default.table_integers + : : +- Relation spark_catalog.default.table_integers[i#x] parquet + : +- SubqueryAlias s1 + : +- SubqueryAlias spark_catalog.default.table_integers + : +- Relation spark_catalog.default.table_integers[i#x] parquet + +- SubqueryAlias i1 + +- SubqueryAlias spark_catalog.default.table_integers + +- Relation spark_catalog.default.table_integers[i#x] parquet + + +-- !query +SELECT i, (SELECT SUM(ss2.i) FROM (SELECT i FROM table_integers s1 WHERE i=i1.i AND EXISTS (SELECT i FROM table_integers WHERE i=s1.i)) ss2) AS j FROM table_integers i1 ORDER BY i +-- !query analysis +Sort [i#x ASC NULLS FIRST], true ++- Project [i#x, scalar-subquery#x [i#x] AS j#xL] + : +- Aggregate [sum(i#x) AS sum(i)#xL] + : +- SubqueryAlias ss2 + : +- Project [i#x] + : +- Filter ((i#x = outer(i#x)) AND exists#x [i#x]) + : : +- Project [i#x] + : : +- Filter (i#x = outer(i#x)) + : : +- SubqueryAlias spark_catalog.default.table_integers + : : +- Relation spark_catalog.default.table_integers[i#x] parquet + : +- SubqueryAlias s1 + : +- SubqueryAlias spark_catalog.default.table_integers + : +- Relation spark_catalog.default.table_integers[i#x] parquet + +- SubqueryAlias i1 + +- SubqueryAlias spark_catalog.default.table_integers + +- Relation spark_catalog.default.table_integers[i#x] parquet + + +-- !query +SELECT i, (SELECT SUM(ss2.i) FROM (SELECT i FROM table_integers s1 WHERE (SELECT i FROM table_integers WHERE i=s1.i) = 1) ss2) AS j FROM table_integers i1 ORDER BY i +-- !query analysis +Sort [i#x ASC NULLS FIRST], true ++- Project [i#x, scalar-subquery#x [] AS j#xL] + : +- Aggregate [sum(i#x) AS sum(i)#xL] + : +- SubqueryAlias ss2 + : +- Project [i#x] + : +- Filter (scalar-subquery#x [i#x] = 1) + : : +- Project [i#x] + : : +- Filter (i#x = outer(i#x)) + : : +- SubqueryAlias spark_catalog.default.table_integers + : : +- Relation spark_catalog.default.table_integers[i#x] parquet + : +- SubqueryAlias s1 + : +- SubqueryAlias spark_catalog.default.table_integers + : +- Relation spark_catalog.default.table_integers[i#x] parquet + +- SubqueryAlias i1 + +- SubqueryAlias spark_catalog.default.table_integers + +- Relation spark_catalog.default.table_integers[i#x] parquet + + +-- !query +SELECT i, (SELECT i FROM table_integers s1 WHERE i=i1.i AND EXISTS (SELECT i FROM table_integers WHERE i=s1.i)) AS j FROM table_integers i1 ORDER BY i +-- !query analysis +Sort [i#x ASC NULLS FIRST], true ++- Project [i#x, scalar-subquery#x [i#x] AS j#x] + : +- Project [i#x] + : +- Filter ((i#x = outer(i#x)) AND exists#x [i#x]) + : : +- Project [i#x] + : : +- Filter (i#x = outer(i#x)) + : : +- SubqueryAlias spark_catalog.default.table_integers + : : +- Relation spark_catalog.default.table_integers[i#x] parquet + : +- SubqueryAlias s1 + : +- SubqueryAlias spark_catalog.default.table_integers + : +- Relation spark_catalog.default.table_integers[i#x] parquet + +- SubqueryAlias i1 + +- SubqueryAlias spark_catalog.default.table_integers + +- Relation spark_catalog.default.table_integers[i#x] parquet + + +-- !query +SELECT i, (SELECT SUM(ss2.i) FROM (SELECT i FROM table_integers s1 WHERE i=i1.i OR i=ANY(SELECT i FROM table_integers WHERE i=s1.i)) ss2) AS j FROM table_integers i1 ORDER BY i +-- !query analysis +org.apache.spark.sql.catalyst.parser.ParseException +{ + "errorClass" : "PARSE_SYNTAX_ERROR", + "sqlState" : "42601", + "messageParameters" : { + "error" : "'('", + "hint" : "" + } +} + + +-- !query +SELECT i, (SELECT SUM(ss2.i) FROM (SELECT i FROM table_integers s1 WHERE CASE WHEN (i=i1.i AND EXISTS(SELECT i FROM table_integers WHERE i=s1.i)) THEN true ELSE false END) ss2) AS j FROM table_integers i1 ORDER BY i +-- !query analysis +Sort [i#x ASC NULLS FIRST], true ++- Project [i#x, scalar-subquery#x [i#x] AS j#xL] + : +- Aggregate [sum(i#x) AS sum(i)#xL] + : +- SubqueryAlias ss2 + : +- Project [i#x] + : +- Filter CASE WHEN ((i#x = outer(i#x)) AND exists#x [i#x]) THEN true ELSE false END + : : +- Project [i#x] + : : +- Filter (i#x = outer(i#x)) + : : +- SubqueryAlias spark_catalog.default.table_integers + : : +- Relation spark_catalog.default.table_integers[i#x] parquet + : +- SubqueryAlias s1 + : +- SubqueryAlias spark_catalog.default.table_integers + : +- Relation spark_catalog.default.table_integers[i#x] parquet + +- SubqueryAlias i1 + +- SubqueryAlias spark_catalog.default.table_integers + +- Relation spark_catalog.default.table_integers[i#x] parquet + + +-- !query +SELECT i, (SELECT SUM(ss2.i) FROM (SELECT i FROM table_integers s1 WHERE i=i1.i AND EXISTS(SELECT i FROM table_integers WHERE i=s1.i)) ss2) AS j FROM table_integers i1 ORDER BY i +-- !query analysis +Sort [i#x ASC NULLS FIRST], true ++- Project [i#x, scalar-subquery#x [i#x] AS j#xL] + : +- Aggregate [sum(i#x) AS sum(i)#xL] + : +- SubqueryAlias ss2 + : +- Project [i#x] + : +- Filter ((i#x = outer(i#x)) AND exists#x [i#x]) + : : +- Project [i#x] + : : +- Filter (i#x = outer(i#x)) + : : +- SubqueryAlias spark_catalog.default.table_integers + : : +- Relation spark_catalog.default.table_integers[i#x] parquet + : +- SubqueryAlias s1 + : +- SubqueryAlias spark_catalog.default.table_integers + : +- Relation spark_catalog.default.table_integers[i#x] parquet + +- SubqueryAlias i1 + +- SubqueryAlias spark_catalog.default.table_integers + +- Relation spark_catalog.default.table_integers[i#x] parquet + + +-- !query +SELECT i, (SELECT SUM(ss1.i) FROM (SELECT i FROM table_integers s1 WHERE EXISTS(SELECT i FROM table_integers WHERE i<>s1.i AND s1.i > i)) ss1) AS j FROM table_integers i1 ORDER BY i +-- !query analysis +Sort [i#x ASC NULLS FIRST], true ++- Project [i#x, scalar-subquery#x [] AS j#xL] + : +- Aggregate [sum(i#x) AS sum(i)#xL] + : +- SubqueryAlias ss1 + : +- Project [i#x] + : +- Filter exists#x [i#x && i#x] + : : +- Project [i#x] + : : +- Filter (NOT (i#x = outer(i#x)) AND (outer(i#x) > i#x)) + : : +- SubqueryAlias spark_catalog.default.table_integers + : : +- Relation spark_catalog.default.table_integers[i#x] parquet + : +- SubqueryAlias s1 + : +- SubqueryAlias spark_catalog.default.table_integers + : +- Relation spark_catalog.default.table_integers[i#x] parquet + +- SubqueryAlias i1 + +- SubqueryAlias spark_catalog.default.table_integers + +- Relation spark_catalog.default.table_integers[i#x] parquet + + +-- !query +SELECT i, (SELECT SUM(ss1.i)+SUM(ss2.i) FROM (SELECT i FROM table_integers s1 WHERE i=i1.i AND EXISTS(SELECT i FROM table_integers WHERE i<>s1.i AND s1.i>i)) ss1 LEFT OUTER JOIN (SELECT i FROM table_integers s1 WHERE EXISTS(SELECT i FROM table_integers WHERE i=s1.i)) ss2 ON ss1.i=ss2.i) AS j FROM table_integers i1 ORDER BY i +-- !query analysis +Sort [i#x ASC NULLS FIRST], true ++- Project [i#x, scalar-subquery#x [i#x] AS j#xL] + : +- Aggregate [(sum(i#x) + sum(i#x)) AS (sum(i) + sum(i))#xL] + : +- Join LeftOuter, (i#x = i#x) + : :- SubqueryAlias ss1 + : : +- Project [i#x] + : : +- Filter ((i#x = outer(i#x)) AND exists#x [i#x && i#x]) + : : : +- Project [i#x] + : : : +- Filter (NOT (i#x = outer(i#x)) AND (outer(i#x) > i#x)) + : : : +- SubqueryAlias spark_catalog.default.table_integers + : : : +- Relation spark_catalog.default.table_integers[i#x] parquet + : : +- SubqueryAlias s1 + : : +- SubqueryAlias spark_catalog.default.table_integers + : : +- Relation spark_catalog.default.table_integers[i#x] parquet + : +- SubqueryAlias ss2 + : +- Project [i#x] + : +- Filter exists#x [i#x] + : : +- Project [i#x] + : : +- Filter (i#x = outer(i#x)) + : : +- SubqueryAlias spark_catalog.default.table_integers + : : +- Relation spark_catalog.default.table_integers[i#x] parquet + : +- SubqueryAlias s1 + : +- SubqueryAlias spark_catalog.default.table_integers + : +- Relation spark_catalog.default.table_integers[i#x] parquet + +- SubqueryAlias i1 + +- SubqueryAlias spark_catalog.default.table_integers + +- Relation spark_catalog.default.table_integers[i#x] parquet + + +-- !query +DROP TABLE IF EXISTS tbl_ProductSales +-- !query analysis +DropTable true, false ++- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.tbl_ProductSales + + +-- !query +DROP TABLE IF EXISTS another_T +-- !query analysis +DropTable true, false ++- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.another_T + + +-- !query +CREATE TABLE tbl_ProductSales (ColID int, Product_Category varchar(64), Product_Name varchar(64), TotalSales int) +-- !query analysis +CreateDataSourceTableCommand `spark_catalog`.`default`.`tbl_ProductSales`, false + + +-- !query +CREATE TABLE another_T (col1 INT, col2 INT, col3 INT, col4 INT, col5 INT, col6 INT, col7 INT, col8 INT) +-- !query analysis +CreateDataSourceTableCommand `spark_catalog`.`default`.`another_T`, false + + +-- !query +INSERT INTO tbl_ProductSales VALUES (1,'Game','Mobo Game',200),(2,'Game','PKO Game',400),(3,'Fashion','Shirt',500),(4,'Fashion','Shorts',100) +-- !query analysis +InsertIntoHadoopFsRelationCommand file:[not included in comparison]/{warehouse_dir}/tbl_productsales, false, Parquet, [path=file:[not included in comparison]/{warehouse_dir}/tbl_productsales], Append, `spark_catalog`.`default`.`tbl_productsales`, org.apache.spark.sql.execution.datasources.InMemoryFileIndex(file:[not included in comparison]/{warehouse_dir}/tbl_productsales), [ColID, Product_Category, Product_Name, TotalSales] ++- Project [cast(col1#x as int) AS ColID#x, static_invoke(CharVarcharCodegenUtils.varcharTypeWriteSideCheck(cast(col2#x as string), 64)) AS Product_Category#x, static_invoke(CharVarcharCodegenUtils.varcharTypeWriteSideCheck(cast(col3#x as string), 64)) AS Product_Name#x, cast(col4#x as int) AS TotalSales#x] + +- LocalRelation [col1#x, col2#x, col3#x, col4#x] + + +-- !query +INSERT INTO another_T VALUES (1,2,3,4,5,6,7,8), (11,22,33,44,55,66,77,88), (111,222,333,444,555,666,777,888), (1111,2222,3333,4444,5555,6666,7777,8888) +-- !query analysis +InsertIntoHadoopFsRelationCommand file:[not included in comparison]/{warehouse_dir}/another_t, false, Parquet, [path=file:[not included in comparison]/{warehouse_dir}/another_t], Append, `spark_catalog`.`default`.`another_t`, org.apache.spark.sql.execution.datasources.InMemoryFileIndex(file:[not included in comparison]/{warehouse_dir}/another_t), [col1, col2, col3, col4, col5, col6, col7, col8] ++- LocalRelation [col1#x, col2#x, col3#x, col4#x, col5#x, col6#x, col7#x, col8#x] + + +-- !query +SELECT (SELECT MIN(ColID) FROM tbl_ProductSales INNER JOIN another_T t2 ON EXISTS (SELECT MAX(t1.col1 + t3.col4) AS mymax FROM another_T t3 HAVING t1.col7 <> mymax)) FROM another_T t1 +-- !query analysis +org.apache.spark.sql.AnalysisException +{ + "errorClass" : "UNSUPPORTED_SUBQUERY_EXPRESSION_CATEGORY.AGGREGATE_FUNCTION_MIXED_OUTER_LOCAL_REFERENCES", + "sqlState" : "0A000", + "messageParameters" : { + "function" : "max((outer(t1.col1) + t3.col4))" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 91, + "stopIndex" : 112, + "fragment" : "MAX(t1.col1 + t3.col4)" + } ] +} diff --git a/sql/core/src/test/resources/sql-tests/results/subquery/nestedcorrelation-analyzer-only/lateral-subquery.sql.out b/sql/core/src/test/resources/sql-tests/results/subquery/nestedcorrelation-analyzer-only/lateral-subquery.sql.out new file mode 100644 index 0000000000000..9cedbe2e28847 --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/results/subquery/nestedcorrelation-analyzer-only/lateral-subquery.sql.out @@ -0,0 +1,88 @@ +-- Automatically generated by SQLQueryTestSuite +-- !query +set spark.sql.optimizer.supportNestedCorrelatedSubqueries.enabled=true +-- !query analysis +SetCommand (spark.sql.optimizer.supportNestedCorrelatedSubqueries.enabled,Some(true)) + + +-- !query +set spark.sql.optimizer.supportNestedCorrelatedSubqueriesForScalarSubqueries.enabled=true +-- !query analysis +SetCommand (spark.sql.optimizer.supportNestedCorrelatedSubqueriesForScalarSubqueries.enabled,Some(true)) + + +-- !query +set spark.sql.optimizer.supportNestedCorrelatedSubqueriesForINSubqueries.enabled=true +-- !query analysis +SetCommand (spark.sql.optimizer.supportNestedCorrelatedSubqueriesForINSubqueries.enabled,Some(true)) + + +-- !query +set spark.sql.optimizer.supportNestedCorrelatedSubqueriesForEXISTSSubqueries.enabled=true +-- !query analysis +SetCommand (spark.sql.optimizer.supportNestedCorrelatedSubqueriesForEXISTSSubqueries.enabled,Some(true)) + + +-- !query +DROP TABLE IF EXISTS t +-- !query analysis +DropTable true, false ++- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.t + + +-- !query +CREATE TABLE t(ps_supplycost INT, n_name INT) +-- !query analysis +CreateDataSourceTableCommand `spark_catalog`.`default`.`t`, false + + +-- !query +SELECT NULL +FROM + t AS ref_2, + LATERAL (SELECT (SELECT NULL + FROM (SELECT * FROM t AS ref_5, + LATERAL (SELECT ref_5.ps_supplycost AS c0, + ref_2.n_name AS c1) AS alias1) AS alias2) AS alias3) AS alias4 +-- !query analysis +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "errorClass" : "UNRESOLVED_COLUMN.WITHOUT_SUGGESTION", + "sqlState" : "42703", + "messageParameters" : { + "objectName" : "`ref_2`.`n_name`" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 187, + "stopIndex" : 198, + "fragment" : "ref_2.n_name" + } ] +} + + +-- !query +SELECT * +FROM + t AS ref_2, + LATERAL (SELECT (SELECT NULL + FROM (SELECT * FROM t AS ref_5, + LATERAL (SELECT ref_5.ps_supplycost AS c0, + ref_2.n_name AS c1) AS alias1) AS alias2) AS alias3) AS alias4 +-- !query analysis +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "errorClass" : "UNRESOLVED_COLUMN.WITHOUT_SUGGESTION", + "sqlState" : "42703", + "messageParameters" : { + "objectName" : "`ref_2`.`n_name`" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 184, + "stopIndex" : 195, + "fragment" : "ref_2.n_name" + } ] +} diff --git a/sql/core/src/test/resources/sql-tests/results/subquery/nestedcorrelation-analyzer-only/scalar-subquery.sql.out b/sql/core/src/test/resources/sql-tests/results/subquery/nestedcorrelation-analyzer-only/scalar-subquery.sql.out new file mode 100644 index 0000000000000..76805b07c6851 --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/results/subquery/nestedcorrelation-analyzer-only/scalar-subquery.sql.out @@ -0,0 +1,259 @@ +-- Automatically generated by SQLQueryTestSuite +-- !query +set spark.sql.optimizer.supportNestedCorrelatedSubqueries.enabled=true +-- !query analysis +SetCommand (spark.sql.optimizer.supportNestedCorrelatedSubqueries.enabled,Some(true)) + + +-- !query +set spark.sql.optimizer.supportNestedCorrelatedSubqueriesForScalarSubqueries.enabled=true +-- !query analysis +SetCommand (spark.sql.optimizer.supportNestedCorrelatedSubqueriesForScalarSubqueries.enabled,Some(true)) + + +-- !query +set spark.sql.optimizer.supportNestedCorrelatedSubqueriesForINSubqueries.enabled=true +-- !query analysis +SetCommand (spark.sql.optimizer.supportNestedCorrelatedSubqueriesForINSubqueries.enabled,Some(true)) + + +-- !query +set spark.sql.optimizer.supportNestedCorrelatedSubqueriesForEXISTSSubqueries.enabled=true +-- !query analysis +SetCommand (spark.sql.optimizer.supportNestedCorrelatedSubqueriesForEXISTSSubqueries.enabled,Some(true)) + + +-- !query +SELECT 1 FROM (SELECT 1) t0(c0) WHERE (SELECT (SELECT c0)) = 1 +-- !query analysis +Project [1 AS 1#x] ++- Filter (scalar-subquery#x [c0#x] = 1) + : +- Project [scalar-subquery#x [c0#x] AS scalarsubquery(c0)#x] + : : +- Project [outer(c0#x)] + : : +- OneRowRelation + : +- OneRowRelation + +- SubqueryAlias t0 + +- Project [1#x AS c0#x] + +- Project [1 AS 1#x] + +- OneRowRelation + + +-- !query +DROP TABLE IF EXISTS table_integers +-- !query analysis +DropTable true, false ++- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.table_integers + + +-- !query +CREATE TABLE table_integers(i INTEGER) +-- !query analysis +CreateDataSourceTableCommand `spark_catalog`.`default`.`table_integers`, false + + +-- !query +INSERT INTO table_integers VALUES (1), (2), (3), (NULL) +-- !query analysis +InsertIntoHadoopFsRelationCommand file:[not included in comparison]/{warehouse_dir}/table_integers, false, Parquet, [path=file:[not included in comparison]/{warehouse_dir}/table_integers], Append, `spark_catalog`.`default`.`table_integers`, org.apache.spark.sql.execution.datasources.InMemoryFileIndex(file:[not included in comparison]/{warehouse_dir}/table_integers), [i] ++- Project [cast(col1#x as int) AS i#x] + +- LocalRelation [col1#x] + + +-- !query +SELECT i, (SELECT (SELECT 42+i1.i)+42+i1.i) AS j FROM table_integers i1 ORDER BY i +-- !query analysis +Sort [i#x ASC NULLS FIRST], true ++- Project [i#x, scalar-subquery#x [i#x && i#x] AS j#x] + : +- Project [((scalar-subquery#x [i#x] + 42) + outer(i#x)) AS ((scalarsubquery(i) + 42) + outer(i1.i))#x] + : : +- Project [(42 + outer(i#x)) AS (42 + outer(i1.i))#x] + : : +- OneRowRelation + : +- OneRowRelation + +- SubqueryAlias i1 + +- SubqueryAlias spark_catalog.default.table_integers + +- Relation spark_catalog.default.table_integers[i#x] parquet + + +-- !query +SELECT i, (SELECT (SELECT (SELECT (SELECT 42+i1.i)++i1.i)+42+i1.i)+42+i1.i) AS j FROM table_integers i1 ORDER BY i +-- !query analysis +Sort [i#x ASC NULLS FIRST], true ++- Project [i#x, scalar-subquery#x [i#x && i#x && i#x && i#x] AS j#x] + : +- Project [((scalar-subquery#x [i#x && i#x && i#x] + 42) + outer(i#x)) AS ((scalarsubquery(i, i, i) + 42) + outer(i1.i))#x] + : : +- Project [((scalar-subquery#x [i#x && i#x] + 42) + outer(i#x)) AS ((scalarsubquery(i, i) + 42) + outer(i1.i))#x] + : : : +- Project [(scalar-subquery#x [i#x] + positive(outer(i#x))) AS (scalarsubquery(i) + (+ outer(i1.i)))#x] + : : : : +- Project [(42 + outer(i#x)) AS (42 + outer(i1.i))#x] + : : : : +- OneRowRelation + : : : +- OneRowRelation + : : +- OneRowRelation + : +- OneRowRelation + +- SubqueryAlias i1 + +- SubqueryAlias spark_catalog.default.table_integers + +- Relation spark_catalog.default.table_integers[i#x] parquet + + +-- !query +SELECT i, (SELECT (SELECT (SELECT (SELECT i1.i+i1.i+i1.i+i1.i+i1.i)))) AS j FROM table_integers i1 ORDER BY i +-- !query analysis +Sort [i#x ASC NULLS FIRST], true ++- Project [i#x, scalar-subquery#x [i#x && i#x && i#x && i#x && i#x] AS j#x] + : +- Project [scalar-subquery#x [i#x && i#x && i#x && i#x && i#x] AS scalarsubquery(i, i, i, i, i)#x] + : : +- Project [scalar-subquery#x [i#x && i#x && i#x && i#x && i#x] AS scalarsubquery(i, i, i, i, i)#x] + : : : +- Project [scalar-subquery#x [i#x && i#x && i#x && i#x && i#x] AS scalarsubquery(i, i, i, i, i)#x] + : : : : +- Project [((((outer(i#x) + outer(i#x)) + outer(i#x)) + outer(i#x)) + outer(i#x)) AS ((((outer(i1.i) + outer(i1.i)) + outer(i1.i)) + outer(i1.i)) + outer(i1.i))#x] + : : : : +- OneRowRelation + : : : +- OneRowRelation + : : +- OneRowRelation + : +- OneRowRelation + +- SubqueryAlias i1 + +- SubqueryAlias spark_catalog.default.table_integers + +- Relation spark_catalog.default.table_integers[i#x] parquet + + +-- !query +SELECT i, (SELECT (SELECT (SELECT (SELECT i1.i+i1.i+i1.i+i1.i+i1.i+i2.i) FROM table_integers i2 WHERE i2.i=i1.i))) AS j FROM table_integers i1 ORDER BY i +-- !query analysis +Sort [i#x ASC NULLS FIRST], true ++- Project [i#x, scalar-subquery#x [i#x] AS j#x] + : +- Project [scalar-subquery#x [i#x] AS scalarsubquery(i)#x] + : : +- Project [scalar-subquery#x [i#x] AS scalarsubquery(i)#x] + : : : +- Project [scalar-subquery#x [i#x && i#x && i#x && i#x && i#x && i#x] AS scalarsubquery(i, i, i, i, i, i)#x] + : : : : +- Project [(((((outer(i#x) + outer(i#x)) + outer(i#x)) + outer(i#x)) + outer(i#x)) + outer(i#x)) AS (((((outer(i1.i) + outer(i1.i)) + outer(i1.i)) + outer(i1.i)) + outer(i1.i)) + outer(i2.i))#x] + : : : : +- OneRowRelation + : : : +- Filter (i#x = outer(i#x)) + : : : +- SubqueryAlias i2 + : : : +- SubqueryAlias spark_catalog.default.table_integers + : : : +- Relation spark_catalog.default.table_integers[i#x] parquet + : : +- OneRowRelation + : +- OneRowRelation + +- SubqueryAlias i1 + +- SubqueryAlias spark_catalog.default.table_integers + +- Relation spark_catalog.default.table_integers[i#x] parquet + + +-- !query +SELECT i, (SELECT SUM(s1.i) FROM (SELECT i FROM table_integers WHERE i=i1.i) s1 LEFT OUTER JOIN table_integers s2 ON s1.i=s2.i) AS j FROM table_integers i1 ORDER BY i +-- !query analysis +Sort [i#x ASC NULLS FIRST], true ++- Project [i#x, scalar-subquery#x [i#x] AS j#xL] + : +- Aggregate [sum(i#x) AS sum(i)#xL] + : +- Join LeftOuter, (i#x = i#x) + : :- SubqueryAlias s1 + : : +- Project [i#x] + : : +- Filter (i#x = outer(i#x)) + : : +- SubqueryAlias spark_catalog.default.table_integers + : : +- Relation spark_catalog.default.table_integers[i#x] parquet + : +- SubqueryAlias s2 + : +- SubqueryAlias spark_catalog.default.table_integers + : +- Relation spark_catalog.default.table_integers[i#x] parquet + +- SubqueryAlias i1 + +- SubqueryAlias spark_catalog.default.table_integers + +- Relation spark_catalog.default.table_integers[i#x] parquet + + +-- !query +SELECT i, (SELECT SUM(s1.i) FROM (SELECT i FROM table_integers WHERE i<>i1.i) s1 LEFT OUTER JOIN table_integers s2 ON s1.i=s2.i) AS j FROM table_integers i1 ORDER BY i +-- !query analysis +Sort [i#x ASC NULLS FIRST], true ++- Project [i#x, scalar-subquery#x [i#x] AS j#xL] + : +- Aggregate [sum(i#x) AS sum(i)#xL] + : +- Join LeftOuter, (i#x = i#x) + : :- SubqueryAlias s1 + : : +- Project [i#x] + : : +- Filter NOT (i#x = outer(i#x)) + : : +- SubqueryAlias spark_catalog.default.table_integers + : : +- Relation spark_catalog.default.table_integers[i#x] parquet + : +- SubqueryAlias s2 + : +- SubqueryAlias spark_catalog.default.table_integers + : +- Relation spark_catalog.default.table_integers[i#x] parquet + +- SubqueryAlias i1 + +- SubqueryAlias spark_catalog.default.table_integers + +- Relation spark_catalog.default.table_integers[i#x] parquet + + +-- !query +SELECT i, (SELECT SUM(ss2.i) FROM (SELECT i FROM table_integers s1 WHERE i=i1.i) ss2) AS j FROM table_integers i1 ORDER BY i +-- !query analysis +Sort [i#x ASC NULLS FIRST], true ++- Project [i#x, scalar-subquery#x [i#x] AS j#xL] + : +- Aggregate [sum(i#x) AS sum(i)#xL] + : +- SubqueryAlias ss2 + : +- Project [i#x] + : +- Filter (i#x = outer(i#x)) + : +- SubqueryAlias s1 + : +- SubqueryAlias spark_catalog.default.table_integers + : +- Relation spark_catalog.default.table_integers[i#x] parquet + +- SubqueryAlias i1 + +- SubqueryAlias spark_catalog.default.table_integers + +- Relation spark_catalog.default.table_integers[i#x] parquet + + +-- !query +SELECT i, (SELECT * FROM (SELECT (SELECT 42+i1.i)) s1) AS j FROM table_integers i1 ORDER BY i +-- !query analysis +Sort [i#x ASC NULLS FIRST], true ++- Project [i#x, scalar-subquery#x [i#x] AS j#x] + : +- Project [scalarsubquery(i)#x] + : +- SubqueryAlias s1 + : +- Project [scalar-subquery#x [i#x] AS scalarsubquery(i)#x] + : : +- Project [(42 + outer(i#x)) AS (42 + outer(i1.i))#x] + : : +- OneRowRelation + : +- OneRowRelation + +- SubqueryAlias i1 + +- SubqueryAlias spark_catalog.default.table_integers + +- Relation spark_catalog.default.table_integers[i#x] parquet + + +-- !query +SELECT i, (SELECT s1.k+s2.k FROM (SELECT (SELECT 42+i1.i) AS k) s1, (SELECT (SELECT 42+i1.i) AS k) s2) AS j FROM table_integers i1 ORDER BY i +-- !query analysis +Sort [i#x ASC NULLS FIRST], true ++- Project [i#x, scalar-subquery#x [i#x && i#x] AS j#x] + : +- Project [(k#x + k#x) AS (k + k)#x] + : +- Join Inner + : :- SubqueryAlias s1 + : : +- Project [scalar-subquery#x [i#x] AS k#x] + : : : +- Project [(42 + outer(i#x)) AS (42 + outer(i1.i))#x] + : : : +- OneRowRelation + : : +- OneRowRelation + : +- SubqueryAlias s2 + : +- Project [scalar-subquery#x [i#x] AS k#x] + : : +- Project [(42 + outer(i#x)) AS (42 + outer(i1.i))#x] + : : +- OneRowRelation + : +- OneRowRelation + +- SubqueryAlias i1 + +- SubqueryAlias spark_catalog.default.table_integers + +- Relation spark_catalog.default.table_integers[i#x] parquet + + +-- !query +SELECT i, (SELECT s1.k+s2.k FROM (SELECT (SELECT 42+i1.i) AS k) s1 LEFT OUTER JOIN (SELECT (SELECT 42+i1.i) AS k) s2 ON s1.k=s2.k) AS j FROM table_integers i1 ORDER BY i +-- !query analysis +Sort [i#x ASC NULLS FIRST], true ++- Project [i#x, scalar-subquery#x [i#x && i#x] AS j#x] + : +- Project [(k#x + k#x) AS (k + k)#x] + : +- Join LeftOuter, (k#x = k#x) + : :- SubqueryAlias s1 + : : +- Project [scalar-subquery#x [i#x] AS k#x] + : : : +- Project [(42 + outer(i#x)) AS (42 + outer(i1.i))#x] + : : : +- OneRowRelation + : : +- OneRowRelation + : +- SubqueryAlias s2 + : +- Project [scalar-subquery#x [i#x] AS k#x] + : : +- Project [(42 + outer(i#x)) AS (42 + outer(i1.i))#x] + : : +- OneRowRelation + : +- OneRowRelation + +- SubqueryAlias i1 + +- SubqueryAlias spark_catalog.default.table_integers + +- Relation spark_catalog.default.table_integers[i#x] parquet + + +-- !query +SELECT i, (SELECT i1.i IN (1, 2, 3, 4, 5, 6, 7, 8)) AS j FROM table_integers i1 ORDER BY i +-- !query analysis +Sort [i#x ASC NULLS FIRST], true ++- Project [i#x, scalar-subquery#x [i#x] AS j#x] + : +- Project [outer(i#x) IN (1,2,3,4,5,6,7,8) AS (outer(i1.i) IN (1, 2, 3, 4, 5, 6, 7, 8))#x] + : +- OneRowRelation + +- SubqueryAlias i1 + +- SubqueryAlias spark_catalog.default.table_integers + +- Relation spark_catalog.default.table_integers[i#x] parquet diff --git a/sql/core/src/test/resources/sql-tests/results/subquery/nestedcorrelation-analyzer-only/subquery-not-supported.sql.out b/sql/core/src/test/resources/sql-tests/results/subquery/nestedcorrelation-analyzer-only/subquery-not-supported.sql.out new file mode 100644 index 0000000000000..c56057defa31c --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/results/subquery/nestedcorrelation-analyzer-only/subquery-not-supported.sql.out @@ -0,0 +1,566 @@ +-- Automatically generated by SQLQueryTestSuite +-- !query +set spark.sql.optimizer.supportNestedCorrelatedSubqueries.enabled=true +-- !query analysis +SetCommand (spark.sql.optimizer.supportNestedCorrelatedSubqueries.enabled,Some(true)) + + +-- !query +set spark.sql.optimizer.supportNestedCorrelatedSubqueriesForScalarSubqueries.enabled=true +-- !query analysis +SetCommand (spark.sql.optimizer.supportNestedCorrelatedSubqueriesForScalarSubqueries.enabled,Some(true)) + + +-- !query +set spark.sql.optimizer.supportNestedCorrelatedSubqueriesForINSubqueries.enabled=true +-- !query analysis +SetCommand (spark.sql.optimizer.supportNestedCorrelatedSubqueriesForINSubqueries.enabled,Some(true)) + + +-- !query +set spark.sql.optimizer.supportNestedCorrelatedSubqueriesForEXISTSSubqueries.enabled=true +-- !query analysis +SetCommand (spark.sql.optimizer.supportNestedCorrelatedSubqueriesForEXISTSSubqueries.enabled,Some(true)) + + +-- !query +SELECT 1 FROM (SELECT 1) t0(c0) WHERE (SELECT (SELECT 1 ORDER BY c0)) = 1 +-- !query analysis +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "errorClass" : "UNSUPPORTED_SUBQUERY_EXPRESSION_CATEGORY.CORRELATED_REFERENCE", + "sqlState" : "0A000", + "messageParameters" : { + "sqlExprs" : "\"c0 ASC NULLS FIRST\"" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 57, + "stopIndex" : 67, + "fragment" : "ORDER BY c0" + } ] +} + + +-- !query +SELECT 1 FROM (SELECT 1) t0(c0) WHERE (SELECT (SELECT 1 LIMIT c0)) = 1 +-- !query analysis +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "errorClass" : "INVALID_LIMIT_LIKE_EXPRESSION.IS_UNFOLDABLE", + "sqlState" : "42K0E", + "messageParameters" : { + "expr" : "\"outer(t0.c0)\"", + "name" : "limit" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 63, + "stopIndex" : 64, + "fragment" : "c0" + } ] +} + + +-- !query +DROP TABLE IF EXISTS t +-- !query analysis +DropTable true, false ++- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.t + + +-- !query +CREATE TABLE t(ps_supplycost INT, n_name INT) +-- !query analysis +CreateDataSourceTableCommand `spark_catalog`.`default`.`t`, false + + +-- !query +SELECT NULL +FROM + t AS ref_2, + (SELECT (SELECT NULL + FROM (FROM t AS ref_5, + (SELECT ref_2.n_name AS c1)))) +-- !query analysis +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "errorClass" : "UNRESOLVED_COLUMN.WITHOUT_SUGGESTION", + "sqlState" : "42703", + "messageParameters" : { + "objectName" : "`ref_2`.`n_name`" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 113, + "stopIndex" : 124, + "fragment" : "ref_2.n_name" + } ] +} + + +-- !query +SELECT NULL +FROM + t AS ref_2, + (SELECT (SELECT NULL + FROM (FROM t AS ref_5, + (SELECT ref_5.ps_supplycost AS c0, + ref_2.n_name AS c1)))) +-- !query analysis +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "errorClass" : "UNRESOLVED_COLUMN.WITHOUT_SUGGESTION", + "sqlState" : "42703", + "messageParameters" : { + "objectName" : "`ref_5`.`ps_supplycost`" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 113, + "stopIndex" : 131, + "fragment" : "ref_5.ps_supplycost" + } ] +} + + +-- !query +DROP TABLE IF EXISTS table_integers +-- !query analysis +DropTable true, false ++- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.table_integers + + +-- !query +CREATE TABLE table_integers(i INTEGER) +-- !query analysis +CreateDataSourceTableCommand `spark_catalog`.`default`.`table_integers`, false + + +-- !query +INSERT INTO table_integers VALUES (1), (2), (3), (NULL) +-- !query analysis +InsertIntoHadoopFsRelationCommand file:[not included in comparison]/{warehouse_dir}/table_integers, false, Parquet, [path=file:[not included in comparison]/{warehouse_dir}/table_integers], Append, `spark_catalog`.`default`.`table_integers`, org.apache.spark.sql.execution.datasources.InMemoryFileIndex(file:[not included in comparison]/{warehouse_dir}/table_integers), [i] ++- Project [cast(col1#x as int) AS i#x] + +- LocalRelation [col1#x] + + +-- !query +SELECT i, (SELECT (SELECT i1.i+SUM(i2.i)) FROM table_integers i2) AS j FROM table_integers i1 ORDER BY i +-- !query analysis +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "errorClass" : "UNSUPPORTED_SUBQUERY_EXPRESSION_CATEGORY.CORRELATED_REFERENCE", + "sqlState" : "0A000", + "messageParameters" : { + "sqlExprs" : "\"(i + sum(i)) AS `(outer(i1.i) + sum(outer(i2.i)))`\"" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 20, + "stopIndex" : 40, + "fragment" : "SELECT i1.i+SUM(i2.i)" + } ] +} + + +-- !query +SELECT i, (SELECT ((SELECT ((SELECT ((SELECT SUM(i)+SUM(i4.i)+SUM(i3.i)+SUM(i2.i)+SUM(i1.i) FROM table_integers i5)) FROM table_integers i4)) FROM table_integers i3)) FROM table_integers i2) AS j FROM table_integers i1 GROUP BY i ORDER BY i +-- !query analysis +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "errorClass" : "UNSUPPORTED_SUBQUERY_EXPRESSION_CATEGORY.CORRELATED_REFERENCE", + "sqlState" : "0A000", + "messageParameters" : { + "sqlExprs" : "\"((((sum(i) + sum(i)) + sum(i)) + sum(i)) + sum(i)) AS `((((sum(i) + sum(outer(i4.i))) + sum(outer(i3.i))) + sum(outer(i2.i))) + sum(outer(i1.i)))`\"" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 39, + "stopIndex" : 114, + "fragment" : "SELECT SUM(i)+SUM(i4.i)+SUM(i3.i)+SUM(i2.i)+SUM(i1.i) FROM table_integers i5" + } ] +} + + +-- !query +SELECT (SELECT (SELECT SUM(i1.i)+SUM(i2.i)+SUM(i3.i) FROM table_integers i3) FROM table_integers i2) FROM table_integers i1 ORDER BY 1 +-- !query analysis +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "errorClass" : "UNSUPPORTED_SUBQUERY_EXPRESSION_CATEGORY.CORRELATED_REFERENCE", + "sqlState" : "0A000", + "messageParameters" : { + "sqlExprs" : "\"((sum(i) + sum(i)) + sum(i)) AS `((sum(outer(i1.i)) + sum(outer(i2.i))) + sum(i))`\"" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 17, + "stopIndex" : 75, + "fragment" : "SELECT SUM(i1.i)+SUM(i2.i)+SUM(i3.i) FROM table_integers i3" + } ] +} + + +-- !query +SELECT i, SUM(i), (SELECT (SELECT SUM(i)+SUM(i1.i)+SUM(i2.i) FROM table_integers) FROM table_integers i2) FROM table_integers i1 GROUP BY i ORDER BY i +-- !query analysis +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "errorClass" : "UNSUPPORTED_SUBQUERY_EXPRESSION_CATEGORY.CORRELATED_REFERENCE", + "sqlState" : "0A000", + "messageParameters" : { + "sqlExprs" : "\"((sum(i) + sum(i)) + sum(i)) AS `((sum(i) + sum(outer(i1.i))) + sum(outer(i2.i)))`\"" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 28, + "stopIndex" : 80, + "fragment" : "SELECT SUM(i)+SUM(i1.i)+SUM(i2.i) FROM table_integers" + } ] +} + + +-- !query +SELECT i, (SELECT SUM(i)+(SELECT 42+i1.i) FROM table_integers) AS j FROM table_integers i1 ORDER BY i +-- !query analysis +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "errorClass" : "SCALAR_SUBQUERY_IS_IN_GROUP_BY_OR_AGGREGATE_FUNCTION", + "sqlState" : "0A000", + "messageParameters" : { + "sqlExpr" : "\"scalarsubquery(i)\"" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 26, + "stopIndex" : 41, + "fragment" : "(SELECT 42+i1.i)" + } ] +} + + +-- !query +SELECT i, (SELECT SUM(s1.i) FROM table_integers s1 INNER JOIN table_integers s2 ON (SELECT i1.i+s1.i)=(SELECT i1.i+s2.i)) AS j FROM table_integers i1 ORDER BY i +-- !query analysis +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "errorClass" : "UNSUPPORTED_SUBQUERY_EXPRESSION_CATEGORY.UNSUPPORTED_CORRELATED_SCALAR_SUBQUERY", + "sqlState" : "0A000", + "messageParameters" : { + "treeNode" : "Join Inner, (scalar-subquery#x [i#x && i#x] = scalar-subquery#x [i#x && i#x])\n: :- Project [(outer(i#x) + outer(i#x)) AS (outer(i1.i) + outer(s1.i))#x]\n: : +- OneRowRelation\n: +- Project [(outer(i#x) + outer(i#x)) AS (outer(i1.i) + outer(s2.i))#x]\n: +- OneRowRelation\n:- SubqueryAlias s1\n: +- SubqueryAlias spark_catalog.default.table_integers\n: +- Relation spark_catalog.default.table_integers[i#x] parquet\n+- SubqueryAlias s2\n +- SubqueryAlias spark_catalog.default.table_integers\n +- Relation spark_catalog.default.table_integers[i#x] parquet\n" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 52, + "stopIndex" : 120, + "fragment" : "INNER JOIN table_integers s2 ON (SELECT i1.i+s1.i)=(SELECT i1.i+s2.i)" + } ] +} + + +-- !query +SELECT i, (SELECT SUM(s1.i) FROM table_integers s1 LEFT OUTER JOIN table_integers s2 ON (SELECT i1.i+s1.i)=(SELECT i1.i+s2.i)) AS j FROM table_integers i1 ORDER BY i + +SELECT (SELECT (SELECT COVAR_POP(i2.i, i3.i) FROM table_integers i3) FROM table_integers i2 ORDER BY i NULLS LAST LIMIT 1) FROM table_integers i1 ORDER BY 1 +-- !query analysis +org.apache.spark.sql.catalyst.parser.ParseException +{ + "errorClass" : "PARSE_SYNTAX_ERROR", + "sqlState" : "42601", + "messageParameters" : { + "error" : "'SELECT'", + "hint" : "" + } +} + + +-- !query +SELECT (SELECT (SELECT COVAR_POP(i1.i, i3.i) FROM table_integers i3) FROM table_integers i2 LIMIT 1) FROM table_integers i1 ORDER BY 1 +-- !query analysis +org.apache.spark.sql.AnalysisException +{ + "errorClass" : "UNSUPPORTED_SUBQUERY_EXPRESSION_CATEGORY.AGGREGATE_FUNCTION_MIXED_OUTER_LOCAL_REFERENCES", + "sqlState" : "0A000", + "messageParameters" : { + "function" : "covar_pop(CAST(outer(i1.i) AS DOUBLE), CAST(i3.i AS DOUBLE))" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 24, + "stopIndex" : 44, + "fragment" : "COVAR_POP(i1.i, i3.i)" + } ] +} + + +-- !query +SELECT i, (SELECT SUM(ss1.i) FROM (SELECT i FROM table_integers s1 WHERE EXISTS(SELECT i FROM table_integers WHERE i<>s1.i AND s1.i > i)) ss1 LEFT OUTER JOIN (SELECT i FROM table_integers s1 WHERE i=i1.i AND EXISTS(SELECT i FROM table_integers WHERE i=s1.i)) ss2 ON ss1.i=ss2.i) AS j FROM table_integers i1 ORDER BY i +-- !query analysis +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "errorClass" : "UNSUPPORTED_SUBQUERY_EXPRESSION_CATEGORY.ACCESSING_OUTER_QUERY_COLUMN_IS_NOT_ALLOWED", + "sqlState" : "0A000", + "messageParameters" : { + "treeNode" : "Filter ((i#x = outer(i#x)) AND exists#x [i#x])\n: +- Project [i#x]\n: +- Filter (i#x = outer(i#x))\n: +- SubqueryAlias spark_catalog.default.table_integers\n: +- Relation spark_catalog.default.table_integers[i#x] parquet\n+- SubqueryAlias s1\n +- SubqueryAlias spark_catalog.default.table_integers\n +- Relation spark_catalog.default.table_integers[i#x] parquet\n" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 160, + "stopIndex" : 257, + "fragment" : "SELECT i FROM table_integers s1 WHERE i=i1.i AND EXISTS(SELECT i FROM table_integers WHERE i=s1.i)" + } ] +} + + +-- !query +SELECT i, (SELECT SUM(ss2.i) FROM (SELECT i FROM table_integers s1 WHERE EXISTS(SELECT i FROM table_integers WHERE i<>s1.i AND s1.i > i)) ss1 LEFT OUTER JOIN (SELECT i FROM table_integers s1 WHERE i=i1.i AND EXISTS(SELECT i FROM table_integers WHERE i=s1.i)) ss2 ON ss1.i=ss2.i) AS j FROM table_integers i1 ORDER BY i +-- !query analysis +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "errorClass" : "UNSUPPORTED_SUBQUERY_EXPRESSION_CATEGORY.ACCESSING_OUTER_QUERY_COLUMN_IS_NOT_ALLOWED", + "sqlState" : "0A000", + "messageParameters" : { + "treeNode" : "Filter ((i#x = outer(i#x)) AND exists#x [i#x])\n: +- Project [i#x]\n: +- Filter (i#x = outer(i#x))\n: +- SubqueryAlias spark_catalog.default.table_integers\n: +- Relation spark_catalog.default.table_integers[i#x] parquet\n+- SubqueryAlias s1\n +- SubqueryAlias spark_catalog.default.table_integers\n +- Relation spark_catalog.default.table_integers[i#x] parquet\n" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 160, + "stopIndex" : 257, + "fragment" : "SELECT i FROM table_integers s1 WHERE i=i1.i AND EXISTS(SELECT i FROM table_integers WHERE i=s1.i)" + } ] +} + + +-- !query +SELECT i, (SELECT SUM(ss1.i)+SUM(ss2.i) FROM (SELECT i FROM table_integers s1 WHERE EXISTS(SELECT i FROM table_integers WHERE i<>s1.i AND s1.i > i)) ss1 LEFT OUTER JOIN (SELECT i FROM table_integers s1 WHERE i=i1.i AND EXISTS(SELECT i FROM table_integers WHERE i=s1.i)) ss2 ON ss1.i=ss2.i) AS j FROM table_integers i1 ORDER BY i +-- !query analysis +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "errorClass" : "UNSUPPORTED_SUBQUERY_EXPRESSION_CATEGORY.ACCESSING_OUTER_QUERY_COLUMN_IS_NOT_ALLOWED", + "sqlState" : "0A000", + "messageParameters" : { + "treeNode" : "Filter ((i#x = outer(i#x)) AND exists#x [i#x])\n: +- Project [i#x]\n: +- Filter (i#x = outer(i#x))\n: +- SubqueryAlias spark_catalog.default.table_integers\n: +- Relation spark_catalog.default.table_integers[i#x] parquet\n+- SubqueryAlias s1\n +- SubqueryAlias spark_catalog.default.table_integers\n +- Relation spark_catalog.default.table_integers[i#x] parquet\n" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 171, + "stopIndex" : 268, + "fragment" : "SELECT i FROM table_integers s1 WHERE i=i1.i AND EXISTS(SELECT i FROM table_integers WHERE i=s1.i)" + } ] +} + + +-- !query +SELECT i, (SELECT SUM(ss1.i)+SUM(ss2.i) FROM (SELECT i FROM table_integers s1 WHERE i=i1.i AND EXISTS(SELECT i FROM table_integers WHERE i<>s1.i AND s1.i>i)) ss1 LEFT OUTER JOIN (SELECT i FROM table_integers s1 WHERE i<>i1.i OR EXISTS(SELECT i FROM table_integers WHERE i=s1.i)) ss2 ON ss1.i=ss2.i) AS j FROM table_integers i1 ORDER BY i +-- !query analysis +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "errorClass" : "UNSUPPORTED_SUBQUERY_EXPRESSION_CATEGORY.ACCESSING_OUTER_QUERY_COLUMN_IS_NOT_ALLOWED", + "sqlState" : "0A000", + "messageParameters" : { + "treeNode" : "Filter (NOT (i#x = outer(i#x)) OR exists#x [i#x])\n: +- Project [i#x]\n: +- Filter (i#x = outer(i#x))\n: +- SubqueryAlias spark_catalog.default.table_integers\n: +- Relation spark_catalog.default.table_integers[i#x] parquet\n+- SubqueryAlias s1\n +- SubqueryAlias spark_catalog.default.table_integers\n +- Relation spark_catalog.default.table_integers[i#x] parquet\n" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 180, + "stopIndex" : 277, + "fragment" : "SELECT i FROM table_integers s1 WHERE i<>i1.i OR EXISTS(SELECT i FROM table_integers WHERE i=s1.i)" + } ] +} + + +-- !query +SELECT i, (SELECT SUM(s2.i) FROM table_integers s1 LEFT OUTER JOIN (SELECT i FROM table_integers WHERE i=i1.i) s2 ON s1.i=s2.i) AS j FROM table_integers i1 ORDER BY i +-- !query analysis +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "errorClass" : "UNSUPPORTED_SUBQUERY_EXPRESSION_CATEGORY.ACCESSING_OUTER_QUERY_COLUMN_IS_NOT_ALLOWED", + "sqlState" : "0A000", + "messageParameters" : { + "treeNode" : "Filter (i#x = outer(i#x))\n+- SubqueryAlias spark_catalog.default.table_integers\n +- Relation spark_catalog.default.table_integers[i#x] parquet\n" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 69, + "stopIndex" : 109, + "fragment" : "SELECT i FROM table_integers WHERE i=i1.i" + } ] +} + + +-- !query +SELECT i, (SELECT SUM(s2.i) FROM table_integers s1 LEFT OUTER JOIN (SELECT i FROM table_integers WHERE i<>i1.i) s2 ON s1.i=s2.i) AS j FROM table_integers i1 ORDER BY i +-- !query analysis +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "errorClass" : "UNSUPPORTED_SUBQUERY_EXPRESSION_CATEGORY.ACCESSING_OUTER_QUERY_COLUMN_IS_NOT_ALLOWED", + "sqlState" : "0A000", + "messageParameters" : { + "treeNode" : "Filter NOT (i#x = outer(i#x))\n+- SubqueryAlias spark_catalog.default.table_integers\n +- Relation spark_catalog.default.table_integers[i#x] parquet\n" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 69, + "stopIndex" : 110, + "fragment" : "SELECT i FROM table_integers WHERE i<>i1.i" + } ] +} + + +-- !query +DROP TABLE IF EXISTS tbl_ProductSales +-- !query analysis +DropTable true, false ++- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.tbl_ProductSales + + +-- !query +DROP TABLE IF EXISTS another_T +-- !query analysis +DropTable true, false ++- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.another_T + + +-- !query +CREATE TABLE tbl_ProductSales (ColID int, Product_Category varchar(64), Product_Name varchar(64), TotalSales int) +-- !query analysis +CreateDataSourceTableCommand `spark_catalog`.`default`.`tbl_ProductSales`, false + + +-- !query +CREATE TABLE another_T (col1 INT, col2 INT, col3 INT, col4 INT, col5 INT, col6 INT, col7 INT, col8 INT) +-- !query analysis +CreateDataSourceTableCommand `spark_catalog`.`default`.`another_T`, false + + +-- !query +INSERT INTO tbl_ProductSales VALUES (1,'Game','Mobo Game',200),(2,'Game','PKO Game',400),(3,'Fashion','Shirt',500),(4,'Fashion','Shorts',100) +-- !query analysis +InsertIntoHadoopFsRelationCommand file:[not included in comparison]/{warehouse_dir}/tbl_productsales, false, Parquet, [path=file:[not included in comparison]/{warehouse_dir}/tbl_productsales], Append, `spark_catalog`.`default`.`tbl_productsales`, org.apache.spark.sql.execution.datasources.InMemoryFileIndex(file:[not included in comparison]/{warehouse_dir}/tbl_productsales), [ColID, Product_Category, Product_Name, TotalSales] ++- Project [cast(col1#x as int) AS ColID#x, static_invoke(CharVarcharCodegenUtils.varcharTypeWriteSideCheck(cast(col2#x as string), 64)) AS Product_Category#x, static_invoke(CharVarcharCodegenUtils.varcharTypeWriteSideCheck(cast(col3#x as string), 64)) AS Product_Name#x, cast(col4#x as int) AS TotalSales#x] + +- LocalRelation [col1#x, col2#x, col3#x, col4#x] + + +-- !query +INSERT INTO another_T VALUES (1,2,3,4,5,6,7,8), (11,22,33,44,55,66,77,88), (111,222,333,444,555,666,777,888), (1111,2222,3333,4444,5555,6666,7777,8888) +-- !query analysis +InsertIntoHadoopFsRelationCommand file:[not included in comparison]/{warehouse_dir}/another_t, false, Parquet, [path=file:[not included in comparison]/{warehouse_dir}/another_t], Append, `spark_catalog`.`default`.`another_t`, org.apache.spark.sql.execution.datasources.InMemoryFileIndex(file:[not included in comparison]/{warehouse_dir}/another_t), [col1, col2, col3, col4, col5, col6, col7, col8] ++- LocalRelation [col1#x, col2#x, col3#x, col4#x, col5#x, col6#x, col7#x, col8#x] + + +-- !query +SELECT (SELECT MIN(ColID) FROM tbl_ProductSales INNER JOIN another_T t2 ON t1.col7 <> (SELECT MAX(t1.col1 + t3.col4) FROM another_T t3)) FROM another_T t1 +-- !query analysis +org.apache.spark.sql.AnalysisException +{ + "errorClass" : "UNSUPPORTED_SUBQUERY_EXPRESSION_CATEGORY.AGGREGATE_FUNCTION_MIXED_OUTER_LOCAL_REFERENCES", + "sqlState" : "0A000", + "messageParameters" : { + "function" : "max((outer(t1.col1) + t3.col4))" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 95, + "stopIndex" : 116, + "fragment" : "MAX(t1.col1 + t3.col4)" + } ] +} + + +-- !query +SELECT CASE WHEN 1 IN (SELECT (SELECT MAX(col7))) THEN 2 ELSE NULL END FROM another_T t1 +-- !query analysis +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "errorClass" : "UNSUPPORTED_SUBQUERY_EXPRESSION_CATEGORY.CORRELATED_REFERENCE", + "sqlState" : "0A000", + "messageParameters" : { + "sqlExprs" : "\"max(col7) AS `max(outer(t1.col7))`\"" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 32, + "stopIndex" : 47, + "fragment" : "SELECT MAX(col7)" + } ] +} + + +-- !query +SELECT CASE WHEN 1 IN (SELECT (SELECT MAX(col7)) UNION ALL (SELECT MIN(ColID) FROM tbl_ProductSales INNER JOIN another_T t2 ON t2.col5 = t2.col1)) THEN 2 ELSE NULL END FROM another_T t1 +-- !query analysis +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "errorClass" : "UNSUPPORTED_SUBQUERY_EXPRESSION_CATEGORY.CORRELATED_REFERENCE", + "sqlState" : "0A000", + "messageParameters" : { + "sqlExprs" : "\"max(col7) AS `max(outer(t1.col7))`\"" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 32, + "stopIndex" : 47, + "fragment" : "SELECT MAX(col7)" + } ] +} + + +-- !query +SELECT CASE WHEN 1 IN (SELECT (SELECT MIN(ColID) FROM tbl_ProductSales INNER JOIN another_T t2 ON t2.col5 = t2.col1) UNION ALL (SELECT MAX(col7))) THEN 2 ELSE NULL END FROM another_T t1 +-- !query analysis +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "errorClass" : "UNSUPPORTED_SUBQUERY_EXPRESSION_CATEGORY.CORRELATED_REFERENCE", + "sqlState" : "0A000", + "messageParameters" : { + "sqlExprs" : "\"max(col7) AS `max(outer(t1.col7))`\"" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 129, + "stopIndex" : 144, + "fragment" : "SELECT MAX(col7)" + } ] +} + + +-- !query +SELECT CASE WHEN NOT col1 NOT IN (SELECT (SELECT MAX(col7)) UNION (SELECT MIN(ColID) FROM tbl_ProductSales LEFT JOIN another_T t2 ON t2.col5 = t1.col1)) THEN 1 ELSE 2 END FROM another_T t1 GROUP BY col1 ORDER BY 1 +-- !query analysis +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "errorClass" : "UNSUPPORTED_SUBQUERY_EXPRESSION_CATEGORY.CORRELATED_REFERENCE", + "sqlState" : "0A000", + "messageParameters" : { + "sqlExprs" : "\"max(col7) AS `max(outer(t1.col7))`\"" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 43, + "stopIndex" : 58, + "fragment" : "SELECT MAX(col7)" + } ] +} diff --git a/sql/core/src/test/resources/sql-tests/results/subquery/nestedcorrelation/combined-subquery.sql.out b/sql/core/src/test/resources/sql-tests/results/subquery/nestedcorrelation/combined-subquery.sql.out new file mode 100644 index 0000000000000..eb712556160d6 --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/results/subquery/nestedcorrelation/combined-subquery.sql.out @@ -0,0 +1,66 @@ +-- Automatically generated by SQLQueryTestSuite +-- !query +set spark.sql.optimizer.supportNestedCorrelatedSubqueries.enabled=true +-- !query analysis +SetCommand (spark.sql.optimizer.supportNestedCorrelatedSubqueries.enabled,Some(true)) + + +-- !query +set spark.sql.optimizer.supportNestedCorrelatedSubqueriesForScalarSubqueries.enabled=true +-- !query analysis +SetCommand (spark.sql.optimizer.supportNestedCorrelatedSubqueriesForScalarSubqueries.enabled,Some(true)) + + +-- !query +set spark.sql.optimizer.supportNestedCorrelatedSubqueriesForINSubqueries.enabled=true +-- !query analysis +SetCommand (spark.sql.optimizer.supportNestedCorrelatedSubqueriesForINSubqueries.enabled,Some(true)) + + +-- !query +set spark.sql.optimizer.supportNestedCorrelatedSubqueriesForEXISTSSubqueries.enabled=true +-- !query analysis +SetCommand (spark.sql.optimizer.supportNestedCorrelatedSubqueriesForEXISTSSubqueries.enabled,Some(true)) + + +-- !query +DROP TABLE IF EXISTS tbl +-- !query analysis +DropTable true, false ++- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.tbl + + +-- !query +CREATE TABLE tbl(a TINYINT, b SMALLINT, c INTEGER, d BIGINT, e VARCHAR(1), f DATE, g TIMESTAMP) +-- !query analysis +CreateDataSourceTableCommand `spark_catalog`.`default`.`tbl`, false + + +-- !query +INSERT INTO tbl VALUES (1, 2, 3, 4, '5', DATE '1992-01-01', TIMESTAMP '1992-01-01 00:00:00') +-- !query analysis +InsertIntoHadoopFsRelationCommand file:[not included in comparison]/{warehouse_dir}/tbl, false, Parquet, [path=file:[not included in comparison]/{warehouse_dir}/tbl], Append, `spark_catalog`.`default`.`tbl`, org.apache.spark.sql.execution.datasources.InMemoryFileIndex(file:[not included in comparison]/{warehouse_dir}/tbl), [a, b, c, d, e, f, g] ++- Project [cast(col1#x as tinyint) AS a#x, cast(col2#x as smallint) AS b#x, cast(col3#x as int) AS c#x, cast(col4#x as bigint) AS d#xL, static_invoke(CharVarcharCodegenUtils.varcharTypeWriteSideCheck(cast(col5#x as string), 1)) AS e#x, cast(col6#x as date) AS f#x, cast(col7#x as timestamp) AS g#x] + +- LocalRelation [col1#x, col2#x, col3#x, col4#x, col5#x, col6#x, col7#x] + + +-- !query +SELECT t1.c+(SELECT t1.b FROM tbl t2 WHERE EXISTS(SELECT t1.b+t2.a)) FROM tbl t1 +-- !query analysis +Project [(c#x + cast(scalar-subquery#x [b#x] as int)) AS (c + scalarsubquery(b))#x] +: +- Project [outer(b#x)] +: +- Filter exists#x [b#x && a#x] +: : +- Project [(outer(b#x) + cast(outer(a#x) as smallint)) AS (outer(t1.b) + outer(t2.a))#x] +: : +- OneRowRelation +: +- SubqueryAlias t2 +: +- SubqueryAlias spark_catalog.default.tbl +: +- Relation spark_catalog.default.tbl[a#x,b#x,c#x,d#xL,e#x,f#x,g#x] parquet ++- SubqueryAlias t1 + +- SubqueryAlias spark_catalog.default.tbl + +- Relation spark_catalog.default.tbl[a#x,b#x,c#x,d#xL,e#x,f#x,g#x] parquet + + +-- !query +SELECT 1 FROM tbl t1 JOIN tbl t2 ON (t1.d=t2.d) WHERE EXISTS(SELECT t1.c FROM tbl t3 WHERE t1.d+t3.c<100 AND EXISTS(SELECT 1 FROM tbl t4 WHERE t2.f < DATE '2000-01-01')) +-- !query analysis +[Analyzer test output redacted due to nondeterminism] diff --git a/sql/core/src/test/resources/sql-tests/results/subquery/nestedcorrelation/exists-subquery.sql.out b/sql/core/src/test/resources/sql-tests/results/subquery/nestedcorrelation/exists-subquery.sql.out new file mode 100644 index 0000000000000..c47867443fde7 --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/results/subquery/nestedcorrelation/exists-subquery.sql.out @@ -0,0 +1,430 @@ +-- Automatically generated by SQLQueryTestSuite +-- !query +set spark.sql.optimizer.supportNestedCorrelatedSubqueries.enabled=true +-- !query analysis +SetCommand (spark.sql.optimizer.supportNestedCorrelatedSubqueries.enabled,Some(true)) + + +-- !query +set spark.sql.optimizer.supportNestedCorrelatedSubqueriesForScalarSubqueries.enabled=true +-- !query analysis +SetCommand (spark.sql.optimizer.supportNestedCorrelatedSubqueriesForScalarSubqueries.enabled,Some(true)) + + +-- !query +set spark.sql.optimizer.supportNestedCorrelatedSubqueriesForINSubqueries.enabled=true +-- !query analysis +SetCommand (spark.sql.optimizer.supportNestedCorrelatedSubqueriesForINSubqueries.enabled,Some(true)) + + +-- !query +set spark.sql.optimizer.supportNestedCorrelatedSubqueriesForEXISTSSubqueries.enabled=true +-- !query analysis +SetCommand (spark.sql.optimizer.supportNestedCorrelatedSubqueriesForEXISTSSubqueries.enabled,Some(true)) + + +-- !query +DROP TABLE IF EXISTS table_integers +-- !query analysis +DropTable true, false ++- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.table_integers + + +-- !query +CREATE TABLE table_integers(i INTEGER) +-- !query analysis +CreateDataSourceTableCommand `spark_catalog`.`default`.`table_integers`, false + + +-- !query +INSERT INTO table_integers VALUES (1), (2), (3), (NULL) +-- !query analysis +InsertIntoHadoopFsRelationCommand file:[not included in comparison]/{warehouse_dir}/table_integers, false, Parquet, [path=file:[not included in comparison]/{warehouse_dir}/table_integers], Append, `spark_catalog`.`default`.`table_integers`, org.apache.spark.sql.execution.datasources.InMemoryFileIndex(file:[not included in comparison]/{warehouse_dir}/table_integers), [i] ++- Project [cast(col1#x as int) AS i#x] + +- LocalRelation [col1#x] + + +-- !query +SELECT + i, + ( + SELECT SUM(ss1.i) + FROM ( + SELECT s1.i + FROM table_integers s1 + WHERE EXISTS ( + SELECT 1 + FROM table_integers t2 + WHERE s1.i > t2.i + ) + ) ss1 + ) AS j +FROM table_integers i1 +ORDER BY i +-- !query analysis +Sort [i#x ASC NULLS FIRST], true ++- Project [i#x, scalar-subquery#x [] AS j#xL] + : +- Aggregate [sum(i#x) AS sum(i)#xL] + : +- SubqueryAlias ss1 + : +- Project [i#x] + : +- Filter exists#x [i#x] + : : +- Project [1 AS 1#x] + : : +- Filter (outer(i#x) > i#x) + : : +- SubqueryAlias t2 + : : +- SubqueryAlias spark_catalog.default.table_integers + : : +- Relation spark_catalog.default.table_integers[i#x] parquet + : +- SubqueryAlias s1 + : +- SubqueryAlias spark_catalog.default.table_integers + : +- Relation spark_catalog.default.table_integers[i#x] parquet + +- SubqueryAlias i1 + +- SubqueryAlias spark_catalog.default.table_integers + +- Relation spark_catalog.default.table_integers[i#x] parquet + + +-- !query +SELECT + i, + ( + SELECT SUM(ss2.i) + FROM ( + SELECT s1.i + FROM table_integers s1 + WHERE s1.i = i1.i + AND EXISTS ( + SELECT 1 + FROM table_integers t2 + WHERE t2.i = s1.i + ) + ) ss2 + ) AS j +FROM table_integers i1 +ORDER BY i +-- !query analysis +Sort [i#x ASC NULLS FIRST], true ++- Project [i#x, scalar-subquery#x [i#x] AS j#xL] + : +- Aggregate [sum(i#x) AS sum(i)#xL] + : +- SubqueryAlias ss2 + : +- Project [i#x] + : +- Filter ((i#x = outer(i#x)) AND exists#x [i#x]) + : : +- Project [1 AS 1#x] + : : +- Filter (i#x = outer(i#x)) + : : +- SubqueryAlias t2 + : : +- SubqueryAlias spark_catalog.default.table_integers + : : +- Relation spark_catalog.default.table_integers[i#x] parquet + : +- SubqueryAlias s1 + : +- SubqueryAlias spark_catalog.default.table_integers + : +- Relation spark_catalog.default.table_integers[i#x] parquet + +- SubqueryAlias i1 + +- SubqueryAlias spark_catalog.default.table_integers + +- Relation spark_catalog.default.table_integers[i#x] parquet + + +-- !query +SELECT + i, + ( + SELECT SUM(ss1.i) + SUM(ss2.i) + FROM ( + SELECT s1.i + FROM table_integers s1 + WHERE EXISTS ( + SELECT 1 + FROM table_integers t2 + WHERE s1.i > t2.i + ) + ) ss1 + LEFT OUTER JOIN ( + SELECT s1.i + FROM table_integers s1 + WHERE EXISTS ( + SELECT 1 + FROM table_integers t2 + WHERE s1.i = t2.i + ) + ) ss2 + ON ss1.i = ss2.i + ) AS j +FROM table_integers i1 +ORDER BY i +-- !query analysis +Sort [i#x ASC NULLS FIRST], true ++- Project [i#x, scalar-subquery#x [] AS j#xL] + : +- Aggregate [(sum(i#x) + sum(i#x)) AS (sum(i) + sum(i))#xL] + : +- Join LeftOuter, (i#x = i#x) + : :- SubqueryAlias ss1 + : : +- Project [i#x] + : : +- Filter exists#x [i#x] + : : : +- Project [1 AS 1#x] + : : : +- Filter (outer(i#x) > i#x) + : : : +- SubqueryAlias t2 + : : : +- SubqueryAlias spark_catalog.default.table_integers + : : : +- Relation spark_catalog.default.table_integers[i#x] parquet + : : +- SubqueryAlias s1 + : : +- SubqueryAlias spark_catalog.default.table_integers + : : +- Relation spark_catalog.default.table_integers[i#x] parquet + : +- SubqueryAlias ss2 + : +- Project [i#x] + : +- Filter exists#x [i#x] + : : +- Project [1 AS 1#x] + : : +- Filter (outer(i#x) = i#x) + : : +- SubqueryAlias t2 + : : +- SubqueryAlias spark_catalog.default.table_integers + : : +- Relation spark_catalog.default.table_integers[i#x] parquet + : +- SubqueryAlias s1 + : +- SubqueryAlias spark_catalog.default.table_integers + : +- Relation spark_catalog.default.table_integers[i#x] parquet + +- SubqueryAlias i1 + +- SubqueryAlias spark_catalog.default.table_integers + +- Relation spark_catalog.default.table_integers[i#x] parquet + + +-- !query +SELECT i, (SELECT SUM(ss2.i) FROM (SELECT i FROM table_integers s1 WHERE CASE WHEN (i=i1.i AND EXISTS (SELECT i FROM table_integers WHERE i=s1.i)) THEN true ELSE false END) ss2) AS j FROM table_integers i1 ORDER BY i +-- !query analysis +Sort [i#x ASC NULLS FIRST], true ++- Project [i#x, scalar-subquery#x [i#x] AS j#xL] + : +- Aggregate [sum(i#x) AS sum(i)#xL] + : +- SubqueryAlias ss2 + : +- Project [i#x] + : +- Filter CASE WHEN ((i#x = outer(i#x)) AND exists#x [i#x]) THEN true ELSE false END + : : +- Project [i#x] + : : +- Filter (i#x = outer(i#x)) + : : +- SubqueryAlias spark_catalog.default.table_integers + : : +- Relation spark_catalog.default.table_integers[i#x] parquet + : +- SubqueryAlias s1 + : +- SubqueryAlias spark_catalog.default.table_integers + : +- Relation spark_catalog.default.table_integers[i#x] parquet + +- SubqueryAlias i1 + +- SubqueryAlias spark_catalog.default.table_integers + +- Relation spark_catalog.default.table_integers[i#x] parquet + + +-- !query +SELECT i, (SELECT SUM(ss2.i) FROM (SELECT i FROM table_integers s1 WHERE i=i1.i AND EXISTS (SELECT i FROM table_integers WHERE i=s1.i)) ss2) AS j FROM table_integers i1 ORDER BY i +-- !query analysis +Sort [i#x ASC NULLS FIRST], true ++- Project [i#x, scalar-subquery#x [i#x] AS j#xL] + : +- Aggregate [sum(i#x) AS sum(i)#xL] + : +- SubqueryAlias ss2 + : +- Project [i#x] + : +- Filter ((i#x = outer(i#x)) AND exists#x [i#x]) + : : +- Project [i#x] + : : +- Filter (i#x = outer(i#x)) + : : +- SubqueryAlias spark_catalog.default.table_integers + : : +- Relation spark_catalog.default.table_integers[i#x] parquet + : +- SubqueryAlias s1 + : +- SubqueryAlias spark_catalog.default.table_integers + : +- Relation spark_catalog.default.table_integers[i#x] parquet + +- SubqueryAlias i1 + +- SubqueryAlias spark_catalog.default.table_integers + +- Relation spark_catalog.default.table_integers[i#x] parquet + + +-- !query +SELECT i, (SELECT SUM(ss2.i) FROM (SELECT i FROM table_integers s1 WHERE (SELECT i FROM table_integers WHERE i=s1.i) = 1) ss2) AS j FROM table_integers i1 ORDER BY i +-- !query analysis +Sort [i#x ASC NULLS FIRST], true ++- Project [i#x, scalar-subquery#x [] AS j#xL] + : +- Aggregate [sum(i#x) AS sum(i)#xL] + : +- SubqueryAlias ss2 + : +- Project [i#x] + : +- Filter (scalar-subquery#x [i#x] = 1) + : : +- Project [i#x] + : : +- Filter (i#x = outer(i#x)) + : : +- SubqueryAlias spark_catalog.default.table_integers + : : +- Relation spark_catalog.default.table_integers[i#x] parquet + : +- SubqueryAlias s1 + : +- SubqueryAlias spark_catalog.default.table_integers + : +- Relation spark_catalog.default.table_integers[i#x] parquet + +- SubqueryAlias i1 + +- SubqueryAlias spark_catalog.default.table_integers + +- Relation spark_catalog.default.table_integers[i#x] parquet + + +-- !query +SELECT i, (SELECT i FROM table_integers s1 WHERE i=i1.i AND EXISTS (SELECT i FROM table_integers WHERE i=s1.i)) AS j FROM table_integers i1 ORDER BY i +-- !query analysis +Sort [i#x ASC NULLS FIRST], true ++- Project [i#x, scalar-subquery#x [i#x] AS j#x] + : +- Project [i#x] + : +- Filter ((i#x = outer(i#x)) AND exists#x [i#x]) + : : +- Project [i#x] + : : +- Filter (i#x = outer(i#x)) + : : +- SubqueryAlias spark_catalog.default.table_integers + : : +- Relation spark_catalog.default.table_integers[i#x] parquet + : +- SubqueryAlias s1 + : +- SubqueryAlias spark_catalog.default.table_integers + : +- Relation spark_catalog.default.table_integers[i#x] parquet + +- SubqueryAlias i1 + +- SubqueryAlias spark_catalog.default.table_integers + +- Relation spark_catalog.default.table_integers[i#x] parquet + + +-- !query +SELECT i, (SELECT SUM(ss2.i) FROM (SELECT i FROM table_integers s1 WHERE i=i1.i OR i=ANY(SELECT i FROM table_integers WHERE i=s1.i)) ss2) AS j FROM table_integers i1 ORDER BY i +-- !query analysis +org.apache.spark.sql.catalyst.parser.ParseException +{ + "errorClass" : "PARSE_SYNTAX_ERROR", + "sqlState" : "42601", + "messageParameters" : { + "error" : "'('", + "hint" : "" + } +} + + +-- !query +SELECT i, (SELECT SUM(ss2.i) FROM (SELECT i FROM table_integers s1 WHERE CASE WHEN (i=i1.i AND EXISTS(SELECT i FROM table_integers WHERE i=s1.i)) THEN true ELSE false END) ss2) AS j FROM table_integers i1 ORDER BY i +-- !query analysis +Sort [i#x ASC NULLS FIRST], true ++- Project [i#x, scalar-subquery#x [i#x] AS j#xL] + : +- Aggregate [sum(i#x) AS sum(i)#xL] + : +- SubqueryAlias ss2 + : +- Project [i#x] + : +- Filter CASE WHEN ((i#x = outer(i#x)) AND exists#x [i#x]) THEN true ELSE false END + : : +- Project [i#x] + : : +- Filter (i#x = outer(i#x)) + : : +- SubqueryAlias spark_catalog.default.table_integers + : : +- Relation spark_catalog.default.table_integers[i#x] parquet + : +- SubqueryAlias s1 + : +- SubqueryAlias spark_catalog.default.table_integers + : +- Relation spark_catalog.default.table_integers[i#x] parquet + +- SubqueryAlias i1 + +- SubqueryAlias spark_catalog.default.table_integers + +- Relation spark_catalog.default.table_integers[i#x] parquet + + +-- !query +SELECT i, (SELECT SUM(ss2.i) FROM (SELECT i FROM table_integers s1 WHERE i=i1.i AND EXISTS(SELECT i FROM table_integers WHERE i=s1.i)) ss2) AS j FROM table_integers i1 ORDER BY i +-- !query analysis +Sort [i#x ASC NULLS FIRST], true ++- Project [i#x, scalar-subquery#x [i#x] AS j#xL] + : +- Aggregate [sum(i#x) AS sum(i)#xL] + : +- SubqueryAlias ss2 + : +- Project [i#x] + : +- Filter ((i#x = outer(i#x)) AND exists#x [i#x]) + : : +- Project [i#x] + : : +- Filter (i#x = outer(i#x)) + : : +- SubqueryAlias spark_catalog.default.table_integers + : : +- Relation spark_catalog.default.table_integers[i#x] parquet + : +- SubqueryAlias s1 + : +- SubqueryAlias spark_catalog.default.table_integers + : +- Relation spark_catalog.default.table_integers[i#x] parquet + +- SubqueryAlias i1 + +- SubqueryAlias spark_catalog.default.table_integers + +- Relation spark_catalog.default.table_integers[i#x] parquet + + +-- !query +SELECT i, (SELECT SUM(ss1.i) FROM (SELECT i FROM table_integers s1 WHERE EXISTS(SELECT i FROM table_integers WHERE i<>s1.i AND s1.i > i)) ss1) AS j FROM table_integers i1 ORDER BY i +-- !query analysis +Sort [i#x ASC NULLS FIRST], true ++- Project [i#x, scalar-subquery#x [] AS j#xL] + : +- Aggregate [sum(i#x) AS sum(i)#xL] + : +- SubqueryAlias ss1 + : +- Project [i#x] + : +- Filter exists#x [i#x && i#x] + : : +- Project [i#x] + : : +- Filter (NOT (i#x = outer(i#x)) AND (outer(i#x) > i#x)) + : : +- SubqueryAlias spark_catalog.default.table_integers + : : +- Relation spark_catalog.default.table_integers[i#x] parquet + : +- SubqueryAlias s1 + : +- SubqueryAlias spark_catalog.default.table_integers + : +- Relation spark_catalog.default.table_integers[i#x] parquet + +- SubqueryAlias i1 + +- SubqueryAlias spark_catalog.default.table_integers + +- Relation spark_catalog.default.table_integers[i#x] parquet + + +-- !query +SELECT i, (SELECT SUM(ss1.i)+SUM(ss2.i) FROM (SELECT i FROM table_integers s1 WHERE i=i1.i AND EXISTS(SELECT i FROM table_integers WHERE i<>s1.i AND s1.i>i)) ss1 LEFT OUTER JOIN (SELECT i FROM table_integers s1 WHERE EXISTS(SELECT i FROM table_integers WHERE i=s1.i)) ss2 ON ss1.i=ss2.i) AS j FROM table_integers i1 ORDER BY i +-- !query analysis +Sort [i#x ASC NULLS FIRST], true ++- Project [i#x, scalar-subquery#x [i#x] AS j#xL] + : +- Aggregate [(sum(i#x) + sum(i#x)) AS (sum(i) + sum(i))#xL] + : +- Join LeftOuter, (i#x = i#x) + : :- SubqueryAlias ss1 + : : +- Project [i#x] + : : +- Filter ((i#x = outer(i#x)) AND exists#x [i#x && i#x]) + : : : +- Project [i#x] + : : : +- Filter (NOT (i#x = outer(i#x)) AND (outer(i#x) > i#x)) + : : : +- SubqueryAlias spark_catalog.default.table_integers + : : : +- Relation spark_catalog.default.table_integers[i#x] parquet + : : +- SubqueryAlias s1 + : : +- SubqueryAlias spark_catalog.default.table_integers + : : +- Relation spark_catalog.default.table_integers[i#x] parquet + : +- SubqueryAlias ss2 + : +- Project [i#x] + : +- Filter exists#x [i#x] + : : +- Project [i#x] + : : +- Filter (i#x = outer(i#x)) + : : +- SubqueryAlias spark_catalog.default.table_integers + : : +- Relation spark_catalog.default.table_integers[i#x] parquet + : +- SubqueryAlias s1 + : +- SubqueryAlias spark_catalog.default.table_integers + : +- Relation spark_catalog.default.table_integers[i#x] parquet + +- SubqueryAlias i1 + +- SubqueryAlias spark_catalog.default.table_integers + +- Relation spark_catalog.default.table_integers[i#x] parquet + + +-- !query +DROP TABLE IF EXISTS tbl_ProductSales +-- !query analysis +DropTable true, false ++- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.tbl_ProductSales + + +-- !query +DROP TABLE IF EXISTS another_T +-- !query analysis +DropTable true, false ++- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.another_T + + +-- !query +CREATE TABLE tbl_ProductSales (ColID int, Product_Category varchar(64), Product_Name varchar(64), TotalSales int) +-- !query analysis +CreateDataSourceTableCommand `spark_catalog`.`default`.`tbl_ProductSales`, false + + +-- !query +CREATE TABLE another_T (col1 INT, col2 INT, col3 INT, col4 INT, col5 INT, col6 INT, col7 INT, col8 INT) +-- !query analysis +CreateDataSourceTableCommand `spark_catalog`.`default`.`another_T`, false + + +-- !query +INSERT INTO tbl_ProductSales VALUES (1,'Game','Mobo Game',200),(2,'Game','PKO Game',400),(3,'Fashion','Shirt',500),(4,'Fashion','Shorts',100) +-- !query analysis +InsertIntoHadoopFsRelationCommand file:[not included in comparison]/{warehouse_dir}/tbl_productsales, false, Parquet, [path=file:[not included in comparison]/{warehouse_dir}/tbl_productsales], Append, `spark_catalog`.`default`.`tbl_productsales`, org.apache.spark.sql.execution.datasources.InMemoryFileIndex(file:[not included in comparison]/{warehouse_dir}/tbl_productsales), [ColID, Product_Category, Product_Name, TotalSales] ++- Project [cast(col1#x as int) AS ColID#x, static_invoke(CharVarcharCodegenUtils.varcharTypeWriteSideCheck(cast(col2#x as string), 64)) AS Product_Category#x, static_invoke(CharVarcharCodegenUtils.varcharTypeWriteSideCheck(cast(col3#x as string), 64)) AS Product_Name#x, cast(col4#x as int) AS TotalSales#x] + +- LocalRelation [col1#x, col2#x, col3#x, col4#x] + + +-- !query +INSERT INTO another_T VALUES (1,2,3,4,5,6,7,8), (11,22,33,44,55,66,77,88), (111,222,333,444,555,666,777,888), (1111,2222,3333,4444,5555,6666,7777,8888) +-- !query analysis +InsertIntoHadoopFsRelationCommand file:[not included in comparison]/{warehouse_dir}/another_t, false, Parquet, [path=file:[not included in comparison]/{warehouse_dir}/another_t], Append, `spark_catalog`.`default`.`another_t`, org.apache.spark.sql.execution.datasources.InMemoryFileIndex(file:[not included in comparison]/{warehouse_dir}/another_t), [col1, col2, col3, col4, col5, col6, col7, col8] ++- LocalRelation [col1#x, col2#x, col3#x, col4#x, col5#x, col6#x, col7#x, col8#x] + + +-- !query +SELECT (SELECT MIN(ColID) FROM tbl_ProductSales INNER JOIN another_T t2 ON EXISTS (SELECT MAX(t1.col1 + t3.col4) AS mymax FROM another_T t3 HAVING t1.col7 <> mymax)) FROM another_T t1 +-- !query analysis +org.apache.spark.sql.AnalysisException +{ + "errorClass" : "UNSUPPORTED_SUBQUERY_EXPRESSION_CATEGORY.AGGREGATE_FUNCTION_MIXED_OUTER_LOCAL_REFERENCES", + "sqlState" : "0A000", + "messageParameters" : { + "function" : "max((outer(t1.col1) + t3.col4))" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 91, + "stopIndex" : 112, + "fragment" : "MAX(t1.col1 + t3.col4)" + } ] +} diff --git a/sql/core/src/test/resources/sql-tests/results/subquery/nestedcorrelation/lateral-subquery.sql.out b/sql/core/src/test/resources/sql-tests/results/subquery/nestedcorrelation/lateral-subquery.sql.out new file mode 100644 index 0000000000000..9cedbe2e28847 --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/results/subquery/nestedcorrelation/lateral-subquery.sql.out @@ -0,0 +1,88 @@ +-- Automatically generated by SQLQueryTestSuite +-- !query +set spark.sql.optimizer.supportNestedCorrelatedSubqueries.enabled=true +-- !query analysis +SetCommand (spark.sql.optimizer.supportNestedCorrelatedSubqueries.enabled,Some(true)) + + +-- !query +set spark.sql.optimizer.supportNestedCorrelatedSubqueriesForScalarSubqueries.enabled=true +-- !query analysis +SetCommand (spark.sql.optimizer.supportNestedCorrelatedSubqueriesForScalarSubqueries.enabled,Some(true)) + + +-- !query +set spark.sql.optimizer.supportNestedCorrelatedSubqueriesForINSubqueries.enabled=true +-- !query analysis +SetCommand (spark.sql.optimizer.supportNestedCorrelatedSubqueriesForINSubqueries.enabled,Some(true)) + + +-- !query +set spark.sql.optimizer.supportNestedCorrelatedSubqueriesForEXISTSSubqueries.enabled=true +-- !query analysis +SetCommand (spark.sql.optimizer.supportNestedCorrelatedSubqueriesForEXISTSSubqueries.enabled,Some(true)) + + +-- !query +DROP TABLE IF EXISTS t +-- !query analysis +DropTable true, false ++- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.t + + +-- !query +CREATE TABLE t(ps_supplycost INT, n_name INT) +-- !query analysis +CreateDataSourceTableCommand `spark_catalog`.`default`.`t`, false + + +-- !query +SELECT NULL +FROM + t AS ref_2, + LATERAL (SELECT (SELECT NULL + FROM (SELECT * FROM t AS ref_5, + LATERAL (SELECT ref_5.ps_supplycost AS c0, + ref_2.n_name AS c1) AS alias1) AS alias2) AS alias3) AS alias4 +-- !query analysis +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "errorClass" : "UNRESOLVED_COLUMN.WITHOUT_SUGGESTION", + "sqlState" : "42703", + "messageParameters" : { + "objectName" : "`ref_2`.`n_name`" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 187, + "stopIndex" : 198, + "fragment" : "ref_2.n_name" + } ] +} + + +-- !query +SELECT * +FROM + t AS ref_2, + LATERAL (SELECT (SELECT NULL + FROM (SELECT * FROM t AS ref_5, + LATERAL (SELECT ref_5.ps_supplycost AS c0, + ref_2.n_name AS c1) AS alias1) AS alias2) AS alias3) AS alias4 +-- !query analysis +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "errorClass" : "UNRESOLVED_COLUMN.WITHOUT_SUGGESTION", + "sqlState" : "42703", + "messageParameters" : { + "objectName" : "`ref_2`.`n_name`" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 184, + "stopIndex" : 195, + "fragment" : "ref_2.n_name" + } ] +} diff --git a/sql/core/src/test/resources/sql-tests/results/subquery/nestedcorrelation/scalar-subquery.sql.out b/sql/core/src/test/resources/sql-tests/results/subquery/nestedcorrelation/scalar-subquery.sql.out new file mode 100644 index 0000000000000..08cc38763e9bb --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/results/subquery/nestedcorrelation/scalar-subquery.sql.out @@ -0,0 +1,624 @@ +-- Automatically generated by SQLQueryTestSuite +-- !query +set spark.sql.optimizer.supportNestedCorrelatedSubqueries.enabled=true +-- !query schema +struct +-- !query output +spark.sql.optimizer.supportNestedCorrelatedSubqueries.enabled true + + +-- !query +set spark.sql.optimizer.supportNestedCorrelatedSubqueriesForScalarSubqueries.enabled=true +-- !query schema +struct +-- !query output +spark.sql.optimizer.supportNestedCorrelatedSubqueriesForScalarSubqueries.enabled true + + +-- !query +DROP TABLE IF EXISTS myt1 +-- !query schema +struct<> +-- !query output + + + +-- !query +DROP TABLE IF EXISTS myt2 +-- !query schema +struct<> +-- !query output + + + +-- !query +DROP TABLE IF EXISTS myt3 +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE TABLE myt1(a INT, b INT, c INT) +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE TABLE myt2(a INT, b INT, c INT) +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE TABLE myt3(a INT, b INT, c INT) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO myt1 VALUES (0, 0, 0), (1, 1, 1), (2, 2, 2), (3, 3, 3), (NULL, NULL, NULL) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO myt2 VALUES (0, 0, 0), (1, 1, 1), (2, 2, 2), (3, 3, 3), (NULL, NULL, NULL) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO myt3 VALUES (0, 0, 0), (1, 1, 1), (2, 2, 2), (3, 3, 3), (NULL, NULL, NULL) +-- !query schema +struct<> +-- !query output + + + +-- !query +SELECT * +FROM myt1 +WHERE myt1.a = ( + SELECT MAX(myt2.a) + FROM myt2 + WHERE myt2.a = ( + SELECT MAX(myt3.a) + FROM myt3 + WHERE myt3.b > myt2.b AND myt3.c > myt1.c + ) AND myt2.b > myt1.b +) +-- !query schema +struct +-- !query output + + + +-- !query +SELECT * +FROM myt1 +WHERE myt1.a = ( + SELECT MAX(myt2.a) + FROM myt2 + WHERE myt2.a = ( + SELECT MAX(myt3.a) + FROM myt3 + WHERE myt3.b = myt2.b AND myt3.c = myt1.c + ) AND myt2.b = myt1.b +) +-- !query schema +struct +-- !query output +0 0 0 +1 1 1 +2 2 2 +3 3 3 + + +-- !query +SELECT * +FROM myt1 +WHERE myt1.a = ( + SELECT COUNT(myt2.a) + FROM myt2 + WHERE myt2.a = ( + SELECT COUNT(myt3.a) + FROM myt3 + WHERE myt3.b > myt2.b AND myt3.c > myt1.c + ) AND myt2.b > myt1.b +) +-- !query schema +struct +-- !query output +0 0 0 + + +-- !query +SELECT * +FROM myt1 +WHERE myt1.a = ( + SELECT COUNT(myt2.a) + FROM myt2 + WHERE myt2.a = ( + SELECT COUNT(myt3.a) + FROM myt3 + WHERE myt3.b = myt2.b AND myt3.c = myt1.c + ) AND myt2.b = myt1.b +) +-- !query schema +struct +-- !query output +0 0 0 +1 1 1 + + +-- !query +SELECT myt1.a, ( + SELECT ( + SELECT MAX(myt3.a) + FROM myt3 + WHERE myt3.b > myt2.b AND myt3.c > myt1.c + ) + FROM myt2 +) +FROM myt1 +-- !query schema +struct<> +-- !query output +org.apache.spark.SparkRuntimeException +{ + "errorClass" : "SCALAR_SUBQUERY_TOO_MANY_ROWS", + "sqlState" : "21000" +} + + +-- !query +SELECT myt1.a, ( + SELECT ( + SELECT MAX(myt3.a) + FROM myt3 + WHERE myt3.b = myt2.b AND myt3.c = myt1.c + ) + FROM myt2 +) +FROM myt1 +-- !query schema +struct +-- !query output +0 0 +1 1 +2 2 +3 3 +NULL NULL + + +-- !query +SELECT myt1.a, ( + SELECT ( + SELECT COUNT(myt3.a) + FROM myt3 + WHERE myt3.b > myt2.b AND myt3.c > myt1.c + ) + FROM myt2 +) +FROM myt1 +-- !query schema +struct<> +-- !query output +org.apache.spark.SparkRuntimeException +{ + "errorClass" : "SCALAR_SUBQUERY_TOO_MANY_ROWS", + "sqlState" : "21000" +} + + +-- !query +SELECT myt1.a, ( + SELECT ( + SELECT COUNT(myt3.a) + FROM myt3 + WHERE myt3.b = myt2.b AND myt3.c = myt1.c + ) + FROM myt2 +) +FROM myt1 +-- !query schema +struct +-- !query output +0 1 +1 1 +2 1 +3 1 +NULL NULL + + +-- !query +SELECT MIN( + SELECT MAX( + SELECT MAX(myt3.a) + FROM myt3 + WHERE myt3.b > myt2.b AND myt3.c > myt1.c + ) + FROM myt2 + ) +FROM myt1 +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException +{ + "errorClass" : "PARSE_SYNTAX_ERROR", + "sqlState" : "42601", + "messageParameters" : { + "error" : "'MAX'", + "hint" : "" + } +} + + +-- !query +SELECT MIN( + SELECT MAX( + SELECT MAX(myt3.a) + FROM myt3 + WHERE myt3.b = myt2.b AND myt3.c = myt1.c + ) + FROM myt2 + ) +FROM myt1 +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException +{ + "errorClass" : "PARSE_SYNTAX_ERROR", + "sqlState" : "42601", + "messageParameters" : { + "error" : "'MAX'", + "hint" : "" + } +} + + +-- !query +SELECT COUNT( + SELECT COUNT( + SELECT COUNT(myt3.a) + FROM myt3 + WHERE myt3.b > myt2.b AND myt3.c > myt1.c + ) + FROM myt2 + ) +FROM myt1 +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException +{ + "errorClass" : "PARSE_SYNTAX_ERROR", + "sqlState" : "42601", + "messageParameters" : { + "error" : "'COUNT'", + "hint" : "" + } +} + + +-- !query +SELECT COUNT( + SELECT COUNT( + SELECT COUNT(myt3.a) + FROM myt3 + WHERE myt3.b = myt2.b AND myt3.c = myt1.c + ) + FROM myt2 + ) +FROM myt1 +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException +{ + "errorClass" : "PARSE_SYNTAX_ERROR", + "sqlState" : "42601", + "messageParameters" : { + "error" : "'COUNT'", + "hint" : "" + } +} + + +-- !query +SELECT b, MAX(myt1.a) +FROM myt1 +GROUP BY b +HAVING ( + SELECT MAX(myt2.a) + FROM myt2 + WHERE myt2.a = ( + SELECT MAX(myt3.a) + FROM myt3 + WHERE myt3.a > MAX(myt1.a) + ) AND myt2.b > myt1.b +) +-- !query schema +struct +-- !query output +0 0 +1 1 +2 2 + + +-- !query +SELECT b, MAX(myt1.a) +FROM myt1 +GROUP BY b +HAVING ( + SELECT MAX(myt2.a) + FROM myt2 + WHERE myt2.a = ( + SELECT MAX(myt3.a) + FROM myt3 + WHERE myt3.a = MAX(myt1.a) + ) AND myt2.b = myt1.b +) +-- !query schema +struct +-- !query output +1 1 +2 2 +3 3 + + +-- !query +SELECT b, MAX(myt1.a) +FROM myt1 +GROUP BY b +HAVING ( + SELECT COUNT(myt2.a) + FROM myt2 + WHERE myt2.a = ( + SELECT COUNT(myt3.a) + FROM myt3 + WHERE myt3.a > MAX(myt1.a) + ) AND myt2.b > myt1.b +) +-- !query schema +struct +-- !query output +0 0 +1 1 + + +-- !query +SELECT b, MAX(myt1.a) +FROM myt1 +GROUP BY b +HAVING ( + SELECT COUNT(myt2.a) + FROM myt2 + WHERE myt2.a = ( + SELECT COUNT(myt3.a) + FROM myt3 + WHERE myt3.a = MAX(myt1.a) + ) AND myt2.b = myt1.b +) +-- !query schema +struct +-- !query output +1 1 + + +-- !query +SELECT myt1.a +FROM myt1 +WHERE EXISTS ( + SELECT 1 + FROM myt2 + WHERE myt2.a = ( + SELECT MAX(myt3.a) + FROM myt3 + WHERE myt3.b > myt2.b AND myt3.c > myt1.c + ) AND myt2.b > myt1.b +) +-- !query schema +struct<> +-- !query output +org.apache.spark.SparkException +{ + "errorClass" : "INTERNAL_ERROR", + "sqlState" : "XX000", + "messageParameters" : { + "message" : "The Spark SQL phase optimization failed with an internal error. You hit a bug in Spark or the Spark plugins you use. Please, report this bug to the corresponding communities or vendors, and provide the full stack trace." + } +} + + +-- !query +SELECT myt1.a +FROM myt1 +WHERE myt1.b = ( + SELECT myt2.b + FROM myt2 + WHERE EXISTS ( + SELECT 1 + FROM myt3 + WHERE myt3.b > myt2.b AND myt3.c > myt1.c + ) AND myt2.b > myt1.b +) +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "errorClass" : "NESTED_REFERENCES_IN_SUBQUERY_NOT_SUPPORTED", + "sqlState" : "0A000", + "messageParameters" : { + "expression" : "spark_catalog.default.myt1.c" + } +} + + +-- !query +SELECT 1 FROM (SELECT 1) t0(c0) WHERE (SELECT (SELECT c0)) = 1 +-- !query schema +struct<1:int> +-- !query output +1 + + +-- !query +DROP TABLE IF EXISTS table_integers +-- !query schema +struct<> +-- !query output + + + +-- !query +CREATE TABLE table_integers(i INTEGER) +-- !query schema +struct<> +-- !query output + + + +-- !query +INSERT INTO table_integers VALUES (1), (2), (3), (NULL) +-- !query schema +struct<> +-- !query output + + + +-- !query +SELECT i, (SELECT (SELECT 42+i1.i)+42+i1.i) AS j FROM table_integers i1 ORDER BY i +-- !query schema +struct +-- !query output +NULL NULL +1 86 +2 88 +3 90 + + +-- !query +SELECT i, (SELECT (SELECT (SELECT (SELECT 42+i1.i)++i1.i)+42+i1.i)+42+i1.i) AS j FROM table_integers i1 ORDER BY i +-- !query schema +struct +-- !query output +NULL NULL +1 130 +2 134 +3 138 + + +-- !query +SELECT i, (SELECT (SELECT (SELECT (SELECT i1.i+i1.i+i1.i+i1.i+i1.i)))) AS j FROM table_integers i1 ORDER BY i +-- !query schema +struct +-- !query output +NULL NULL +1 5 +2 10 +3 15 + + +-- !query +SELECT i, (SELECT (SELECT (SELECT (SELECT i1.i+i1.i+i1.i+i1.i+i1.i+i2.i) FROM table_integers i2 WHERE i2.i=i1.i))) AS j FROM table_integers i1 ORDER BY i +-- !query schema +struct<> +-- !query output +org.apache.spark.SparkRuntimeException +{ + "errorClass" : "SCALAR_SUBQUERY_TOO_MANY_ROWS", + "sqlState" : "21000" +} + + +-- !query +SELECT i, (SELECT SUM(s1.i) FROM (SELECT i FROM table_integers WHERE i=i1.i) s1 LEFT OUTER JOIN table_integers s2 ON s1.i=s2.i) AS j FROM table_integers i1 ORDER BY i +-- !query schema +struct +-- !query output +NULL NULL +1 1 +2 2 +3 3 + + +-- !query +SELECT i, (SELECT SUM(s1.i) FROM (SELECT i FROM table_integers WHERE i<>i1.i) s1 LEFT OUTER JOIN table_integers s2 ON s1.i=s2.i) AS j FROM table_integers i1 ORDER BY i +-- !query schema +struct +-- !query output +NULL NULL +1 5 +2 4 +3 3 + + +-- !query +SELECT i, (SELECT SUM(ss2.i) FROM (SELECT i FROM table_integers s1 WHERE i=i1.i) ss2) AS j FROM table_integers i1 ORDER BY i +-- !query schema +struct +-- !query output +NULL NULL +1 1 +2 2 +3 3 + + +-- !query +SELECT i, (SELECT * FROM (SELECT (SELECT 42+i1.i)) s1) AS j FROM table_integers i1 ORDER BY i +-- !query schema +struct +-- !query output +NULL NULL +1 43 +2 44 +3 45 + + +-- !query +SELECT i, (SELECT s1.k+s2.k FROM (SELECT (SELECT 42+i1.i) AS k) s1, (SELECT (SELECT 42+i1.i) AS k) s2) AS j FROM table_integers i1 ORDER BY i +-- !query schema +struct +-- !query output +NULL NULL +1 86 +2 88 +3 90 + + +-- !query +SELECT i, (SELECT s1.k+s2.k FROM (SELECT (SELECT 42+i1.i) AS k) s1 LEFT OUTER JOIN (SELECT (SELECT 42+i1.i) AS k) s2 ON s1.k=s2.k) AS j FROM table_integers i1 ORDER BY i +-- !query schema +struct +-- !query output +NULL NULL +1 86 +2 88 +3 90 + + +-- !query +SELECT i, (SELECT i1.i IN (1, 2, 3, 4, 5, 6, 7, 8)) AS j FROM table_integers i1 ORDER BY i +-- !query schema +struct +-- !query output +NULL NULL +1 true +2 true +3 true diff --git a/sql/core/src/test/resources/sql-tests/results/subquery/nestedcorrelation/subquery-not-supported.sql.out b/sql/core/src/test/resources/sql-tests/results/subquery/nestedcorrelation/subquery-not-supported.sql.out new file mode 100644 index 0000000000000..c56057defa31c --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/results/subquery/nestedcorrelation/subquery-not-supported.sql.out @@ -0,0 +1,566 @@ +-- Automatically generated by SQLQueryTestSuite +-- !query +set spark.sql.optimizer.supportNestedCorrelatedSubqueries.enabled=true +-- !query analysis +SetCommand (spark.sql.optimizer.supportNestedCorrelatedSubqueries.enabled,Some(true)) + + +-- !query +set spark.sql.optimizer.supportNestedCorrelatedSubqueriesForScalarSubqueries.enabled=true +-- !query analysis +SetCommand (spark.sql.optimizer.supportNestedCorrelatedSubqueriesForScalarSubqueries.enabled,Some(true)) + + +-- !query +set spark.sql.optimizer.supportNestedCorrelatedSubqueriesForINSubqueries.enabled=true +-- !query analysis +SetCommand (spark.sql.optimizer.supportNestedCorrelatedSubqueriesForINSubqueries.enabled,Some(true)) + + +-- !query +set spark.sql.optimizer.supportNestedCorrelatedSubqueriesForEXISTSSubqueries.enabled=true +-- !query analysis +SetCommand (spark.sql.optimizer.supportNestedCorrelatedSubqueriesForEXISTSSubqueries.enabled,Some(true)) + + +-- !query +SELECT 1 FROM (SELECT 1) t0(c0) WHERE (SELECT (SELECT 1 ORDER BY c0)) = 1 +-- !query analysis +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "errorClass" : "UNSUPPORTED_SUBQUERY_EXPRESSION_CATEGORY.CORRELATED_REFERENCE", + "sqlState" : "0A000", + "messageParameters" : { + "sqlExprs" : "\"c0 ASC NULLS FIRST\"" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 57, + "stopIndex" : 67, + "fragment" : "ORDER BY c0" + } ] +} + + +-- !query +SELECT 1 FROM (SELECT 1) t0(c0) WHERE (SELECT (SELECT 1 LIMIT c0)) = 1 +-- !query analysis +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "errorClass" : "INVALID_LIMIT_LIKE_EXPRESSION.IS_UNFOLDABLE", + "sqlState" : "42K0E", + "messageParameters" : { + "expr" : "\"outer(t0.c0)\"", + "name" : "limit" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 63, + "stopIndex" : 64, + "fragment" : "c0" + } ] +} + + +-- !query +DROP TABLE IF EXISTS t +-- !query analysis +DropTable true, false ++- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.t + + +-- !query +CREATE TABLE t(ps_supplycost INT, n_name INT) +-- !query analysis +CreateDataSourceTableCommand `spark_catalog`.`default`.`t`, false + + +-- !query +SELECT NULL +FROM + t AS ref_2, + (SELECT (SELECT NULL + FROM (FROM t AS ref_5, + (SELECT ref_2.n_name AS c1)))) +-- !query analysis +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "errorClass" : "UNRESOLVED_COLUMN.WITHOUT_SUGGESTION", + "sqlState" : "42703", + "messageParameters" : { + "objectName" : "`ref_2`.`n_name`" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 113, + "stopIndex" : 124, + "fragment" : "ref_2.n_name" + } ] +} + + +-- !query +SELECT NULL +FROM + t AS ref_2, + (SELECT (SELECT NULL + FROM (FROM t AS ref_5, + (SELECT ref_5.ps_supplycost AS c0, + ref_2.n_name AS c1)))) +-- !query analysis +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "errorClass" : "UNRESOLVED_COLUMN.WITHOUT_SUGGESTION", + "sqlState" : "42703", + "messageParameters" : { + "objectName" : "`ref_5`.`ps_supplycost`" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 113, + "stopIndex" : 131, + "fragment" : "ref_5.ps_supplycost" + } ] +} + + +-- !query +DROP TABLE IF EXISTS table_integers +-- !query analysis +DropTable true, false ++- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.table_integers + + +-- !query +CREATE TABLE table_integers(i INTEGER) +-- !query analysis +CreateDataSourceTableCommand `spark_catalog`.`default`.`table_integers`, false + + +-- !query +INSERT INTO table_integers VALUES (1), (2), (3), (NULL) +-- !query analysis +InsertIntoHadoopFsRelationCommand file:[not included in comparison]/{warehouse_dir}/table_integers, false, Parquet, [path=file:[not included in comparison]/{warehouse_dir}/table_integers], Append, `spark_catalog`.`default`.`table_integers`, org.apache.spark.sql.execution.datasources.InMemoryFileIndex(file:[not included in comparison]/{warehouse_dir}/table_integers), [i] ++- Project [cast(col1#x as int) AS i#x] + +- LocalRelation [col1#x] + + +-- !query +SELECT i, (SELECT (SELECT i1.i+SUM(i2.i)) FROM table_integers i2) AS j FROM table_integers i1 ORDER BY i +-- !query analysis +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "errorClass" : "UNSUPPORTED_SUBQUERY_EXPRESSION_CATEGORY.CORRELATED_REFERENCE", + "sqlState" : "0A000", + "messageParameters" : { + "sqlExprs" : "\"(i + sum(i)) AS `(outer(i1.i) + sum(outer(i2.i)))`\"" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 20, + "stopIndex" : 40, + "fragment" : "SELECT i1.i+SUM(i2.i)" + } ] +} + + +-- !query +SELECT i, (SELECT ((SELECT ((SELECT ((SELECT SUM(i)+SUM(i4.i)+SUM(i3.i)+SUM(i2.i)+SUM(i1.i) FROM table_integers i5)) FROM table_integers i4)) FROM table_integers i3)) FROM table_integers i2) AS j FROM table_integers i1 GROUP BY i ORDER BY i +-- !query analysis +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "errorClass" : "UNSUPPORTED_SUBQUERY_EXPRESSION_CATEGORY.CORRELATED_REFERENCE", + "sqlState" : "0A000", + "messageParameters" : { + "sqlExprs" : "\"((((sum(i) + sum(i)) + sum(i)) + sum(i)) + sum(i)) AS `((((sum(i) + sum(outer(i4.i))) + sum(outer(i3.i))) + sum(outer(i2.i))) + sum(outer(i1.i)))`\"" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 39, + "stopIndex" : 114, + "fragment" : "SELECT SUM(i)+SUM(i4.i)+SUM(i3.i)+SUM(i2.i)+SUM(i1.i) FROM table_integers i5" + } ] +} + + +-- !query +SELECT (SELECT (SELECT SUM(i1.i)+SUM(i2.i)+SUM(i3.i) FROM table_integers i3) FROM table_integers i2) FROM table_integers i1 ORDER BY 1 +-- !query analysis +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "errorClass" : "UNSUPPORTED_SUBQUERY_EXPRESSION_CATEGORY.CORRELATED_REFERENCE", + "sqlState" : "0A000", + "messageParameters" : { + "sqlExprs" : "\"((sum(i) + sum(i)) + sum(i)) AS `((sum(outer(i1.i)) + sum(outer(i2.i))) + sum(i))`\"" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 17, + "stopIndex" : 75, + "fragment" : "SELECT SUM(i1.i)+SUM(i2.i)+SUM(i3.i) FROM table_integers i3" + } ] +} + + +-- !query +SELECT i, SUM(i), (SELECT (SELECT SUM(i)+SUM(i1.i)+SUM(i2.i) FROM table_integers) FROM table_integers i2) FROM table_integers i1 GROUP BY i ORDER BY i +-- !query analysis +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "errorClass" : "UNSUPPORTED_SUBQUERY_EXPRESSION_CATEGORY.CORRELATED_REFERENCE", + "sqlState" : "0A000", + "messageParameters" : { + "sqlExprs" : "\"((sum(i) + sum(i)) + sum(i)) AS `((sum(i) + sum(outer(i1.i))) + sum(outer(i2.i)))`\"" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 28, + "stopIndex" : 80, + "fragment" : "SELECT SUM(i)+SUM(i1.i)+SUM(i2.i) FROM table_integers" + } ] +} + + +-- !query +SELECT i, (SELECT SUM(i)+(SELECT 42+i1.i) FROM table_integers) AS j FROM table_integers i1 ORDER BY i +-- !query analysis +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "errorClass" : "SCALAR_SUBQUERY_IS_IN_GROUP_BY_OR_AGGREGATE_FUNCTION", + "sqlState" : "0A000", + "messageParameters" : { + "sqlExpr" : "\"scalarsubquery(i)\"" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 26, + "stopIndex" : 41, + "fragment" : "(SELECT 42+i1.i)" + } ] +} + + +-- !query +SELECT i, (SELECT SUM(s1.i) FROM table_integers s1 INNER JOIN table_integers s2 ON (SELECT i1.i+s1.i)=(SELECT i1.i+s2.i)) AS j FROM table_integers i1 ORDER BY i +-- !query analysis +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "errorClass" : "UNSUPPORTED_SUBQUERY_EXPRESSION_CATEGORY.UNSUPPORTED_CORRELATED_SCALAR_SUBQUERY", + "sqlState" : "0A000", + "messageParameters" : { + "treeNode" : "Join Inner, (scalar-subquery#x [i#x && i#x] = scalar-subquery#x [i#x && i#x])\n: :- Project [(outer(i#x) + outer(i#x)) AS (outer(i1.i) + outer(s1.i))#x]\n: : +- OneRowRelation\n: +- Project [(outer(i#x) + outer(i#x)) AS (outer(i1.i) + outer(s2.i))#x]\n: +- OneRowRelation\n:- SubqueryAlias s1\n: +- SubqueryAlias spark_catalog.default.table_integers\n: +- Relation spark_catalog.default.table_integers[i#x] parquet\n+- SubqueryAlias s2\n +- SubqueryAlias spark_catalog.default.table_integers\n +- Relation spark_catalog.default.table_integers[i#x] parquet\n" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 52, + "stopIndex" : 120, + "fragment" : "INNER JOIN table_integers s2 ON (SELECT i1.i+s1.i)=(SELECT i1.i+s2.i)" + } ] +} + + +-- !query +SELECT i, (SELECT SUM(s1.i) FROM table_integers s1 LEFT OUTER JOIN table_integers s2 ON (SELECT i1.i+s1.i)=(SELECT i1.i+s2.i)) AS j FROM table_integers i1 ORDER BY i + +SELECT (SELECT (SELECT COVAR_POP(i2.i, i3.i) FROM table_integers i3) FROM table_integers i2 ORDER BY i NULLS LAST LIMIT 1) FROM table_integers i1 ORDER BY 1 +-- !query analysis +org.apache.spark.sql.catalyst.parser.ParseException +{ + "errorClass" : "PARSE_SYNTAX_ERROR", + "sqlState" : "42601", + "messageParameters" : { + "error" : "'SELECT'", + "hint" : "" + } +} + + +-- !query +SELECT (SELECT (SELECT COVAR_POP(i1.i, i3.i) FROM table_integers i3) FROM table_integers i2 LIMIT 1) FROM table_integers i1 ORDER BY 1 +-- !query analysis +org.apache.spark.sql.AnalysisException +{ + "errorClass" : "UNSUPPORTED_SUBQUERY_EXPRESSION_CATEGORY.AGGREGATE_FUNCTION_MIXED_OUTER_LOCAL_REFERENCES", + "sqlState" : "0A000", + "messageParameters" : { + "function" : "covar_pop(CAST(outer(i1.i) AS DOUBLE), CAST(i3.i AS DOUBLE))" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 24, + "stopIndex" : 44, + "fragment" : "COVAR_POP(i1.i, i3.i)" + } ] +} + + +-- !query +SELECT i, (SELECT SUM(ss1.i) FROM (SELECT i FROM table_integers s1 WHERE EXISTS(SELECT i FROM table_integers WHERE i<>s1.i AND s1.i > i)) ss1 LEFT OUTER JOIN (SELECT i FROM table_integers s1 WHERE i=i1.i AND EXISTS(SELECT i FROM table_integers WHERE i=s1.i)) ss2 ON ss1.i=ss2.i) AS j FROM table_integers i1 ORDER BY i +-- !query analysis +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "errorClass" : "UNSUPPORTED_SUBQUERY_EXPRESSION_CATEGORY.ACCESSING_OUTER_QUERY_COLUMN_IS_NOT_ALLOWED", + "sqlState" : "0A000", + "messageParameters" : { + "treeNode" : "Filter ((i#x = outer(i#x)) AND exists#x [i#x])\n: +- Project [i#x]\n: +- Filter (i#x = outer(i#x))\n: +- SubqueryAlias spark_catalog.default.table_integers\n: +- Relation spark_catalog.default.table_integers[i#x] parquet\n+- SubqueryAlias s1\n +- SubqueryAlias spark_catalog.default.table_integers\n +- Relation spark_catalog.default.table_integers[i#x] parquet\n" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 160, + "stopIndex" : 257, + "fragment" : "SELECT i FROM table_integers s1 WHERE i=i1.i AND EXISTS(SELECT i FROM table_integers WHERE i=s1.i)" + } ] +} + + +-- !query +SELECT i, (SELECT SUM(ss2.i) FROM (SELECT i FROM table_integers s1 WHERE EXISTS(SELECT i FROM table_integers WHERE i<>s1.i AND s1.i > i)) ss1 LEFT OUTER JOIN (SELECT i FROM table_integers s1 WHERE i=i1.i AND EXISTS(SELECT i FROM table_integers WHERE i=s1.i)) ss2 ON ss1.i=ss2.i) AS j FROM table_integers i1 ORDER BY i +-- !query analysis +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "errorClass" : "UNSUPPORTED_SUBQUERY_EXPRESSION_CATEGORY.ACCESSING_OUTER_QUERY_COLUMN_IS_NOT_ALLOWED", + "sqlState" : "0A000", + "messageParameters" : { + "treeNode" : "Filter ((i#x = outer(i#x)) AND exists#x [i#x])\n: +- Project [i#x]\n: +- Filter (i#x = outer(i#x))\n: +- SubqueryAlias spark_catalog.default.table_integers\n: +- Relation spark_catalog.default.table_integers[i#x] parquet\n+- SubqueryAlias s1\n +- SubqueryAlias spark_catalog.default.table_integers\n +- Relation spark_catalog.default.table_integers[i#x] parquet\n" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 160, + "stopIndex" : 257, + "fragment" : "SELECT i FROM table_integers s1 WHERE i=i1.i AND EXISTS(SELECT i FROM table_integers WHERE i=s1.i)" + } ] +} + + +-- !query +SELECT i, (SELECT SUM(ss1.i)+SUM(ss2.i) FROM (SELECT i FROM table_integers s1 WHERE EXISTS(SELECT i FROM table_integers WHERE i<>s1.i AND s1.i > i)) ss1 LEFT OUTER JOIN (SELECT i FROM table_integers s1 WHERE i=i1.i AND EXISTS(SELECT i FROM table_integers WHERE i=s1.i)) ss2 ON ss1.i=ss2.i) AS j FROM table_integers i1 ORDER BY i +-- !query analysis +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "errorClass" : "UNSUPPORTED_SUBQUERY_EXPRESSION_CATEGORY.ACCESSING_OUTER_QUERY_COLUMN_IS_NOT_ALLOWED", + "sqlState" : "0A000", + "messageParameters" : { + "treeNode" : "Filter ((i#x = outer(i#x)) AND exists#x [i#x])\n: +- Project [i#x]\n: +- Filter (i#x = outer(i#x))\n: +- SubqueryAlias spark_catalog.default.table_integers\n: +- Relation spark_catalog.default.table_integers[i#x] parquet\n+- SubqueryAlias s1\n +- SubqueryAlias spark_catalog.default.table_integers\n +- Relation spark_catalog.default.table_integers[i#x] parquet\n" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 171, + "stopIndex" : 268, + "fragment" : "SELECT i FROM table_integers s1 WHERE i=i1.i AND EXISTS(SELECT i FROM table_integers WHERE i=s1.i)" + } ] +} + + +-- !query +SELECT i, (SELECT SUM(ss1.i)+SUM(ss2.i) FROM (SELECT i FROM table_integers s1 WHERE i=i1.i AND EXISTS(SELECT i FROM table_integers WHERE i<>s1.i AND s1.i>i)) ss1 LEFT OUTER JOIN (SELECT i FROM table_integers s1 WHERE i<>i1.i OR EXISTS(SELECT i FROM table_integers WHERE i=s1.i)) ss2 ON ss1.i=ss2.i) AS j FROM table_integers i1 ORDER BY i +-- !query analysis +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "errorClass" : "UNSUPPORTED_SUBQUERY_EXPRESSION_CATEGORY.ACCESSING_OUTER_QUERY_COLUMN_IS_NOT_ALLOWED", + "sqlState" : "0A000", + "messageParameters" : { + "treeNode" : "Filter (NOT (i#x = outer(i#x)) OR exists#x [i#x])\n: +- Project [i#x]\n: +- Filter (i#x = outer(i#x))\n: +- SubqueryAlias spark_catalog.default.table_integers\n: +- Relation spark_catalog.default.table_integers[i#x] parquet\n+- SubqueryAlias s1\n +- SubqueryAlias spark_catalog.default.table_integers\n +- Relation spark_catalog.default.table_integers[i#x] parquet\n" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 180, + "stopIndex" : 277, + "fragment" : "SELECT i FROM table_integers s1 WHERE i<>i1.i OR EXISTS(SELECT i FROM table_integers WHERE i=s1.i)" + } ] +} + + +-- !query +SELECT i, (SELECT SUM(s2.i) FROM table_integers s1 LEFT OUTER JOIN (SELECT i FROM table_integers WHERE i=i1.i) s2 ON s1.i=s2.i) AS j FROM table_integers i1 ORDER BY i +-- !query analysis +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "errorClass" : "UNSUPPORTED_SUBQUERY_EXPRESSION_CATEGORY.ACCESSING_OUTER_QUERY_COLUMN_IS_NOT_ALLOWED", + "sqlState" : "0A000", + "messageParameters" : { + "treeNode" : "Filter (i#x = outer(i#x))\n+- SubqueryAlias spark_catalog.default.table_integers\n +- Relation spark_catalog.default.table_integers[i#x] parquet\n" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 69, + "stopIndex" : 109, + "fragment" : "SELECT i FROM table_integers WHERE i=i1.i" + } ] +} + + +-- !query +SELECT i, (SELECT SUM(s2.i) FROM table_integers s1 LEFT OUTER JOIN (SELECT i FROM table_integers WHERE i<>i1.i) s2 ON s1.i=s2.i) AS j FROM table_integers i1 ORDER BY i +-- !query analysis +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "errorClass" : "UNSUPPORTED_SUBQUERY_EXPRESSION_CATEGORY.ACCESSING_OUTER_QUERY_COLUMN_IS_NOT_ALLOWED", + "sqlState" : "0A000", + "messageParameters" : { + "treeNode" : "Filter NOT (i#x = outer(i#x))\n+- SubqueryAlias spark_catalog.default.table_integers\n +- Relation spark_catalog.default.table_integers[i#x] parquet\n" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 69, + "stopIndex" : 110, + "fragment" : "SELECT i FROM table_integers WHERE i<>i1.i" + } ] +} + + +-- !query +DROP TABLE IF EXISTS tbl_ProductSales +-- !query analysis +DropTable true, false ++- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.tbl_ProductSales + + +-- !query +DROP TABLE IF EXISTS another_T +-- !query analysis +DropTable true, false ++- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.another_T + + +-- !query +CREATE TABLE tbl_ProductSales (ColID int, Product_Category varchar(64), Product_Name varchar(64), TotalSales int) +-- !query analysis +CreateDataSourceTableCommand `spark_catalog`.`default`.`tbl_ProductSales`, false + + +-- !query +CREATE TABLE another_T (col1 INT, col2 INT, col3 INT, col4 INT, col5 INT, col6 INT, col7 INT, col8 INT) +-- !query analysis +CreateDataSourceTableCommand `spark_catalog`.`default`.`another_T`, false + + +-- !query +INSERT INTO tbl_ProductSales VALUES (1,'Game','Mobo Game',200),(2,'Game','PKO Game',400),(3,'Fashion','Shirt',500),(4,'Fashion','Shorts',100) +-- !query analysis +InsertIntoHadoopFsRelationCommand file:[not included in comparison]/{warehouse_dir}/tbl_productsales, false, Parquet, [path=file:[not included in comparison]/{warehouse_dir}/tbl_productsales], Append, `spark_catalog`.`default`.`tbl_productsales`, org.apache.spark.sql.execution.datasources.InMemoryFileIndex(file:[not included in comparison]/{warehouse_dir}/tbl_productsales), [ColID, Product_Category, Product_Name, TotalSales] ++- Project [cast(col1#x as int) AS ColID#x, static_invoke(CharVarcharCodegenUtils.varcharTypeWriteSideCheck(cast(col2#x as string), 64)) AS Product_Category#x, static_invoke(CharVarcharCodegenUtils.varcharTypeWriteSideCheck(cast(col3#x as string), 64)) AS Product_Name#x, cast(col4#x as int) AS TotalSales#x] + +- LocalRelation [col1#x, col2#x, col3#x, col4#x] + + +-- !query +INSERT INTO another_T VALUES (1,2,3,4,5,6,7,8), (11,22,33,44,55,66,77,88), (111,222,333,444,555,666,777,888), (1111,2222,3333,4444,5555,6666,7777,8888) +-- !query analysis +InsertIntoHadoopFsRelationCommand file:[not included in comparison]/{warehouse_dir}/another_t, false, Parquet, [path=file:[not included in comparison]/{warehouse_dir}/another_t], Append, `spark_catalog`.`default`.`another_t`, org.apache.spark.sql.execution.datasources.InMemoryFileIndex(file:[not included in comparison]/{warehouse_dir}/another_t), [col1, col2, col3, col4, col5, col6, col7, col8] ++- LocalRelation [col1#x, col2#x, col3#x, col4#x, col5#x, col6#x, col7#x, col8#x] + + +-- !query +SELECT (SELECT MIN(ColID) FROM tbl_ProductSales INNER JOIN another_T t2 ON t1.col7 <> (SELECT MAX(t1.col1 + t3.col4) FROM another_T t3)) FROM another_T t1 +-- !query analysis +org.apache.spark.sql.AnalysisException +{ + "errorClass" : "UNSUPPORTED_SUBQUERY_EXPRESSION_CATEGORY.AGGREGATE_FUNCTION_MIXED_OUTER_LOCAL_REFERENCES", + "sqlState" : "0A000", + "messageParameters" : { + "function" : "max((outer(t1.col1) + t3.col4))" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 95, + "stopIndex" : 116, + "fragment" : "MAX(t1.col1 + t3.col4)" + } ] +} + + +-- !query +SELECT CASE WHEN 1 IN (SELECT (SELECT MAX(col7))) THEN 2 ELSE NULL END FROM another_T t1 +-- !query analysis +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "errorClass" : "UNSUPPORTED_SUBQUERY_EXPRESSION_CATEGORY.CORRELATED_REFERENCE", + "sqlState" : "0A000", + "messageParameters" : { + "sqlExprs" : "\"max(col7) AS `max(outer(t1.col7))`\"" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 32, + "stopIndex" : 47, + "fragment" : "SELECT MAX(col7)" + } ] +} + + +-- !query +SELECT CASE WHEN 1 IN (SELECT (SELECT MAX(col7)) UNION ALL (SELECT MIN(ColID) FROM tbl_ProductSales INNER JOIN another_T t2 ON t2.col5 = t2.col1)) THEN 2 ELSE NULL END FROM another_T t1 +-- !query analysis +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "errorClass" : "UNSUPPORTED_SUBQUERY_EXPRESSION_CATEGORY.CORRELATED_REFERENCE", + "sqlState" : "0A000", + "messageParameters" : { + "sqlExprs" : "\"max(col7) AS `max(outer(t1.col7))`\"" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 32, + "stopIndex" : 47, + "fragment" : "SELECT MAX(col7)" + } ] +} + + +-- !query +SELECT CASE WHEN 1 IN (SELECT (SELECT MIN(ColID) FROM tbl_ProductSales INNER JOIN another_T t2 ON t2.col5 = t2.col1) UNION ALL (SELECT MAX(col7))) THEN 2 ELSE NULL END FROM another_T t1 +-- !query analysis +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "errorClass" : "UNSUPPORTED_SUBQUERY_EXPRESSION_CATEGORY.CORRELATED_REFERENCE", + "sqlState" : "0A000", + "messageParameters" : { + "sqlExprs" : "\"max(col7) AS `max(outer(t1.col7))`\"" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 129, + "stopIndex" : 144, + "fragment" : "SELECT MAX(col7)" + } ] +} + + +-- !query +SELECT CASE WHEN NOT col1 NOT IN (SELECT (SELECT MAX(col7)) UNION (SELECT MIN(ColID) FROM tbl_ProductSales LEFT JOIN another_T t2 ON t2.col5 = t1.col1)) THEN 1 ELSE 2 END FROM another_T t1 GROUP BY col1 ORDER BY 1 +-- !query analysis +org.apache.spark.sql.catalyst.ExtendedAnalysisException +{ + "errorClass" : "UNSUPPORTED_SUBQUERY_EXPRESSION_CATEGORY.CORRELATED_REFERENCE", + "sqlState" : "0A000", + "messageParameters" : { + "sqlExprs" : "\"max(col7) AS `max(outer(t1.col7))`\"" + }, + "queryContext" : [ { + "objectType" : "", + "objectName" : "", + "startIndex" : 43, + "stopIndex" : 58, + "fragment" : "SELECT MAX(col7)" + } ] +} diff --git a/sql/core/src/test/resources/sql-tests/results/to_from_avro.sql.out b/sql/core/src/test/resources/sql-tests/results/to_from_avro.sql.out index a94175b1df39a..0c23f17c2920c 100644 --- a/sql/core/src/test/resources/sql-tests/results/to_from_avro.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/to_from_avro.sql.out @@ -1,4 +1,12 @@ -- Automatically generated by SQLQueryTestSuite +-- !query +drop table if exists t +-- !query schema +struct<> +-- !query output + + + -- !query create table t as select named_struct('u', named_struct('member0', member0, 'member1', member1)) as s diff --git a/sql/core/src/test/resources/sql-tests/results/udf/postgreSQL/udf-aggregates_part1.sql.out b/sql/core/src/test/resources/sql-tests/results/udf/postgreSQL/udf-aggregates_part1.sql.out index 6e955c1e4f1ee..a425846772d2f 100644 --- a/sql/core/src/test/resources/sql-tests/results/udf/postgreSQL/udf-aggregates_part1.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/udf/postgreSQL/udf-aggregates_part1.sql.out @@ -500,17 +500,9 @@ struct<> -- !query output org.apache.spark.sql.catalyst.ExtendedAnalysisException { - "errorClass" : "UNRESOLVED_COLUMN.WITH_SUGGESTION", - "sqlState" : "42703", + "errorClass" : "NESTED_REFERENCES_IN_SUBQUERY_NOT_SUPPORTED", + "sqlState" : "0A000", "messageParameters" : { - "objectName" : "`o`.`unique1`", - "proposal" : "`i`.`unique1`, `i`.`unique2`, `i`.`even`, `i`.`four`, `i`.`odd`" - }, - "queryContext" : [ { - "objectType" : "", - "objectName" : "", - "startIndex" : 75, - "stopIndex" : 83, - "fragment" : "o.unique1" - } ] + "expression" : "o.unique1" + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQueryTestSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQueryTestSuite.scala index 575a4ae69d1a9..3f8ece0f34962 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQueryTestSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQueryTestSuite.scala @@ -471,6 +471,10 @@ class SQLQueryTestSuite extends QueryTest with SharedSparkSession with SQLHelper TimestampNTZTestCase(testCaseName, absPath, resultFile) :: Nil } else if (file.getAbsolutePath.startsWith(s"$inputFilePath${File.separator}cte.sql")) { CTETestCase(testCaseName, absPath, resultFile) :: Nil + } else if (file.getAbsolutePath.startsWith( + s"$inputFilePath${File.separator}subquery${File.separator}nestedcorrelation-analyzer-only" + )) { + AnalyzerTestCase(testCaseName, absPath, resultFile) :: Nil } else { RegularTestCase(testCaseName, absPath, resultFile) :: Nil } @@ -480,6 +484,8 @@ class SQLQueryTestSuite extends QueryTest with SharedSparkSession with SQLHelper case _: UDAFTestCase => // Skip creating analyzer test cases for UDAF tests as they are hard to update locally. Seq(test) + case _: AnalyzerTest => + Seq(test) case _ => Seq( test, diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala index 576f93e94ec1e..760e076e24973 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala @@ -2846,4 +2846,162 @@ class SubquerySuite extends QueryTest :: Row(true) :: Row(true) :: Row(true) :: Nil ) } + + test("query without count bug without domain joins") { + sql("CREATE TEMP VIEW t0 AS SELECT 1 AS a, 2 AS b, 3 AS c") + sql("CREATE TEMP VIEW t1 AS SELECT 1 AS a, 2 AS b, 3 AS c") + sql("CREATE TEMP VIEW t2 AS SELECT 1 AS a, 2 AS b, 3 AS c") + sql("CREATE TEMP VIEW t3 AS SELECT 1 AS a, 2 AS b, 3 AS c") + val query = + """ + |SELECT * + |FROM t1 + |WHERE t1.a = ( + | SELECT MAX(t2.a) + | FROM t2 + | WHERE t2.a = ( + | SELECT MAX(t3.a) + | FROM t3 + | WHERE t3.b = t2.b AND t3.c = t1.c + | ) AND t2.b = t1.b + |) + |""".stripMargin + withSQLConf( + "spark.sql.optimizer.supportNestedCorrelatedSubqueries.enabled" -> "true", + "spark.sql.optimizer.supportNestedCorrelatedSubqueriesForScalarSubqueries.enabled" -> "true", + "spark.sql.planChangeLog.level" -> "info" + ) { + val df = sql(query).collect() + } + val querySuperNested = + """ + |SELECT * + |FROM t0 + |WHERE t0.a = ( + |SELECT t1.a + |FROM t1 + |WHERE t1.a = ( + | SELECT MAX(t2.a) + | FROM t2 + | WHERE t2.a = ( + | SELECT MAX(t3.a) + | FROM t3 + | WHERE t3.b = t2.b AND t3.c = t0.c + | ) AND t2.b = t1.b + | ) + |) + |""".stripMargin + withSQLConf( + "spark.sql.optimizer.supportNestedCorrelatedSubqueries.enabled" -> "true", + "spark.sql.optimizer.supportNestedCorrelatedSubqueriesForScalarSubqueries.enabled" -> "true", + "spark.sql.planChangeLog.level" -> "info" + ) { + val df = sql(querySuperNested).collect() + } + } + + test("query without count bug with domain joins") { + sql("CREATE TEMP VIEW t0 AS SELECT 1 AS a, 2 AS b, 3 AS c") + sql("CREATE TEMP VIEW t1 AS SELECT 1 AS a, 2 AS b, 3 AS c") + sql("CREATE TEMP VIEW t2 AS SELECT 1 AS a, 2 AS b, 3 AS c") + sql("CREATE TEMP VIEW t3 AS SELECT 1 AS a, 2 AS b, 3 AS c") + val query = + """ + |SELECT * + |FROM t1 + |WHERE t1.a > ( + | SELECT MAX(t2.a) + | FROM t2 + | WHERE t2.a > ( + | SELECT MAX(t3.a) + | FROM t3 + | WHERE t3.b > t2.b AND t3.c > t1.c + | ) AND t2.b > t1.b + |) + |""".stripMargin + withSQLConf( + "spark.sql.optimizer.supportNestedCorrelatedSubqueries.enabled" -> "true", + "spark.sql.optimizer.supportNestedCorrelatedSubqueriesForScalarSubqueries.enabled" -> "true", + "spark.sql.planChangeLog.level" -> "info" + ) { + val df = sql(query).collect() + } + val querySuperNested = + """ + |SELECT * + |FROM t0 + |WHERE t0.a > ( + |SELECT t1.a + |FROM t1 + |WHERE t1.a > ( + | SELECT MAX(t2.a) + | FROM t2 + | WHERE t2.a > ( + | SELECT MAX(t3.a) + | FROM t3 + | WHERE t3.b > t2.b AND t3.c > t0.c + | ) AND t2.b > t1.b + | ) + |) + |""".stripMargin + withSQLConf( + "spark.sql.optimizer.supportNestedCorrelatedSubqueries.enabled" -> "true", + "spark.sql.optimizer.supportNestedCorrelatedSubqueriesForScalarSubqueries.enabled" -> "true", + "spark.sql.planChangeLog.level" -> "info" + ) { + val df = sql(querySuperNested).collect() + } + } + + test("avery test") { + val query = + """ + |SELECT b, MAX(t1.a) + |FROM t1 + |GROUP BY b + |HAVING ( + | SELECT MAX(t2.a) + | FROM t2 + | WHERE t2.a = ( + | SELECT MAX(t3.a) + | FROM t3 + | WHERE t3.a > MAX(t1.a) + | ) AND t2.b > t1.b + |); + |""".stripMargin + sql("CREATE TABLE IF NOT EXISTS t1(a INT, b INT, c INT);") + sql("CREATE TABLE IF NOT EXISTS t2(a INT, b INT, c INT);") + sql("CREATE TABLE IF NOT EXISTS t3(a INT, b INT, c INT);") + sql("INSERT INTO t1 VALUES (0, 0, 0), (1, 1, 1), (2, 2, 2), (3, 3, 3), (NULL, NULL, NULL);") + sql("INSERT INTO t2 VALUES (0, 0, 0), (1, 1, 1), (2, 2, 2), (3, 3, 3), (NULL, NULL, NULL);") + sql("INSERT INTO t3 VALUES (0, 0, 0), (1, 1, 1), (2, 2, 2), (3, 3, 3), (NULL, NULL, NULL);") + withTable("t1", "t2", "t3") { + withSQLConf( + "spark.sql.planChangeLog.level" -> "info", + "spark.sql.optimizer.supportNestedCorrelatedSubqueries.enabled" -> "true", + "spark.sql.optimizer.supportNestedCorrelatedSubqueriesForScalarSubqueries.enabled" -> "true" + ) { + val df = sql(query).collect() + } + } + } + + test("postgresql test") { + val query = + """ + |SELECT i, (SELECT s1.k+s2.k FROM (SELECT (SELECT 42+i1.i) AS k) s1, + | (SELECT (SELECT 42+i1.i) AS k) s2) AS j FROM table_integers i1 ORDER BY i; + |""".stripMargin + sql("CREATE TABLE table_integers(i INTEGER);") + sql("INSERT INTO table_integers VALUES (1), (2), (3), (NULL);") + withTable("table_integers") { + withSQLConf( + "spark.sql.planChangeLog.level" -> "info", + "spark.sql.optimizer.supportNestedCorrelatedSubqueries.enabled" -> "true", + "spark.sql.optimizer.supportNestedCorrelatedSubqueriesForScalarSubqueries.enabled" -> "true" + ) { + val df = sql(query).collect() + } + } + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/PlanResolutionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/PlanResolutionSuite.scala index b4b3bff86471e..2a4cd889c0ccf 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/PlanResolutionSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/PlanResolutionSuite.scala @@ -1012,7 +1012,7 @@ class PlanResolutionSuite extends SharedSparkSession with AnalysisTest { query match { case ListQuery(Project(projects, SubqueryAlias(AliasIdentifier("s", Seq()), UnresolvedSubqueryColumnAliases(outputColumnNames, Project(_, _: OneRowRelation)))), - _, _, _, _, _) => + _, _, _, _, _, _) => assert(projects.size == 1 && projects.head.name == "s.name") assert(outputColumnNames.size == 1 && outputColumnNames.head == "name") case o => fail("Unexpected subquery: \n" + o.treeString) @@ -1093,7 +1093,7 @@ class PlanResolutionSuite extends SharedSparkSession with AnalysisTest { query match { case ListQuery(Project(projects, SubqueryAlias(AliasIdentifier("s", Seq()), UnresolvedSubqueryColumnAliases(outputColumnNames, Project(_, _: OneRowRelation)))), - _, _, _, _, _) => + _, _, _, _, _, _) => assert(projects.size == 1 && projects.head.name == "s.name") assert(outputColumnNames.size == 1 && outputColumnNames.head == "name") case o => fail("Unexpected subquery: \n" + o.treeString) diff --git a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/ThriftServerQueryTestSuite.scala b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/ThriftServerQueryTestSuite.scala index 32e11f06ae8b1..acfbd5eba511a 100644 --- a/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/ThriftServerQueryTestSuite.scala +++ b/sql/hive-thriftserver/src/test/scala/org/apache/spark/sql/hive/thriftserver/ThriftServerQueryTestSuite.scala @@ -110,7 +110,16 @@ class ThriftServerQueryTestSuite extends SQLQueryTestSuite with SharedThriftServ // VARIANT type "variant/named-function-arguments.sql", // SPARK-51516: Support the TIME data type by Thrift Server - "time.sql" + "time.sql", + // SPARK-50983: Currently nested correlations tests are using + // analyzer results as results. The expected segment sizes are different + // from other testcases. + "subquery/nestedcorrelation-analyzer-only/combined-subquery.sql", + "subquery/nestedcorrelation-analyzer-only/exists-subquery.sql", + "subquery/nestedcorrelation-analyzer-only/lateral-subquery.sql", + "subquery/nestedcorrelation-analyzer-only/scalar-subquery.sql", + "subquery/nestedcorrelation-analyzer-only/subquery-not-supported.sql", + "subquery/nestedcorrelation/scalar-subquery.sql" ) override def runQueries(