apache · HeartSaVioR · Feb 20, 2025 · Feb 20, 2025 · cloud-fan · Feb 20, 2025
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala
@@ -30,21 +30,23 @@ import org.apache.spark.internal.{Logging, MDC}
 import org.apache.spark.internal.LogKeys.EXTENDED_EXPLAIN_GENERATOR
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.{AnalysisException, ExtendedExplainGenerator, Row}
-import org.apache.spark.sql.catalyst.{InternalRow, QueryPlanningTracker}
+import org.apache.spark.sql.catalyst.{ExtendedAnalysisException, InternalRow, QueryPlanningTracker}
 import org.apache.spark.sql.catalyst.analysis.{LazyExpression, UnsupportedOperationChecker}
 import org.apache.spark.sql.catalyst.expressions.codegen.ByteCodeStats
 import org.apache.spark.sql.catalyst.plans.QueryPlan
 import org.apache.spark.sql.catalyst.plans.logical.{AppendData, Command, CommandResult, CreateTableAsSelect, LogicalPlan, OverwriteByExpression, OverwritePartitionsDynamic, ReplaceTableAsSelect, ReturnAnswer, Union}
 import org.apache.spark.sql.catalyst.rules.{PlanChangeLogger, Rule}
+import org.apache.spark.sql.catalyst.streaming.StreamingRelationV2
 import org.apache.spark.sql.catalyst.util.StringUtils.PlanStringConcat
 import org.apache.spark.sql.catalyst.util.truncatedString
 import org.apache.spark.sql.classic.SparkSession
 import org.apache.spark.sql.execution.adaptive.{AdaptiveExecutionContext, InsertAdaptiveSparkPlan}
 import org.apache.spark.sql.execution.bucketing.{CoalesceBucketsInJoin, DisableUnnecessaryBucketedScan}
+import org.apache.spark.sql.execution.datasources.v2.StreamingDataSourceV2ScanRelation
 import org.apache.spark.sql.execution.dynamicpruning.PlanDynamicPruningFilters
 import org.apache.spark.sql.execution.exchange.EnsureRequirements
 import org.apache.spark.sql.execution.reuse.ReuseExchangeAndSubquery
-import org.apache.spark.sql.execution.streaming.{IncrementalExecution, OffsetSeqMetadata, WatermarkPropagator}
+import org.apache.spark.sql.execution.streaming.{IncrementalExecution, OffsetSeqMetadata, StreamingExecutionRelation, StreamingRelation, WatermarkPropagator}
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.streaming.OutputMode
 import org.apache.spark.util.{LazyTry, Utils}
@@ -128,11 +130,39 @@ class QueryExecution(
     case _ => "command"
   }
 
+  private def assertNoStreamSourceMarkerNode(p: LogicalPlan): Unit = {
+    // In UnsupportedOperationChecker.checkForBatch, we check if the plan has any streaming node.
+    // That is more aggressive than just checking the marker node for streaming source which is
+    // yet to be materialized. We'd like to be a bit conservative here since this is the exact
+    // problematic case we figured out.
+    p.foreach {
+      case _: StreamingRelation | _: StreamingRelationV2 |
+           _: StreamingExecutionRelation | _: StreamingDataSourceV2ScanRelation =>
+        val msg = "Queries with streaming sources must be executed with writeStream.start()"
+        // This is exactly the same with UnsupportedOperationChecker.checkForBatch.
+        // TODO: Classify and issue a new error class, along with UnsupportedOperationChecker.
+        throw new ExtendedAnalysisException(
+          new AnalysisException(
+            errorClass = "_LEGACY_ERROR_TEMP_3102",
+            messageParameters = Map("msg" -> msg)),
+          plan = p)
+
+      case _ =>
+    }
+  }
+
   private def eagerlyExecuteCommands(p: LogicalPlan) = {
     def eagerlyExecute(
         p: LogicalPlan,
         name: String,
         mode: CommandExecutionMode.Value): LogicalPlan = {
+      // Since we are about to execute the plan, the plan shouldn't have a marker node to be
+      // materialized during microbatch planning. If the plan has a marker node, it is highly
+      // likely that users put streaming sources in a batch query.
+      // This case brings problem before reaching the check in UnsupportedOperationChecker,
+      // (assertSupported), so we need to verify here manually.
+      assertNoStreamSourceMarkerNode(p)
+
       // Since Command execution will eagerly take place here,
       // and in most cases be the bulk of time and effort,
       // with the rest of processing of the root plan being just outputting command results,

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/QueryExecutionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/QueryExecutionSuite.scala
@@ -422,6 +422,24 @@ class QueryExecutionSuite extends SharedSparkSession {
     mockCallback.assertAnalyzed()
   }
 
+  test("SPARK-51265 Running eagerlyExecuteCommand with streaming source should give an user " +
+    "facing error") {
+    withTempView("s") {
+      val streamDf = spark.readStream.format("rate").load()
+      streamDf.createOrReplaceTempView("s")
+      withTable("output") {
+        val ex = intercept[AnalysisException] {
+          // Creates a table from streaming source with batch query. This should fail.
+          spark.sql("CREATE TABLE output AS SELECT * FROM s")
+        }
+        assert(
+          ex.getMessage.contains("Queries with streaming sources must be executed with " +
+            "writeStream.start()")
+        )
+      }
+    }
+  }
+
   case class MockCallbackEagerCommand(
       var trackerAnalyzed: QueryPlanningTracker = null,
       var trackerReadyForExecution: QueryPlanningTracker = null)

diff --git a/...c/test/scala/org/apache/spark/sql/execution/streaming/sources/ForeachBatchSinkSuite.scala b/...c/test/scala/org/apache/spark/sql/execution/streaming/sources/ForeachBatchSinkSuite.scala
@@ -215,6 +215,46 @@ class ForeachBatchSinkSuite extends StreamTest {
     assert(ex.getCause == sparkEx)
   }
 
+  test("SPARK-51265 Running eagerlyExecuteCommand with streaming source in foreachBatch " +
+    "should give an user facing error") {
+    val mem = MemoryStream[Int]
+    val ds = mem.toDS().map(_ + 1)
+
+    def foreachBatchFn(df: Dataset[Int], batchId: Long): Unit = {
+      withTempView("param", "s") {
+        df.createOrReplaceTempView("param")
+        val streamDf = df.sparkSession.readStream.format("rate").load()
+        streamDf.createOrReplaceTempView("s")
+        withTable("output") {
+          val ex = intercept[AnalysisException] {
+            // Creates a table from streaming source with batch query. This should fail.
+            df.sparkSession.sql("CREATE TABLE output AS SELECT * FROM s")
+          }
+          assert(
+            ex.getMessage.contains("Queries with streaming sources must be executed with " +
+              "writeStream.start()")
+          )
+
+          // Creates a table from batch source (materialized RDD plan of streaming query).
+          // This should be work properly.
+          df.sparkSession.sql("CREATE TABLE output AS SELECT * from param")
+
+          checkAnswer(
+            df.sparkSession.sql("SELECT value FROM output"),
+            Seq(Row(2), Row(3), Row(4), Row(5), Row(6)))
+        }
+      }
+    }
+
+    mem.addData(1, 2, 3, 4, 5)
+
+    val query = ds.writeStream
+      .trigger(Trigger.AvailableNow())
+      .foreachBatch(foreachBatchFn _)
+      .start()
+    query.awaitTermination()
+  }
+
   // ============== Helper classes and methods =================
 
   private class ForeachBatchTester[T: Encoder](memoryStream: MemoryStream[Int]) {