apache
diff --git a/‎.github/workflows/miri.yml
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/miri.yml
Lines changed: 1 addition & 1 deletion
diff --git a/‎common/src/main/scala/org/apache/comet/CometConf.scala
Lines changed: 0 additions & 5 deletions b/‎common/src/main/scala/org/apache/comet/CometConf.scala
Lines changed: 0 additions & 5 deletions
diff --git a/‎dev/diffs/3.4.3.diff
Lines changed: 1 addition & 1 deletion b/‎dev/diffs/3.4.3.diff
Lines changed: 1 addition & 1 deletion
diff --git a/‎dev/diffs/3.5.4.diff
Lines changed: 1 addition & 1 deletion b/‎dev/diffs/3.5.4.diff
Lines changed: 1 addition & 1 deletion
diff --git a/‎dev/diffs/3.5.5.diff
Lines changed: 1 addition & 1 deletion b/‎dev/diffs/3.5.5.diff
Lines changed: 1 addition & 1 deletion
diff --git a/‎dev/diffs/4.0.0-preview1.diff
Lines changed: 1 addition & 1 deletion b/‎dev/diffs/4.0.0-preview1.diff
Lines changed: 1 addition & 1 deletion
diff --git a/‎spark/src/main/scala/org/apache/comet/parquet/CometParquetFileFormat.scala
Lines changed: 6 additions & 3 deletions b/‎spark/src/main/scala/org/apache/comet/parquet/CometParquetFileFormat.scala
Lines changed: 6 additions & 3 deletions
diff --git a/‎spark/src/main/scala/org/apache/comet/parquet/CometParquetPartitionReaderFactory.scala
Lines changed: 1 addition & 11 deletions b/‎spark/src/main/scala/org/apache/comet/parquet/CometParquetPartitionReaderFactory.scala
Lines changed: 1 addition & 11 deletions
diff --git a/‎spark/src/main/scala/org/apache/comet/parquet/CometParquetScan.scala
Lines changed: 1 addition & 0 deletions b/‎spark/src/main/scala/org/apache/comet/parquet/CometParquetScan.scala
Lines changed: 1 addition & 0 deletions
diff --git a/‎spark/src/main/scala/org/apache/comet/rules/CometExecRule.scala
Lines changed: 2 additions & 3 deletions b/‎spark/src/main/scala/org/apache/comet/rules/CometExecRule.scala
Lines changed: 2 additions & 3 deletions
diff --git a/‎spark/src/main/scala/org/apache/comet/rules/CometScanRule.scala
Lines changed: 31 additions & 13 deletions b/‎spark/src/main/scala/org/apache/comet/rules/CometScanRule.scala
Lines changed: 31 additions & 13 deletions
diff --git a/‎spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala
Lines changed: 18 additions & 5 deletions b/‎spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala
Lines changed: 18 additions & 5 deletions
diff --git a/‎spark/src/main/scala/org/apache/spark/sql/comet/CometNativeScanExec.scala
Lines changed: 5 additions & 19 deletions b/‎spark/src/main/scala/org/apache/spark/sql/comet/CometNativeScanExec.scala
Lines changed: 5 additions & 19 deletions
@@ -52,4 +52,4 @@ jobs:
       - name: Test with Miri
         run: |
           cd native
-          MIRIFLAGS="-Zmiri-disable-isolation" cargo miri test
+          MIRIFLAGS="-Zmiri-disable-isolation" cargo miri test --lib --bins --tests --examples
@@ -104,11 +104,6 @@ object CometConf extends ShimCometConf {
       .getOrElse("COMET_PARQUET_SCAN_IMPL", SCAN_NATIVE_COMET)
       .toLowerCase(Locale.ROOT))
 
-  def isExperimentalNativeScan: Boolean = COMET_NATIVE_SCAN_IMPL.get() match {
-    case SCAN_NATIVE_DATAFUSION | SCAN_NATIVE_ICEBERG_COMPAT => true
-    case SCAN_NATIVE_COMET => false
-  }
-
   val COMET_PARQUET_PARALLEL_IO_ENABLED: ConfigEntry[Boolean] =
     conf("spark.comet.parquet.read.parallel.io.enabled")
       .doc(
 
@@ -961,7 +961,7 @@ index 75eabcb96f2..36e3318ad7e 100644
                _.asInstanceOf[FileScanRDD].filePartitions.forall(
                  _.files.forall(_.urlEncodedPath.contains("p=0"))))
 +        case WholeStageCodegenExec(ColumnarToRowExec(InputAdapter(
-+        fs @ CometScanExec(_, _, _, partitionFilters, _, _, _, _, _, _)))) =>
++        fs @ CometScanExec(_, _, _, _, partitionFilters, _, _, _, _, _, _)))) =>
 +          partitionFilters.exists(ExecSubqueryExpression.hasSubquery) &&
 +            fs.inputRDDs().forall(
 +              _.asInstanceOf[FileScanRDD].filePartitions.forall(
 
@@ -1092,7 +1092,7 @@ index 260c992f1ae..b9d8e22337c 100644
                _.asInstanceOf[FileScanRDD].filePartitions.forall(
                  _.files.forall(_.urlEncodedPath.contains("p=0"))))
 +        case WholeStageCodegenExec(ColumnarToRowExec(InputAdapter(
-+        fs @ CometScanExec(_, _, _, partitionFilters, _, _, _, _, _, _)))) =>
++        fs @ CometScanExec(_, _, _, _, partitionFilters, _, _, _, _, _, _)))) =>
 +          partitionFilters.exists(ExecSubqueryExpression.hasSubquery) &&
 +            fs.inputRDDs().forall(
 +              _.asInstanceOf[FileScanRDD].filePartitions.forall(
 
@@ -963,7 +963,7 @@ index 04702201f82..6cc2b01b7f3 100644
                _.asInstanceOf[FileScanRDD].filePartitions.forall(
                  _.files.forall(_.urlEncodedPath.contains("p=0"))))
 +        case WholeStageCodegenExec(ColumnarToRowExec(InputAdapter(
-+        fs @ CometScanExec(_, _, _, partitionFilters, _, _, _, _, _, _)))) =>
++        fs @ CometScanExec(_, _, _, _, partitionFilters, _, _, _, _, _, _)))) =>
 +          partitionFilters.exists(ExecSubqueryExpression.hasSubquery) &&
 +            fs.inputRDDs().forall(
 +              _.asInstanceOf[FileScanRDD].filePartitions.forall(
 
@@ -1035,7 +1035,7 @@ index 68f14f13bbd..174636cefb5 100644
                _.asInstanceOf[FileScanRDD].filePartitions.forall(
                  _.files.forall(_.urlEncodedPath.contains("p=0"))))
 +        case WholeStageCodegenExec(ColumnarToRowExec(InputAdapter(
-+        fs @ CometScanExec(_, _, _, partitionFilters, _, _, _, _, _, _)))) =>
++        fs @ CometScanExec(_, _, _, _, partitionFilters, _, _, _, _, _, _)))) =>
 +          partitionFilters.exists(ExecSubqueryExpression.hasSubquery) &&
 +            fs.inputRDDs().forall(
 +              _.asInstanceOf[FileScanRDD].filePartitions.forall(
 
@@ -56,7 +56,10 @@ import org.apache.comet.vector.CometVector
  *     in [[org.apache.comet.CometSparkSessionExtensions]]
  *   - `buildReaderWithPartitionValues`, so Spark calls Comet's Parquet reader to read values.
  */
-class CometParquetFileFormat extends ParquetFileFormat with MetricsSupport with ShimSQLConf {
+class CometParquetFileFormat(scanImpl: String)
+    extends ParquetFileFormat
+    with MetricsSupport
+    with ShimSQLConf {
   override def shortName(): String = "parquet"
   override def toString: String = "CometParquet"
   override def hashCode(): Int = getClass.hashCode()
@@ -100,8 +103,8 @@ class CometParquetFileFormat extends ParquetFileFormat with MetricsSupport with
 
     // Comet specific configurations
     val capacity = CometConf.COMET_BATCH_SIZE.get(sqlConf)
-    val nativeIcebergCompat =
-      CometConf.COMET_NATIVE_SCAN_IMPL.get(sqlConf).equals(CometConf.SCAN_NATIVE_ICEBERG_COMPAT)
+
+    val nativeIcebergCompat = scanImpl == CometConf.SCAN_NATIVE_ICEBERG_COMPAT
 
     (file: PartitionedFile) => {
       val sharedConf = broadcastedHadoopConf.value.value
 
@@ -46,6 +46,7 @@ import org.apache.comet.{CometConf, CometRuntimeException}
 import org.apache.comet.shims.ShimSQLConf
 
 case class CometParquetPartitionReaderFactory(
+    usingDataFusionReader: Boolean,
     @transient sqlConf: SQLConf,
     broadcastedConf: Broadcast[SerializableConfiguration],
     readDataSchema: StructType,
@@ -71,17 +72,6 @@ case class CometParquetPartitionReaderFactory(
   // Comet specific configurations
   private val batchSize = CometConf.COMET_BATCH_SIZE.get(sqlConf)
 
-  @transient private lazy val usingDataFusionReader: Boolean = {
-    val conf = broadcastedConf.value.value
-    conf.getBoolean(
-      CometConf.COMET_NATIVE_SCAN_ENABLED.key,
-      CometConf.COMET_NATIVE_SCAN_ENABLED.defaultValue.get) &&
-    conf
-      .get(
-        CometConf.COMET_NATIVE_SCAN_IMPL.key,
-        CometConf.COMET_NATIVE_SCAN_IMPL.defaultValueString)
-      .equalsIgnoreCase(CometConf.SCAN_NATIVE_ICEBERG_COMPAT)
-  }
   // This is only called at executor on a Broadcast variable, so we don't want it to be
   // materialized at driver.
   @transient private lazy val preFetchEnabled = {
 
@@ -58,6 +58,7 @@ trait CometParquetScan extends FileScan with MetricsSupport {
     val broadcastedConf =
       sparkSession.sparkContext.broadcast(new SerializableConfiguration(hadoopConf))
     CometParquetPartitionReaderFactory(
+      usingDataFusionReader = false, // this value is not used since this is v2 scan
       sqlConf,
       broadcastedConf,
       readDataSchema,
 
@@ -35,7 +35,7 @@ import org.apache.spark.sql.execution.window.WindowExec
 import org.apache.spark.sql.types.{DoubleType, FloatType}
 
 import org.apache.comet.{CometConf, ExtendedExplainInfo}
-import org.apache.comet.CometConf.{COMET_ANSI_MODE_ENABLED, COMET_NATIVE_SCAN_IMPL, COMET_SHUFFLE_FALLBACK_TO_COLUMNAR}
+import org.apache.comet.CometConf.{COMET_ANSI_MODE_ENABLED, COMET_SHUFFLE_FALLBACK_TO_COLUMNAR}
 import org.apache.comet.CometSparkSessionExtensions.{createMessage, getCometBroadcastNotEnabledReason, getCometShuffleNotEnabledReason, isANSIEnabled, isCometBroadCastForceEnabled, isCometExecEnabled, isCometJVMShuffleMode, isCometLoaded, isCometNativeShuffleMode, isCometScan, isCometShuffleEnabled, isSpark40Plus, shouldApplySparkToColumnar, withInfo}
 import org.apache.comet.serde.OperatorOuterClass.Operator
 import org.apache.comet.serde.QueryPlanSerde
@@ -154,8 +154,7 @@ case class CometExecRule(session: SparkSession) extends Rule[SparkPlan] {
 
     plan.transformUp {
       // Fully native scan for V1
-      case scan: CometScanExec
-          if COMET_NATIVE_SCAN_IMPL.get() == CometConf.SCAN_NATIVE_DATAFUSION =>
+      case scan: CometScanExec if scan.scanImpl == CometConf.SCAN_NATIVE_DATAFUSION =>
         val nativeOp = QueryPlanSerde.operator2Proto(scan).get
         CometNativeScanExec(nativeOp, scan.wrapped, scan.session)
 
 
@@ -25,14 +25,15 @@ import org.apache.spark.sql.SparkSession
 import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, PlanExpression}
 import org.apache.spark.sql.catalyst.rules.Rule
 import org.apache.spark.sql.catalyst.util.MetadataColumnHelper
-import org.apache.spark.sql.comet.{CometBatchScanExec, CometNativeScanExec, CometScanExec}
+import org.apache.spark.sql.comet.{CometBatchScanExec, CometScanExec}
 import org.apache.spark.sql.execution.{FileSourceScanExec, SparkPlan}
 import org.apache.spark.sql.execution.datasources.HadoopFsRelation
 import org.apache.spark.sql.execution.datasources.v2.BatchScanExec
 import org.apache.spark.sql.execution.datasources.v2.parquet.ParquetScan
 import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.types.{ArrayType, ByteType, DataType, MapType, ShortType, StructType}
 
-import org.apache.comet.CometConf
+import org.apache.comet.{CometConf, DataTypeSupport}
 import org.apache.comet.CometConf._
 import org.apache.comet.CometSparkSessionExtensions.{isCometLoaded, isCometScanEnabled, withInfo, withInfos}
 import org.apache.comet.parquet.{CometParquetScan, SupportsComet}
@@ -106,16 +107,11 @@ case class CometScanRule(session: SparkSession) extends Rule[SparkPlan] {
           return withInfos(scanExec, fallbackReasons.toSet)
         }
 
-        val (schemaSupported, partitionSchemaSupported) = scanImpl match {
-          case CometConf.SCAN_NATIVE_DATAFUSION =>
-            (
-              CometNativeScanExec.isSchemaSupported(scanExec.requiredSchema, fallbackReasons),
-              CometNativeScanExec.isSchemaSupported(r.partitionSchema, fallbackReasons))
-          case CometConf.SCAN_NATIVE_COMET | SCAN_NATIVE_ICEBERG_COMPAT =>
-            (
-              CometScanExec.isSchemaSupported(scanExec.requiredSchema, fallbackReasons),
-              CometScanExec.isSchemaSupported(r.partitionSchema, fallbackReasons))
-        }
+        val typeChecker = new CometScanTypeChecker(scanImpl)
+        val schemaSupported =
+          typeChecker.isSchemaSupported(scanExec.requiredSchema, fallbackReasons)
+        val partitionSchemaSupported =
+          typeChecker.isSchemaSupported(r.partitionSchema, fallbackReasons)
 
         if (!schemaSupported) {
           fallbackReasons += s"Unsupported schema ${scanExec.requiredSchema} for $scanImpl"
@@ -125,7 +121,9 @@ case class CometScanRule(session: SparkSession) extends Rule[SparkPlan] {
         }
 
         if (schemaSupported && partitionSchemaSupported) {
-          CometScanExec(scanExec, session)
+          // this is confusing, but we always insert a CometScanExec here, which may replaced
+          // with a CometNativeExec when CometExecRule runs, depending on the scanImpl value.
+          CometScanExec(scanExec, session, scanImpl)
         } else {
           withInfos(scanExec, fallbackReasons.toSet)
         }
@@ -201,3 +199,23 @@ case class CometScanRule(session: SparkSession) extends Rule[SparkPlan] {
   }
 
 }
+
+case class CometScanTypeChecker(scanImpl: String) extends DataTypeSupport {
+  override def isTypeSupported(
+      dt: DataType,
+      name: String,
+      fallbackReasons: ListBuffer[String]): Boolean = {
+    dt match {
+      case ByteType | ShortType
+          if scanImpl != CometConf.SCAN_NATIVE_COMET &&
+            !CometConf.COMET_SCAN_ALLOW_INCOMPATIBLE.get() =>
+        fallbackReasons += s"$scanImpl scan cannot read $dt when " +
+          s"${CometConf.COMET_SCAN_ALLOW_INCOMPATIBLE.key} is false. ${CometConf.COMPAT_GUIDE}."
+        false
+      case _: StructType | _: ArrayType | _: MapType if scanImpl == CometConf.SCAN_NATIVE_COMET =>
+        false
+      case _ =>
+        super.isTypeSupported(dt, name, fallbackReasons)
+    }
+  }
+}
@@ -2282,8 +2282,7 @@ object QueryPlanSerde extends Logging with CometExprShim {
     op match {
 
       // Fully native scan for V1
-      case scan: CometScanExec
-          if CometConf.COMET_NATIVE_SCAN_IMPL.get(conf) == CometConf.SCAN_NATIVE_DATAFUSION =>
+      case scan: CometScanExec if scan.scanImpl == CometConf.SCAN_NATIVE_DATAFUSION =>
         val nativeScanBuilder = OperatorOuterClass.NativeScan.newBuilder()
         nativeScanBuilder.setSource(op.simpleStringWithNodeId())
 
@@ -2376,12 +2375,26 @@ object QueryPlanSerde extends Logging with CometExprShim {
         val cond = exprToProto(condition, child.output)
 
         if (cond.isDefined && childOp.nonEmpty) {
+          // We need to determine whether to use DataFusion's FilterExec or Comet's
+          // FilterExec. The difference is that DataFusion's implementation will sometimes pass
+          // batches through whereas the Comet implementation guarantees that a copy is always
+          // made, which is critical when using `native_comet` scans due to buffer re-use
+
+          // TODO this could be optimized more to stop walking the tree on hitting
+          //  certain operators such as join or aggregate which will copy batches
+          def containsNativeCometScan(plan: SparkPlan): Boolean = {
+            plan match {
+              case w: CometScanWrapper => containsNativeCometScan(w.originalPlan)
+              case scan: CometScanExec => scan.scanImpl == CometConf.SCAN_NATIVE_COMET
+              case _: CometNativeScanExec => false
+              case _ => plan.children.exists(containsNativeCometScan)
+            }
+          }
+
           val filterBuilder = OperatorOuterClass.Filter
             .newBuilder()
             .setPredicate(cond.get)
-            .setUseDatafusionFilter(
-              CometConf.COMET_NATIVE_SCAN_IMPL.get() == CometConf.SCAN_NATIVE_DATAFUSION ||
-                CometConf.COMET_NATIVE_SCAN_IMPL.get() == CometConf.SCAN_NATIVE_ICEBERG_COMPAT)
+            .setUseDatafusionFilter(!containsNativeCometScan(op))
           Some(result.setFilter(filterBuilder).build())
         } else {
           withInfo(op, condition, child)
 
@@ -19,7 +19,6 @@
 
 package org.apache.spark.sql.comet
 
-import scala.collection.mutable.ListBuffer
 import scala.reflect.ClassTag
 
 import org.apache.spark.rdd.RDD
@@ -36,12 +35,12 @@ import org.apache.spark.util.collection._
 
 import com.google.common.base.Objects
 
-import org.apache.comet.{CometConf, DataTypeSupport}
+import org.apache.comet.CometConf
 import org.apache.comet.parquet.CometParquetFileFormat
 import org.apache.comet.serde.OperatorOuterClass.Operator
 
 /**
- * Comet fully native scan node for DataSource V1.
+ * Comet fully native scan node for DataSource V1 that delegates to DataFusion's DataSourceExec.
  */
 case class CometNativeScanExec(
     override val nativeOp: Operator,
@@ -184,7 +183,7 @@ case class CometNativeScanExec(
   override def inputRDDs(): Seq[RDD[InternalRow]] = originalPlan.inputRDDs()
 }
 
-object CometNativeScanExec extends DataTypeSupport {
+object CometNativeScanExec {
   def apply(
       nativeOp: Operator,
       scanExec: FileSourceScanExec,
@@ -206,7 +205,8 @@ object CometNativeScanExec extends DataTypeSupport {
     // https://github.com/apache/arrow-datafusion-comet/issues/190
     def transform(arg: Any): AnyRef = arg match {
       case _: HadoopFsRelation =>
-        scanExec.relation.copy(fileFormat = new CometParquetFileFormat)(session)
+        scanExec.relation.copy(fileFormat =
+          new CometParquetFileFormat(CometConf.SCAN_NATIVE_DATAFUSION))(session)
       case other: AnyRef => other
       case null => null
     }
@@ -229,18 +229,4 @@ object CometNativeScanExec extends DataTypeSupport {
     scanExec.logicalLink.foreach(batchScanExec.setLogicalLink)
     batchScanExec
   }
-
-  override def isTypeSupported(
-      dt: DataType,
-      name: String,
-      fallbackReasons: ListBuffer[String]): Boolean = {
-    dt match {
-      case ByteType | ShortType if !CometConf.COMET_SCAN_ALLOW_INCOMPATIBLE.get() =>
-        fallbackReasons += s"${CometConf.SCAN_NATIVE_DATAFUSION} scan cannot read $dt when " +
-          s"${CometConf.COMET_SCAN_ALLOW_INCOMPATIBLE.key} is false. ${CometConf.COMPAT_GUIDE}."
-        false
-      case _ =>
-        super.isTypeSupported(dt, name, fallbackReasons)
-    }
-  }
 }