Support Schema Evolution in iceberg

huaxingao · huaxingao · commit 899fe4fda977 · 2025-05-08T16:28:25.000-07:00
diff --git a/common/src/main/java/org/apache/comet/parquet/AbstractColumnReader.java b/common/src/main/java/org/apache/comet/parquet/AbstractColumnReader.java
@@ -27,7 +27,6 @@
 import org.apache.spark.sql.types.DataType;
 import org.apache.spark.sql.types.TimestampNTZType$;
 
-import org.apache.comet.CometConf;
 import org.apache.comet.vector.CometVector;
 
 /** Base class for Comet Parquet column reader implementations. */
@@ -63,6 +62,13 @@ public abstract class AbstractColumnReader implements AutoCloseable {
   /** A pointer to the native implementation of ColumnReader. */
   protected long nativeHandle;
 
+  /**
+   * Whether to enable schema evolution in Comet. For instance, promoting a integer column to a long
+   * column, a float column to a double column, etc. This is automatically enabled when reading from
+   * Iceberg tables.
+   */
+  protected boolean supportsSchemaEvolution;
+
   public AbstractColumnReader(
       DataType type,
       Type fieldType,
@@ -80,9 +86,11 @@ public AbstractColumnReader(
       DataType type,
       ColumnDescriptor descriptor,
       boolean useDecimal128,
-      boolean useLegacyDateTimestamp) {
+      boolean useLegacyDateTimestamp,
+      boolean supportsSchemaEvolution) {
     this(type, null, descriptor, useDecimal128, useLegacyDateTimestamp);
-    TypeUtil.checkParquetType(descriptor, type);
+    this.supportsSchemaEvolution = supportsSchemaEvolution;
+    TypeUtil.checkParquetType(descriptor, type, supportsSchemaEvolution);
   }
 
   public ColumnDescriptor getDescriptor() {
@@ -120,7 +128,7 @@ public void close() {
 
   protected void initNative() {
     LOG.debug("initializing the native column reader");
-    DataType readType = (boolean) CometConf.COMET_SCHEMA_EVOLUTION_ENABLED().get() ? type : null;
+    DataType readType = supportsSchemaEvolution ? type : null;
     boolean useLegacyDateTimestampOrNTZ =
         useLegacyDateTimestamp || type == TimestampNTZType$.MODULE$;
     nativeHandle =
diff --git a/common/src/main/java/org/apache/comet/parquet/BatchReader.java b/common/src/main/java/org/apache/comet/parquet/BatchReader.java
@@ -583,7 +583,8 @@ private boolean loadNextRowGroupIfNecessary() throws Throwable {
               capacity,
               useDecimal128,
               useLazyMaterialization,
-              useLegacyDateTimestamp);
+              useLegacyDateTimestamp,
+              false);
       reader.setPageReader(rowGroupReader.getPageReader(columns.get(i)));
       columnReaders[i] = reader;
     }
diff --git a/common/src/main/java/org/apache/comet/parquet/ColumnReader.java b/common/src/main/java/org/apache/comet/parquet/ColumnReader.java
@@ -99,8 +99,9 @@ public ColumnReader(
       CometSchemaImporter importer,
       int batchSize,
       boolean useDecimal128,
-      boolean useLegacyDateTimestamp) {
-    super(type, descriptor, useDecimal128, useLegacyDateTimestamp);
+      boolean useLegacyDateTimestamp,
+      boolean supportsSchemaEvolution) {
+    super(type, descriptor, useDecimal128, useLegacyDateTimestamp, supportsSchemaEvolution);
     assert batchSize > 0 : "Batch size must be positive, found " + batchSize;
     this.batchSize = batchSize;
     this.importer = importer;
diff --git a/common/src/main/java/org/apache/comet/parquet/LazyColumnReader.java b/common/src/main/java/org/apache/comet/parquet/LazyColumnReader.java
@@ -49,8 +49,16 @@ public LazyColumnReader(
       CometSchemaImporter importer,
       int batchSize,
       boolean useDecimal128,
-      boolean useLegacyDateTimestamp) {
-    super(sparkReadType, descriptor, importer, batchSize, useDecimal128, useLegacyDateTimestamp);
+      boolean useLegacyDateTimestamp,
+      boolean supportsSchemaEvolution) {
+    super(
+        sparkReadType,
+        descriptor,
+        importer,
+        batchSize,
+        useDecimal128,
+        useLegacyDateTimestamp,
+        supportsSchemaEvolution);
     this.batchSize = 0; // the batch size is set later in `readBatch`
     this.vector = new CometLazyVector(sparkReadType, this, useDecimal128);
   }
diff --git a/common/src/main/java/org/apache/comet/parquet/MetadataColumnReader.java b/common/src/main/java/org/apache/comet/parquet/MetadataColumnReader.java
@@ -45,7 +45,7 @@ public class MetadataColumnReader extends AbstractColumnReader {
   public MetadataColumnReader(
       DataType type, ColumnDescriptor descriptor, boolean useDecimal128, boolean isConstant) {
     // TODO: should we handle legacy dates & timestamps for metadata columns?
-    super(type, descriptor, useDecimal128, false);
+    super(type, descriptor, useDecimal128, false, false);
 
     this.isConstant = isConstant;
   }
diff --git a/common/src/main/java/org/apache/comet/parquet/TypeUtil.java b/common/src/main/java/org/apache/comet/parquet/TypeUtil.java
@@ -31,8 +31,6 @@
 import org.apache.spark.sql.internal.SQLConf;
 import org.apache.spark.sql.types.*;
 
-import org.apache.comet.CometConf;
-
 public class TypeUtil {
 
   /** Converts the input Spark 'field' into a Parquet column descriptor. */
@@ -116,11 +114,11 @@ public static ColumnDescriptor convertToParquet(StructField field) {
    * @param descriptor descriptor for a Parquet primitive column
    * @param sparkType Spark read type
    */
-  public static void checkParquetType(ColumnDescriptor descriptor, DataType sparkType) {
+  public static void checkParquetType(
+      ColumnDescriptor descriptor, DataType sparkType, boolean allowTypePromotion) {
     PrimitiveType.PrimitiveTypeName typeName = descriptor.getPrimitiveType().getPrimitiveTypeName();
     LogicalTypeAnnotation logicalTypeAnnotation =
         descriptor.getPrimitiveType().getLogicalTypeAnnotation();
-    boolean allowTypePromotion = (boolean) CometConf.COMET_SCHEMA_EVOLUTION_ENABLED().get();
 
     if (sparkType instanceof NullType) {
       return;
diff --git a/common/src/main/java/org/apache/comet/parquet/Utils.java b/common/src/main/java/org/apache/comet/parquet/Utils.java
@@ -28,33 +28,33 @@
 
 public class Utils {
 
-  /** This method is called from Apache Iceberg. */
-  public static ColumnReader getColumnReader(
-      DataType type,
-      ColumnDescriptor descriptor,
-      CometSchemaImporter importer,
-      int batchSize,
-      boolean useDecimal128,
-      boolean useLazyMaterialization) {
-    // TODO: support `useLegacyDateTimestamp` for Iceberg
-    return getColumnReader(
-        type, descriptor, importer, batchSize, useDecimal128, useLazyMaterialization, true);
-  }
-
   public static ColumnReader getColumnReader(
       DataType type,
       ColumnDescriptor descriptor,
       CometSchemaImporter importer,
       int batchSize,
       boolean useDecimal128,
       boolean useLazyMaterialization,
-      boolean useLegacyDateTimestamp) {
+      boolean useLegacyDateTimestamp,
+      boolean supportsSchemaEvolution) {
     if (useLazyMaterialization && supportLazyMaterialization(type)) {
       return new LazyColumnReader(
-          type, descriptor, importer, batchSize, useDecimal128, useLegacyDateTimestamp);
+          type,
+          descriptor,
+          importer,
+          batchSize,
+          useDecimal128,
+          useLegacyDateTimestamp,
+          supportsSchemaEvolution);
     } else {
       return new ColumnReader(
-          type, descriptor, importer, batchSize, useDecimal128, useLegacyDateTimestamp);
+          type,
+          descriptor,
+          importer,
+          batchSize,
+          useDecimal128,
+          useLegacyDateTimestamp,
+          supportsSchemaEvolution);
     }
   }
 
diff --git a/common/src/main/scala/org/apache/comet/CometConf.scala b/common/src/main/scala/org/apache/comet/CometConf.scala
@@ -539,16 +539,6 @@ object CometConf extends ShimCometConf {
     .booleanConf
     .createWithDefault(true)
 
-  val COMET_SCHEMA_EVOLUTION_ENABLED: ConfigEntry[Boolean] = conf(
-    "spark.comet.schemaEvolution.enabled")
-    .internal()
-    .doc(
-      "Whether to enable schema evolution in Comet. For instance, promoting a integer " +
-        "column to a long column, a float column to a double column, etc. This is automatically" +
-        "enabled when reading from Iceberg tables.")
-    .booleanConf
-    .createWithDefault(COMET_SCHEMA_EVOLUTION_ENABLED_DEFAULT)
-
   val COMET_SPARK_TO_ARROW_ENABLED: ConfigEntry[Boolean] =
     conf("spark.comet.sparkToColumnar.enabled")
       .internal()
diff --git a/spark/src/main/scala/org/apache/comet/rules/CometScanRule.scala b/spark/src/main/scala/org/apache/comet/rules/CometScanRule.scala
@@ -180,8 +180,6 @@ case class CometScanRule(session: SparkSession) extends Rule[SparkPlan] {
         }
 
         if (s.isCometEnabled && schemaSupported) {
-          // When reading from Iceberg, we automatically enable type promotion
-          SQLConf.get.setConfString(COMET_SCHEMA_EVOLUTION_ENABLED.key, "true")
           CometBatchScanExec(
             scanExec.clone().asInstanceOf[BatchScanExec],
             runtimeFilters = scanExec.runtimeFilters)
diff --git a/spark/src/test/scala/org/apache/comet/parquet/ParquetReadSuite.scala b/spark/src/test/scala/org/apache/comet/parquet/ParquetReadSuite.scala
@@ -1217,34 +1217,6 @@ abstract class ParquetReadSuite extends CometTestBase {
     }
   }
 
-  test("schema evolution") {
-    Seq(true, false).foreach { enableSchemaEvolution =>
-      Seq(true, false).foreach { useDictionary =>
-        {
-          withSQLConf(
-            CometConf.COMET_SCHEMA_EVOLUTION_ENABLED.key -> enableSchemaEvolution.toString) {
-            val data = (0 until 100).map(i => {
-              val v = if (useDictionary) i % 5 else i
-              (v, v.toFloat)
-            })
-            val readSchema =
-              StructType(
-                Seq(StructField("_1", LongType, false), StructField("_2", DoubleType, false)))
-
-            withParquetDataFrame(data, schema = Some(readSchema)) { df =>
-              // TODO: validate with Spark 3.x and 'usingDataFusionParquetExec=true'
-              if (enableSchemaEvolution || usingDataFusionParquetExec(conf)) {
-                checkAnswer(df, data.map(Row.fromTuple))
-              } else {
-                assertThrows[SparkException](df.collect())
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-
   test("scan metrics") {
     // https://github.com/apache/datafusion-comet/issues/1441
     assume(CometConf.COMET_NATIVE_SCAN_IMPL.get() != CometConf.SCAN_NATIVE_ICEBERG_COMPAT)

Original file line number	Diff line number	Diff line change
`@@ -45,7 +45,7 @@ public class MetadataColumnReader extends AbstractColumnReader {`
`45`	`45`	`public MetadataColumnReader(`
`46`	`46`	`DataType type, ColumnDescriptor descriptor, boolean useDecimal128, boolean isConstant) {`
`47`	`47`	`// TODO: should we handle legacy dates & timestamps for metadata columns?`
`48`		`- super(type, descriptor, useDecimal128, false);`
	`48`	`+ super(type, descriptor, useDecimal128, false, false);`
`49`	`49`
`50`	`50`	`this.isConstant = isConstant;`
`51`	`51`	`}`
Original file line number	Diff line number	Diff line change
`@@ -180,8 +180,6 @@ case class CometScanRule(session: SparkSession) extends Rule[SparkPlan] {`
`180`	`180`	`}`
`181`	`181`
`182`	`182`	`if (s.isCometEnabled && schemaSupported) {`
`183`		`- // When reading from Iceberg, we automatically enable type promotion`
`184`		`- SQLConf.get.setConfString(COMET_SCHEMA_EVOLUTION_ENABLED.key, "true")`
`185`	`183`	`CometBatchScanExec(`
`186`	`184`	`scanExec.clone().asInstanceOf[BatchScanExec],`
`187`	`185`	`runtimeFilters = scanExec.runtimeFilters)`