diff --git a/.github/workflows/connectors_test.yaml b/.github/workflows/connectors_test.yaml index 1481e982f71..cec3c722b7e 100644 --- a/.github/workflows/connectors_test.yaml +++ b/.github/workflows/connectors_test.yaml @@ -1,8 +1,8 @@ -name: "Delta Connectors Tests" +name: "Delta Connectors" on: [push, pull_request] jobs: build: - name: "Run tests" + name: "DC: Scala ${{ matrix.scala }}" runs-on: ubuntu-20.04 strategy: matrix: diff --git a/.github/workflows/kernel_test.yaml b/.github/workflows/kernel_test.yaml index e30089110c6..a03ba19481b 100644 --- a/.github/workflows/kernel_test.yaml +++ b/.github/workflows/kernel_test.yaml @@ -1,7 +1,8 @@ -name: "Delta Kernel Tests" +name: "Delta Kernel" on: [push, pull_request] jobs: test: + name: "DK" runs-on: ubuntu-20.04 env: SCALA_VERSION: 2.12.18 diff --git a/.github/workflows/spark_examples_test.yaml b/.github/workflows/spark_examples_test.yaml index e3b8341d997..7461e087576 100644 --- a/.github/workflows/spark_examples_test.yaml +++ b/.github/workflows/spark_examples_test.yaml @@ -1,7 +1,8 @@ -name: "Delta Spark Local Publishing and Examples Compilation" +name: "Delta Spark Publishing and Examples" on: [push, pull_request] jobs: test: + name: "DSP&E: Scala ${{ matrix.scala }}" runs-on: ubuntu-20.04 strategy: matrix: diff --git a/.github/workflows/spark_master_test.yaml b/.github/workflows/spark_master_test.yaml index 1836e42bc96..3906c31f221 100644 --- a/.github/workflows/spark_master_test.yaml +++ b/.github/workflows/spark_master_test.yaml @@ -1,7 +1,8 @@ -name: "Delta Spark Master Tests" +name: "Delta Spark Master" on: [push, pull_request] jobs: test: + name: "DSM: Scala ${{ matrix.scala }}, Shard ${{ matrix.shard }}" runs-on: ubuntu-20.04 strategy: matrix: diff --git a/.github/workflows/spark_python_test.yaml b/.github/workflows/spark_python_test.yaml index d103612949e..b183336316f 100644 --- a/.github/workflows/spark_python_test.yaml +++ b/.github/workflows/spark_python_test.yaml @@ -1,7 +1,8 @@ -name: "Delta Spark Python Tests" +name: "Delta Spark Python" on: [push, pull_request] jobs: test: + name: "DSP" runs-on: ubuntu-20.04 strategy: matrix: @@ -60,7 +61,7 @@ jobs: # `-SNAPSHOT` in version (e.g. `3.3.0-SNAPSHOT`) as the version is picked up from # the`version.sbt` file. pipenv run pip install pip==24.0 setuptools==69.5.1 wheel==0.43.0 - pipenv run pip install pyspark==3.5.0 + pipenv run pip install pyspark==3.5.3 pipenv run pip install flake8==3.5.0 pypandoc==1.3.3 pipenv run pip install black==23.9.1 pipenv run pip install importlib_metadata==3.10.0 diff --git a/.github/workflows/spark_test.yaml b/.github/workflows/spark_test.yaml index 8ab56954f74..71893279bc0 100644 --- a/.github/workflows/spark_test.yaml +++ b/.github/workflows/spark_test.yaml @@ -1,7 +1,8 @@ -name: "Delta Spark Tests" +name: "Delta Spark Latest" on: [push, pull_request] jobs: test: + name: "DSL: Scala ${{ matrix.scala }}, Shard ${{ matrix.shard }}" runs-on: ubuntu-20.04 strategy: matrix: @@ -64,7 +65,7 @@ jobs: # `-SNAPSHOT` in version (e.g. `3.3.0-SNAPSHOT`) as the version is picked up from # the`version.sbt` file. pipenv run pip install pip==24.0 setuptools==69.5.1 wheel==0.43.0 - pipenv run pip install pyspark==3.5.2 + pipenv run pip install pyspark==3.5.3 pipenv run pip install flake8==3.5.0 pypandoc==1.3.3 pipenv run pip install black==23.9.1 pipenv run pip install importlib_metadata==3.10.0 diff --git a/.github/workflows/unidoc.yaml b/.github/workflows/unidoc.yaml index 8ef11790e5b..78671ce08d5 100644 --- a/.github/workflows/unidoc.yaml +++ b/.github/workflows/unidoc.yaml @@ -1,12 +1,12 @@ - name: "Unidoc generation" + name: "Unidoc" on: [push, pull_request] jobs: build: - name: "Generate unidoc" + name: "U: Scala ${{ matrix.scala }}" runs-on: ubuntu-20.04 strategy: matrix: - # These Scala versions must match those in the build.sbt + # These Scala versions must match those in the build.sbt scala: [2.13.13, 2.12.18] steps: - name: install java diff --git a/Dockerfile b/Dockerfile index 261b6d84a68..6da2b487bf7 100644 --- a/Dockerfile +++ b/Dockerfile @@ -38,7 +38,7 @@ RUN pip3 install --upgrade pip # the`version.sbt` file. RUN pip install pip==24.0 setuptools==69.5.1 wheel==0.43.0 -RUN pip3 install pyspark==3.5.2 +RUN pip3 install pyspark==3.5.3 RUN pip3 install mypy==0.982 diff --git a/benchmarks/build.sbt b/benchmarks/build.sbt index 277a132069b..ef07dd97427 100644 --- a/benchmarks/build.sbt +++ b/benchmarks/build.sbt @@ -20,7 +20,7 @@ scalaVersion := "2.12.18" lazy val root = (project in file(".")) .settings( name := "benchmarks", - libraryDependencies += "org.apache.spark" %% "spark-sql" % "3.5.2" % "provided", + libraryDependencies += "org.apache.spark" %% "spark-sql" % "3.5.3" % "provided", libraryDependencies += "com.github.scopt" %% "scopt" % "4.0.1", libraryDependencies += "com.fasterxml.jackson.module" %% "jackson-module-scala" % "2.13.1", diff --git a/build.sbt b/build.sbt index 151af7a67c2..cf58305da7c 100644 --- a/build.sbt +++ b/build.sbt @@ -52,7 +52,7 @@ val all_scala_versions = Seq(scala212, scala213) val default_scala_version = settingKey[String]("Default Scala version") Global / default_scala_version := scala212 -val LATEST_RELEASED_SPARK_VERSION = "3.5.2" +val LATEST_RELEASED_SPARK_VERSION = "3.5.3" val SPARK_MASTER_VERSION = "4.0.0-SNAPSHOT" val sparkVersion = settingKey[String]("Spark version") spark / sparkVersion := getSparkVersion() @@ -176,6 +176,7 @@ def crossSparkSettings(): Seq[Setting[_]] = getSparkVersion() match { Compile / unmanagedSourceDirectories += (Compile / baseDirectory).value / "src" / "main" / "scala-spark-3.5", Test / unmanagedSourceDirectories += (Test / baseDirectory).value / "src" / "test" / "scala-spark-3.5", Antlr4 / antlr4Version := "4.9.3", + Test / javaOptions ++= Seq("-Dlog4j.configurationFile=log4j2.properties"), // Java-/Scala-/Uni-Doc Settings scalacOptions ++= Seq( @@ -204,8 +205,9 @@ def crossSparkSettings(): Seq[Setting[_]] = getSparkVersion() match { "--add-opens=java.base/sun.nio.ch=ALL-UNNAMED", "--add-opens=java.base/sun.nio.cs=ALL-UNNAMED", "--add-opens=java.base/sun.security.action=ALL-UNNAMED", - "--add-opens=java.base/sun.util.calendar=ALL-UNNAMED" - ) + "--add-opens=java.base/sun.util.calendar=ALL-UNNAMED", + "-Dlog4j.configurationFile=log4j2_spark_master.properties" + ), // Java-/Scala-/Uni-Doc Settings // This isn't working yet against Spark Master. diff --git a/connectors/.github/workflows/new_pull_request.yaml b/connectors/.github/workflows/new_pull_request.yaml deleted file mode 100644 index 30b9389902a..00000000000 --- a/connectors/.github/workflows/new_pull_request.yaml +++ /dev/null @@ -1,16 +0,0 @@ -name: Add new pull requests to Backlog (External) - -on: - pull_request_target: - types: [opened, reopened] - -jobs: - automate-new-pull-requests: - if: ${{ !contains('allisonport-db dennyglee scottsand-db tdas zsxwing', github.event.sender.login) }} - runs-on: ubuntu-latest - steps: - - uses: alex-page/github-project-automation-plus@v0.8.1 - with: - project: oss-delta-prs - column: Needs Review - repo-token: ${{ secrets.PROJECT_BOARD_AUTOMATION_TOKEN }} diff --git a/connectors/.github/workflows/new_updated_issue.yaml b/connectors/.github/workflows/new_updated_issue.yaml deleted file mode 100644 index 9169e9a4486..00000000000 --- a/connectors/.github/workflows/new_updated_issue.yaml +++ /dev/null @@ -1,19 +0,0 @@ -name: Add new and updated issues to Needs Review - -on: - issues: - types: [opened, reopened] - issue_comment: - types: [created] - - -jobs: - automate-new-updated-issues: - if: ${{ !github.event.issue.pull_request && !contains('allisonport-db dennyglee scottsand-db tdas zsxwing', github.event.sender.login) }} - runs-on: ubuntu-latest - steps: - - uses: alex-page/github-project-automation-plus@v0.8.1 - with: - project: oss-delta-issues - column: Needs Review - repo-token: ${{ secrets.PROJECT_BOARD_AUTOMATION_TOKEN }} diff --git a/connectors/.github/workflows/test.yaml b/connectors/.github/workflows/test.yaml deleted file mode 100644 index 1a27feafb04..00000000000 --- a/connectors/.github/workflows/test.yaml +++ /dev/null @@ -1,43 +0,0 @@ -name: "Delta Lake Connectors Tests" -on: [push, pull_request] -jobs: - build: - name: "Run tests" - runs-on: ubuntu-20.04 - strategy: - matrix: - scala: [2.13.13, 2.12.18, 2.11.12] - steps: - - uses: actions/checkout@v2 - - name: install java - uses: actions/setup-java@v2 - with: - distribution: 'zulu' - java-version: '8' - - name: Cache Scala, SBT - uses: actions/cache@v2 - with: - path: | - ~/.sbt - ~/.ivy2 - ~/.cache/coursier - ~/.m2 - key: build-cache-3-with-scala_${{ matrix.scala }} - - name: Run Scala Style tests on test sources (Scala 2.12 only) - run: build/sbt "++ ${{ matrix.scala }}" testScalastyle - if: startsWith(matrix.scala, '2.12.') - - name: Run sqlDeltaImport tests (Scala 2.12 and 2.13 only) - run: build/sbt "++ ${{ matrix.scala }}" sqlDeltaImport/test - if: ${{ !startsWith(matrix.scala, '2.11.') }} - - name: Run Delta Standalone Compatibility tests (Scala 2.12 only) - run: build/sbt "++ ${{ matrix.scala }}" compatibility/test - if: startsWith(matrix.scala, '2.12.') - - name: Run Delta Standalone tests - run: build/sbt "++ ${{ matrix.scala }}" standalone/test testStandaloneCosmetic/test standaloneParquet/test testParquetUtilsWithStandaloneCosmetic/test - - name: Run Hive 3 tests - run: build/sbt "++ ${{ matrix.scala }}" hiveMR/test hiveTez/test - - name: Run Hive 2 tests - run: build/sbt "++ ${{ matrix.scala }}" hive2MR/test hive2Tez/test - - name: Run Flink tests (Scala 2.12 only) - run: build/sbt -mem 3000 "++ ${{ matrix.scala }}" flink/test - if: ${{ startsWith(matrix.scala, '2.12.') }} diff --git a/connectors/.github/workflows/updated_pull_request.yaml b/connectors/.github/workflows/updated_pull_request.yaml deleted file mode 100644 index d15a0075850..00000000000 --- a/connectors/.github/workflows/updated_pull_request.yaml +++ /dev/null @@ -1,20 +0,0 @@ -name: Move updated pull requests to Needs Review - -on: - issue_comment: - types: [created] - pull_request_target: - types: [synchronize] - -jobs: - automate-updated-pull-requests: - if: ${{ (github.event.issue.pull_request || github.event.pull_request) && - !contains('allisonport-db dennyglee scottsand-db tdas zsxwing', github.event.sender.login) && - (github.event.pull_request.state == 'open' || github.event.issue.state == 'open') }} - runs-on: ubuntu-latest - steps: - - uses: alex-page/github-project-automation-plus@2af3cf061aeca8ac6ab40a960eee1968a7f9ce0e # TODO: update to use a version after fixes are merged & released - with: - project: oss-delta-prs - column: Needs Review - repo-token: ${{ secrets.PROJECT_BOARD_AUTOMATION_TOKEN }} diff --git a/connectors/dev/README.md b/connectors/dev/README.md deleted file mode 100644 index 8a85deb2a6d..00000000000 --- a/connectors/dev/README.md +++ /dev/null @@ -1,38 +0,0 @@ -# Dev README -Below are some helpful IntelliJ configurations you can set to match our coding style and standards. - -## Checkstyle -This project uses checkstyle to format Java code. If developing locally, please setup checkstyle using the following steps. - -1. Add the CheckStyle-IDEA plugin to IntelliJ. -- `Settings > Plugins > Marketplace > CheckStyle-IDEA > INSTALL`. -- Restart your IDE if prompted. - -2. Configure IntelliJ to use the `checkstyle.xml` file provided in this directory. -- Go to `Settings > Tools > Checkstyle` (this tool location may differ based on your version of IntelliJ). -- Set the version to 8.29. -- Under the `Configuration File` heading, click the `+` symbol to add our specific configuration file. -- Give our file a useful description, such as `Delta Connectors Java Checks`, and provide the `connectors/dev/checkstyle.xml` path. -- Click `Next` to add the checkstyle file -- Check `Active` next to it once it has been added -- In the top right, set the Scan Scope to `Only Java sources (including tests)` - -3. Now, on the bottom tab bar, there should be a `CheckStyle` tab that lets you run Java style checks against using the `Check Project` button. - -4. You can also run checkstyle using SBT. For example, `build/sbt checkstyle` to run against all modules or `build/sbt standalone/checkstyle` to test only the `standalone` module. - -## Java Import Order -We use the following import order in our Java files. Please update this in `Settings > Editor > Code Style > Java > Imports > Import Layout`: - -``` -import java.* -import javax.* - -import scala.* - -import all other imports - -import io.delta.standalone.* -import io.delta.standalone.internal.* -``` - \ No newline at end of file diff --git a/connectors/flink/src/main/java/io/delta/flink/sink/internal/DeltaPartitionComputer.java b/connectors/flink/src/main/java/io/delta/flink/sink/internal/DeltaPartitionComputer.java index 4ee3ad388b3..fa37d64a1dd 100644 --- a/connectors/flink/src/main/java/io/delta/flink/sink/internal/DeltaPartitionComputer.java +++ b/connectors/flink/src/main/java/io/delta/flink/sink/internal/DeltaPartitionComputer.java @@ -5,6 +5,7 @@ import org.apache.flink.streaming.api.functions.sink.filesystem.BucketAssigner; import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.conversion.DateDateConverter; import org.apache.flink.table.types.logical.LogicalType; import org.apache.flink.table.types.logical.LogicalTypeRoot; import org.apache.flink.table.types.logical.RowType; @@ -99,6 +100,10 @@ public LinkedHashMap generatePartitionValues( partitionValues.put(partitionKey, String.valueOf(element.getShort(keyIndex))); } else if (keyType.getTypeRoot() == LogicalTypeRoot.TINYINT) { partitionValues.put(partitionKey, String.valueOf(element.getByte(keyIndex))); + } else if (keyType.getTypeRoot() == LogicalTypeRoot.DATE) { + DateDateConverter converter = new DateDateConverter(); + String value = String.valueOf(converter.toExternal(element.getInt(keyIndex))); + partitionValues.put(partitionKey, value); } else { throw new RuntimeException("Type not supported " + keyType.getTypeRoot()); } diff --git a/connectors/flink/src/test/java/io/delta/flink/sink/DeltaSinkITCase.java b/connectors/flink/src/test/java/io/delta/flink/sink/DeltaSinkITCase.java new file mode 100644 index 00000000000..fce6f373a8c --- /dev/null +++ b/connectors/flink/src/test/java/io/delta/flink/sink/DeltaSinkITCase.java @@ -0,0 +1,103 @@ +package io.delta.flink.sink; + +import java.nio.file.Path; +import java.time.LocalDate; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; +import java.util.stream.Collectors; +import java.util.stream.IntStream; +import java.util.stream.Stream; +import java.util.stream.StreamSupport; + +import io.delta.flink.utils.DeltaTestUtils; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; +import org.apache.flink.table.data.RowData; +import org.apache.flink.table.data.util.DataFormatConverters; +import org.apache.flink.table.types.logical.DateType; +import org.apache.flink.table.types.logical.IntType; +import org.apache.flink.table.types.logical.RowType; +import org.apache.flink.table.types.utils.TypeConversions; +import org.apache.flink.types.Row; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; + +import io.delta.standalone.DeltaLog; +import io.delta.standalone.data.CloseableIterator; +import io.delta.standalone.data.RowRecord; + +public class DeltaSinkITCase { + + @Test + public void testWritePartitionedByDate(@TempDir Path tempDir) throws Exception { + final String deltaTablePath = tempDir.toString(); + + final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); + + final RowType rowType = new RowType(Arrays.asList( + new RowType.RowField("part1", new DateType()), + new RowType.RowField("data1", new IntType()) + )); + + final DataFormatConverters.DataFormatConverter typeConverter = + DataFormatConverters.getConverterForDataType( + TypeConversions.fromLogicalToDataType(rowType) + ); + + final DeltaSink deltaSink = DeltaSink + .forRowData( + new org.apache.flink.core.fs.Path(deltaTablePath), + DeltaTestUtils.getHadoopConf(), + rowType + ) + .withPartitionColumns("part1") + .build(); + + LocalDate startDate = LocalDate.of(2024, 8, 1); + int numOfDays = 30; + List inputDates = IntStream.range(1, numOfDays) + .mapToObj(startDate::plusDays) + .collect(Collectors.toList()); + + RowData[] elements = inputDates.stream() + .map(date -> Row.of(date, 0)) + .map(typeConverter::toInternal) + .toArray(RowData[]::new); + + final DataStream inputStream = env.fromElements(elements); + + inputStream.sinkTo(deltaSink); + + env.execute("Delta Sink Example"); + + DeltaLog deltaLog = + DeltaLog.forTable(DeltaTestUtils.getHadoopConf(), deltaTablePath); + + try (CloseableIterator rowRecordIterator = deltaLog.snapshot().open()) { + + Iterable iterable = () -> rowRecordIterator; + Stream rowRecordStream = StreamSupport.stream(iterable.spliterator(), false); + + List resultDates = rowRecordStream + .map(rowRecord -> rowRecord.getDate("part1").toLocalDate()) + .collect(Collectors.toList()); + + org.assertj.core.api.Assertions.assertThat(resultDates) + .withFailMessage("Delta table failed to store date partitions.") + .containsAll(inputDates); + + } catch (Exception e) { + throw new RuntimeException(e); + } + + List deltaPartitionColumns = deltaLog + .snapshot() + .getMetadata() + .getPartitionColumns(); + + org.assertj.core.api.Assertions.assertThat(deltaPartitionColumns) + .withFailMessage("Delta table failed to store date partitions to the delta log.") + .containsAll(Collections.singletonList("part1")); + } +} diff --git a/connectors/flink/src/test/java/io/delta/flink/sink/internal/TestDeltaBucketAssigner.java b/connectors/flink/src/test/java/io/delta/flink/sink/internal/TestDeltaBucketAssigner.java index ca25ac37db6..ec9fcf758b2 100644 --- a/connectors/flink/src/test/java/io/delta/flink/sink/internal/TestDeltaBucketAssigner.java +++ b/connectors/flink/src/test/java/io/delta/flink/sink/internal/TestDeltaBucketAssigner.java @@ -18,6 +18,7 @@ package io.delta.flink.sink.internal; +import java.time.LocalDate; import java.util.Arrays; import java.util.LinkedHashMap; import javax.annotation.Nullable; @@ -27,6 +28,7 @@ import org.apache.flink.table.data.RowData; import org.apache.flink.table.data.util.DataFormatConverters; import org.apache.flink.table.types.logical.BigIntType; +import org.apache.flink.table.types.logical.DateType; import org.apache.flink.table.types.logical.DoubleType; import org.apache.flink.table.types.logical.IntType; import org.apache.flink.table.types.logical.RowType; @@ -97,22 +99,23 @@ public void testRowDataPartitionComputer() { new RowType.RowField("partition_col3", new BigIntType()), new RowType.RowField("partition_col4", new SmallIntType()), new RowType.RowField("partition_col5", new TinyIntType()), - new RowType.RowField("col5", new VarCharType()), - new RowType.RowField("col6", new IntType()) + new RowType.RowField("partition_col6", new DateType()), + new RowType.RowField("col7", new VarCharType()), + new RowType.RowField("col8", new IntType()) )); DataFormatConverters.DataFormatConverter converter = DataFormatConverters.getConverterForDataType( TypeConversions.fromLogicalToDataType(testRowType) ); String[] partitionCols = {"partition_col1", "partition_col2", "partition_col3", - "partition_col4", "partition_col5"}; + "partition_col4", "partition_col5", "partition_col6"}; DeltaPartitionComputer partitionComputer = new DeltaPartitionComputer.DeltaRowDataPartitionComputer(testRowType, partitionCols); RowData record = converter.toInternal( Row.of("1", Integer.MAX_VALUE, Long.MAX_VALUE, Short.MAX_VALUE, Byte.MAX_VALUE, - "some_val", 2)); + LocalDate.of(9999,12,31), "some_val", 2)); // WHEN LinkedHashMap partitionValues = @@ -125,6 +128,7 @@ public void testRowDataPartitionComputer() { put("partition_col3", String.valueOf(Long.MAX_VALUE)); put("partition_col4", String.valueOf(Short.MAX_VALUE)); put("partition_col5", String.valueOf(Byte.MAX_VALUE)); + put("partition_col6", "9999-12-31"); }}; assertEquals(expected, partitionValues); diff --git a/connectors/standalone/src/main/scala/io/delta/standalone/internal/Checkpoints.scala b/connectors/standalone/src/main/scala/io/delta/standalone/internal/Checkpoints.scala index a2f8444f10f..62714dd9553 100644 --- a/connectors/standalone/src/main/scala/io/delta/standalone/internal/Checkpoints.scala +++ b/connectors/standalone/src/main/scala/io/delta/standalone/internal/Checkpoints.scala @@ -238,6 +238,9 @@ private[internal] object Checkpoints extends Logging { snapshot.tombstonesScala ).map(_.wrap) + logInfo(s"Starting to write checkpoint at path=$path using rename=$useRename and " + + s"snapshot=$snapshot") + val writtenPath = if (useRename) { val p = new Path(path) @@ -245,6 +248,7 @@ private[internal] object Checkpoints extends Logging { // speculation, stage retry), so generate the temp path here to avoid two tasks // using the same path. val tempPath = new Path(p.getParent, s".${p.getName}.${UUID.randomUUID}.tmp") + logInfo(s"Writing the checkpoint first to temp file: $tempPath") tempPath.toString } else { path @@ -265,6 +269,25 @@ private[internal] object Checkpoints extends Logging { numOfFiles += 1 } } + + // Before calling close (or rename if applicable) make sure we have written + // all `addFiles` actions from the snapshot. By writing the `addFiles` actions, + // we ensure the table state is captured in the checkpoint. + if (numOfFiles != snapshot.numOfFiles) { + val msg = s"Number of `add` files written to checkpoint file ($numOfFiles) doesn't match " + + s"the number of `add` files contained in the snapshot (${snapshot.numOfFiles}). " + + s"Skipping creating the checkpoint.\nCheckpoint file: `$path`." + + (if (writtenPath != path) s"\nTemporary file: `$writtenPath`" else "") + + // The error message will be logged in the catch block below. + throw new IllegalStateException(msg) + } + + // Close the writer only after writing all the actions. Calling close before writing + // all the records could result in leaving a Parquet file with partial content as the + // `close` flushes the already buffered data to storage. + // This would leak resources but we don't have a way to abort the storage request here. + writer.close() } catch { case e: org.apache.hadoop.fs.FileAlreadyExistsException if !useRename => val p = new Path(writtenPath) @@ -274,8 +297,11 @@ private[internal] object Checkpoints extends Logging { } else { throw e } - } finally { - writer.close() + case other: Throwable => + // Make sure the log the exception before throwing it, so that we know why the checkpoint + // write failed. + logError(s"Error writing checkpoint at $writtenPath", other) + throw other } if (useRename) { @@ -287,27 +313,31 @@ private[internal] object Checkpoints extends Logging { if (fs.rename(src, dest)) { renameDone = true } else { + val msg = s"Cannot rename $src to $dest" // There should be only one writer writing the checkpoint file, so there must be // something wrong here. - throw new IllegalStateException(s"Cannot rename $src to $dest") + logError(msg) + throw new IllegalStateException(msg) } } finally { if (!renameDone) { - fs.delete(src, false) + try { + fs.delete(src, false) + } catch { + case NonFatal(e) => + logWarning(s"Error while deleting the temporary checkpoint part file $src", e) + } } } } - if (numOfFiles != snapshot.numOfFiles) { - throw new IllegalStateException( - "State of the checkpoint doesn't match that of the snapshot.") - } - // Attempting to write empty checkpoint if (checkpointSize == 0) { logWarning(DeltaErrors.EmptyCheckpointErrorMessage) } - CheckpointMetaData(snapshot.version, checkpointSize, None) + val checkpointMetaData = CheckpointMetaData(snapshot.version, checkpointSize, None) + logInfo(s"Checkpoint written to $path with $checkpointMetaData") + checkpointMetaData } } diff --git a/connectors/standalone/src/main/scala/io/delta/standalone/internal/SnapshotImpl.scala b/connectors/standalone/src/main/scala/io/delta/standalone/internal/SnapshotImpl.scala index c8dd67bbfa0..a0697d6c6dd 100644 --- a/connectors/standalone/src/main/scala/io/delta/standalone/internal/SnapshotImpl.scala +++ b/connectors/standalone/src/main/scala/io/delta/standalone/internal/SnapshotImpl.scala @@ -56,6 +56,7 @@ case class ProtocolMetadataLoadMetrics(fileVersions: Seq[Long]) /** * Scala implementation of Java interface [[Snapshot]]. * + * @param path _delta_log path of this snapshot * @param timestamp The timestamp of the latest commit in milliseconds. Can also be set to -1 if the * timestamp of the commit is unknown or the table has not been initialized, i.e. * `version = -1`. @@ -82,6 +83,8 @@ private[internal] class SnapshotImpl( import SnapshotImpl._ + @volatile private var loadedState = false + private val memoryOptimizedLogReplay = new MemoryOptimizedLogReplay(files, deltaLog.store, hadoopConf, deltaLog.timezone) @@ -89,6 +92,16 @@ private[internal] class SnapshotImpl( // Public API Methods /////////////////////////////////////////////////////////////////////////// + override def toString: String = { + if (loadedState) { + s"SnapshotImpl(path=$path, version=$version, timestamp=$timestamp, " + + s"sizeInBytes=${state.sizeInBytes}, numAddFiles=${state.numOfFiles}, " + + s"numRemoveFiles=${state.numOfRemoves}, numSetTransactions=${state.numOfSetTransactions})" + } else { + s"SnapshotImpl(path=$path, version=$version, timestamp=$timestamp)" + } + } + override def scan(): DeltaScan = new DeltaScanImpl(memoryOptimizedLogReplay) override def scan(predicate: Expression): DeltaScan = @@ -325,6 +338,8 @@ private[internal] class SnapshotImpl( throw DeltaErrors.actionNotFoundException("metadata", version) } + loadedState = true + State( replay.getSetTransactions, replay.getActiveFiles, diff --git a/docs/environment.yml b/docs/environment.yml index 89dda7b1b3a..eda28c4f724 100644 --- a/docs/environment.yml +++ b/docs/environment.yml @@ -33,7 +33,7 @@ dependencies: - packaging==23.2 - py4j==0.10.9.7 - pygments==2.16.1 - - pyspark==3.5.2 + - pyspark==3.5.3 - pytz==2023.3.post1 - requests==2.31.0 - six==1.16.0 diff --git a/examples/scala/build.sbt b/examples/scala/build.sbt index e253cba3b42..3bc6e285353 100644 --- a/examples/scala/build.sbt +++ b/examples/scala/build.sbt @@ -46,7 +46,7 @@ val lookupSparkVersion: PartialFunction[(Int, Int), String] = { // version 4.0.0-preview1 case (major, minor) if major >= 4 => "4.0.0-preview1" // versions 3.3.x+ - case (major, minor) if major >= 3 && minor >=3 => "3.5.2" + case (major, minor) if major >= 3 && minor >=3 => "3.5.3" // versions 3.0.0 to 3.2.x case (major, minor) if major >= 3 && minor <=2 => "3.5.0" // versions 2.4.x diff --git a/iceberg/src/main/scala/org/apache/spark/sql/delta/icebergShaded/IcebergConversionTransaction.scala b/iceberg/src/main/scala/org/apache/spark/sql/delta/icebergShaded/IcebergConversionTransaction.scala index a196c5f12bc..4c2949c8565 100644 --- a/iceberg/src/main/scala/org/apache/spark/sql/delta/icebergShaded/IcebergConversionTransaction.scala +++ b/iceberg/src/main/scala/org/apache/spark/sql/delta/icebergShaded/IcebergConversionTransaction.scala @@ -97,9 +97,8 @@ class IcebergConversionTransaction( tablePath, partitionSpec, logicalToPhysicalPartitionNames, - postCommitSnapshot.statsSchema, statsParser, - postCommitSnapshot.deltaLog + postCommitSnapshot ) ) } @@ -138,9 +137,8 @@ class IcebergConversionTransaction( tablePath, partitionSpec, logicalToPhysicalPartitionNames, - postCommitSnapshot.statsSchema, statsParser, - postCommitSnapshot.deltaLog + postCommitSnapshot ) ) } @@ -148,7 +146,11 @@ class IcebergConversionTransaction( def remove(remove: RemoveFile): Unit = { overwriter.deleteFile( convertDeltaRemoveFileToIcebergDataFile( - remove, tablePath, partitionSpec, logicalToPhysicalPartitionNames) + remove, + tablePath, + partitionSpec, + logicalToPhysicalPartitionNames, + postCommitSnapshot) ) } } @@ -167,7 +169,11 @@ class IcebergConversionTransaction( val dataFilesToDelete = removes.map { f => assert(!f.dataChange, "Rewrite operation should not add data") convertDeltaRemoveFileToIcebergDataFile( - f, tablePath, partitionSpec, logicalToPhysicalPartitionNames) + f, + tablePath, + partitionSpec, + logicalToPhysicalPartitionNames, + postCommitSnapshot) }.toSet.asJava val dataFilesToAdd = adds.map { f => @@ -177,9 +183,8 @@ class IcebergConversionTransaction( tablePath, partitionSpec, logicalToPhysicalPartitionNames, - postCommitSnapshot.statsSchema, statsParser, - postCommitSnapshot.deltaLog + postCommitSnapshot ) }.toSet.asJava @@ -250,7 +255,7 @@ class IcebergConversionTransaction( } def getExpireSnapshotHelper(): ExpireSnapshotHelper = { - val ret = new ExpireSnapshotHelper(txn.expireSnapshots().cleanExpiredFiles(false)) + val ret = new ExpireSnapshotHelper(txn.expireSnapshots()) fileUpdates += ret ret } diff --git a/iceberg/src/main/scala/org/apache/spark/sql/delta/icebergShaded/IcebergTransactionUtils.scala b/iceberg/src/main/scala/org/apache/spark/sql/delta/icebergShaded/IcebergTransactionUtils.scala index 3f16a4c1b45..71d3f3396d4 100644 --- a/iceberg/src/main/scala/org/apache/spark/sql/delta/icebergShaded/IcebergTransactionUtils.scala +++ b/iceberg/src/main/scala/org/apache/spark/sql/delta/icebergShaded/IcebergTransactionUtils.scala @@ -16,10 +16,13 @@ package org.apache.spark.sql.delta.icebergShaded +import java.nio.ByteBuffer +import java.time.Instant + import scala.collection.JavaConverters._ import scala.util.control.NonFatal -import org.apache.spark.sql.delta.{DeltaColumnMapping, DeltaConfig, DeltaConfigs, DeltaErrors, DeltaLog, DeltaRuntimeException} +import org.apache.spark.sql.delta.{DeltaColumnMapping, DeltaConfig, DeltaConfigs, DeltaErrors, DeltaLog, Snapshot} import org.apache.spark.sql.delta.DeltaConfigs.parseCalendarInterval import org.apache.spark.sql.delta.actions.{AddFile, FileAction, RemoveFile} import org.apache.spark.sql.delta.metering.DeltaLogging @@ -27,6 +30,7 @@ import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import shadedForDelta.org.apache.iceberg.{DataFile, DataFiles, FileFormat, PartitionSpec, Schema => IcebergSchema} import shadedForDelta.org.apache.iceberg.Metrics +import shadedForDelta.org.apache.iceberg.StructLike import shadedForDelta.org.apache.iceberg.TableProperties // scalastyle:off import.ordering.noEmptyLine @@ -35,7 +39,7 @@ import shadedForDelta.org.apache.iceberg.catalog.{Namespace, TableIdentifier => import shadedForDelta.org.apache.iceberg.hive.HiveCatalog import org.apache.spark.sql.catalyst.{InternalRow, TableIdentifier => SparkTableIdentifier} -import org.apache.spark.sql.types.StructType +import org.apache.spark.sql.types.{BinaryType, BooleanType, ByteType, DataType, DateType, DecimalType, DoubleType, FloatType, IntegerType, LongType, ShortType, StringType, StructType, TimestampNTZType, TimestampType} import org.apache.spark.unsafe.types.CalendarInterval object IcebergTransactionUtils @@ -60,66 +64,6 @@ object IcebergTransactionUtils } } - def convertDeltaAddFileToIcebergDataFile( - add: AddFile, - tablePath: Path, - partitionSpec: PartitionSpec, - logicalToPhysicalPartitionNames: Map[String, String], - statsSchema: StructType, - statsParser: String => InternalRow, - deltaLog: DeltaLog): DataFile = { - if (add.deletionVector != null) { - throw new UnsupportedOperationException("No support yet for DVs") - } - - var dataFileBuilder = - convertFileAction(add, tablePath, partitionSpec, logicalToPhysicalPartitionNames) - // Attempt to attach the number of records metric regardless of whether the Delta stats - // string is null/empty or not because this metric is required by Iceberg. If the number - // of records is both unavailable here and unavailable in the Delta stats, Iceberg will - // throw an exception when building the data file. - .withRecordCount(add.numLogicalRecords.getOrElse(-1L)) - - if (add.stats != null && add.stats.nonEmpty) { - try { - val statsRow = statsParser(add.stats) - - val metricsConverter = IcebergStatsConverter(statsRow, statsSchema) - val metrics = new Metrics( - metricsConverter.numRecordsStat, // rowCount - null, // columnSizes - null, // valueCounts - metricsConverter.nullValueCountsStat.getOrElse(null).asJava, // nullValueCounts - null, // nanValueCounts - metricsConverter.lowerBoundsStat.getOrElse(null).asJava, // lowerBounds - metricsConverter.upperBoundsStat.getOrElse(null).asJava // upperBounds - ) - - dataFileBuilder = dataFileBuilder.withMetrics(metrics) - } catch { - case NonFatal(e) => - logWarning("Failed to convert Delta stats to Iceberg stats. Iceberg conversion will " + - "attempt to proceed without stats.", e) - } - } - - dataFileBuilder.build() - } - - /** - * Note that APIs like [[shadedForDelta.org.apache.iceberg.OverwriteFiles#deleteFile]] take - * a DataFile, and not a DeleteFile as you might have expected. - */ - def convertDeltaRemoveFileToIcebergDataFile( - remove: RemoveFile, - tablePath: Path, - partitionSpec: PartitionSpec, - logicalToPhysicalPartitionNames: Map[String, String]): DataFile = { - convertFileAction(remove, tablePath, partitionSpec, logicalToPhysicalPartitionNames) - .withRecordCount(remove.numLogicalRecords.getOrElse(0L)) - .build() - } - /** * We expose this as a public API since APIs like * [[shadedForDelta.org.apache.iceberg.DeleteFiles#deleteFile]] actually only need to take in @@ -168,48 +112,153 @@ object IcebergTransactionUtils partitionSchema.fields.map(f => f.name -> DeltaColumnMapping.getPhysicalName(f)).toMap } + class Row (val values: Array[Any]) extends StructLike { + override def size: Int = values.length + override def get[T <: Any](pos: Int, javaClass: Class[T]): T = javaClass.cast(values(pos)) + override def set[T <: Any](pos: Int, value: T): Unit = { + values(pos) = value + } + } + //////////////////// // Helper Methods // //////////////////// /** Visible for testing. */ + private[delta] def convertDeltaAddFileToIcebergDataFile( + add: AddFile, + tablePath: Path, + partitionSpec: PartitionSpec, + logicalToPhysicalPartitionNames: Map[String, String], + statsParser: String => InternalRow, + snapshot: Snapshot): DataFile = { + if (add.deletionVector != null) { + throw new UnsupportedOperationException("No support yet for DVs") + } + + var dataFileBuilder = + convertFileAction( + add, tablePath, partitionSpec, logicalToPhysicalPartitionNames, snapshot) + // Attempt to attach the number of records metric regardless of whether the Delta stats + // string is null/empty or not because this metric is required by Iceberg. If the number + // of records is both unavailable here and unavailable in the Delta stats, Iceberg will + // throw an exception when building the data file. + .withRecordCount(add.numLogicalRecords.getOrElse(-1L)) + + try { + if (add.stats != null && add.stats.nonEmpty) { + dataFileBuilder = dataFileBuilder.withMetrics( + getMetricsForIcebergDataFile(statsParser, add.stats, snapshot.statsSchema)) + } + } catch { + case NonFatal(e) => + logWarning("Failed to convert Delta stats to Iceberg stats. Iceberg conversion will " + + "attempt to proceed without stats.", e) + } + + dataFileBuilder.build() + } + + private[delta] def convertDeltaRemoveFileToIcebergDataFile( + remove: RemoveFile, + tablePath: Path, + partitionSpec: PartitionSpec, + logicalToPhysicalPartitionNames: Map[String, String], + snapshot: Snapshot): DataFile = { + convertFileAction( + remove, tablePath, partitionSpec, logicalToPhysicalPartitionNames, snapshot) + .withRecordCount(remove.numLogicalRecords.getOrElse(0L)) + .build() + } + private[delta] def convertFileAction( f: FileAction, tablePath: Path, partitionSpec: PartitionSpec, - logicalToPhysicalPartitionNames: Map[String, String]): DataFiles.Builder = { + logicalToPhysicalPartitionNames: Map[String, String], + snapshot: Snapshot): DataFiles.Builder = { val absPath = canonicalizeFilePath(f, tablePath) - + val schema = snapshot.schema var builder = DataFiles .builder(partitionSpec) .withPath(absPath) .withFileSizeInBytes(f.getFileSize) .withFormat(FileFormat.PARQUET) + val nameToDataTypes = schema.fields.map(f => f.name -> f.dataType).toMap if (partitionSpec.isPartitioned) { val ICEBERG_NULL_PARTITION_VALUE = "__HIVE_DEFAULT_PARTITION__" - val partitionPath = partitionSpec - .fields() - .asScala - .map(_.name) - .map { logicalPartCol => - // The Iceberg Schema and PartitionSpec all use the column logical names. - // Delta FileAction::partitionValues, however, uses physical names. - val physicalPartKey = logicalToPhysicalPartitionNames(logicalPartCol) - - // ICEBERG_NULL_PARTITION_VALUE is referred in Iceberg lib to mark NULL partition value - val partValue = Option(f.partitionValues(physicalPartKey)) - .getOrElse(ICEBERG_NULL_PARTITION_VALUE) - s"$logicalPartCol=$partValue" - } - .mkString("/") - - builder = builder.withPartitionPath(partitionPath) - } + val partitionPath = partitionSpec.fields() + val partitionVals = new Array[Any](partitionSpec.fields().size()) + for (i <- partitionVals.indices) { + val logicalPartCol = partitionPath.get(i).name() + val physicalPartKey = logicalToPhysicalPartitionNames(logicalPartCol) + // ICEBERG_NULL_PARTITION_VALUE is referred in Iceberg lib to mark NULL partition value + val partValue = Option(f.partitionValues(physicalPartKey)) + .getOrElse(ICEBERG_NULL_PARTITION_VALUE) + val partitionColumnDataType = nameToDataTypes(logicalPartCol) + val icebergPartitionValue = + stringToIcebergPartitionValue(partitionColumnDataType, partValue, snapshot.version) + partitionVals(i) = icebergPartitionValue + } + builder = builder.withPartition(new Row(partitionVals)) + } builder } + /** + * Follows deserialization as specified here + * https://github.com/delta-io/delta/blob/master/PROTOCOL.md#Partition-Value-Serialization + */ + private def stringToIcebergPartitionValue( + elemType: DataType, + partitionVal: String, + version: Long): Any = { + if (partitionVal == null || partitionVal == "__HIVE_DEFAULT_PARTITION__") { + return null + } + + elemType match { + case _: StringType => partitionVal + case _: DateType => + java.sql.Date.valueOf(partitionVal).toLocalDate.toEpochDay.asInstanceOf[Int] + case _: IntegerType => partitionVal.toInt.asInstanceOf[Integer] + case _: ShortType => partitionVal.toInt.asInstanceOf[Integer] + case _: ByteType => partitionVal.toInt.asInstanceOf[Integer] + case _: LongType => partitionVal.toLong + case _: BooleanType => partitionVal.toBoolean + case _: FloatType => partitionVal.toFloat + case _: DoubleType => partitionVal.toDouble + case _: DecimalType => new java.math.BigDecimal(partitionVal) + case _: BinaryType => ByteBuffer.wrap(partitionVal.getBytes("UTF-8")) + case _: TimestampNTZType => + java.sql.Timestamp.valueOf(partitionVal).getNanos/1000.asInstanceOf[Long] + case _: TimestampType => + Instant.parse(partitionVal).getNano/1000.asInstanceOf[Long] + case _ => + throw DeltaErrors.universalFormatConversionFailedException( + version, "iceberg", "Unexpected partition data type " + elemType) + } + } + + private def getMetricsForIcebergDataFile( + statsParser: String => InternalRow, + stats: String, + statsSchema: StructType): Metrics = { + val statsRow = statsParser(stats) + val metricsConverter = IcebergStatsConverter(statsRow, statsSchema) + new Metrics( + metricsConverter.numRecordsStat, // rowCount + null, // columnSizes + null, // valueCounts + metricsConverter.nullValueCountsStat.getOrElse(null).asJava, // nullValueCounts + null, // nanValueCounts + metricsConverter.lowerBoundsStat.getOrElse(null).asJava, // lowerBounds + metricsConverter.upperBoundsStat.getOrElse(null).asJava // upperBounds + ) + } + /** * Create an Iceberg HiveCatalog * @param conf: Hadoop Configuration diff --git a/iceberg/src/test/scala/org/apache/spark/sql/delta/ConvertToIcebergSuite.scala b/iceberg/src/test/scala/org/apache/spark/sql/delta/ConvertToIcebergSuite.scala index ad60dcc6061..1f645a30db4 100644 --- a/iceberg/src/test/scala/org/apache/spark/sql/delta/ConvertToIcebergSuite.scala +++ b/iceberg/src/test/scala/org/apache/spark/sql/delta/ConvertToIcebergSuite.scala @@ -25,11 +25,9 @@ import org.scalatest.time.SpanSugar._ import org.apache.spark.SparkContext import org.apache.spark.sql.{QueryTest, Row, SparkSession} import org.apache.spark.sql.catalyst.TableIdentifier -import org.apache.spark.sql.catalyst.catalog.{CatalogStorageFormat, CatalogTable, CatalogTableType} +import org.apache.spark.sql.catalyst.catalog.{CatalogTable, CatalogTableType, CatalogStorageFormat} import org.apache.spark.sql.delta.actions.Metadata -import org.apache.spark.sql.delta.icebergShaded.IcebergTransactionUtils -import org.apache.spark.sql.delta.sources.DeltaSQLConf -import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType} +import org.apache.spark.sql.types.{IntegerType, StringType, StructType, StructField} import org.apache.spark.util.Utils /** @@ -111,7 +109,6 @@ class ConvertToIcebergSuite extends QueryTest with Eventually { runDeltaSql( s"""CREATE TABLE `${testTableName}` (col1 INT) USING DELTA |TBLPROPERTIES ( - | 'delta.enableIcebergCompatV2' = 'true', | 'delta.columnMapping.mode' = 'name', | 'delta.universalFormat.enabledFormats' = 'iceberg' |)""".stripMargin) @@ -126,7 +123,6 @@ class ConvertToIcebergSuite extends QueryTest with Eventually { withDefaultTablePropsInSQLConf { deltaSpark.range(10).write.format("delta") .option("path", testTablePath) - .option("delta.enableIcebergCompatV2", "true") .saveAsTable(testTableName) } } @@ -134,77 +130,14 @@ class ConvertToIcebergSuite extends QueryTest with Eventually { deltaSpark.range(10, 20, 1) .write.format("delta").mode("append") .option("path", testTablePath) - .option("delta.enableIcebergCompatV2", "true") .saveAsTable(testTableName) } verifyReadWithIceberg(testTableName, 0 to 19 map (Row(_))) } } - test("Expire Snapshots") { - if (hmsReady(PORT)) { - runDeltaSql( - s"""CREATE TABLE `${testTableName}` (col1 INT) USING DELTA - |TBLPROPERTIES ( - | 'delta.enableIcebergCompatV2' = 'true', - | 'delta.columnMapping.mode' = 'name', - | 'delta.universalFormat.enabledFormats' = 'iceberg' - |)""".stripMargin) - - val icebergTable = loadIcebergTable() - icebergTable.updateProperties().set("history.expire.max-snapshot-age-ms", "1").commit() - - for (i <- 0 to 7) { - runDeltaSql(s"INSERT INTO ${testTableName} VALUES (${i})", - DeltaSQLConf.DELTA_UNIFORM_ICEBERG_SYNC_CONVERT_ENABLED.key -> "true") - } - - // Sleep past snapshot retention duration - Thread.sleep(5) - withIcebergSparkSession { icebergSpark => { - icebergSpark.sql(s"REFRESH TABLE $testTableName") - val manifestListsBeforeExpiration = icebergSpark - .sql(s"SELECT * FROM default.${testTableName}.snapshots") - .select("manifest_list") - .collect() - - assert(manifestListsBeforeExpiration.length == 8) - - // Trigger snapshot expiration - runDeltaSql(s"OPTIMIZE ${testTableName}") - icebergSpark.sql(s"REFRESH TABLE $testTableName") - - val manifestListsAfterExpiration = icebergSpark - .sql(s"SELECT * FROM default.${testTableName}.snapshots") - .select("manifest_list") - .collect() - - assert(manifestListsAfterExpiration.length == 1) - // Manifests from earlier snapshots should not be removed - manifestListsBeforeExpiration.toStream.foreach( - manifestList => assert( - icebergTable.io().newInputFile(manifestList.get(0).asInstanceOf[String]).exists())) - }} - } - } - - private def loadIcebergTable(): shadedForDelta.org.apache.iceberg.Table = { - withDeltaSparkSession { deltaSpark => { - val log = DeltaLog.forTable(deltaSpark, testTablePath) - val hiveCatalog = IcebergTransactionUtils.createHiveCatalog( - log.newDeltaHadoopConf() - ) - val table = hiveCatalog.loadTable( - shadedForDelta.org.apache.iceberg.catalog.TableIdentifier - .of("default", testTableName) - ) - table - }} - } - - def runDeltaSql(sqlStr: String, conf: (String, String)*): Unit = { + def runDeltaSql(sqlStr: String): Unit = { withDeltaSparkSession { deltaSpark => - conf.foreach(c => deltaSpark.conf.set(c._1, c._2)) deltaSpark.sql(sqlStr) } } diff --git a/icebergShaded/iceberg_src_patches/0002-iceberg-core-must-not-delete-any-delta-data-files.patch b/icebergShaded/iceberg_src_patches/0002-iceberg-core-must-not-delete-any-delta-data-files.patch new file mode 100644 index 00000000000..a181f065040 --- /dev/null +++ b/icebergShaded/iceberg_src_patches/0002-iceberg-core-must-not-delete-any-delta-data-files.patch @@ -0,0 +1,177 @@ +iceberg core must NOT delete any delta data files + +--- + .../iceberg/IncrementalFileCleanup.java | 8 +-- + .../apache/iceberg/ReachableFileCleanup.java | 5 +- + .../apache/iceberg/TestRemoveSnapshots.java | 57 +++++++++++-------- + 3 files changed, 40 insertions(+), 30 deletions(-) + +diff --git a/core/src/main/java/org/apache/iceberg/IncrementalFileCleanup.java b/connector/iceberg-core/core/src/main/java/org/apache/iceberg/IncrementalFileCleanup.java +index d894dcbf36d..ead7ea6b076 100644 +--- a/core/src/main/java/org/apache/iceberg/IncrementalFileCleanup.java ++++ b/core/src/main/java/org/apache/iceberg/IncrementalFileCleanup.java +@@ -256,10 +256,10 @@ class IncrementalFileCleanup extends FileCleanupStrategy { + } + }); + +- Set filesToDelete = +- findFilesToDelete(manifestsToScan, manifestsToRevert, validIds, afterExpiration); +- +- deleteFiles(filesToDelete, "data"); ++ // iceberg core MUST NOT delete any data files which are managed by delta ++ // Set filesToDelete = ++ // findFilesToDelete(manifestsToScan, manifestsToRevert, validIds, afterExpiration); ++ // deleteFiles(filesToDelete, "data"); + LOG.warn("Manifests to delete: {}", Joiner.on(", ").join(manifestsToDelete)); + LOG.warn("Manifests Lists to delete: {}", Joiner.on(", ").join(manifestListsToDelete)); + deleteFiles(manifestsToDelete, "manifest"); +diff --git a/core/src/main/java/org/apache/iceberg/ReachableFileCleanup.java b/connector/iceberg-core/core/src/main/java/org/apache/iceberg/ReachableFileCleanup.java +index ccbee78e27b..da888a63b3d 100644 +--- a/core/src/main/java/org/apache/iceberg/ReachableFileCleanup.java ++++ b/core/src/main/java/org/apache/iceberg/ReachableFileCleanup.java +@@ -72,8 +72,9 @@ class ReachableFileCleanup extends FileCleanupStrategy { + snapshotsAfterExpiration, deletionCandidates, currentManifests::add); + + if (!manifestsToDelete.isEmpty()) { +- Set dataFilesToDelete = findFilesToDelete(manifestsToDelete, currentManifests); +- deleteFiles(dataFilesToDelete, "data"); ++ // iceberg core MUST NOT delete any data files which are managed by delta ++ // Set dataFilesToDelete = findFilesToDelete(manifestsToDelete, currentManifests); ++ // deleteFiles(dataFilesToDelete, "data"); + Set manifestPathsToDelete = + manifestsToDelete.stream().map(ManifestFile::path).collect(Collectors.toSet()); + deleteFiles(manifestPathsToDelete, "manifest"); +diff --git a/core/src/test/java/org/apache/iceberg/TestRemoveSnapshots.java b/connector/iceberg-core/core/src/test/java/org/apache/iceberg/TestRemoveSnapshots.java +index 53e5af520d9..95fa8e41de1 100644 +--- a/core/src/test/java/org/apache/iceberg/TestRemoveSnapshots.java ++++ b/core/src/test/java/org/apache/iceberg/TestRemoveSnapshots.java +@@ -147,8 +147,9 @@ public class TestRemoveSnapshots extends TableTestBase { + secondSnapshot + .allManifests(table.io()) + .get(0) +- .path(), // manifest contained only deletes, was dropped +- FILE_A.path()), // deleted ++ .path() // manifest contained only deletes, was dropped ++ // FILE_A.path() should NOT delete data files ++ ), // deleted + deletedFiles); + } + +@@ -209,8 +210,9 @@ public class TestRemoveSnapshots extends TableTestBase { + .allManifests(table.io()) + .get(0) + .path(), // manifest was rewritten for delete +- secondSnapshot.manifestListLocation(), // snapshot expired +- FILE_A.path()), // deleted ++ secondSnapshot.manifestListLocation() // snapshot expired ++ // FILE_A.path() should not delete any data files ++ ), + deletedFiles); + } + +@@ -309,8 +311,9 @@ public class TestRemoveSnapshots extends TableTestBase { + Sets.newHashSet( + secondSnapshot.manifestListLocation(), // snapshot expired + Iterables.getOnlyElement(secondSnapshotManifests) +- .path(), // manifest is no longer referenced +- FILE_B.path()), // added, but rolled back ++ .path() // manifest is no longer referenced ++ // FILE_B.path() should not delete any data files ++ ), + deletedFiles); + } + +@@ -686,7 +689,8 @@ public class TestRemoveSnapshots extends TableTestBase { + + removeSnapshots(table).expireOlderThan(t3).deleteWith(deletedFiles::add).commit(); + +- Assert.assertTrue("FILE_A should be deleted", deletedFiles.contains(FILE_A.path().toString())); ++ Assert.assertTrue("FILE_A should NOT be deleted", ++ !deletedFiles.contains(FILE_A.path().toString())); + } + + @Test +@@ -712,7 +716,8 @@ public class TestRemoveSnapshots extends TableTestBase { + + removeSnapshots(table).expireOlderThan(t3).deleteWith(deletedFiles::add).commit(); + +- Assert.assertTrue("FILE_A should be deleted", deletedFiles.contains(FILE_A.path().toString())); ++ Assert.assertTrue("FILE_A should NOT be deleted", ++ !deletedFiles.contains(FILE_A.path().toString())); + } + + @Test +@@ -749,8 +754,10 @@ public class TestRemoveSnapshots extends TableTestBase { + + removeSnapshots(table).expireOlderThan(t4).deleteWith(deletedFiles::add).commit(); + +- Assert.assertTrue("FILE_A should be deleted", deletedFiles.contains(FILE_A.path().toString())); +- Assert.assertTrue("FILE_B should be deleted", deletedFiles.contains(FILE_B.path().toString())); ++ Assert.assertTrue("FILE_A should NOT be deleted", ++ !deletedFiles.contains(FILE_A.path().toString())); ++ Assert.assertTrue("FILE_B should NOT be deleted", ++ !deletedFiles.contains(FILE_B.path().toString())); + } + + @Test +@@ -824,9 +831,11 @@ public class TestRemoveSnapshots extends TableTestBase { + Sets.newHashSet( + "remove-snapshot-0", "remove-snapshot-1", "remove-snapshot-2", "remove-snapshot-3")); + +- Assert.assertTrue("FILE_A should be deleted", deletedFiles.contains(FILE_A.path().toString())); +- Assert.assertTrue("FILE_B should be deleted", deletedFiles.contains(FILE_B.path().toString())); +- Assert.assertTrue("Thread should be created in provided pool", planThreadsIndex.get() > 0); ++ Assert.assertTrue("FILE_A should NOT be deleted", ++ !deletedFiles.contains(FILE_A.path().toString())); ++ Assert.assertTrue("FILE_B should NOT be deleted", ++ !deletedFiles.contains(FILE_B.path().toString())); ++ // Assert.assertTrue("Thread should be created in provided pool", planThreadsIndex.get() > 0); + } + + @Test +@@ -885,13 +894,13 @@ public class TestRemoveSnapshots extends TableTestBase { + Set expectedDeletes = Sets.newHashSet(); + expectedDeletes.add(snapshotA.manifestListLocation()); + +- // Files should be deleted of dangling staged snapshot +- snapshotB +- .addedDataFiles(table.io()) +- .forEach( +- i -> { +- expectedDeletes.add(i.path().toString()); +- }); ++ // Files should NOT be deleted of dangling staged snapshot ++ // snapshotB ++ // .addedDataFiles(table.io()) ++ // .forEach( ++ // i -> { ++ // expectedDeletes.add(i.path().toString()); ++ // }); + + // ManifestList should be deleted too + expectedDeletes.add(snapshotB.manifestListLocation()); +@@ -1144,10 +1153,10 @@ public class TestRemoveSnapshots extends TableTestBase { + removeSnapshots(table).expireOlderThan(fourthSnapshotTs).deleteWith(deletedFiles::add).commit(); + + Assert.assertEquals( +- "Should remove old delete files and delete file manifests", ++ "Should only delete file manifests", + ImmutableSet.builder() +- .add(FILE_A.path()) +- .add(FILE_A_DELETES.path()) ++ // .add(FILE_A.path()) ++ // .add(FILE_A_DELETES.path()) + .add(firstSnapshot.manifestListLocation()) + .add(secondSnapshot.manifestListLocation()) + .add(thirdSnapshot.manifestListLocation()) +@@ -1501,7 +1510,7 @@ public class TestRemoveSnapshots extends TableTestBase { + expectedDeletes.addAll(manifestPaths(appendA, table.io())); + expectedDeletes.add(branchDelete.manifestListLocation()); + expectedDeletes.addAll(manifestPaths(branchDelete, table.io())); +- expectedDeletes.add(FILE_A.path().toString()); ++ // expectedDeletes.add(FILE_A.path().toString()); + + Assert.assertEquals(2, Iterables.size(table.snapshots())); + Assert.assertEquals(expectedDeletes, deletedFiles); +-- +2.39.2 (Apple Git-143) diff --git a/kernel/kernel-api/src/main/java/io/delta/kernel/data/ColumnarBatch.java b/kernel/kernel-api/src/main/java/io/delta/kernel/data/ColumnarBatch.java index f954b2e652b..fb9214f8588 100644 --- a/kernel/kernel-api/src/main/java/io/delta/kernel/data/ColumnarBatch.java +++ b/kernel/kernel-api/src/main/java/io/delta/kernel/data/ColumnarBatch.java @@ -86,17 +86,6 @@ default ColumnarBatch withNewSchema(StructType newSchema) { throw new UnsupportedOperationException("Not yet implemented"); } - /** - * Return a slice of the current batch. - * - * @param start Starting record index to include in the returned columnar batch - * @param end Ending record index (exclusive) to include in the returned columnar batch - * @return a columnar batch containing the records between [start, end) - */ - default ColumnarBatch slice(int start, int end) { - throw new UnsupportedOperationException("Not yet implemented!"); - } - /** @return iterator of {@link Row}s in this batch */ default CloseableIterator getRows() { final ColumnarBatch batch = this; diff --git a/kernel/kernel-api/src/main/java/io/delta/kernel/engine/FileSystemClient.java b/kernel/kernel-api/src/main/java/io/delta/kernel/engine/FileSystemClient.java index 8814a86602c..769a29ee48f 100644 --- a/kernel/kernel-api/src/main/java/io/delta/kernel/engine/FileSystemClient.java +++ b/kernel/kernel-api/src/main/java/io/delta/kernel/engine/FileSystemClient.java @@ -76,4 +76,13 @@ CloseableIterator readFiles(CloseableIterator earliestAvailableVersion) { + String message = + String.format( + "%s: Requested table changes beginning with startVersion=%s but no log file found for " + + "version %s.", + tablePath, startVersionRequested, startVersionRequested); + if (earliestAvailableVersion.isPresent()) { + message = + message + + String.format(" Earliest available version is %s", earliestAvailableVersion.get()); + } + return new KernelException(message); + } + + public static KernelException endVersionNotFound( + String tablePath, long endVersionRequested, Optional latestAvailableVersion) { + String message = + String.format( + "%s: Requested table changes ending with endVersion=%d but no log file found for " + + "version %d%s", + tablePath, + endVersionRequested, + endVersionRequested, + latestAvailableVersion + .map(version -> String.format(". Latest available version is %d", version)) + .orElse("")); + return new KernelException(message); + } + + public static KernelException invalidVersionRange(long startVersion, long endVersion) { + String message = + String.format( + "Invalid version range: requested table changes for version range [%s, %s]. " + + "Requires startVersion >= 0 and endVersion >= startVersion.", + startVersion, endVersion); + return new KernelException(message); + } + /* ------------------------ PROTOCOL EXCEPTIONS ----------------------------- */ public static KernelException unsupportedReaderProtocol( @@ -99,12 +150,13 @@ public static KernelException unsupportedReaderProtocol( return new KernelException(message); } - public static KernelException unsupportedReaderFeature(String tablePath, String readerFeature) { + public static KernelException unsupportedReaderFeature( + String tablePath, Set unsupportedFeatures) { String message = String.format( - "Unsupported Delta reader feature: table `%s` requires reader table feature \"%s\" " + "Unsupported Delta reader features: table `%s` requires reader table features [%s] " + "which is unsupported by this version of Delta Kernel.", - tablePath, readerFeature); + tablePath, String.join(", ", unsupportedFeatures)); return new KernelException(message); } diff --git a/kernel/kernel-api/src/main/java/io/delta/kernel/internal/DeltaHistoryManager.java b/kernel/kernel-api/src/main/java/io/delta/kernel/internal/DeltaHistoryManager.java index 099e56d2b6a..5449d3c0220 100644 --- a/kernel/kernel-api/src/main/java/io/delta/kernel/internal/DeltaHistoryManager.java +++ b/kernel/kernel-api/src/main/java/io/delta/kernel/internal/DeltaHistoryManager.java @@ -19,6 +19,7 @@ import static io.delta.kernel.internal.fs.Path.getName; import io.delta.kernel.engine.Engine; +import io.delta.kernel.exceptions.KernelException; import io.delta.kernel.exceptions.TableNotFoundException; import io.delta.kernel.internal.checkpoints.CheckpointInstance; import io.delta.kernel.internal.fs.Path; @@ -40,36 +41,58 @@ private DeltaHistoryManager() {} private static final Logger logger = LoggerFactory.getLogger(DeltaHistoryManager.class); /** - * Returns the latest recreatable commit that happened at or before {@code timestamp}. If the - * provided timestamp is after the timestamp of the latest version of the table throws an - * exception. If the provided timestamp is before the timestamp of the earliest version of the - * table throws an exception. + * Returns the latest commit that happened at or before {@code timestamp}. + * + *

If the timestamp is outside the range of [earliestCommit, latestCommit] then use parameters + * {@code canReturnLastCommit} and {@code canReturnEarliestCommit} to control whether an exception + * is thrown or the corresponding earliest/latest commit is returned. * * @param engine instance of {@link Engine} to use * @param logPath the _delta_log path of the table * @param timestamp the timestamp find the version for in milliseconds since the unix epoch - * @return the active recreatable commit version at the provided timestamp + * @param mustBeRecreatable whether the state at the returned commit should be recreatable + * @param canReturnLastCommit whether we can return the latest version of the table if the + * provided timestamp is after the latest commit + * @param canReturnEarliestCommit whether we can return the earliest version of the table if the + * provided timestamp is before the earliest commit + * @throws KernelException if the provided timestamp is before the earliest commit and + * canReturnEarliestCommit is false + * @throws KernelException if the provided timestamp is after the latest commit and + * canReturnLastCommit is false * @throws TableNotFoundException when there is no Delta table at the given path */ - public static long getActiveCommitAtTimestamp(Engine engine, Path logPath, long timestamp) + public static Commit getActiveCommitAtTimestamp( + Engine engine, + Path logPath, + long timestamp, + boolean mustBeRecreatable, + boolean canReturnLastCommit, + boolean canReturnEarliestCommit) throws TableNotFoundException { - long earliestRecreatableCommit = getEarliestRecreatableCommit(engine, logPath); + long earliestVersion = + (mustBeRecreatable) + ? getEarliestRecreatableCommit(engine, logPath) + : getEarliestDeltaFile(engine, logPath); // Search for the commit - List commits = getCommits(engine, logPath, earliestRecreatableCommit); + List commits = getCommits(engine, logPath, earliestVersion); Commit commit = lastCommitBeforeOrAtTimestamp(commits, timestamp) - .orElseThrow( - () -> - DeltaErrors.timestampBeforeFirstAvailableCommit( - logPath.getParent().toString(), /* use dataPath */ - timestamp, - commits.get(0).timestamp, - commits.get(0).version)); + .orElse(commits.get(0)); // This is only returned if canReturnEarliestCommit (see below) + // If timestamp is before the earliest commit + if (commit.timestamp > timestamp && !canReturnEarliestCommit) { + throw DeltaErrors.timestampBeforeFirstAvailableCommit( + logPath.getParent().toString(), /* use dataPath */ + timestamp, + commits.get(0).timestamp, + commits.get(0).version); + } // If timestamp is after the last commit of the table - if (commit == commits.get(commits.size() - 1) && commit.timestamp < timestamp) { + if (commit == commits.get(commits.size() - 1) + && commit.timestamp < timestamp + && !canReturnLastCommit) { throw DeltaErrors.timestampAfterLatestCommit( logPath.getParent().toString(), /* use dataPath */ timestamp, @@ -77,7 +100,7 @@ public static long getActiveCommitAtTimestamp(Engine engine, Path logPath, long commit.version); } - return commit.version; + return commit; } /** @@ -161,6 +184,30 @@ public static long getEarliestRecreatableCommit(Engine engine, Path logPath) } } + /** + * Get the earliest commit available for this table. Note that this version isn't guaranteed to + * exist when performing an action as a concurrent operation can delete the file during cleanup. + * This value must be used as a lower bound. + */ + public static long getEarliestDeltaFile(Engine engine, Path logPath) + throws TableNotFoundException { + + try (CloseableIterator files = + listFrom(engine, logPath, 0).filter(fs -> FileNames.isCommitFile(getName(fs.getPath())))) { + + if (files.hasNext()) { + return FileNames.deltaVersion(files.next().getPath()); + } else { + // listFrom already throws an error if the directory is truly empty, thus this must + // be because no files are delta files + throw new RuntimeException( + String.format("No delta files found in the directory: %s", logPath)); + } + } catch (IOException e) { + throw new RuntimeException("Could not close iterator", e); + } + } + /** * Returns an iterator containing a list of files found in the _delta_log directory starting with * {@code startVersion}. Throws a {@link TableNotFoundException} if the directory doesn't exist or @@ -242,16 +289,24 @@ private static Optional lastCommitBeforeOrAtTimestamp( return Optional.ofNullable((i < 0) ? null : commits.get(i)); } - private static class Commit { + public static class Commit { - final long version; - final long timestamp; + private final long version; + private final long timestamp; Commit(long version, long timestamp) { this.version = version; this.timestamp = timestamp; } + public long getVersion() { + return version; + } + + public long getTimestamp() { + return timestamp; + } + @Override public boolean equals(Object o) { if (this == o) { diff --git a/kernel/kernel-api/src/main/java/io/delta/kernel/internal/DeltaLogActionUtils.java b/kernel/kernel-api/src/main/java/io/delta/kernel/internal/DeltaLogActionUtils.java new file mode 100644 index 00000000000..ce47da8327e --- /dev/null +++ b/kernel/kernel-api/src/main/java/io/delta/kernel/internal/DeltaLogActionUtils.java @@ -0,0 +1,303 @@ +/* + * Copyright (2024) The Delta Lake Project Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.delta.kernel.internal; + +import static io.delta.kernel.internal.DeltaErrors.*; +import static io.delta.kernel.internal.fs.Path.getName; + +import io.delta.kernel.data.ColumnVector; +import io.delta.kernel.data.ColumnarBatch; +import io.delta.kernel.engine.Engine; +import io.delta.kernel.exceptions.InvalidTableException; +import io.delta.kernel.exceptions.KernelException; +import io.delta.kernel.exceptions.TableNotFoundException; +import io.delta.kernel.expressions.ExpressionEvaluator; +import io.delta.kernel.expressions.Literal; +import io.delta.kernel.internal.actions.*; +import io.delta.kernel.internal.fs.Path; +import io.delta.kernel.internal.replay.ActionsIterator; +import io.delta.kernel.internal.util.FileNames; +import io.delta.kernel.types.*; +import io.delta.kernel.utils.CloseableIterator; +import io.delta.kernel.utils.FileStatus; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.io.UncheckedIOException; +import java.util.*; +import java.util.stream.Collectors; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Exposes APIs to read the raw actions within the *commit files* of the _delta_log. This is used + * for CDF, streaming, and more. + */ +public class DeltaLogActionUtils { + + private DeltaLogActionUtils() {} + + private static final Logger logger = LoggerFactory.getLogger(DeltaLogActionUtils.class); + + ///////////////// + // Public APIs // + ///////////////// + + /** + * Represents a Delta action. This is used to request which actions to read from the commit files + * in {@link TableImpl#getChanges(Engine, long, long, Set)}. + * + *

See the Delta protocol for more details + * https://github.com/delta-io/delta/blob/master/PROTOCOL.md#actions + */ + public enum DeltaAction { + REMOVE("remove", RemoveFile.FULL_SCHEMA), + ADD("add", AddFile.FULL_SCHEMA), + METADATA("metaData", Metadata.FULL_SCHEMA), + PROTOCOL("protocol", Protocol.FULL_SCHEMA), + COMMITINFO("commitInfo", CommitInfo.FULL_SCHEMA), + CDC("cdc", AddCDCFile.FULL_SCHEMA); + + public final String colName; + public final StructType schema; + + DeltaAction(String colName, StructType schema) { + this.colName = colName; + this.schema = schema; + } + } + + /** + * For a table get the list of commit log files for the provided version range. + * + * @param tablePath path for the given table + * @param startVersion start version of the range (inclusive) + * @param endVersion end version of the range (inclusive) + * @return the list of commit files in increasing order between startVersion and endVersion + * @throws TableNotFoundException if the table does not exist or if it is not a delta table + * @throws KernelException if a commit file does not exist for any of the versions in the provided + * range + * @throws KernelException if provided an invalid version range + */ + public static List getCommitFilesForVersionRange( + Engine engine, Path tablePath, long startVersion, long endVersion) { + + // Validate arguments + if (startVersion < 0 || endVersion < startVersion) { + throw invalidVersionRange(startVersion, endVersion); + } + + // Get any available commit files within the version range + List commitFiles = listCommitFiles(engine, tablePath, startVersion, endVersion); + + // There are no available commit files within the version range. + // This can be due to (1) an empty directory, (2) no valid delta files in the directory, + // (3) only delta files less than startVersion prefix (4) only delta files after endVersion + if (commitFiles.isEmpty()) { + throw noCommitFilesFoundForVersionRange(tablePath.toString(), startVersion, endVersion); + } + + // Verify commit files found + // (check that they are continuous and start with startVersion and end with endVersion) + verifyDeltaVersions(commitFiles, startVersion, endVersion, tablePath); + + return commitFiles; + } + + /** + * Read the given commitFiles and return the contents as an iterator of batches. Also adds two + * columns "version" and "timestamp" that store the commit version and timestamp for the commit + * file that the batch was read from. The "version" and "timestamp" columns are the first and + * second columns in the returned schema respectively and both of {@link LongType} + * + * @param commitFiles list of delta commit files to read + * @param readSchema JSON schema to read + * @return an iterator over the contents of the files in the same order as the provided files + */ + public static CloseableIterator readCommitFiles( + Engine engine, List commitFiles, StructType readSchema) { + + return new ActionsIterator(engine, commitFiles, readSchema, Optional.empty()) + .map( + actionWrapper -> { + long timestamp = + actionWrapper + .getTimestamp() + .orElseThrow( + () -> + new RuntimeException("Commit files should always have a timestamp")); + ExpressionEvaluator commitVersionGenerator = + wrapEngineException( + () -> + engine + .getExpressionHandler() + .getEvaluator( + readSchema, + Literal.ofLong(actionWrapper.getVersion()), + LongType.LONG), + "Get the expression evaluator for the commit version"); + ExpressionEvaluator commitTimestampGenerator = + wrapEngineException( + () -> + engine + .getExpressionHandler() + .getEvaluator(readSchema, Literal.ofLong(timestamp), LongType.LONG), + "Get the expression evaluator for the commit timestamp"); + ColumnVector commitVersionVector = + wrapEngineException( + () -> commitVersionGenerator.eval(actionWrapper.getColumnarBatch()), + "Evaluating the commit version expression"); + ColumnVector commitTimestampVector = + wrapEngineException( + () -> commitTimestampGenerator.eval(actionWrapper.getColumnarBatch()), + "Evaluating the commit timestamp expression"); + + return actionWrapper + .getColumnarBatch() + .withNewColumn(0, COMMIT_VERSION_STRUCT_FIELD, commitVersionVector) + .withNewColumn(1, COMMIT_TIMESTAMP_STRUCT_FIELD, commitTimestampVector); + }); + } + + ////////////////////// + // Private helpers // + ///////////////////// + + /** Column name storing the commit version for a given file action */ + private static final String COMMIT_VERSION_COL_NAME = "version"; + + private static final DataType COMMIT_VERSION_DATA_TYPE = LongType.LONG; + private static final StructField COMMIT_VERSION_STRUCT_FIELD = + new StructField(COMMIT_VERSION_COL_NAME, COMMIT_VERSION_DATA_TYPE, false /* nullable */); + + /** Column name storing the commit timestamp for a given file action */ + private static final String COMMIT_TIMESTAMP_COL_NAME = "timestamp"; + + private static final DataType COMMIT_TIMESTAMP_DATA_TYPE = LongType.LONG; + private static final StructField COMMIT_TIMESTAMP_STRUCT_FIELD = + new StructField(COMMIT_TIMESTAMP_COL_NAME, COMMIT_TIMESTAMP_DATA_TYPE, false /* nullable */); + + /** + * Given a list of delta versions, verifies that they are (1) contiguous (2) versions starts with + * expectedStartVersion and (3) end with expectedEndVersion. Throws an exception if any of these + * are not true. + * + *

Public to expose for testing only. + * + * @param commitFiles in sorted increasing order according to the commit version + */ + static void verifyDeltaVersions( + List commitFiles, + long expectedStartVersion, + long expectedEndVersion, + Path tablePath) { + + List commitVersions = + commitFiles.stream() + .map(fs -> FileNames.deltaVersion(new Path(fs.getPath()))) + .collect(Collectors.toList()); + + for (int i = 1; i < commitVersions.size(); i++) { + if (commitVersions.get(i) != commitVersions.get(i - 1) + 1) { + throw new InvalidTableException( + tablePath.toString(), + String.format( + "Missing delta files: versions are not contiguous: (%s)", commitVersions)); + } + } + + if (commitVersions.isEmpty() || !Objects.equals(commitVersions.get(0), expectedStartVersion)) { + throw startVersionNotFound( + tablePath.toString(), + expectedStartVersion, + commitVersions.isEmpty() ? Optional.empty() : Optional.of(commitVersions.get(0))); + } + + if (commitVersions.isEmpty() + || !Objects.equals(commitVersions.get(commitVersions.size() - 1), expectedEndVersion)) { + throw endVersionNotFound( + tablePath.toString(), + expectedEndVersion, + commitVersions.isEmpty() + ? Optional.empty() + : Optional.of(commitVersions.get(commitVersions.size() - 1))); + } + } + + /** + * Gets an iterator of files in the _delta_log directory starting with the startVersion. + * + * @throws TableNotFoundException if the directory does not exist + */ + private static CloseableIterator listLogDir( + Engine engine, Path tablePath, long startVersion) { + final Path logPath = new Path(tablePath, "_delta_log"); + try { + return wrapEngineExceptionThrowsIO( + () -> + engine.getFileSystemClient().listFrom(FileNames.listingPrefix(logPath, startVersion)), + "Listing from %s", + FileNames.listingPrefix(logPath, startVersion)); + } catch (FileNotFoundException e) { + throw new TableNotFoundException(tablePath.toString()); + } catch (IOException io) { + throw new UncheckedIOException("Failed to list the files in delta log", io); + } + } + + /** + * Returns a list of delta commit files found in the _delta_log directory between startVersion and + * endVersion (both inclusive). + * + * @throws TableNotFoundException if the _delta_log directory does not exist + */ + private static List listCommitFiles( + Engine engine, Path tablePath, long startVersion, long endVersion) { + + // TODO update to support coordinated commits; suggested to load the Snapshot at endVersion + // and get the backfilled/unbackfilled commits from the LogSegment to combine with commit files + // listed from [startVersion, LogSegment.checkpointVersion] + logger.info( + "{}: Listing the commit files for versions [{}, {}]", tablePath, startVersion, endVersion); + long startTimeMillis = System.currentTimeMillis(); + final List output = new ArrayList<>(); + try (CloseableIterator fsIter = listLogDir(engine, tablePath, startVersion)) { + while (fsIter.hasNext()) { + FileStatus fs = fsIter.next(); + if (!FileNames.isCommitFile(getName(fs.getPath()))) { + logger.debug("Ignoring non-commit file {}", fs.getPath()); + continue; + } + if (FileNames.getFileVersion(new Path(fs.getPath())) > endVersion) { + logger.debug( + "Stopping listing found file {} with version > {}=endVersion", + fs.getPath(), + endVersion); + break; + } + output.add(fs); + } + } catch (IOException e) { + throw new UncheckedIOException("Unable to close resource", e); + } + logger.info( + "{}: Took {} ms to list the commit files for versions [{}, {}]", + tablePath, + System.currentTimeMillis() - startTimeMillis, + startVersion, + endVersion); + return output; + } +} diff --git a/kernel/kernel-api/src/main/java/io/delta/kernel/internal/TableConfig.java b/kernel/kernel-api/src/main/java/io/delta/kernel/internal/TableConfig.java index e68c84dff91..acebf4d521c 100644 --- a/kernel/kernel-api/src/main/java/io/delta/kernel/internal/TableConfig.java +++ b/kernel/kernel-api/src/main/java/io/delta/kernel/internal/TableConfig.java @@ -78,6 +78,30 @@ public class TableConfig { "needs to be a positive integer.", true); + /** + * The shortest duration we have to keep delta/checkpoint files around before deleting them. We + * can only delete delta files that are before a checkpoint. + */ + public static final TableConfig LOG_RETENTION = + new TableConfig<>( + "delta.logRetentionDuration", + "interval 30 days", + (engineOpt, v) -> IntervalParserUtils.safeParseIntervalAsMillis(v), + value -> true, + "needs to be provided as a calendar interval such as '2 weeks'. Months " + + "and years are not accepted. You may specify '365 days' for a year instead.", + true /* editable */); + + /** Whether to clean up expired checkpoints and delta logs. */ + public static final TableConfig EXPIRED_LOG_CLEANUP_ENABLED = + new TableConfig<>( + "delta.enableExpiredLogCleanup", + "true", + (engineOpt, v) -> Boolean.valueOf(v), + value -> true, + "needs to be a boolean.", + true /* editable */); + /** * This table property is used to track the enablement of the {@code inCommitTimestamps}. * diff --git a/kernel/kernel-api/src/main/java/io/delta/kernel/internal/TableFeatures.java b/kernel/kernel-api/src/main/java/io/delta/kernel/internal/TableFeatures.java index 7b595d9fbc9..4aa66ec2fff 100644 --- a/kernel/kernel-api/src/main/java/io/delta/kernel/internal/TableFeatures.java +++ b/kernel/kernel-api/src/main/java/io/delta/kernel/internal/TableFeatures.java @@ -24,10 +24,7 @@ import io.delta.kernel.internal.util.ColumnMapping; import io.delta.kernel.internal.util.Tuple2; import io.delta.kernel.types.StructType; -import java.util.Collections; -import java.util.HashSet; -import java.util.List; -import java.util.Set; +import java.util.*; import java.util.stream.Collectors; /** Contains utility methods related to the Delta table feature support in protocol. */ @@ -43,34 +40,40 @@ public class TableFeatures { } }); + private static final Set SUPPORTED_READER_FEATURES = + Collections.unmodifiableSet( + new HashSet() { + { + add("columnMapping"); + add("deletionVectors"); + add("timestampNtz"); + add("vacuumProtocolCheck"); + add("variantType-preview"); + add("v2Checkpoint"); + } + }); + //////////////////// // Helper Methods // //////////////////// public static void validateReadSupportedTable( - Protocol protocol, Metadata metadata, String tablePath) { + Protocol protocol, String tablePath, Optional metadata) { switch (protocol.getMinReaderVersion()) { case 1: break; case 2: - ColumnMapping.throwOnUnsupportedColumnMappingMode(metadata); + metadata.ifPresent(ColumnMapping::throwOnUnsupportedColumnMappingMode); break; case 3: List readerFeatures = protocol.getReaderFeatures(); - for (String readerFeature : readerFeatures) { - switch (readerFeature) { - case "columnMapping": - ColumnMapping.throwOnUnsupportedColumnMappingMode(metadata); - break; - case "deletionVectors": // fall through - case "timestampNtz": // fall through - case "vacuumProtocolCheck": // fall through - case "variantType-preview": // fall through - case "v2Checkpoint": - break; - default: - throw DeltaErrors.unsupportedReaderFeature(tablePath, readerFeature); - } + if (!SUPPORTED_READER_FEATURES.containsAll(readerFeatures)) { + Set unsupportedFeatures = new HashSet<>(readerFeatures); + unsupportedFeatures.removeAll(SUPPORTED_READER_FEATURES); + throw DeltaErrors.unsupportedReaderFeature(tablePath, unsupportedFeatures); + } + if (readerFeatures.contains("columnMapping")) { + metadata.ifPresent(ColumnMapping::throwOnUnsupportedColumnMappingMode); } break; default: diff --git a/kernel/kernel-api/src/main/java/io/delta/kernel/internal/TableImpl.java b/kernel/kernel-api/src/main/java/io/delta/kernel/internal/TableImpl.java index f2ccbb4f15b..d71ef4c30be 100644 --- a/kernel/kernel-api/src/main/java/io/delta/kernel/internal/TableImpl.java +++ b/kernel/kernel-api/src/main/java/io/delta/kernel/internal/TableImpl.java @@ -18,16 +18,34 @@ import static io.delta.kernel.internal.DeltaErrors.wrapEngineExceptionThrowsIO; import io.delta.kernel.*; +import io.delta.kernel.data.ColumnVector; +import io.delta.kernel.data.ColumnarBatch; import io.delta.kernel.engine.Engine; import io.delta.kernel.exceptions.CheckpointAlreadyExistsException; +import io.delta.kernel.exceptions.KernelException; import io.delta.kernel.exceptions.TableNotFoundException; +import io.delta.kernel.internal.actions.Protocol; import io.delta.kernel.internal.fs.Path; import io.delta.kernel.internal.snapshot.SnapshotManager; import io.delta.kernel.internal.util.Clock; +import io.delta.kernel.types.StructField; +import io.delta.kernel.types.StructType; +import io.delta.kernel.utils.CloseableIterator; +import io.delta.kernel.utils.FileStatus; import java.io.IOException; import java.io.UncheckedIOException; +import java.util.HashSet; +import java.util.List; +import java.util.Optional; +import java.util.Set; +import java.util.stream.Collectors; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; public class TableImpl implements Table { + + private static final Logger logger = LoggerFactory.getLogger(TableImpl.class); + public static Table forPath(Engine engine, String path) { return forPath(engine, path, System::currentTimeMillis); } @@ -90,7 +108,7 @@ public Snapshot getSnapshotAsOfTimestamp(Engine engine, long millisSinceEpochUTC @Override public void checkpoint(Engine engine, long version) throws TableNotFoundException, CheckpointAlreadyExistsException, IOException { - snapshotManager.checkpoint(engine, version); + snapshotManager.checkpoint(engine, clock, version); } @Override @@ -103,6 +121,66 @@ public Clock getClock() { return clock; } + /** + * Returns delta actions for each version between startVersion and endVersion. Only returns the + * actions requested in actionSet. + * + *

For the returned columnar batches: + * + *

    + *
  • Each row within the same batch is guaranteed to have the same commit version + *
  • The batch commit versions are monotonically increasing + *
  • The top-level columns include "version", "timestamp", and the actions requested in + * actionSet. "version" and "timestamp" are the first and second columns in the schema, + * respectively. The remaining columns are based on the actions requested and each have the + * schema found in {@code DeltaAction.schema}. + *
+ * + * @param engine {@link Engine} instance to use in Delta Kernel. + * @param startVersion start version (inclusive) + * @param endVersion end version (inclusive) + * @param actionSet the actions to read and return from the JSON log files + * @return an iterator of batches where each row in the batch has exactly one non-null action and + * its commit version and timestamp + * @throws TableNotFoundException if the table does not exist or if it is not a delta table + * @throws KernelException if a commit file does not exist for any of the versions in the provided + * range + * @throws KernelException if provided an invalid version range + * @throws KernelException if the version range contains a version with reader protocol that is + * unsupported by Kernel + */ + public CloseableIterator getChanges( + Engine engine, + long startVersion, + long endVersion, + Set actionSet) { + // Create a new action set that always contains protocol + Set copySet = new HashSet<>(actionSet); + copySet.add(DeltaLogActionUtils.DeltaAction.PROTOCOL); + // If protocol is not in the original requested actions we drop the column before returning + boolean shouldDropProtocolColumn = + !actionSet.contains(DeltaLogActionUtils.DeltaAction.PROTOCOL); + + return getRawChanges(engine, startVersion, endVersion, copySet) + .map( + batch -> { + int protocolIdx = batch.getSchema().indexOf("protocol"); // must exist + ColumnVector protocolVector = batch.getColumnVector(protocolIdx); + for (int rowId = 0; rowId < protocolVector.getSize(); rowId++) { + if (!protocolVector.isNullAt(rowId)) { + Protocol protocol = Protocol.fromColumnVector(protocolVector, rowId); + TableFeatures.validateReadSupportedTable( + protocol, getDataPath().toString(), Optional.empty()); + } + } + if (shouldDropProtocolColumn) { + return batch.withDeletedColumnAt(protocolIdx); + } else { + return batch; + } + }); + } + protected Path getDataPath() { return new Path(tablePath); } @@ -110,4 +188,132 @@ protected Path getDataPath() { protected Path getLogPath() { return new Path(tablePath, "_delta_log"); } + + /** + * Returns the latest version that was committed before or at {@code millisSinceEpochUTC}. If no + * version exists, throws a {@link KernelException} + * + *

Specifically: + * + *

    + *
  • if a commit version exactly matches the provided timestamp, we return it + *
  • else, we return the latest commit version with a timestamp less than the provided one + *
  • If the provided timestamp is less than the timestamp of any committed version, we throw + * an error. + *
+ * + * . + * + * @param millisSinceEpochUTC the number of milliseconds since midnight, January 1, 1970 UTC + * @return latest commit that happened before or at {@code timestamp}. + * @throws KernelException if the timestamp is less than the timestamp of any committed version + * @throws TableNotFoundException if no delta table is found + */ + public long getVersionBeforeOrAtTimestamp(Engine engine, long millisSinceEpochUTC) { + return DeltaHistoryManager.getActiveCommitAtTimestamp( + engine, + getLogPath(), + millisSinceEpochUTC, + false, /* mustBeRecreatable */ + // e.g. if we give time T+2 and last commit has time T, then we DO want that last commit + true, /* canReturnLastCommit */ + // e.g. we give time T-1 and first commit has time T, then do NOT want that earliest + // commit + false /* canReturnEarliestCommit */) + .getVersion(); + } + + /** + * Returns the latest version that was committed at or after {@code millisSinceEpochUTC}. If no + * version exists, throws a {@link KernelException} + * + *

Specifically: + * + *

    + *
  • if a commit version exactly matches the provided timestamp, we return it + *
  • else, we return the earliest commit version with a timestamp greater than the provided + * one + *
  • If the provided timestamp is larger than the timestamp of any committed version, we throw + * an error. + *
+ * + * . + * + * @param millisSinceEpochUTC the number of milliseconds since midnight, January 1, 1970 UTC + * @return latest commit that happened at or before {@code timestamp}. + * @throws KernelException if the timestamp is more than the timestamp of any committed version + * @throws TableNotFoundException if no delta table is found + */ + public long getVersionAtOrAfterTimestamp(Engine engine, long millisSinceEpochUTC) { + DeltaHistoryManager.Commit commit = + DeltaHistoryManager.getActiveCommitAtTimestamp( + engine, + getLogPath(), + millisSinceEpochUTC, + false, /* mustBeRecreatable */ + // e.g. if we give time T+2 and last commit has time T, then we do NOT want that last + // commit + false, /* canReturnLastCommit */ + // e.g. we give time T-1 and first commit has time T, then we DO want that earliest + // commit + true /* canReturnEarliestCommit */); + + if (commit.getTimestamp() >= millisSinceEpochUTC) { + return commit.getVersion(); + } else { + // this commit.timestamp is before the input timestamp. if this is the last commit, then + // the input timestamp is after the last commit and `getActiveCommitAtTimestamp` would have + // thrown an KernelException. So, clearly, this can't be the last commit, so we can safely + // return commit.version + 1 as the version that is at or after the input timestamp. + return commit.getVersion() + 1; + } + } + + /** + * Returns the raw delta actions for each version between startVersion and endVersion. Only reads + * the actions requested in actionSet from the JSON log files. + * + *

For the returned columnar batches: + * + *

    + *
  • Each row within the same batch is guaranteed to have the same commit version + *
  • The batch commit versions are monotonically increasing + *
  • The top-level columns include "version", "timestamp", and the actions requested in + * actionSet. "version" and "timestamp" are the first and second columns in the schema, + * respectively. The remaining columns are based on the actions requested and each have the + * schema found in {@code DeltaAction.schema}. + *
+ * + * @param engine {@link Engine} instance to use in Delta Kernel. + * @param startVersion start version (inclusive) + * @param endVersion end version (inclusive) + * @param actionSet the actions to read and return from the JSON log files + * @return an iterator of batches where each row in the batch has exactly one non-null action and + * its commit version and timestamp + * @throws TableNotFoundException if the table does not exist or if it is not a delta table + * @throws KernelException if a commit file does not exist for any of the versions in the provided + * range + * @throws KernelException if provided an invalid version range + */ + private CloseableIterator getRawChanges( + Engine engine, + long startVersion, + long endVersion, + Set actionSet) { + + logger.info( + "{}: Getting the commit files for versions [{}, {}]", tablePath, startVersion, endVersion); + List commitFiles = + DeltaLogActionUtils.getCommitFilesForVersionRange( + engine, new Path(tablePath), startVersion, endVersion); + + StructType readSchema = + new StructType( + actionSet.stream() + .map(action -> new StructField(action.colName, action.schema, true)) + .collect(Collectors.toList())); + + logger.info("{}: Reading the commit files with readSchema {}", tablePath, readSchema); + return DeltaLogActionUtils.readCommitFiles(engine, commitFiles, readSchema); + } } diff --git a/kernel/kernel-api/src/main/java/io/delta/kernel/internal/TransactionImpl.java b/kernel/kernel-api/src/main/java/io/delta/kernel/internal/TransactionImpl.java index f509277183c..2d51c315ab3 100644 --- a/kernel/kernel-api/src/main/java/io/delta/kernel/internal/TransactionImpl.java +++ b/kernel/kernel-api/src/main/java/io/delta/kernel/internal/TransactionImpl.java @@ -293,7 +293,8 @@ private CommitInfo generateCommitAction(Engine engine) { operation.getDescription(), /* description */ getOperationParameters(), /* operationParameters */ isBlindAppend(), /* isBlindAppend */ - txnId.toString() /* txnId */); + txnId.toString(), /* txnId */ + Collections.emptyMap() /* operationMetrics */); } private boolean isReadyForCheckpoint(Engine engine, long newVersion) { diff --git a/kernel/kernel-api/src/main/java/io/delta/kernel/internal/actions/AddCDCFile.java b/kernel/kernel-api/src/main/java/io/delta/kernel/internal/actions/AddCDCFile.java new file mode 100644 index 00000000000..192e862b27b --- /dev/null +++ b/kernel/kernel-api/src/main/java/io/delta/kernel/internal/actions/AddCDCFile.java @@ -0,0 +1,33 @@ +/* + * Copyright (2023) The Delta Lake Project Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.delta.kernel.internal.actions; + +import io.delta.kernel.types.*; + +/** Metadata about {@code cdc} action in the Delta Log. */ +public class AddCDCFile { + /** Full schema of the {@code cdc} action in the Delta Log. */ + public static final StructType FULL_SCHEMA = + new StructType() + .add("path", StringType.STRING, false /* nullable */) + .add( + "partitionValues", + new MapType(StringType.STRING, StringType.STRING, true), + false /* nullable*/) + .add("size", LongType.LONG, false /* nullable*/) + .add( + "tags", new MapType(StringType.STRING, StringType.STRING, true), true /* nullable */); +} diff --git a/kernel/kernel-api/src/main/java/io/delta/kernel/internal/actions/CommitInfo.java b/kernel/kernel-api/src/main/java/io/delta/kernel/internal/actions/CommitInfo.java index b9908deb913..6b16e79e1bc 100644 --- a/kernel/kernel-api/src/main/java/io/delta/kernel/internal/actions/CommitInfo.java +++ b/kernel/kernel-api/src/main/java/io/delta/kernel/internal/actions/CommitInfo.java @@ -65,8 +65,8 @@ public static CommitInfo fromColumnVector(ColumnVector vector, int rowId) { return null; } - ColumnVector[] children = new ColumnVector[7]; - for (int i = 0; i < 7; i++) { + ColumnVector[] children = new ColumnVector[8]; + for (int i = 0; i < children.length; i++) { children[i] = vector.getChild(i); } @@ -79,7 +79,10 @@ public static CommitInfo fromColumnVector(ColumnVector vector, int rowId) { ? Collections.emptyMap() : VectorUtils.toJavaMap(children[4].getMap(rowId)), children[5].isNullAt(rowId) ? null : children[5].getBoolean(rowId), - children[6].isNullAt(rowId) ? null : children[6].getString(rowId)); + children[6].isNullAt(rowId) ? null : children[6].getString(rowId), + children[7].isNullAt(rowId) + ? Collections.emptyMap() + : VectorUtils.toJavaMap(children[7].getMap(rowId))); } public static StructType FULL_SCHEMA = @@ -92,7 +95,10 @@ public static CommitInfo fromColumnVector(ColumnVector vector, int rowId) { "operationParameters", new MapType(StringType.STRING, StringType.STRING, true /* nullable */)) .add("isBlindAppend", BooleanType.BOOLEAN, true /* nullable */) - .add("txnId", StringType.STRING); + .add("txnId", StringType.STRING) + .add( + "operationMetrics", + new MapType(StringType.STRING, StringType.STRING, true /* nullable */)); private static final Map COL_NAME_TO_ORDINAL = IntStream.range(0, FULL_SCHEMA.length()) @@ -108,6 +114,7 @@ public static CommitInfo fromColumnVector(ColumnVector vector, int rowId) { private final boolean isBlindAppend; private final String txnId; private Optional inCommitTimestamp; + private final Map operationMetrics; public CommitInfo( Optional inCommitTimestamp, @@ -116,7 +123,8 @@ public CommitInfo( String operation, Map operationParameters, boolean isBlindAppend, - String txnId) { + String txnId, + Map operationMetrics) { this.inCommitTimestamp = inCommitTimestamp; this.timestamp = timestamp; this.engineInfo = engineInfo; @@ -124,6 +132,7 @@ public CommitInfo( this.operationParameters = Collections.unmodifiableMap(operationParameters); this.isBlindAppend = isBlindAppend; this.txnId = txnId; + this.operationMetrics = operationMetrics; } public Optional getInCommitTimestamp() { @@ -161,6 +170,8 @@ public Row toRow() { COL_NAME_TO_ORDINAL.get("operationParameters"), stringStringMapValue(operationParameters)); commitInfo.put(COL_NAME_TO_ORDINAL.get("isBlindAppend"), isBlindAppend); commitInfo.put(COL_NAME_TO_ORDINAL.get("txnId"), txnId); + commitInfo.put( + COL_NAME_TO_ORDINAL.get("operationMetrics"), stringStringMapValue(operationMetrics)); return new GenericRow(CommitInfo.FULL_SCHEMA, commitInfo); } diff --git a/kernel/kernel-api/src/main/java/io/delta/kernel/internal/lang/ListUtils.java b/kernel/kernel-api/src/main/java/io/delta/kernel/internal/lang/ListUtils.java index 78c6970db29..06ba6cab887 100644 --- a/kernel/kernel-api/src/main/java/io/delta/kernel/internal/lang/ListUtils.java +++ b/kernel/kernel-api/src/main/java/io/delta/kernel/internal/lang/ListUtils.java @@ -18,6 +18,7 @@ import io.delta.kernel.internal.util.Tuple2; import java.util.List; import java.util.Map; +import java.util.NoSuchElementException; import java.util.function.Predicate; import java.util.stream.Collectors; @@ -34,4 +35,22 @@ public static Tuple2, List> partition( public static T last(List list) { return list.get(list.size() - 1); } + + /** Remove once supported JDK (build) version is 21 or above */ + public static T getFirst(List list) { + if (list.isEmpty()) { + throw new NoSuchElementException(); + } else { + return list.get(0); + } + } + + /** Remove once supported JDK (build) version is 21 or above */ + public static T getLast(List list) { + if (list.isEmpty()) { + throw new NoSuchElementException(); + } else { + return list.get(list.size() - 1); + } + } } diff --git a/kernel/kernel-api/src/main/java/io/delta/kernel/internal/replay/ActionWrapper.java b/kernel/kernel-api/src/main/java/io/delta/kernel/internal/replay/ActionWrapper.java index 7965da99a75..274e30fc723 100644 --- a/kernel/kernel-api/src/main/java/io/delta/kernel/internal/replay/ActionWrapper.java +++ b/kernel/kernel-api/src/main/java/io/delta/kernel/internal/replay/ActionWrapper.java @@ -16,17 +16,22 @@ package io.delta.kernel.internal.replay; import io.delta.kernel.data.ColumnarBatch; +import java.util.Optional; /** Internal wrapper class holding information needed to perform log replay. */ -class ActionWrapper { +public class ActionWrapper { private final ColumnarBatch columnarBatch; private final boolean isFromCheckpoint; private final long version; + /* Timestamp of the commit file if isFromCheckpoint=false */ + private final Optional timestamp; - ActionWrapper(ColumnarBatch data, boolean isFromCheckpoint, long version) { + ActionWrapper( + ColumnarBatch data, boolean isFromCheckpoint, long version, Optional timestamp) { this.columnarBatch = data; this.isFromCheckpoint = isFromCheckpoint; this.version = version; + this.timestamp = timestamp; } public ColumnarBatch getColumnarBatch() { @@ -40,4 +45,8 @@ public boolean isFromCheckpoint() { public long getVersion() { return version; } + + public Optional getTimestamp() { + return timestamp; + } } diff --git a/kernel/kernel-api/src/main/java/io/delta/kernel/internal/replay/ActionsIterator.java b/kernel/kernel-api/src/main/java/io/delta/kernel/internal/replay/ActionsIterator.java index bfb51ff5402..a7ed4a9cc89 100644 --- a/kernel/kernel-api/src/main/java/io/delta/kernel/internal/replay/ActionsIterator.java +++ b/kernel/kernel-api/src/main/java/io/delta/kernel/internal/replay/ActionsIterator.java @@ -46,7 +46,7 @@ * *

Users must pass in a `readSchema` to select which actions and sub-fields they want to consume. */ -class ActionsIterator implements CloseableIterator { +public class ActionsIterator implements CloseableIterator { private final Engine engine; private final Optional checkpointPredicate; @@ -74,7 +74,7 @@ class ActionsIterator implements CloseableIterator { private boolean closed; - ActionsIterator( + public ActionsIterator( Engine engine, List files, StructType readSchema, @@ -316,7 +316,11 @@ private CloseableIterator getNextActionsIter() { nextFile, readSchema); - return combine(dataIter, false /* isFromCheckpoint */, fileVersion); + return combine( + dataIter, + false /* isFromCheckpoint */, + fileVersion, + Optional.of(nextFile.getModificationTime()) /* timestamp */); } case CHECKPOINT_CLASSIC: case V2_CHECKPOINT_MANIFEST: @@ -327,7 +331,7 @@ private CloseableIterator getNextActionsIter() { CloseableIterator dataIter = getActionsIterFromSinglePartOrV2Checkpoint(nextFile, fileName); long version = checkpointVersion(nextFilePath); - return combine(dataIter, true /* isFromCheckpoint */, version); + return combine(dataIter, true /* isFromCheckpoint */, version, Optional.empty()); } case MULTIPART_CHECKPOINT: case SIDECAR: @@ -349,7 +353,7 @@ private CloseableIterator getNextActionsIter() { checkpointPredicate); long version = checkpointVersion(nextFilePath); - return combine(dataIter, true /* isFromCheckpoint */, version); + return combine(dataIter, true /* isFromCheckpoint */, version, Optional.empty()); } default: throw new IOException("Unrecognized log type: " + nextLogFile.getLogType()); @@ -361,7 +365,10 @@ private CloseableIterator getNextActionsIter() { /** Take input (iterator, boolean) and produce an iterator. */ private CloseableIterator combine( - CloseableIterator fileReadDataIter, boolean isFromCheckpoint, long version) { + CloseableIterator fileReadDataIter, + boolean isFromCheckpoint, + long version, + Optional timestamp) { return new CloseableIterator() { @Override public boolean hasNext() { @@ -370,7 +377,7 @@ public boolean hasNext() { @Override public ActionWrapper next() { - return new ActionWrapper(fileReadDataIter.next(), isFromCheckpoint, version); + return new ActionWrapper(fileReadDataIter.next(), isFromCheckpoint, version, timestamp); } @Override diff --git a/kernel/kernel-api/src/main/java/io/delta/kernel/internal/replay/LogReplay.java b/kernel/kernel-api/src/main/java/io/delta/kernel/internal/replay/LogReplay.java index 1d1a229ca31..170dad9ac1f 100644 --- a/kernel/kernel-api/src/main/java/io/delta/kernel/internal/replay/LogReplay.java +++ b/kernel/kernel-api/src/main/java/io/delta/kernel/internal/replay/LogReplay.java @@ -233,7 +233,8 @@ protected Tuple2 loadTableProtocolAndMetadata( if (protocol != null) { // Stop since we have found the latest Protocol and Metadata. - TableFeatures.validateReadSupportedTable(protocol, metadata, dataPath.toString()); + TableFeatures.validateReadSupportedTable( + protocol, dataPath.toString(), Optional.of(metadata)); return new Tuple2<>(protocol, metadata); } diff --git a/kernel/kernel-api/src/main/java/io/delta/kernel/internal/snapshot/MetadataCleanup.java b/kernel/kernel-api/src/main/java/io/delta/kernel/internal/snapshot/MetadataCleanup.java new file mode 100644 index 00000000000..0b405426751 --- /dev/null +++ b/kernel/kernel-api/src/main/java/io/delta/kernel/internal/snapshot/MetadataCleanup.java @@ -0,0 +1,200 @@ +/* + * Copyright (2024) The Delta Lake Project Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.delta.kernel.internal.snapshot; + +import static io.delta.kernel.internal.DeltaErrors.wrapEngineExceptionThrowsIO; +import static io.delta.kernel.internal.checkpoints.Checkpointer.getLatestCompleteCheckpointFromList; +import static io.delta.kernel.internal.lang.ListUtils.getFirst; +import static io.delta.kernel.internal.lang.ListUtils.getLast; +import static io.delta.kernel.internal.util.Preconditions.checkArgument; +import static java.util.stream.Collectors.toList; + +import io.delta.kernel.engine.Engine; +import io.delta.kernel.internal.checkpoints.CheckpointInstance; +import io.delta.kernel.internal.fs.Path; +import io.delta.kernel.internal.util.Clock; +import io.delta.kernel.internal.util.FileNames; +import io.delta.kernel.utils.CloseableIterator; +import io.delta.kernel.utils.FileStatus; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.Optional; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class MetadataCleanup { + private static final Logger logger = LoggerFactory.getLogger(MetadataCleanup.class); + + private MetadataCleanup() {} + + /** + * Delete the Delta log files (delta and checkpoint files) that are expired according to the table + * metadata retention settings. While deleting the log files, it makes sure the time travel + * continues to work for all unexpired table versions. + * + *

Here is algorithm: + * + *

    + *
  • Initial the potential delete file list: `potentialFilesToDelete` as an empty list + *
  • Initialize the last seen checkpoint file list: `lastSeenCheckpointFiles`. There could be + * one or more checkpoint files for a given version. + *
  • List the delta log files starting with prefix "00000000000000000000." (%020d). For each + * file: + *
      + *
    • Step 1: Check if the `lastSeenCheckpointFiles` contains a complete checkpoint, then + *
        + *
      • Step 1.1: delete all files in `potentialFilesToDelete`. Now we know there is + * a checkpoint that contains the compacted Delta log up to the checkpoint + * version and all commit/checkpoint files before this checkpoint version are + * not needed. + *
      • Step 1.2: add `lastCheckpointFiles` to `potentialFileStoDelete` list. This + * checkpoint is potential candidate to delete later if we find another + * checkpoint + *
      + *
    • Step 2: If the timestamp falls within the retention period, stop + *
    • Step 3: If the file is a delta log file, add it to the `potentialFilesToDelete` + * list + *
    • Step 4: If the file is a checkpoint file, add it to the `lastSeenCheckpointFiles` + *
    + *
+ * + * @param engine {@link Engine} instance to delete the expired log files + * @param clock {@link Clock} instance to get the current time. Useful in testing to mock the + * current time. + * @param tablePath Table location + * @param retentionMillis Log file retention period in milliseconds + * @return number of log files deleted + * @throws IOException if an error occurs while deleting the log files + */ + public static long cleanupExpiredLogs( + Engine engine, Clock clock, Path tablePath, long retentionMillis) throws IOException { + checkArgument(retentionMillis >= 0, "Retention period must be non-negative"); + + List potentialLogFilesToDelete = new ArrayList<>(); + long lastSeenCheckpointVersion = -1; // -1 indicates no checkpoint seen yet + List lastSeenCheckpointFiles = new ArrayList<>(); + + long fileCutOffTime = clock.getTimeMillis() - retentionMillis; + logger.info("{}: Starting the deletion of log files older than {}", tablePath, fileCutOffTime); + long numDeleted = 0; + try (CloseableIterator files = listDeltaLogs(engine, tablePath)) { + while (files.hasNext()) { + // Step 1: Check if the `lastSeenCheckpointFiles` contains a complete checkpoint + Optional lastCompleteCheckpoint = + getLatestCompleteCheckpointFromList( + lastSeenCheckpointFiles.stream().map(CheckpointInstance::new).collect(toList()), + CheckpointInstance.MAX_VALUE); + + if (lastCompleteCheckpoint.isPresent()) { + // Step 1.1: delete all files in `potentialFilesToDelete`. Now we know there is a + // checkpoint that contains the compacted Delta log up to the checkpoint version and all + // commit/checkpoint files before this checkpoint version are not needed. add + // `lastCheckpointFiles` to `potentialFileStoDelete` list. This checkpoint is potential + // candidate to delete later if we find another checkpoint + if (!potentialLogFilesToDelete.isEmpty()) { + logger.info( + "{}: Deleting log files (start = {}, end = {}) because a checkpoint at " + + "version {} indicates that these log files are no longer needed.", + tablePath, + getFirst(potentialLogFilesToDelete), + getLast(potentialLogFilesToDelete), + lastSeenCheckpointVersion); + + numDeleted += deleteLogFiles(engine, potentialLogFilesToDelete); + potentialLogFilesToDelete.clear(); + } + + // Step 1.2: add `lastCheckpointFiles` to `potentialFileStoDelete` list. This checkpoint + // is potential candidate to delete later if we find another checkpoint + potentialLogFilesToDelete.addAll(lastSeenCheckpointFiles); + lastSeenCheckpointFiles.clear(); + lastSeenCheckpointVersion = -1; + } + + FileStatus nextFile = files.next(); + + // Step 2: If the timestamp is earlier than the retention period, stop + if (nextFile.getModificationTime() > fileCutOffTime) { + if (!potentialLogFilesToDelete.isEmpty()) { + logger.info( + "{}: Skipping deletion of expired log files {}, because there is no checkpoint " + + "file that indicates that the log files are no longer needed. ", + tablePath, + potentialLogFilesToDelete.size()); + } + break; + } + + if (FileNames.isCommitFile(nextFile.getPath())) { + // Step 3: If the file is a delta log file, add it to the `potentialFilesToDelete` list + // We can't delete these files until we encounter a checkpoint later that indicates + // that the log files are no longer needed. + potentialLogFilesToDelete.add(nextFile.getPath()); + } else if (FileNames.isCheckpointFile(nextFile.getPath())) { + // Step 4: If the file is a checkpoint file, add it to the `lastSeenCheckpointFiles` + long newLastSeenCheckpointVersion = FileNames.checkpointVersion(nextFile.getPath()); + checkArgument( + lastSeenCheckpointVersion == -1 + || newLastSeenCheckpointVersion >= lastSeenCheckpointVersion); + + if (lastSeenCheckpointVersion != -1 + && newLastSeenCheckpointVersion > lastSeenCheckpointVersion) { + // We have found checkpoint file for a new version. This means the files gathered for + // the last checkpoint version are not complete (most likely an incomplete multipart + // checkpoint). We should delete the files gathered so far and start fresh + // last seen checkpoint state + logger.info( + "{}: Incomplete checkpoint files found at version {}, ignoring the checkpoint" + + " files and adding them to potential log file delete list", + tablePath, + lastSeenCheckpointVersion); + potentialLogFilesToDelete.addAll(lastSeenCheckpointFiles); + lastSeenCheckpointFiles.clear(); + } + + lastSeenCheckpointFiles.add(nextFile.getPath()); + lastSeenCheckpointVersion = newLastSeenCheckpointVersion; + } + // Ignore non-delta and non-checkpoint files. + } + } + logger.info("{}: Deleted {} log files older than {}", tablePath, numDeleted, fileCutOffTime); + return numDeleted; + } + + private static CloseableIterator listDeltaLogs(Engine engine, Path tablePath) + throws IOException { + Path logPath = new Path(tablePath, "_delta_log"); + // TODO: Currently we don't update the timestamps of files to be monotonically increasing. + // In future we can do something similar to Delta Spark to make the timestamps monotonically + // increasing. See `BufferingLogDeletionIterator` in Delta Spark. + return engine.getFileSystemClient().listFrom(FileNames.listingPrefix(logPath, 0)); + } + + private static int deleteLogFiles(Engine engine, List logFiles) throws IOException { + int numDeleted = 0; + for (String logFile : logFiles) { + if (wrapEngineExceptionThrowsIO( + () -> engine.getFileSystemClient().delete(logFile), + "Failed to delete the log file as part of the metadata cleanup %s", + logFile)) { + numDeleted++; + } + } + return numDeleted; + } +} diff --git a/kernel/kernel-api/src/main/java/io/delta/kernel/internal/snapshot/SnapshotManager.java b/kernel/kernel-api/src/main/java/io/delta/kernel/internal/snapshot/SnapshotManager.java index a8c22bc8b5d..8c760dbaf0c 100644 --- a/kernel/kernel-api/src/main/java/io/delta/kernel/internal/snapshot/SnapshotManager.java +++ b/kernel/kernel-api/src/main/java/io/delta/kernel/internal/snapshot/SnapshotManager.java @@ -17,10 +17,13 @@ package io.delta.kernel.internal.snapshot; import static io.delta.kernel.internal.DeltaErrors.wrapEngineExceptionThrowsIO; +import static io.delta.kernel.internal.TableConfig.EXPIRED_LOG_CLEANUP_ENABLED; +import static io.delta.kernel.internal.TableConfig.LOG_RETENTION; import static io.delta.kernel.internal.TableFeatures.validateWriteSupportedTable; import static io.delta.kernel.internal.checkpoints.Checkpointer.findLastCompleteCheckpointBefore; import static io.delta.kernel.internal.fs.Path.getName; import static io.delta.kernel.internal.replay.LogReplayUtils.assertLogFilesBelongToTable; +import static io.delta.kernel.internal.snapshot.MetadataCleanup.cleanupExpiredLogs; import static io.delta.kernel.internal.util.Preconditions.checkArgument; import static java.lang.String.format; @@ -31,11 +34,13 @@ import io.delta.kernel.exceptions.InvalidTableException; import io.delta.kernel.exceptions.TableNotFoundException; import io.delta.kernel.internal.*; +import io.delta.kernel.internal.actions.Metadata; import io.delta.kernel.internal.checkpoints.*; import io.delta.kernel.internal.fs.Path; import io.delta.kernel.internal.lang.ListUtils; import io.delta.kernel.internal.replay.CreateCheckpointIterator; import io.delta.kernel.internal.replay.LogReplay; +import io.delta.kernel.internal.util.Clock; import io.delta.kernel.internal.util.FileNames; import io.delta.kernel.internal.util.Tuple2; import io.delta.kernel.utils.CloseableIterator; @@ -169,7 +174,14 @@ public Snapshot getSnapshotForTimestamp(Engine engine, long millisSinceEpochUTC) throws TableNotFoundException { long startTimeMillis = System.currentTimeMillis(); long versionToRead = - DeltaHistoryManager.getActiveCommitAtTimestamp(engine, logPath, millisSinceEpochUTC); + DeltaHistoryManager.getActiveCommitAtTimestamp( + engine, + logPath, + millisSinceEpochUTC, + true /* mustBeRecreatable */, + false /* canReturnLastCommit */, + false /* canReturnEarliestCommit */) + .getVersion(); logger.info( "{}: Took {}ms to fetch version at timestamp {}", tablePath, @@ -179,7 +191,8 @@ public Snapshot getSnapshotForTimestamp(Engine engine, long millisSinceEpochUTC) return getSnapshotAt(engine, versionToRead); } - public void checkpoint(Engine engine, long version) throws TableNotFoundException, IOException { + public void checkpoint(Engine engine, Clock clock, long version) + throws TableNotFoundException, IOException { logger.info("{}: Starting checkpoint for version: {}", tablePath, version); // Get the snapshot corresponding the version SnapshotImpl snapshot = (SnapshotImpl) getSnapshotAt(engine, version); @@ -224,6 +237,15 @@ public void checkpoint(Engine engine, long version) throws TableNotFoundExceptio logger.info("{}: Last checkpoint metadata file is written for version: {}", tablePath, version); logger.info("{}: Finished checkpoint for version: {}", tablePath, version); + + // Clean up delta log files if enabled. + Metadata metadata = snapshot.getMetadata(); + if (EXPIRED_LOG_CLEANUP_ENABLED.fromMetadata(engine, metadata)) { + cleanupExpiredLogs(engine, clock, tablePath, LOG_RETENTION.fromMetadata(engine, metadata)); + } else { + logger.info( + "{}: Log cleanup is disabled. Skipping the deletion of expired log files", tablePath); + } } //////////////////// diff --git a/kernel/kernel-api/src/main/java/io/delta/kernel/types/CollationIdentifier.java b/kernel/kernel-api/src/main/java/io/delta/kernel/types/CollationIdentifier.java index 104878d697b..713a141f0a4 100644 --- a/kernel/kernel-api/src/main/java/io/delta/kernel/types/CollationIdentifier.java +++ b/kernel/kernel-api/src/main/java/io/delta/kernel/types/CollationIdentifier.java @@ -35,19 +35,14 @@ public class CollationIdentifier { private final String name; private final Optional version; - public CollationIdentifier(String provider, String collationName) { - Objects.requireNonNull(provider, "Collation provider cannot be null."); - Objects.requireNonNull(collationName, "Collation name cannot be null."); - - this.provider = provider.toUpperCase(); - this.name = collationName.toUpperCase(); - this.version = Optional.empty(); + private CollationIdentifier(String provider, String collationName) { + this(provider, collationName, Optional.empty()); } - public CollationIdentifier(String provider, String collationName, Optional version) { + private CollationIdentifier(String provider, String collationName, Optional version) { Objects.requireNonNull(provider, "Collation provider cannot be null."); Objects.requireNonNull(collationName, "Collation name cannot be null."); - Objects.requireNonNull(version, "Provider version cannot be null."); + Objects.requireNonNull(version, "Collation version cannot be null."); this.provider = provider.toUpperCase(); this.name = collationName.toUpperCase(); @@ -64,14 +59,14 @@ public String getName() { return name; } - /** @return provider version. */ + /** @return collation version. */ public Optional getVersion() { return version; } /** * @param identifier collation identifier in string form of
- * {@code PROVIDER.COLLATION_NAME[.PROVIDER_VERSION]}. + * {@code PROVIDER.COLLATION_NAME[.COLLATION_VERSION]}. * @return appropriate collation identifier object */ public static CollationIdentifier fromString(String identifier) { @@ -104,7 +99,7 @@ public String toStringWithoutVersion() { return String.format("%s.%s", provider, name); } - /** @return collation identifier in form of {@code PROVIDER.COLLATION_NAME[.PROVIDER_VERSION]} */ + /** @return collation identifier in form of {@code PROVIDER.COLLATION_NAME[.COLLATION_VERSION]} */ @Override public String toString() { if (version.isPresent()) { diff --git a/kernel/kernel-api/src/main/java/io/delta/kernel/types/StringType.java b/kernel/kernel-api/src/main/java/io/delta/kernel/types/StringType.java index a9ce78e66a5..a18d93cf804 100644 --- a/kernel/kernel-api/src/main/java/io/delta/kernel/types/StringType.java +++ b/kernel/kernel-api/src/main/java/io/delta/kernel/types/StringType.java @@ -30,7 +30,9 @@ public class StringType extends BasePrimitiveType { private final CollationIdentifier collationIdentifier; /** - * @param collationIdentifier identifier of collation in which this StringType will be observed + * @param collationIdentifier An identifier representing the collation to be used for string + * comparison and sorting. This determines how strings will be ordered and compared in query + * operations. */ public StringType(CollationIdentifier collationIdentifier) { super("string"); diff --git a/kernel/kernel-api/src/main/java/io/delta/kernel/utils/FileStatus.java b/kernel/kernel-api/src/main/java/io/delta/kernel/utils/FileStatus.java index 6e0a7257169..4b659126230 100644 --- a/kernel/kernel-api/src/main/java/io/delta/kernel/utils/FileStatus.java +++ b/kernel/kernel-api/src/main/java/io/delta/kernel/utils/FileStatus.java @@ -74,4 +74,23 @@ public long getModificationTime() { public static FileStatus of(String path, long size, long modificationTime) { return new FileStatus(path, size, modificationTime); } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + FileStatus that = (FileStatus) o; + return Objects.equals(this.path, that.path) + && Objects.equals(this.size, that.size) + && Objects.equals(this.modificationTime, that.modificationTime); + } + + @Override + public int hashCode() { + return Objects.hash(path, size, modificationTime); + } } diff --git a/kernel/kernel-api/src/test/scala/io/delta/kernel/internal/DeltaHistoryManagerSuite.scala b/kernel/kernel-api/src/test/scala/io/delta/kernel/internal/DeltaHistoryManagerSuite.scala index b66f758b94b..d3342577fce 100644 --- a/kernel/kernel-api/src/test/scala/io/delta/kernel/internal/DeltaHistoryManagerSuite.scala +++ b/kernel/kernel-api/src/test/scala/io/delta/kernel/internal/DeltaHistoryManagerSuite.scala @@ -29,25 +29,52 @@ class DeltaHistoryManagerSuite extends AnyFunSuite with MockFileSystemClientUtil def checkGetActiveCommitAtTimestamp( fileList: Seq[FileStatus], timestamp: Long, - expectedVersion: Long): Unit = { + expectedVersion: Long, + mustBeRecreatable: Boolean = true, + canReturnLastCommit: Boolean = false, + canReturnEarliestCommit: Boolean = false): Unit = { val activeCommit = DeltaHistoryManager.getActiveCommitAtTimestamp( createMockFSListFromEngine(fileList), logPath, - timestamp + timestamp, + mustBeRecreatable, + canReturnLastCommit, + canReturnEarliestCommit ) - assert(activeCommit == expectedVersion, + assert(activeCommit.getVersion == expectedVersion, s"Expected version $expectedVersion but got $activeCommit for timestamp=$timestamp") + + if (mustBeRecreatable) { + // When mustBeRecreatable=true, we should have the same answer as mustBeRecreatable=false + // for valid queries that do not throw an error + val activeCommit = DeltaHistoryManager.getActiveCommitAtTimestamp( + createMockFSListFromEngine(fileList), + logPath, + timestamp, + false, // mustBeRecreatable + canReturnLastCommit, + canReturnEarliestCommit + ) + assert(activeCommit.getVersion == expectedVersion, + s"Expected version $expectedVersion but got $activeCommit for timestamp=$timestamp") + } } def checkGetActiveCommitAtTimestampError[T <: Throwable]( fileList: Seq[FileStatus], timestamp: Long, - expectedErrorMessageContains: String)(implicit classTag: ClassTag[T]): Unit = { + expectedErrorMessageContains: String, + mustBeRecreatable: Boolean = true, + canReturnLastCommit: Boolean = false, + canReturnEarliestCommit: Boolean = false)(implicit classTag: ClassTag[T]): Unit = { val e = intercept[T] { DeltaHistoryManager.getActiveCommitAtTimestamp( createMockFSListFromEngine(fileList), logPath, - timestamp + timestamp, + mustBeRecreatable, + canReturnLastCommit, + canReturnEarliestCommit ) } assert(e.getMessage.contains(expectedErrorMessageContains)) @@ -72,6 +99,9 @@ class DeltaHistoryManagerSuite extends AnyFunSuite with MockFileSystemClientUtil 21, DeltaErrors.timestampAfterLatestCommit(dataPath.toString, 21, 20, 2).getMessage ) + // Valid queries with canReturnLastCommit=true and canReturnEarliestCommit=true + checkGetActiveCommitAtTimestamp(deltaFiles, -1, 0, canReturnEarliestCommit = true) + checkGetActiveCommitAtTimestamp(deltaFiles, 21, 2, canReturnLastCommit = true) } test("getActiveCommitAtTimestamp: basic listing from 0 with a checkpoint") { @@ -93,6 +123,9 @@ class DeltaHistoryManagerSuite extends AnyFunSuite with MockFileSystemClientUtil 21, DeltaErrors.timestampAfterLatestCommit(dataPath.toString, 21, 20, 2).getMessage ) + // Valid queries with canReturnLastCommit=true and canReturnEarliestCommit=true + checkGetActiveCommitAtTimestamp(deltaFiles, -1, 0, canReturnEarliestCommit = true) + checkGetActiveCommitAtTimestamp(deltaFiles, 21, 2, canReturnLastCommit = true) } test("getActiveCommitAtTimestamp: truncated delta log") { @@ -112,6 +145,9 @@ class DeltaHistoryManagerSuite extends AnyFunSuite with MockFileSystemClientUtil 31, DeltaErrors.timestampAfterLatestCommit(dataPath.toString, 31, 30, 3).getMessage ) + // Valid queries with canReturnLastCommit=true and canReturnEarliestCommit=true + checkGetActiveCommitAtTimestamp(deltaFiles, 8, 2, canReturnEarliestCommit = true) + checkGetActiveCommitAtTimestamp(deltaFiles, 31, 3, canReturnLastCommit = true) } test("getActiveCommitAtTimestamp: truncated delta log only checkpoint version") { @@ -129,6 +165,9 @@ class DeltaHistoryManagerSuite extends AnyFunSuite with MockFileSystemClientUtil 21, DeltaErrors.timestampAfterLatestCommit(dataPath.toString, 21, 20, 2).getMessage ) + // Valid queries with canReturnLastCommit=true and canReturnEarliestCommit=true + checkGetActiveCommitAtTimestamp(deltaFiles, 8, 2, canReturnEarliestCommit = true) + checkGetActiveCommitAtTimestamp(deltaFiles, 21, 2, canReturnLastCommit = true) } test("getActiveCommitAtTimestamp: truncated delta log with multi-part checkpoint") { @@ -148,6 +187,9 @@ class DeltaHistoryManagerSuite extends AnyFunSuite with MockFileSystemClientUtil 31, DeltaErrors.timestampAfterLatestCommit(dataPath.toString, 31, 30, 3).getMessage ) + // Valid queries with canReturnLastCommit=true and canReturnEarliestCommit=true + checkGetActiveCommitAtTimestamp(deltaFiles, 8, 2, canReturnEarliestCommit = true) + checkGetActiveCommitAtTimestamp(deltaFiles, 31, 3, canReturnLastCommit = true) } test("getActiveCommitAtTimestamp: throws table not found exception") { @@ -156,7 +198,10 @@ class DeltaHistoryManagerSuite extends AnyFunSuite with MockFileSystemClientUtil DeltaHistoryManager.getActiveCommitAtTimestamp( createMockFSListFromEngine(p => throw new FileNotFoundException(p)), logPath, - 0 + 0, + true, // mustBeRecreatable + false, // canReturnLastCommit + false // canReturnEarliestCommit ) ) // Empty _delta_log directory @@ -164,7 +209,10 @@ class DeltaHistoryManagerSuite extends AnyFunSuite with MockFileSystemClientUtil DeltaHistoryManager.getActiveCommitAtTimestamp( createMockFSListFromEngine(p => Seq()), logPath, - 0 + 0, + true, // mustBeRecreatable + false, // canReturnLastCommit + false // canReturnEarliestCommit ) ) } @@ -205,4 +253,77 @@ class DeltaHistoryManagerSuite extends AnyFunSuite with MockFileSystemClientUtil "No recreatable commits found" ) } + + test("getActiveCommitAtTimestamp: when mustBeRecreatable=false") { + Seq(deltaFileStatuses(Seq(1L, 2L, 3L)), // w/o checkpoint + singularCheckpointFileStatuses(Seq(2L)) ++ deltaFileStatuses(Seq(1L, 2L, 3L)) // w/checkpoint + ).foreach { deltaFiles => + // Valid queries + checkGetActiveCommitAtTimestamp(deltaFiles, 10, 1, mustBeRecreatable = false) + checkGetActiveCommitAtTimestamp(deltaFiles, 11, 1, mustBeRecreatable = false) + checkGetActiveCommitAtTimestamp(deltaFiles, 20, 2, mustBeRecreatable = false) + checkGetActiveCommitAtTimestamp(deltaFiles, 21, 2, mustBeRecreatable = false) + checkGetActiveCommitAtTimestamp(deltaFiles, 30, 3, mustBeRecreatable = false) + // Invalid queries + checkGetActiveCommitAtTimestampError[RuntimeException]( + deltaFiles, + -1, + DeltaErrors.timestampBeforeFirstAvailableCommit(dataPath.toString, -1, 10, 1).getMessage, + mustBeRecreatable = false + ) + checkGetActiveCommitAtTimestampError[RuntimeException]( + deltaFiles, + 31, + DeltaErrors.timestampAfterLatestCommit(dataPath.toString, 31, 30, 3).getMessage, + mustBeRecreatable = false + ) + // Valid queries with canReturnLastCommit=true and canReturnEarliestCommit=true + checkGetActiveCommitAtTimestamp( + deltaFiles, 0, 1, mustBeRecreatable = false, canReturnEarliestCommit = true) + checkGetActiveCommitAtTimestamp( + deltaFiles, 31, 3, mustBeRecreatable = false, canReturnLastCommit = true) + } + } + + test("getActiveCommitAtTimestamp: mustBeRecreatable=false error cases") { + /* ---------- TABLE NOT FOUND --------- */ + // Non-existent path + intercept[TableNotFoundException]( + DeltaHistoryManager.getActiveCommitAtTimestamp( + createMockFSListFromEngine(p => throw new FileNotFoundException(p)), + logPath, + 0, + false, // mustBeRecreatable + false, // canReturnLastCommit + false // canReturnEarliestCommit + ) + ) + // Empty _delta_log directory + intercept[TableNotFoundException]( + DeltaHistoryManager.getActiveCommitAtTimestamp( + createMockFSListFromEngine(p => Seq()), + logPath, + 0, + true, // mustBeRecreatable + false, // canReturnLastCommit + false // canReturnEarliestCommit + ) + ) + /* ---------- CORRUPT LISTINGS --------- */ + // No commit files at all (only checkpoint files) + checkGetActiveCommitAtTimestampError[RuntimeException]( + singularCheckpointFileStatuses(Seq(1L)), + 25, + "No delta files found in the directory", + mustBeRecreatable = false + ) + // No delta files + checkGetActiveCommitAtTimestampError[RuntimeException]( + Seq("foo", "notdelta.parquet", "foo.json", "001.checkpoint.00f.oo0.parquet") + .map(FileStatus.of(_, 10, 10)), + 25, + "No delta files found in the directory", + mustBeRecreatable = false + ) + } } diff --git a/kernel/kernel-api/src/test/scala/io/delta/kernel/internal/DeltaLogActionUtilsSuite.scala b/kernel/kernel-api/src/test/scala/io/delta/kernel/internal/DeltaLogActionUtilsSuite.scala new file mode 100644 index 00000000000..3846fd034d3 --- /dev/null +++ b/kernel/kernel-api/src/test/scala/io/delta/kernel/internal/DeltaLogActionUtilsSuite.scala @@ -0,0 +1,243 @@ +/* + * Copyright (2024) The Delta Lake Project Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.delta.kernel.internal + +import java.io.FileNotFoundException + +import scala.collection.JavaConverters._ +import scala.reflect.ClassTag + +import io.delta.kernel.exceptions.{InvalidTableException, KernelException, TableNotFoundException} +import io.delta.kernel.internal.util.FileNames +import io.delta.kernel.utils.FileStatus +import org.scalatest.funsuite.AnyFunSuite +import io.delta.kernel.internal.DeltaLogActionUtils.{getCommitFilesForVersionRange, verifyDeltaVersions} +import io.delta.kernel.test.MockFileSystemClientUtils + +class DeltaLogActionUtilsSuite extends AnyFunSuite with MockFileSystemClientUtils { + + ////////////////////////////////////////////////////////////////////////////////// + // verifyDeltaVersions tests + ////////////////////////////////////////////////////////////////////////////////// + + def getCommitFiles(versions: Seq[Long]): java.util.List[FileStatus] = { + versions + .map(v => FileStatus.of(FileNames.deltaFile(logPath, v), 0, 0)) + .asJava + } + + test("verifyDeltaVersions") { + // Basic correct use case + verifyDeltaVersions( + getCommitFiles(Seq(1, 2, 3)), + 1, + 3, + dataPath + ) + // Only one version provided + verifyDeltaVersions( + getCommitFiles(Seq(1)), + 1, + 1, + dataPath + ) + // Non-contiguous versions + intercept[InvalidTableException] { + verifyDeltaVersions( + getCommitFiles(Seq(1, 3, 4)), + 1, + 4, + dataPath + ) + } + // End-version or start-version not right + intercept[KernelException] { + verifyDeltaVersions( + getCommitFiles(Seq(1, 2, 3)), + 0, + 3, + dataPath + ) + } + intercept[KernelException] { + verifyDeltaVersions( + getCommitFiles(Seq(1, 2, 3)), + 1, + 4, + dataPath + ) + } + // Empty versions + intercept[KernelException] { + verifyDeltaVersions( + getCommitFiles(Seq()), + 1, + 4, + dataPath + ) + } + // Unsorted or duplicates (shouldn't be possible) + intercept[InvalidTableException] { + verifyDeltaVersions( + getCommitFiles(Seq(1, 1, 2)), + 1, + 4, + dataPath + ) + } + intercept[InvalidTableException] { + verifyDeltaVersions( + getCommitFiles(Seq(1, 4, 3, 2)), + 1, + 2, + dataPath + ) + } + } + + ////////////////////////////////////////////////////////////////////////////////// + // getCommitFilesForVersionRange tests + ////////////////////////////////////////////////////////////////////////////////// + + test("getCommitFilesForVersionRange: directory does not exist") { + intercept[TableNotFoundException] { + getCommitFilesForVersionRange( + createMockFSListFromEngine(_ => throw new FileNotFoundException()), + dataPath, + 0, + 1 + ) + } + } + + def testGetCommitFilesExpectedError[T <: Throwable]( + testName: String, + files: Seq[FileStatus], + startVersion: Long = 1, + endVersion: Long = 3, + expectedErrorMessageContains: String + )(implicit classTag: ClassTag[T]): Unit = { + test("getCommitFilesForVersionRange: " + testName) { + val e = intercept[T] { + getCommitFilesForVersionRange( + createMockFSListFromEngine(files), + dataPath, + startVersion, + endVersion + ) + } + assert(e.getMessage.contains(expectedErrorMessageContains)) + } + } + + testGetCommitFilesExpectedError[KernelException]( + testName = "empty directory", + files = Seq(), + expectedErrorMessageContains = "no log files found in the requested version range" + ) + + testGetCommitFilesExpectedError[KernelException]( + testName = "all versions less than startVersion", + files = deltaFileStatuses(Seq(0)), + expectedErrorMessageContains = "no log files found in the requested version range" + ) + + testGetCommitFilesExpectedError[KernelException]( + testName = "all versions greater than endVersion", + files = deltaFileStatuses(Seq(4, 5, 6)), + expectedErrorMessageContains = "no log files found in the requested version range" + ) + + testGetCommitFilesExpectedError[InvalidTableException]( + testName = "missing log files", + files = deltaFileStatuses(Seq(1, 3)), + expectedErrorMessageContains = "versions are not contiguous" + ) + + testGetCommitFilesExpectedError[KernelException]( + testName = "start version not available", + files = deltaFileStatuses(Seq(2, 3, 4, 5)), + expectedErrorMessageContains = "no log file found for version 1" + ) + + testGetCommitFilesExpectedError[KernelException]( + testName = "end version not available", + files = deltaFileStatuses(Seq(0, 1, 2)), + expectedErrorMessageContains = "no log file found for version 3" + ) + + testGetCommitFilesExpectedError[KernelException]( + testName = "invalid start version", + files = deltaFileStatuses(Seq(0, 1, 2)), + startVersion = -1, + expectedErrorMessageContains = "Invalid version range" + ) + + testGetCommitFilesExpectedError[KernelException]( + testName = "invalid end version", + files = deltaFileStatuses(Seq(0, 1, 2)), + startVersion = 3, + endVersion = 2, + expectedErrorMessageContains = "Invalid version range" + ) + + def testGetCommitFiles( + testName: String, + files: Seq[FileStatus], + startVersion: Long = 1, + endVersion: Long = 3, + expectedCommitFiles: Seq[FileStatus] + ): Unit = { + test("getCommitFilesForVersionRange: " + testName) { + assert( + getCommitFilesForVersionRange( + createMockFSListFromEngine(files), + dataPath, + startVersion, + endVersion + ).asScala sameElements expectedCommitFiles + ) + } + } + + testGetCommitFiles( + testName = "basic case", + files = deltaFileStatuses(Seq(0, 1, 2, 3, 4, 5)), + expectedCommitFiles = deltaFileStatuses(Seq(1, 2, 3)) + ) + + testGetCommitFiles( + testName = "basic case with checkpoint file", + files = deltaFileStatuses(Seq(0, 1, 2, 3, 4, 5)) ++ singularCheckpointFileStatuses(Seq(2)), + expectedCommitFiles = deltaFileStatuses(Seq(1, 2, 3)) + ) + + testGetCommitFiles( + testName = "basic case with non-log files", + files = deltaFileStatuses(Seq(0, 1, 2, 3, 4, 5)) ++ + deltaFileStatuses(Seq(2)) + .map(fs => FileStatus.of(fs.getPath + ".crc", fs.getSize, fs.getModificationTime)), + expectedCommitFiles = deltaFileStatuses(Seq(1, 2, 3)) + ) + + testGetCommitFiles( + testName = "version range size 1", + files = deltaFileStatuses(Seq(0, 1, 2, 3, 4, 5)), + startVersion = 0, + endVersion = 0, + expectedCommitFiles = deltaFileStatuses(Seq(0)) + ) +} diff --git a/kernel/kernel-api/src/test/scala/io/delta/kernel/internal/TableImplSuite.scala b/kernel/kernel-api/src/test/scala/io/delta/kernel/internal/TableImplSuite.scala new file mode 100644 index 00000000000..0c20e2a4f08 --- /dev/null +++ b/kernel/kernel-api/src/test/scala/io/delta/kernel/internal/TableImplSuite.scala @@ -0,0 +1,123 @@ +/* + * Copyright (2024) The Delta Lake Project Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.delta.kernel.internal + +import io.delta.kernel.test.{MockFileSystemClientUtils, MockListFromResolvePathFileSystemClient} +import io.delta.kernel.utils.FileStatus +import io.delta.kernel.Table +import io.delta.kernel.exceptions.KernelException +import org.scalatest.funsuite.AnyFunSuite + +class TableImplSuite extends AnyFunSuite with MockFileSystemClientUtils { + + def checkGetVersionBeforeOrAtTimestamp( + fileList: Seq[FileStatus], + timestamp: Long, + expectedVersion: Option[Long] = None, + expectedErrorMessageContains: Option[String] = None): Unit = { + // Check our inputs are as expected + assert(expectedVersion.isEmpty || expectedErrorMessageContains.isEmpty) + assert(expectedVersion.nonEmpty || expectedErrorMessageContains.nonEmpty) + + val engine = mockEngine(fileSystemClient = + new MockListFromResolvePathFileSystemClient(listFromProvider(fileList))) + val table = Table.forPath(engine, dataPath.toString) + + expectedVersion.foreach { v => + assert(table.asInstanceOf[TableImpl].getVersionBeforeOrAtTimestamp(engine, timestamp) == v) + } + expectedErrorMessageContains.foreach { s => + assert(intercept[KernelException] { + table.asInstanceOf[TableImpl].getVersionBeforeOrAtTimestamp(engine, timestamp) + }.getMessage.contains(s)) + } + } + + def checkGetVersionAtOrAfterTimestamp( + fileList: Seq[FileStatus], + timestamp: Long, + expectedVersion: Option[Long] = None, + expectedErrorMessageContains: Option[String] = None): Unit = { + // Check our inputs are as expected + assert(expectedVersion.isEmpty || expectedErrorMessageContains.isEmpty) + assert(expectedVersion.nonEmpty || expectedErrorMessageContains.nonEmpty) + + val engine = mockEngine(fileSystemClient = + new MockListFromResolvePathFileSystemClient(listFromProvider(fileList))) + val table = Table.forPath(engine, dataPath.toString) + + expectedVersion.foreach { v => + assert(table.asInstanceOf[TableImpl].getVersionAtOrAfterTimestamp(engine, timestamp) == v) + } + expectedErrorMessageContains.foreach { s => + assert(intercept[KernelException] { + table.asInstanceOf[TableImpl].getVersionAtOrAfterTimestamp(engine, timestamp) + }.getMessage.contains(s)) + } + } + + test("getVersionBeforeOrAtTimestamp: basic case from 0") { + val deltaFiles = deltaFileStatuses(Seq(0L, 1L)) + checkGetVersionBeforeOrAtTimestamp(deltaFiles, -1, + expectedErrorMessageContains = Some("is before the earliest available version 0")) // before 0 + checkGetVersionBeforeOrAtTimestamp(deltaFiles, 0, expectedVersion = Some(0)) // at 0 + checkGetVersionBeforeOrAtTimestamp(deltaFiles, 5, expectedVersion = Some(0)) // btw 0, 1 + checkGetVersionBeforeOrAtTimestamp(deltaFiles, 10, expectedVersion = Some(1)) // at 1 + checkGetVersionBeforeOrAtTimestamp(deltaFiles, 11, expectedVersion = Some(1)) // after 1 + } + + test("getVersionAtOrAfterTimestamp: basic case from 0") { + val deltaFiles = deltaFileStatuses(Seq(0L, 1L)) + checkGetVersionAtOrAfterTimestamp(deltaFiles, -1, expectedVersion = Some(0)) // before 0 + checkGetVersionAtOrAfterTimestamp(deltaFiles, 0, expectedVersion = Some(0)) // at 0 + checkGetVersionAtOrAfterTimestamp(deltaFiles, 5, expectedVersion = Some(1)) // btw 0, 1 + checkGetVersionAtOrAfterTimestamp(deltaFiles, 10, expectedVersion = Some(1)) // at 1 + checkGetVersionAtOrAfterTimestamp(deltaFiles, 11, + expectedErrorMessageContains = Some("is after the latest available version 1")) // after 1 + } + + test("getVersionBeforeOrAtTimestamp: w/ checkpoint + w/o checkpoint") { + Seq( + deltaFileStatuses(Seq(10L, 11L, 12L)) ++ singularCheckpointFileStatuses(Seq(10L)), + deltaFileStatuses(Seq(10L, 11L, 12L)) // checks that does not need to be recreatable + ).foreach { deltaFiles => + checkGetVersionBeforeOrAtTimestamp(deltaFiles, 99, // before 10 + expectedErrorMessageContains = Some("is before the earliest available version 10")) + checkGetVersionBeforeOrAtTimestamp(deltaFiles, 100, expectedVersion = Some(10)) // at 10 + checkGetVersionBeforeOrAtTimestamp(deltaFiles, 105, expectedVersion = Some(10)) // btw 10, 11 + checkGetVersionBeforeOrAtTimestamp(deltaFiles, 110, expectedVersion = Some(11)) // at 11 + checkGetVersionBeforeOrAtTimestamp(deltaFiles, 115, expectedVersion = Some(11)) // btw 11, 12 + checkGetVersionBeforeOrAtTimestamp(deltaFiles, 120, expectedVersion = Some(12)) // at 12 + checkGetVersionBeforeOrAtTimestamp(deltaFiles, 125, expectedVersion = Some(12)) // after 12 + } + } + + test("getVersionAtOrAfterTimestamp: w/ checkpoint + w/o checkpoint") { + Seq( + deltaFileStatuses(Seq(10L, 11L, 12L)) ++ singularCheckpointFileStatuses(Seq(10L)), + deltaFileStatuses(Seq(10L, 11L, 12L)) // checks that does not need to be recreatable + ).foreach { deltaFiles => + checkGetVersionAtOrAfterTimestamp(deltaFiles, 99, expectedVersion = Some(10)) // before 10 + checkGetVersionAtOrAfterTimestamp(deltaFiles, 100, expectedVersion = Some(10)) // at 10 + checkGetVersionAtOrAfterTimestamp(deltaFiles, 105, expectedVersion = Some(11)) // btw 10, 11 + checkGetVersionAtOrAfterTimestamp(deltaFiles, 110, expectedVersion = Some(11)) // at 11 + checkGetVersionAtOrAfterTimestamp(deltaFiles, 115, expectedVersion = Some(12)) // btw 11, 12 + checkGetVersionAtOrAfterTimestamp(deltaFiles, 120, expectedVersion = Some(12)) // at 12 + checkGetVersionAtOrAfterTimestamp(deltaFiles, 125, + expectedErrorMessageContains = Some("is after the latest available version 12")) // after 12 + } + } +} diff --git a/kernel/kernel-api/src/test/scala/io/delta/kernel/internal/snapshot/MetadataCleanupSuite.scala b/kernel/kernel-api/src/test/scala/io/delta/kernel/internal/snapshot/MetadataCleanupSuite.scala new file mode 100644 index 00000000000..04afb67bd68 --- /dev/null +++ b/kernel/kernel-api/src/test/scala/io/delta/kernel/internal/snapshot/MetadataCleanupSuite.scala @@ -0,0 +1,308 @@ +/* + * Copyright (2023) The Delta Lake Project Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.delta.kernel.internal.snapshot + +import io.delta.kernel.internal.snapshot.MetadataCleanup.cleanupExpiredLogs +import io.delta.kernel.internal.util.ManualClock +import io.delta.kernel.test.{MockFileSystemClientUtils, MockListFromDeleteFileSystemClient} +import io.delta.kernel.utils.FileStatus +import org.scalatest.funsuite.AnyFunSuite + +/** + * Test suite for the metadata cleanup logic in the Delta log directory. It mocks the + * `FileSystemClient` to test the cleanup logic for various combinations of delta files and + * checkpoint files. Utility methods in `MockFileSystemClientUtils` are used to generate the + * log file statuses which usually have modification time as the `version * 10`. + */ +class MetadataCleanupSuite extends AnyFunSuite with MockFileSystemClientUtils { + + import MetadataCleanupSuite._ + + /* ------------------- TESTS ------------------ */ + + // Simple case where the Delta log directory contains only delta files and no checkpoint files + Seq( + ( + "no files should be deleted even some of them are expired", + DeletedFileList(), // expected deleted files - none of them should be deleted + 70, // current time + 30 // retention period + ), + ( + "no files should be deleted as none of them are expired", + DeletedFileList(), // expected deleted files - none of them should be deleted + 200, // current time + 200 // retention period + ), + ( + "no files should be deleted as none of them are expired", + DeletedFileList(), // expected deleted files - none of them should be deleted + 200, // current time + 0 // retention period + ) + ).foreach { + case (testName, expectedDeletedFiles, currentTime, retentionPeriod) => + // _deltalog directory contents - contains only delta files + val logFiles = deltaFileStatuses(Seq(0, 1, 2, 3, 4, 5, 6)) + test(s"metadataCleanup: $testName: $currentTime, $retentionPeriod") { + cleanupAndVerify(logFiles, expectedDeletedFiles.fileList(), currentTime, retentionPeriod) + } + } + + // with various checkpoint types + Seq("classic", "multi-part", "v2", "hybrid").foreach { checkpointType => + // _deltalog directory contains a combination of delta files and checkpoint files + + val logFiles = deltaFileStatuses(Seq(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12)) ++ + (checkpointType match { + case "classic" => + singularCheckpointFileStatuses(Seq(3, 6, 9, 12)) + case "multi-part" => + multiCheckpointFileStatuses(Seq(3, 6, 9, 12), multiPartCheckpointPartsSize) + case "v2" => + v2CPFileStatuses(Seq[Long](3, 6, 9, 12)) + case "hybrid" => + singularCheckpointFileStatuses(Seq(3)) ++ + multiCheckpointFileStatuses(Seq(6), numParts = multiPartCheckpointPartsSize) ++ + v2CPFileStatuses(Seq[Long](9)) ++ + singularCheckpointFileStatuses(Seq(12)) + }) + + // test cases + Seq( + ( + "delete expired delta files up to the checkpoint version, " + + "not all expired delta files are deleted", + Seq(0L, 1L, 2L), // expDeletedDeltaVersions, + Seq(), // expDeletedCheckpointVersions, + 130, // current time + 80 // retention period + ), + ( + "expired delta files + expired checkpoint should be deleted", + Seq(0L, 1L, 2L, 3L, 4L, 5L), // expDeletedDeltaVersions, + Seq(3L), // expDeletedCheckpointVersions, + 130, // current time + 60 // retention period + ), + ( + "expired delta files + expired checkpoints should be deleted", + Seq(0L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L), // expDeletedDeltaVersions, + Seq(3L, 6L), // expDeletedCheckpointVersions, + 130, // current time + 40 // retention period + ), + ( + "all delta/checkpoint files should be except the last checkpoint file", + Seq(0L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L), // expDeletedDeltaVersions, + Seq(3L, 6L, 9L), // expDeletedCheckpointVersions, + 130, // current time + 0 // retention period + ), + ( + "no delta/checkpoint files should be deleted as none expired", + Seq(), // expDeletedDeltaVersions + Seq(), // expDeletedDeltaVersions + 200, // current time + 200 // retention period + ) + ).foreach { + case (testName, expDeletedDeltaVersions, expDeletedCheckpointVersions, + currentTime, retentionPeriod) => + + val expectedDeletedFiles = DeletedFileList( + deltaVersions = expDeletedDeltaVersions, + classicCheckpointVersions = checkpointType match { + case "classic" => expDeletedCheckpointVersions + case "hybrid" => expDeletedCheckpointVersions.filter(Seq(3, 12).contains(_)) + case _ => Seq.empty + }, + multipartCheckpointVersions = checkpointType match { + case "multi-part" => expDeletedCheckpointVersions + case "hybrid" => expDeletedCheckpointVersions.filter(_ == 6) + case _ => Seq.empty + }, + v2CheckpointVersions = checkpointType match { + case "v2" => expDeletedCheckpointVersions + case "hybrid" => expDeletedCheckpointVersions.filter(_ == 9) + case _ => Seq.empty + } + ) + + test(s"metadataCleanup: $checkpointType: $testName: $currentTime, $retentionPeriod") { + cleanupAndVerify(logFiles, expectedDeletedFiles.fileList(), currentTime, retentionPeriod) + } + } + } + + test("first log entry is a checkpoint") { + val logFiles = multiCheckpointFileStatuses(Seq(25), multiPartCheckpointPartsSize) ++ + singularCheckpointFileStatuses(Seq(29)) ++ + deltaFileStatuses(Seq(25, 26, 27, 28, 29, 30, 31, 32)) + + Seq( + ( + 330, // current time + 50, // retention period + DeletedFileList() // expected deleted files - none of them should be deleted + ), + ( + 330, // current time + 30, // retention period + DeletedFileList( + deltaVersions = Seq(25, 26, 27, 28), + multipartCheckpointVersions = Seq(25) + ) + ), + ( + 330, // current time + 10, // retention period + DeletedFileList( + deltaVersions = Seq(25, 26, 27, 28), + multipartCheckpointVersions = Seq(25) + ) + ) + ).foreach { + case (currentTime, retentionPeriod, expectedDeletedFiles) => + cleanupAndVerify(logFiles, expectedDeletedFiles.fileList(), currentTime, retentionPeriod) + } + } + + /* ------------------- NEGATIVE TESTS ------------------ */ + test("metadataCleanup: invalid retention period") { + val e = intercept[IllegalArgumentException] { + cleanupExpiredLogs( + mockEngine(mockFsClient(Seq.empty)), + new ManualClock(100), + logPath, + -1 /* retentionPeriod */ + ) + } + + assert(e.getMessage.contains("Retention period must be non-negative")) + } + + test("incomplete checkpoints should not be considered") { + val logFiles = deltaFileStatuses(Seq(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12)) ++ + multiCheckpointFileStatuses(Seq(3), multiPartCheckpointPartsSize) + // delete the third part of the checkpoint + .filterNot(_.getPath.contains(s"%010d.%010d".format(2, 4))) ++ + multiCheckpointFileStatuses(Seq(6), multiPartCheckpointPartsSize) ++ + v2CPFileStatuses(Seq(9)) + + // test cases + Seq( + ( + Seq[Long](), // expDeletedDeltaVersions, + Seq[Long](), // expDeletedCheckpointVersions, + 130, // current time + 80 // retention period + ), + ( + Seq(0L, 1L, 2L, 3L, 4L, 5L), // expDeletedDeltaVersions, + Seq(3L), // expDeletedCheckpointVersions, + 130, // current time + 60 // retention period + ), + ( + Seq(0L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L), // expDeletedDeltaVersions, + Seq(3L, 6L), // expDeletedCheckpointVersions, + 130, // current time + 20 // retention period + ) + ).foreach { + case (expDeletedDeltaVersions, expDeletedCheckpointVersions, + currentTime, retentionPeriod) => + + val expectedDeletedFiles = (deltaFileStatuses(expDeletedDeltaVersions) ++ + expDeletedCheckpointVersions.flatMap { + case v@3 => multiCheckpointFileStatuses(Seq(v), multiPartCheckpointPartsSize) + .filterNot(_.getPath.contains(s"%010d.%010d".format(2, 4))) + case v@6 => multiCheckpointFileStatuses(Seq(v), multiPartCheckpointPartsSize) + case v@9 => v2CPFileStatuses(Seq(v)) + }).map(_.getPath) + + cleanupAndVerify(logFiles, expectedDeletedFiles, currentTime, retentionPeriod) + } + } + + /* ------------------- HELPER UTILITIES/CONSTANTS ------------------ */ + /** + * Cleanup the metadata log files and verify the expected deleted files. + * + * @param logFiles List of log files in the _delta_log directory + * @param expectedDeletedFiles List of expected deleted file paths + * @param currentTimeMillis Current time in millis + * @param retentionPeriodMillis Retention period in millis + */ + def cleanupAndVerify( + logFiles: Seq[FileStatus], + expectedDeletedFiles: Seq[String], + currentTimeMillis: Long, + retentionPeriodMillis: Long): Unit = { + val fsClient = mockFsClient(logFiles) + val resultDeletedCount = cleanupExpiredLogs( + mockEngine(fsClient), + new ManualClock(currentTimeMillis), + logPath, + retentionPeriodMillis + ) + + assert(resultDeletedCount === expectedDeletedFiles.size) + assert(fsClient.getDeleteCalls.toSet === expectedDeletedFiles.toSet) + } +} + +object MetadataCleanupSuite extends MockFileSystemClientUtils { + /* ------------------- HELPER UTILITIES/CONSTANTS ------------------ */ + private val multiPartCheckpointPartsSize = 4 + + /** Case class containing the list of expected files in the deleted metadata log file list */ + case class DeletedFileList( + deltaVersions: Seq[Long] = Seq.empty, + classicCheckpointVersions: Seq[Long] = Seq.empty, + multipartCheckpointVersions: Seq[Long] = Seq.empty, + v2CheckpointVersions: Seq[Long] = Seq.empty) { + + def fileList(): Seq[String] = { + (deltaFileStatuses(deltaVersions) ++ + singularCheckpointFileStatuses(classicCheckpointVersions) ++ + multiCheckpointFileStatuses(multipartCheckpointVersions, multiPartCheckpointPartsSize) ++ + v2CPFileStatuses(v2CheckpointVersions) + ).sortBy(_.getPath).map(_.getPath) + } + } + + def mockFsClient(logFiles: Seq[FileStatus]): MockListFromDeleteFileSystemClient = { + new MockListFromDeleteFileSystemClient(logFiles) + } + + def v2CPFileStatuses(versions: Seq[Long]): Seq[FileStatus] = { + // Replace the UUID with a standard UUID to make the test deterministic + val standardUUID = "123e4567-e89b-12d3-a456-426614174000" + val uuidPattern = + "[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}".r + + v2CheckpointFileStatuses( + versions.map(v => (v, true, 20)), // to (version, useUUID, numSidecars) + "parquet" + ).map(_._1) + .map(f => FileStatus.of( + uuidPattern.replaceAllIn(f.getPath, standardUUID), + f.getSize, + f.getModificationTime)) + } +} diff --git a/kernel/kernel-api/src/test/scala/io/delta/kernel/test/MockEngineUtils.scala b/kernel/kernel-api/src/test/scala/io/delta/kernel/test/MockEngineUtils.scala index 671272bdabb..c7eb9caf7f9 100644 --- a/kernel/kernel-api/src/test/scala/io/delta/kernel/test/MockEngineUtils.scala +++ b/kernel/kernel-api/src/test/scala/io/delta/kernel/test/MockEngineUtils.scala @@ -159,4 +159,7 @@ trait BaseMockFileSystemClient extends FileSystemClient { override def mkdirs(path: String): Boolean = throw new UnsupportedOperationException("not supported in this test suite") + + override def delete(path: String): Boolean = + throw new UnsupportedOperationException("not supported in this test suite") } diff --git a/kernel/kernel-api/src/test/scala/io/delta/kernel/test/MockFileSystemClientUtils.scala b/kernel/kernel-api/src/test/scala/io/delta/kernel/test/MockFileSystemClientUtils.scala index 00f42d0638a..0e8ed2c56b9 100644 --- a/kernel/kernel-api/src/test/scala/io/delta/kernel/test/MockFileSystemClientUtils.scala +++ b/kernel/kernel-api/src/test/scala/io/delta/kernel/test/MockFileSystemClientUtils.scala @@ -141,3 +141,48 @@ class MockListFromFileSystemClient(listFromProvider: String => Seq[FileStatus]) def getListFromCalls: Seq[String] = listFromCalls } + +/** + * A mock [[FileSystemClient]] that answers `listFrom` calls from a given content provider and + * implements the identity function for `resolvePath` calls. + * + * It also maintains metrics on number of times `listFrom` is called and arguments for each call. + */ +class MockListFromResolvePathFileSystemClient(listFromProvider: String => Seq[FileStatus]) + extends BaseMockFileSystemClient { + private var listFromCalls: Seq[String] = Seq.empty + + override def listFrom(filePath: String): CloseableIterator[FileStatus] = { + listFromCalls = listFromCalls :+ filePath + toCloseableIterator(listFromProvider(filePath).iterator.asJava) + } + + override def resolvePath(path: String): String = path + + def getListFromCalls: Seq[String] = listFromCalls +} + +/** + * A mock [[FileSystemClient]] that answers `listFrom` call from the given list of file statuses + * and tracks the delete calls. + * @param listContents List of file statuses to be returned by `listFrom` call. + */ +class MockListFromDeleteFileSystemClient(listContents: Seq[FileStatus]) + extends BaseMockFileSystemClient { + private val listOfFiles: Seq[String] = listContents.map(_.getPath).toSeq + private var isListFromAlreadyCalled = false + private var deleteCalls: Seq[String] = Seq.empty + + override def listFrom(filePath: String): CloseableIterator[FileStatus] = { + assert(!isListFromAlreadyCalled, "listFrom should be called only once") + isListFromAlreadyCalled = true + toCloseableIterator(listContents.sortBy(_.getPath).asJava.iterator()) + } + + override def delete(path: String): Boolean = { + deleteCalls = deleteCalls :+ path + listOfFiles.contains(path) + } + + def getDeleteCalls: Seq[String] = deleteCalls +} diff --git a/kernel/kernel-api/src/test/scala/io/delta/kernel/types/CollationIdentifierSuite.scala b/kernel/kernel-api/src/test/scala/io/delta/kernel/types/CollationIdentifierSuite.scala index 01c349a222f..2e4f8c29947 100644 --- a/kernel/kernel-api/src/test/scala/io/delta/kernel/types/CollationIdentifierSuite.scala +++ b/kernel/kernel-api/src/test/scala/io/delta/kernel/types/CollationIdentifierSuite.scala @@ -33,11 +33,11 @@ class CollationIdentifierSuite extends AnyFunSuite { ), ( s"$PROVIDER_ICU.sr_Cyrl_SRB", - new CollationIdentifier(PROVIDER_ICU, "sr_Cyrl_SRB") + CollationIdentifier.fromString(s"$PROVIDER_ICU.sr_Cyrl_SRB") ), ( s"$PROVIDER_ICU.sr_Cyrl_SRB.75.1", - new CollationIdentifier(PROVIDER_ICU, "sr_Cyrl_SRB", Optional.of("75.1")) + CollationIdentifier.fromString(s"$PROVIDER_ICU.sr_Cyrl_SRB.75.1") ) ).foreach { case(stringIdentifier, collationIdentifier) => @@ -65,11 +65,11 @@ class CollationIdentifierSuite extends AnyFunSuite { s"$PROVIDER_SPARK.$DEFAULT_COLLATION_NAME" ), ( - new CollationIdentifier(PROVIDER_ICU, "sr_Cyrl_SRB"), + CollationIdentifier.fromString(s"$PROVIDER_ICU.sr_Cyrl_SRB"), s"$PROVIDER_ICU.SR_CYRL_SRB" ), ( - new CollationIdentifier(PROVIDER_ICU, "sr_Cyrl_SRB", Optional.of("75.1")), + CollationIdentifier.fromString(s"$PROVIDER_ICU.sr_Cyrl_SRB.75.1"), s"$PROVIDER_ICU.SR_CYRL_SRB" ) ).foreach { @@ -85,11 +85,11 @@ class CollationIdentifierSuite extends AnyFunSuite { s"$PROVIDER_SPARK.$DEFAULT_COLLATION_NAME" ), ( - new CollationIdentifier(PROVIDER_ICU, "sr_Cyrl_SRB"), + CollationIdentifier.fromString(s"$PROVIDER_ICU.sr_Cyrl_SRB"), s"$PROVIDER_ICU.SR_CYRL_SRB" ), ( - new CollationIdentifier(PROVIDER_ICU, "sr_Cyrl_SRB", Optional.of("75.1")), + CollationIdentifier.fromString(s"$PROVIDER_ICU.sr_Cyrl_SRB.75.1"), s"$PROVIDER_ICU.SR_CYRL_SRB.75.1" ) ).foreach { @@ -98,4 +98,3 @@ class CollationIdentifierSuite extends AnyFunSuite { } } } - diff --git a/kernel/kernel-api/src/test/scala/io/delta/kernel/types/StringTypeSuite.scala b/kernel/kernel-api/src/test/scala/io/delta/kernel/types/StringTypeSuite.scala index 4441d4dcdf7..d6acfa47e93 100644 --- a/kernel/kernel-api/src/test/scala/io/delta/kernel/types/StringTypeSuite.scala +++ b/kernel/kernel-api/src/test/scala/io/delta/kernel/types/StringTypeSuite.scala @@ -19,6 +19,7 @@ import org.scalatest.funsuite.AnyFunSuite class StringTypeSuite extends AnyFunSuite { test("check equals") { + // Testcase: (instance1, instance2, expected value for `instance1 == instance2`) Seq( ( StringType.STRING, diff --git a/kernel/kernel-defaults/src/main/java/io/delta/kernel/defaults/engine/DefaultCommitCoordinatorClientHandler.java b/kernel/kernel-defaults/src/main/java/io/delta/kernel/defaults/engine/DefaultCommitCoordinatorClientHandler.java index 47f407bb4d9..38fbd7e48dd 100644 --- a/kernel/kernel-defaults/src/main/java/io/delta/kernel/defaults/engine/DefaultCommitCoordinatorClientHandler.java +++ b/kernel/kernel-defaults/src/main/java/io/delta/kernel/defaults/engine/DefaultCommitCoordinatorClientHandler.java @@ -30,9 +30,11 @@ import io.delta.storage.LogStore; import io.delta.storage.commit.CommitCoordinatorClient; import io.delta.storage.commit.CommitFailedException; +import io.delta.storage.commit.TableDescriptor; import java.io.IOException; import java.util.Iterator; import java.util.Map; +import java.util.Optional; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; @@ -89,8 +91,11 @@ public Map registerTable( long currentVersion, AbstractMetadata currentMetadata, AbstractProtocol currentProtocol) { + // TODO: Introduce table identifier concept in Table API in Kernel and plumb the + // table identifier into `CommitCoordinatorClient` in all APIs. return commitCoordinatorClient.registerTable( new Path(logPath), + Optional.empty() /* table identfier */, currentVersion, StorageKernelAPIAdapter.toStorageAbstractMetadata(currentMetadata), StorageKernelAPIAdapter.toStorageAbstractProtocol(currentProtocol)); @@ -111,8 +116,7 @@ public CommitResponse commit( commitCoordinatorClient.commit( logStore, hadoopConf, - path, - tableConf, + new TableDescriptor(path, Optional.empty() /* table identfier */, tableConf), commitVersion, new Iterator() { @Override @@ -134,8 +138,9 @@ public String next() { @Override public GetCommitsResponse getCommits( String logPath, Map tableConf, Long startVersion, Long endVersion) { + TableDescriptor tableDesc = new TableDescriptor(new Path(logPath), Optional.empty(), tableConf); return StorageKernelAPIAdapter.toKernelAPIGetCommitsResponse( - commitCoordinatorClient.getCommits(new Path(logPath), tableConf, startVersion, endVersion)); + commitCoordinatorClient.getCommits(tableDesc, startVersion, endVersion)); } @Override @@ -144,8 +149,10 @@ public void backfillToVersion( throws IOException { Path path = new Path(logPath); LogStore logStore = LogStoreProvider.getLogStore(hadoopConf, path.toUri().getScheme()); + TableDescriptor tableDesc = + new TableDescriptor(path, Optional.empty() /* table identfier */, tableConf); commitCoordinatorClient.backfillToVersion( - logStore, hadoopConf, path, tableConf, version, lastKnownBackfilledVersion); + logStore, hadoopConf, tableDesc, version, lastKnownBackfilledVersion); } @Override diff --git a/kernel/kernel-defaults/src/main/java/io/delta/kernel/defaults/engine/DefaultFileSystemClient.java b/kernel/kernel-defaults/src/main/java/io/delta/kernel/defaults/engine/DefaultFileSystemClient.java index e80923554a1..c226e61a5ad 100644 --- a/kernel/kernel-defaults/src/main/java/io/delta/kernel/defaults/engine/DefaultFileSystemClient.java +++ b/kernel/kernel-defaults/src/main/java/io/delta/kernel/defaults/engine/DefaultFileSystemClient.java @@ -99,6 +99,13 @@ public boolean mkdirs(String path) throws IOException { return fs.mkdirs(pathObject); } + @Override + public boolean delete(String path) throws IOException { + Path pathObject = new Path(path); + FileSystem fs = pathObject.getFileSystem(hadoopConf); + return fs.delete(pathObject, false); + } + private ByteArrayInputStream getStream(String filePath, int offset, int size) { Path path = new Path(filePath); try { diff --git a/kernel/kernel-defaults/src/main/java/io/delta/kernel/defaults/internal/expressions/DefaultExpressionUtils.java b/kernel/kernel-defaults/src/main/java/io/delta/kernel/defaults/internal/expressions/DefaultExpressionUtils.java index bf7c98fdc84..b59db8689ab 100644 --- a/kernel/kernel-defaults/src/main/java/io/delta/kernel/defaults/internal/expressions/DefaultExpressionUtils.java +++ b/kernel/kernel-defaults/src/main/java/io/delta/kernel/defaults/internal/expressions/DefaultExpressionUtils.java @@ -35,19 +35,6 @@ class DefaultExpressionUtils { static final Comparator BIGDECIMAL_COMPARATOR = Comparator.naturalOrder(); - static final Comparator STRING_COMPARATOR = - (leftOp, rightOp) -> { - byte[] leftBytes = leftOp.getBytes(StandardCharsets.UTF_8); - byte[] rightBytes = rightOp.getBytes(StandardCharsets.UTF_8); - int i = 0; - while (i < leftBytes.length && i < rightBytes.length) { - if (leftBytes[i] != rightBytes[i]) { - return Byte.toUnsignedInt(leftBytes[i]) - Byte.toUnsignedInt(rightBytes[i]); - } - i++; - } - return Integer.compare(leftBytes.length, rightBytes.length); - }; static final Comparator BINARY_COMPARTOR = (leftOp, rightOp) -> { int i = 0; @@ -59,6 +46,12 @@ class DefaultExpressionUtils { } return Integer.compare(leftOp.length, rightOp.length); }; + static final Comparator STRING_COMPARATOR = + (leftOp, rightOp) -> { + byte[] leftBytes = leftOp.getBytes(StandardCharsets.UTF_8); + byte[] rightBytes = rightOp.getBytes(StandardCharsets.UTF_8); + return BINARY_COMPARTOR.compare(leftBytes, rightBytes); + }; private DefaultExpressionUtils() {} diff --git a/kernel/kernel-defaults/src/test/scala/io/delta/kernel/defaults/DeltaTableReadsSuite.scala b/kernel/kernel-defaults/src/test/scala/io/delta/kernel/defaults/DeltaTableReadsSuite.scala index 9638be9e290..36f88fd57bc 100644 --- a/kernel/kernel-defaults/src/test/scala/io/delta/kernel/defaults/DeltaTableReadsSuite.scala +++ b/kernel/kernel-defaults/src/test/scala/io/delta/kernel/defaults/DeltaTableReadsSuite.scala @@ -18,12 +18,15 @@ package io.delta.kernel.defaults import io.delta.golden.GoldenTableUtils.goldenTablePath import io.delta.kernel.exceptions.{InvalidTableException, KernelException, TableNotFoundException} import io.delta.kernel.defaults.utils.{TestRow, TestUtils} +import io.delta.kernel.internal.TableImpl import io.delta.kernel.internal.fs.Path import io.delta.kernel.internal.util.InternalUtils.daysSinceEpoch import io.delta.kernel.internal.util.{DateTimeConstants, FileNames} import io.delta.kernel.types.{LongType, StructType} import io.delta.kernel.Table import org.apache.hadoop.shaded.org.apache.commons.io.FileUtils +import org.apache.spark.sql.delta.{DeltaLog, DeltaOperations} +import org.apache.spark.sql.delta.actions.{AddFile, Metadata} import org.apache.spark.sql.functions.col import org.scalatest.funsuite.AnyFunSuite @@ -859,4 +862,141 @@ class DeltaTableReadsSuite extends AnyFunSuite with TestUtils { } } } + + /////////////////////////////////////////////////////////////////////////////////////////////// + // getVersionBeforeOrAtTimestamp + getVersionAtOrAfterTimestamp tests + // (more in TableImplSuite and DeltaHistoryManagerSuite) + ////////////////////////////////////////////////////////////////////////////////////////////// + + // Copied from Standalone DeltaLogSuite + test("getVersionBeforeOrAtTimestamp and getVersionAtOrAfterTimestamp") { + // Note: + // - all Xa test cases will test getVersionBeforeOrAtTimestamp + // - all Xb test cases will test getVersionAtOrAfterTimestamp + withTempDir { dir => + val log = DeltaLog.forTable(spark, dir.getCanonicalPath) + val tableImpl = Table.forPath(defaultEngine, dir.getCanonicalPath).asInstanceOf[TableImpl] + + // ========== case 0: delta table does not exist ========== + intercept[TableNotFoundException] { + tableImpl.getVersionBeforeOrAtTimestamp(defaultEngine, System.currentTimeMillis()) + } + intercept[TableNotFoundException] { + tableImpl.getVersionAtOrAfterTimestamp(defaultEngine, System.currentTimeMillis()) + } + + // Setup part 1 of 2: create log files + (0 to 2).foreach { i => + val files = AddFile(i.toString, Map.empty, 1, 1, true) :: Nil + val metadata = if (i == 0) Metadata() :: Nil else Nil + log.startTransaction().commit( metadata ++ files, DeltaOperations.ManualUpdate) + } + + // Setup part 2 of 2: edit lastModified times + val logPath = new Path(dir.getCanonicalPath, "_delta_log") + + val delta0 = new File(FileNames.deltaFile(logPath, 0)) + val delta1 = new File(FileNames.deltaFile(logPath, 1)) + val delta2 = new File(FileNames.deltaFile(logPath, 2)) + delta0.setLastModified(1000) + delta1.setLastModified(2000) + delta2.setLastModified(3000) + + // ========== case 1: before first commit ========== + // case 1a + val e1 = intercept[KernelException] { + tableImpl.getVersionBeforeOrAtTimestamp(defaultEngine, 500) + }.getMessage + assert(e1.contains("is before the earliest available version 0")) + // case 1b + assert(tableImpl.getVersionAtOrAfterTimestamp(defaultEngine, 500) == 0) + + // ========== case 2: at first commit ========== + // case 2a + assert(tableImpl.getVersionBeforeOrAtTimestamp(defaultEngine, 1000) == 0) + // case 2b + assert(tableImpl.getVersionAtOrAfterTimestamp(defaultEngine, 1000) == 0) + + // ========== case 3: between two normal commits ========== + // case 3a + assert(tableImpl.getVersionBeforeOrAtTimestamp(defaultEngine, 1500) == 0) // round down to v0 + // case 3b + assert(tableImpl.getVersionAtOrAfterTimestamp(defaultEngine, 1500) == 1) // round up to v1 + + // ========== case 4: at last commit ========== + // case 4a + assert(tableImpl.getVersionBeforeOrAtTimestamp(defaultEngine, 3000) == 2) + // case 4b + assert(tableImpl.getVersionAtOrAfterTimestamp(defaultEngine, 3000) == 2) + + // ========== case 5: after last commit ========== + // case 5a + assert(tableImpl.getVersionBeforeOrAtTimestamp(defaultEngine, 4000) == 2) + // case 5b + val e2 = intercept[KernelException] { + tableImpl.getVersionAtOrAfterTimestamp(defaultEngine, 4000) + }.getMessage + assert(e2.contains("is after the latest available version 2")) + } + } + + // Copied from Standalone DeltaLogSuite + test("getVersionBeforeOrAtTimestamp and getVersionAtOrAfterTimestamp - recoverability") { + withTempDir { dir => + // local file system truncates to seconds + val nowEpochMs = System.currentTimeMillis() / 1000 * 1000 + + val logPath = new Path(dir.getCanonicalPath, "_delta_log") + + val log = DeltaLog.forTable(spark, dir.getCanonicalPath) + val tableImpl = Table.forPath(defaultEngine, dir.getCanonicalPath).asInstanceOf[TableImpl] + + (0 to 35).foreach { i => + val files = AddFile(i.toString, Map.empty, 1, 1, true) :: Nil + val metadata = if (i == 0) Metadata() :: Nil else Nil + log.startTransaction().commit(metadata ++ files, DeltaOperations.ManualUpdate) + } + + (0 to 35).foreach { i => + val delta = new File(FileNames.deltaFile(logPath, i)) + if (i >= 25) { + delta.setLastModified(nowEpochMs + i * 1000) + } else { + assert(delta.delete()) + } + } + + // A checkpoint exists at version 30, so all versions [30, 35] are recoverable. + // Nonetheless, getVersionBeforeOrAtTimestamp and getVersionAtOrAfterTimestamp do not + // require that the version is recoverable, so we should still be able to get back versions + // [25-29] + + (25 to 34).foreach { i => + if (i == 25) { + assertThrows[KernelException] { + tableImpl.getVersionBeforeOrAtTimestamp(defaultEngine, nowEpochMs + i * 1000 - 1) + } + } else { + assert(tableImpl.getVersionBeforeOrAtTimestamp(defaultEngine, nowEpochMs + i * 1000 - 1) + == i - 1) + } + + assert( + tableImpl.getVersionAtOrAfterTimestamp(defaultEngine, nowEpochMs + i * 1000 - 1) == i) + + assert(tableImpl.getVersionBeforeOrAtTimestamp(defaultEngine, nowEpochMs + i * 1000) == i) + assert(tableImpl.getVersionAtOrAfterTimestamp(defaultEngine, nowEpochMs + i * 1000) == i) + + assert( + tableImpl.getVersionBeforeOrAtTimestamp(defaultEngine, nowEpochMs + i * 1000 + 1)== i) + + if (i == 35) { + tableImpl.getVersionAtOrAfterTimestamp(defaultEngine, nowEpochMs + i * 1000 + 1) + } else { + assert(tableImpl.getVersionAtOrAfterTimestamp(defaultEngine, nowEpochMs + i * 1000 + 1) + == i + 1) + } + } + } + } } diff --git a/kernel/kernel-defaults/src/test/scala/io/delta/kernel/defaults/TableChangesSuite.scala b/kernel/kernel-defaults/src/test/scala/io/delta/kernel/defaults/TableChangesSuite.scala new file mode 100644 index 00000000000..5d6950d856e --- /dev/null +++ b/kernel/kernel-defaults/src/test/scala/io/delta/kernel/defaults/TableChangesSuite.scala @@ -0,0 +1,589 @@ +/* + * Copyright (2024) The Delta Lake Project Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.delta.kernel.defaults + +import java.io.File + +import scala.collection.JavaConverters._ + +import io.delta.golden.GoldenTableUtils.goldenTablePath +import io.delta.kernel.data.Row +import io.delta.kernel.data.ColumnarBatch +import io.delta.kernel.defaults.utils.TestUtils +import io.delta.kernel.utils.CloseableIterator +import io.delta.kernel.internal.DeltaLogActionUtils.DeltaAction +import io.delta.kernel.internal.actions.{AddCDCFile, AddFile, CommitInfo, Metadata, Protocol, RemoveFile} +import io.delta.kernel.internal.util.{FileNames, VectorUtils} +import io.delta.kernel.Table +import io.delta.kernel.exceptions.{KernelException, TableNotFoundException} +import io.delta.kernel.internal.TableImpl +import io.delta.kernel.internal.fs.Path + +import org.apache.spark.sql.delta.actions.{Action => SparkAction, AddCDCFile => SparkAddCDCFile, AddFile => SparkAddFile, CommitInfo => SparkCommitInfo, Metadata => SparkMetadata, Protocol => SparkProtocol, RemoveFile => SparkRemoveFile} +import org.apache.spark.sql.delta.DeltaLog +import org.scalatest.funsuite.AnyFunSuite +import org.apache.spark.sql.functions.col + +class TableChangesSuite extends AnyFunSuite with TestUtils { + + /* actionSet including all currently supported actions */ + val FULL_ACTION_SET: Set[DeltaAction] = DeltaAction.values().toSet + + ////////////////////////////////////////////////////////////////////////////////// + // TableImpl.getChangesByVersion tests + ////////////////////////////////////////////////////////////////////////////////// + + /** + * For the given parameters, read the table changes from Kernel using + * TableImpl.getChangesByVersion and compare results with Spark + */ + def testGetChangesVsSpark( + tablePath: String, + startVersion: Long, + endVersion: Long, + actionSet: Set[DeltaAction]): Unit = { + + val sparkChanges = DeltaLog.forTable(spark, tablePath) + .getChanges(startVersion) + .filter(_._1 <= endVersion) // Spark API does not have endVersion + + val kernelChanges = Table.forPath(defaultEngine, tablePath) + .asInstanceOf[TableImpl] + .getChanges(defaultEngine, startVersion, endVersion, actionSet.asJava) + .toSeq + + // Check schema is as expected (version + timestamp column + the actions requested) + kernelChanges.foreach { batch => + batch.getSchema.fields().asScala sameElements + (Seq("version", "timestamp") ++ actionSet.map(_.colName)) + } + + compareActions(kernelChanges, pruneSparkActionsByActionSet(sparkChanges, actionSet)) + } + + // Golden table from Delta Standalone test + test("getChanges - golden table deltalog-getChanges valid queries") { + withGoldenTable("deltalog-getChanges") { tablePath => + // request subset of actions + testGetChangesVsSpark( + tablePath, + 0, + 2, + Set(DeltaAction.REMOVE) + ) + testGetChangesVsSpark( + tablePath, + 0, + 2, + Set(DeltaAction.ADD) + ) + testGetChangesVsSpark( + tablePath, + 0, + 2, + Set(DeltaAction.ADD, DeltaAction.REMOVE, DeltaAction.METADATA, DeltaAction.PROTOCOL) + ) + // request full actions, various versions + testGetChangesVsSpark( + tablePath, + 0, + 2, + FULL_ACTION_SET + ) + testGetChangesVsSpark( + tablePath, + 1, + 2, + FULL_ACTION_SET + ) + testGetChangesVsSpark( + tablePath, + 0, + 0, + FULL_ACTION_SET + ) + } + } + + test("getChanges - returns correct timestamps") { + withTempDir { tempDir => + + def generateCommits(path: String, commits: Long*): Unit = { + commits.zipWithIndex.foreach { case (ts, i) => + spark.range(i*10, i*10 + 10).write.format("delta").mode("append").save(path) + val file = new File(FileNames.deltaFile(new Path(path, "_delta_log"), i)) + file.setLastModified(ts) + } + } + + val start = 1540415658000L + val minuteInMilliseconds = 60000L + generateCommits(tempDir.getCanonicalPath, start, start + 20 * minuteInMilliseconds, + start + 40 * minuteInMilliseconds) + val versionToTimestamp: Map[Long, Long] = Map( + 0L -> start, + 1L -> (start + 20 * minuteInMilliseconds), + 2L -> (start + 40 * minuteInMilliseconds) + ) + + // Check the timestamps are returned correctly + Table.forPath(defaultEngine, tempDir.getCanonicalPath) + .asInstanceOf[TableImpl] + .getChanges(defaultEngine, 0, 2, Set(DeltaAction.ADD).asJava) + .toSeq + .flatMap(_.getRows.toSeq) + .foreach { row => + val version = row.getLong(0) + val timestamp = row.getLong(1) + assert(timestamp == versionToTimestamp(version), + f"Expected timestamp ${versionToTimestamp(version)} for version $version but" + + f"Kernel returned timestamp $timestamp") + } + + // Check contents as well + testGetChangesVsSpark( + tempDir.getCanonicalPath, + 0, + 2, + FULL_ACTION_SET + ) + } + } + + test("getChanges - empty _delta_log folder") { + withTempDir { tempDir => + new File(tempDir, "delta_log").mkdirs() + intercept[TableNotFoundException] { + Table.forPath(defaultEngine, tempDir.getCanonicalPath) + .asInstanceOf[TableImpl] + .getChanges(defaultEngine, 0, 2, FULL_ACTION_SET.asJava) + } + } + } + + test("getChanges - empty folder no _delta_log dir") { + withTempDir { tempDir => + intercept[TableNotFoundException] { + Table.forPath(defaultEngine, tempDir.getCanonicalPath) + .asInstanceOf[TableImpl] + .getChanges(defaultEngine, 0, 2, FULL_ACTION_SET.asJava) + } + } + } + + test("getChanges - non-empty folder not a delta table") { + withTempDir { tempDir => + spark.range(20).write.format("parquet").mode("overwrite").save(tempDir.getCanonicalPath) + intercept[TableNotFoundException] { + Table.forPath(defaultEngine, tempDir.getCanonicalPath) + .asInstanceOf[TableImpl] + .getChanges(defaultEngine, 0, 2, FULL_ACTION_SET.asJava) + } + } + } + + test("getChanges - directory does not exist") { + intercept[TableNotFoundException] { + Table.forPath(defaultEngine, "/fake/table/path") + .asInstanceOf[TableImpl] + .getChanges(defaultEngine, 0, 2, FULL_ACTION_SET.asJava) + } + } + + test("getChanges - golden table deltalog-getChanges invalid queries") { + withGoldenTable("deltalog-getChanges") { tablePath => + def getChangesByVersion( + startVersion: Long, endVersion: Long): CloseableIterator[ColumnarBatch] = { + Table.forPath(defaultEngine, tablePath) + .asInstanceOf[TableImpl] + .getChanges(defaultEngine, startVersion, endVersion, FULL_ACTION_SET.asJava) + } + + // startVersion after latest available version + assert(intercept[KernelException]{ + getChangesByVersion(3, 8) + }.getMessage.contains("no log files found in the requested version range")) + + // endVersion larger than latest available version + assert(intercept[KernelException]{ + getChangesByVersion(0, 8) + }.getMessage.contains("no log file found for version 8")) + + // invalid start version + assert(intercept[KernelException]{ + getChangesByVersion(-1, 2) + }.getMessage.contains("Invalid version range")) + + // invalid end version + assert(intercept[KernelException]{ + getChangesByVersion(2, 1) + }.getMessage.contains("Invalid version range")) + } + } + + test("getChanges - with truncated log") { + withTempDir { tempDir => + // PREPARE TEST TABLE + val tablePath = tempDir.getCanonicalPath + // Write versions [0, 10] (inclusive) including a checkpoint + (0 to 10).foreach { i => + spark.range(i*10, i*10 + 10).write + .format("delta") + .mode("append") + .save(tablePath) + } + val log = org.apache.spark.sql.delta.DeltaLog.forTable( + spark, new org.apache.hadoop.fs.Path(tablePath)) + val deltaCommitFileProvider = org.apache.spark.sql.delta.util.DeltaCommitFileProvider( + log.unsafeVolatileSnapshot) + // Delete the log files for versions 0-9, truncating the table history to version 10 + (0 to 9).foreach { i => + val jsonFile = deltaCommitFileProvider.deltaFile(i) + new File(new org.apache.hadoop.fs.Path(log.logPath, jsonFile).toUri).delete() + } + // Create version 11 that overwrites the whole table + spark.range(50).write + .format("delta") + .mode("overwrite") + .save(tablePath) + // Create version 12 that appends new data + spark.range(10).write + .format("delta") + .mode("append") + .save(tablePath) + + // TEST ERRORS + // endVersion before earliest available version + assert(intercept[KernelException] { + Table.forPath(defaultEngine, tablePath) + .asInstanceOf[TableImpl] + .getChanges(defaultEngine, 0, 9, FULL_ACTION_SET.asJava) + }.getMessage.contains("no log files found in the requested version range")) + + // startVersion less than the earliest available version + assert(intercept[KernelException] { + Table.forPath(defaultEngine, tablePath) + .asInstanceOf[TableImpl] + .getChanges(defaultEngine, 5, 11, FULL_ACTION_SET.asJava) + }.getMessage.contains("no log file found for version 5")) + + // TEST VALID CASES + testGetChangesVsSpark( + tablePath, + 10, + 12, + FULL_ACTION_SET + ) + testGetChangesVsSpark( + tablePath, + 11, + 12, + FULL_ACTION_SET + ) + } + } + + test("getChanges - table with a lot of changes") { + withTempDir { tempDir => + spark.sql( + f""" + |CREATE TABLE delta.`${tempDir.getCanonicalPath}` (id LONG, month LONG) + |USING DELTA + |PARTITIONED BY (month) + |TBLPROPERTIES (delta.enableChangeDataFeed = true) + |""".stripMargin) + spark.range(100).withColumn("month", col("id") % 12 + 1) + .write + .format("delta") + .mode("append") + .save(tempDir.getCanonicalPath) + spark.sql( // cdc actions + f""" + |UPDATE delta.`${tempDir.getCanonicalPath}` SET month = 1 WHERE id < 10 + |""".stripMargin) + spark.sql( + f""" + |DELETE FROM delta.`${tempDir.getCanonicalPath}` WHERE month = 12 + |""".stripMargin) + spark.sql( + f""" + |DELETE FROM delta.`${tempDir.getCanonicalPath}` WHERE id = 52 + |""".stripMargin) + spark.range(100, 150).withColumn("month", col("id") % 12) + .write + .format("delta") + .mode("overwrite") + .save(tempDir.getCanonicalPath) + spark.sql( // change metadata + f""" + |ALTER TABLE delta.`${tempDir.getCanonicalPath}` + |ADD CONSTRAINT validMonth CHECK (month <= 12) + |""".stripMargin) + + // Check all actions are correctly retrieved + testGetChangesVsSpark( + tempDir.getCanonicalPath, + 0, + 6, + FULL_ACTION_SET + ) + // Check some subset of actions + testGetChangesVsSpark( + tempDir.getCanonicalPath, + 0, + 6, + Set(DeltaAction.ADD) + ) + } + } + + test("getChanges - fails when protocol is not readable by Kernel") { + // Existing tests suffice to check if the protocol column is present/dropped correctly + // We test our protocol checks for table features in TableFeaturesSuite + // Min reader version is too high + assert(intercept[KernelException] { + // Use toSeq because we need to consume the iterator to force the exception + Table.forPath(defaultEngine, goldenTablePath("deltalog-invalid-protocol-version")) + .asInstanceOf[TableImpl] + .getChanges(defaultEngine, 0, 0, FULL_ACTION_SET.asJava).toSeq + }.getMessage.contains("Unsupported Delta protocol reader version")) + // We still get an error if we don't request the protocol file action + assert(intercept[KernelException] { + Table.forPath(defaultEngine, goldenTablePath("deltalog-invalid-protocol-version")) + .asInstanceOf[TableImpl] + .getChanges(defaultEngine, 0, 0, Set(DeltaAction.ADD).asJava).toSeq + }.getMessage.contains("Unsupported Delta protocol reader version")) + } + + ////////////////////////////////////////////////////////////////////////////////// + // Helpers to compare actions returned between Kernel and Spark + ////////////////////////////////////////////////////////////////////////////////// + + // Standardize actions with case classes, keeping just a few fields to compare + trait StandardAction + + case class StandardRemove( + path: String, + dataChange: Boolean, + partitionValues: Map[String, String]) extends StandardAction + + case class StandardAdd( + path: String, + partitionValues: Map[String, String], + size: Long, + modificationTime: Long, + dataChange: Boolean) extends StandardAction + + case class StandardMetadata( + id: String, + schemaString: String, + partitionColumns: Seq[String], + configuration: Map[String, String]) extends StandardAction + + case class StandardProtocol( + minReaderVersion: Int, + minWriterVersion: Int, + readerFeatures: Set[String], + writerFeatures: Set[String]) extends StandardAction + + case class StandardCommitInfo( + operation: String, + operationMetrics: Map[String, String]) extends StandardAction + + case class StandardCdc( + path: String, + partitionValues: Map[String, String], + size: Long, + tags: Map[String, String]) extends StandardAction + + def standardizeKernelAction(row: Row): Option[StandardAction] = { + val actionIdx = (2 until row.getSchema.length()).find(!row.isNullAt(_)).getOrElse( + return None + ) + + row.getSchema.at(actionIdx).getName match { + case DeltaAction.REMOVE.colName => + val removeRow = row.getStruct(actionIdx) + val partitionValues: Map[String, String] = { // partitionValues is nullable for removes + if (removeRow.isNullAt(RemoveFile.FULL_SCHEMA.indexOf("partitionValues"))) { + null + } else { + VectorUtils.toJavaMap[String, String]( + removeRow.getMap(RemoveFile.FULL_SCHEMA.indexOf("partitionValues"))).asScala.toMap + } + } + Some(StandardRemove( + removeRow.getString(RemoveFile.FULL_SCHEMA.indexOf("path")), + removeRow.getBoolean(RemoveFile.FULL_SCHEMA.indexOf("dataChange")), + partitionValues + )) + + case DeltaAction.ADD.colName => + val addRow = row.getStruct(actionIdx) + Some(StandardAdd( + addRow.getString(AddFile.FULL_SCHEMA.indexOf("path")), + VectorUtils.toJavaMap[String, String]( + addRow.getMap(AddFile.FULL_SCHEMA.indexOf("partitionValues"))).asScala.toMap, + addRow.getLong(AddFile.FULL_SCHEMA.indexOf("size")), + addRow.getLong(AddFile.FULL_SCHEMA.indexOf("modificationTime")), + addRow.getBoolean(AddFile.FULL_SCHEMA.indexOf("dataChange")) + )) + + case DeltaAction.METADATA.colName => + val metadataRow = row.getStruct(actionIdx) + Some(StandardMetadata( + metadataRow.getString(Metadata.FULL_SCHEMA.indexOf("id")), + metadataRow.getString(Metadata.FULL_SCHEMA.indexOf("schemaString")), + VectorUtils.toJavaList( + metadataRow.getArray(Metadata.FULL_SCHEMA.indexOf("partitionColumns"))).asScala, + VectorUtils.toJavaMap[String, String]( + metadataRow.getMap(Metadata.FULL_SCHEMA.indexOf("configuration"))).asScala.toMap + )) + + case DeltaAction.PROTOCOL.colName => + val protocolRow = row.getStruct(actionIdx) + val readerFeatures = + if (protocolRow.isNullAt(Protocol.FULL_SCHEMA.indexOf("readerFeatures"))) { + Seq() + } else { + VectorUtils.toJavaList( + protocolRow.getArray(Protocol.FULL_SCHEMA.indexOf("readerFeatures"))).asScala + } + val writerFeatures = + if (protocolRow.isNullAt(Protocol.FULL_SCHEMA.indexOf("writerFeatures"))) { + Seq() + } else { + VectorUtils.toJavaList( + protocolRow.getArray(Protocol.FULL_SCHEMA.indexOf("writerFeatures"))).asScala + } + + Some(StandardProtocol( + protocolRow.getInt(Protocol.FULL_SCHEMA.indexOf("minReaderVersion")), + protocolRow.getInt(Protocol.FULL_SCHEMA.indexOf("minWriterVersion")), + readerFeatures.toSet, + writerFeatures.toSet + )) + + case DeltaAction.COMMITINFO.colName => + val commitInfoRow = row.getStruct(actionIdx) + val operationIdx = CommitInfo.FULL_SCHEMA.indexOf("operation") + val operationMetricsIdx = CommitInfo.FULL_SCHEMA.indexOf("operationMetrics") + + Some(StandardCommitInfo( + if (commitInfoRow.isNullAt(operationIdx)) null else commitInfoRow.getString(operationIdx), + if (commitInfoRow.isNullAt(operationMetricsIdx)) { + Map.empty + } else { + VectorUtils.toJavaMap[String, String]( + commitInfoRow.getMap(operationMetricsIdx)).asScala.toMap + } + )) + + case DeltaAction.CDC.colName => + val cdcRow = row.getStruct(actionIdx) + val tags: Map[String, String] = { + if (cdcRow.isNullAt(AddCDCFile.FULL_SCHEMA.indexOf("tags"))) { + null + } else { + VectorUtils.toJavaMap[String, String]( + cdcRow.getMap(AddCDCFile.FULL_SCHEMA.indexOf("tags"))).asScala.toMap + } + } + Some(StandardCdc( + cdcRow.getString(AddCDCFile.FULL_SCHEMA.indexOf("path")), + VectorUtils.toJavaMap[String, String]( + cdcRow.getMap(AddCDCFile.FULL_SCHEMA.indexOf("partitionValues"))).asScala.toMap, + cdcRow.getLong(AddCDCFile.FULL_SCHEMA.indexOf("size")), + tags + )) + + case _ => + throw new RuntimeException("Encountered an action that hasn't been added as an option yet") + } + } + + def standardizeSparkAction(action: SparkAction): Option[StandardAction] = action match { + case remove: SparkRemoveFile => + Some(StandardRemove(remove.path, remove.dataChange, remove.partitionValues)) + case add: SparkAddFile => + Some(StandardAdd( + add.path, add.partitionValues, add.size, add.modificationTime, add.dataChange)) + case metadata: SparkMetadata => + Some(StandardMetadata( + metadata.id, metadata.schemaString, metadata.partitionColumns, metadata.configuration)) + case protocol: SparkProtocol => + Some(StandardProtocol( + protocol.minReaderVersion, + protocol.minWriterVersion, + protocol.readerFeatures.getOrElse(Set.empty), + protocol.writerFeatures.getOrElse(Set.empty) + )) + case commitInfo: SparkCommitInfo => + Some(StandardCommitInfo( + commitInfo.operation, + commitInfo.operationMetrics.getOrElse(Map.empty) + )) + case cdc: SparkAddCDCFile => + Some(StandardCdc(cdc.path, cdc.partitionValues, cdc.size, cdc.tags)) + case _ => None + } + + /** + * When we query the Spark actions using DeltaLog::getChanges ALL action types are returned. Since + * Kernel only returns actions in the provided `actionSet` this FX prunes the Spark actions to + * match `actionSet`. + */ + def pruneSparkActionsByActionSet( + sparkActions: Iterator[(Long, Seq[SparkAction])], + actionSet: Set[DeltaAction]): Iterator[(Long, Seq[SparkAction])] = { + sparkActions.map { case (version, actions) => + (version, + actions.filter { + case _: SparkRemoveFile => actionSet.contains(DeltaAction.REMOVE) + case _: SparkAddFile => actionSet.contains(DeltaAction.ADD) + case _: SparkMetadata => actionSet.contains(DeltaAction.METADATA) + case _: SparkProtocol => actionSet.contains(DeltaAction.PROTOCOL) + case _: SparkCommitInfo => actionSet.contains(DeltaAction.COMMITINFO) + case _: SparkAddCDCFile => actionSet.contains(DeltaAction.CDC) + case _ => false + } + ) + } + } + + def compareActions( + kernelActions: Seq[ColumnarBatch], + sparkActions: Iterator[(Long, Seq[SparkAction])]): Unit = { + + val standardKernelActions: Seq[(Long, StandardAction)] = { + kernelActions.flatMap(_.getRows.toSeq) + .map(row => (row.getLong(0), standardizeKernelAction(row))) + .filter(_._2.nonEmpty) + .map(t => (t._1, t._2.get)) + } + + val standardSparkActions: Seq[(Long, StandardAction)] = + sparkActions.flatMap { case (version, actions) => + actions.map(standardizeSparkAction(_)).flatten.map((version, _)) + }.toSeq + + assert(standardKernelActions sameElements standardSparkActions, + f"Kernel actions did not match Spark actions.\n" + + f"Kernel actions: $standardKernelActions\n" + + f"Spark actions: $standardSparkActions" + ) + } +} diff --git a/kernel/kernel-defaults/src/test/scala/io/delta/kernel/defaults/internal/coordinatedcommits/CommitCoordinatorProviderSuite.scala b/kernel/kernel-defaults/src/test/scala/io/delta/kernel/defaults/internal/coordinatedcommits/CommitCoordinatorProviderSuite.scala index 000cf095762..9feb2ea04ba 100644 --- a/kernel/kernel-defaults/src/test/scala/io/delta/kernel/defaults/internal/coordinatedcommits/CommitCoordinatorProviderSuite.scala +++ b/kernel/kernel-defaults/src/test/scala/io/delta/kernel/defaults/internal/coordinatedcommits/CommitCoordinatorProviderSuite.scala @@ -21,7 +21,7 @@ import io.delta.kernel.defaults.DeltaTableWriteSuiteBase import io.delta.kernel.defaults.utils.TestUtils import io.delta.kernel.internal.actions.Metadata import io.delta.kernel.internal.TableConfig -import io.delta.storage.commit.{Commit, CommitCoordinatorClient, CommitResponse, GetCommitsResponse, UpdatedActions} +import io.delta.storage.commit.{Commit, CommitCoordinatorClient, CommitResponse, GetCommitsResponse, TableDescriptor, TableIdentifier, UpdatedActions} import io.delta.storage.LogStore import io.delta.storage.commit.actions.{AbstractMetadata, AbstractProtocol} import org.apache.hadoop.conf.Configuration @@ -29,7 +29,7 @@ import org.apache.hadoop.fs.{FileStatus, Path} import org.scalatest.funsuite.AnyFunSuite import java.{lang, util} -import java.util.Collections +import java.util.{Collections, Optional} import scala.collection.convert.ImplicitConversions.`map AsScala` import scala.collection.JavaConverters._ @@ -88,17 +88,18 @@ class CommitCoordinatorProviderSuite extends AnyFunSuite with TestUtils { assert( obj1.registerTable("logPath", 1, null, null) === - obj2.registerTable(new Path("logPath"), 1, null, null)) + obj2.registerTable(new Path("logPath"), Optional.empty(), 1, null, null)) + val tableDesc = + new TableDescriptor(new Path("logPath"), Optional.empty(), Collections.emptyMap()) assert( obj1.getCommits("logPath", Collections.emptyMap(), 1, 2).getLatestTableVersion === - obj2.getCommits( - new Path("logPath"), Collections.emptyMap(), 1, 2).getLatestTableVersion) + obj2.getCommits(tableDesc, 1, 2).getLatestTableVersion) assert( obj1.commit("logPath", Collections.emptyMap(), 1, null, null).getCommit.getVersion === obj2 - .commit(null, null, new Path("logPath"), Collections.emptyMap(), 1, null, null) + .commit(null, null, tableDesc, 1, null, null) .getCommit .getVersion) @@ -127,6 +128,7 @@ class CommitCoordinatorProviderSuite extends AnyFunSuite with TestUtils { protected trait TestCommitCoordinatorClientBase extends CommitCoordinatorClient { override def registerTable( logPath: Path, + tableIdentifier: Optional[TableIdentifier], currentVersion: Long, currentMetadata: AbstractMetadata, currentProtocol: AbstractProtocol): util.Map[String, String] = { @@ -136,8 +138,7 @@ protected trait TestCommitCoordinatorClientBase extends CommitCoordinatorClient override def commit( logStore: LogStore, hadoopConf: Configuration, - logPath: Path, - coordinatedCommitsTableConf: util.Map[String, String], + tableDesc: TableDescriptor, commitVersion: Long, actions: util.Iterator[String], updatedActions: UpdatedActions): CommitResponse = { @@ -145,8 +146,7 @@ protected trait TestCommitCoordinatorClientBase extends CommitCoordinatorClient } override def getCommits( - logPath: Path, - coordinatedCommitsTableConf: util.Map[String, String], + tableDesc: TableDescriptor, startVersion: lang.Long, endVersion: lang.Long = null): GetCommitsResponse = new GetCommitsResponse(Collections.emptyList(), -1) @@ -154,8 +154,7 @@ protected trait TestCommitCoordinatorClientBase extends CommitCoordinatorClient override def backfillToVersion( logStore: LogStore, hadoopConf: Configuration, - logPath: Path, - coordinatedCommitsTableConf: util.Map[String, String], + tableDesc: TableDescriptor, version: Long, lastKnownBackfilledVersion: lang.Long): Unit = {} @@ -221,6 +220,7 @@ class TestCommitCoordinatorClient4 extends TestCommitCoordinatorClientBase { fileStatus.setPath(new Path("logPath")) override def registerTable( logPath: Path, + tableIdentifier: Optional[TableIdentifier], currentVersion: Long, currentMetadata: AbstractMetadata, currentProtocol: AbstractProtocol): util.Map[String, String] = { @@ -228,8 +228,7 @@ class TestCommitCoordinatorClient4 extends TestCommitCoordinatorClientBase { } override def getCommits( - logPath: Path, - coordinatedCommitsTableConf: util.Map[String, String], + tableDesc: TableDescriptor, startVersion: lang.Long, endVersion: lang.Long = null): GetCommitsResponse = { new GetCommitsResponse( @@ -239,8 +238,7 @@ class TestCommitCoordinatorClient4 extends TestCommitCoordinatorClientBase { override def commit( logStore: LogStore, hadoopConf: Configuration, - logPath: Path, - coordinatedCommitsTableConf: util.Map[String, String], + tableDesc: TableDescriptor, commitVersion: Long, actions: util.Iterator[String], updatedActions: UpdatedActions): CommitResponse = { @@ -250,12 +248,12 @@ class TestCommitCoordinatorClient4 extends TestCommitCoordinatorClientBase { override def backfillToVersion( logStore: LogStore, hadoopConf: Configuration, - logPath: Path, - coordinatedCommitsTableConf: util.Map[String, String], + tableDesc: TableDescriptor, version: Long, lastKnownBackfilledVersion: lang.Long): Unit = { throw new UnsupportedOperationException( - "BackfillToVersion not implemented in TestCommitCoordinatorClient for %s".format(logPath)) + "BackfillToVersion not implemented in TestCommitCoordinatorClient" + + " for %s".format(tableDesc.getLogPath)) } } diff --git a/kernel/kernel-defaults/src/test/scala/io/delta/kernel/defaults/internal/coordinatedcommits/CoordinatedCommitsSuite.scala b/kernel/kernel-defaults/src/test/scala/io/delta/kernel/defaults/internal/coordinatedcommits/CoordinatedCommitsSuite.scala index 7a0091d093e..2709fef5a3d 100644 --- a/kernel/kernel-defaults/src/test/scala/io/delta/kernel/defaults/internal/coordinatedcommits/CoordinatedCommitsSuite.scala +++ b/kernel/kernel-defaults/src/test/scala/io/delta/kernel/defaults/internal/coordinatedcommits/CoordinatedCommitsSuite.scala @@ -33,7 +33,7 @@ import io.delta.kernel.internal.util.{CoordinatedCommitsUtils, FileNames} import io.delta.kernel.internal.util.Preconditions.checkArgument import io.delta.kernel.internal.util.Utils.{closeCloseables, singletonCloseableIterator, toCloseableIterator} import io.delta.kernel.utils.{CloseableIterator, FileStatus} -import io.delta.storage.commit.{CommitCoordinatorClient, CommitResponse, GetCommitsResponse, InMemoryCommitCoordinator, UpdatedActions, CoordinatedCommitsUtils => CCU} +import io.delta.storage.commit.{CommitCoordinatorClient, CommitResponse, GetCommitsResponse, InMemoryCommitCoordinator, TableDescriptor, TableIdentifier, UpdatedActions, CoordinatedCommitsUtils => CCU} import io.delta.storage.commit.actions.{AbstractMetadata, AbstractProtocol} import io.delta.storage.LogStore import org.apache.hadoop.conf.Configuration @@ -331,31 +331,29 @@ object TestCommitCoordinator { class TestCommitCoordinatorClient extends InMemoryCommitCoordinator(10) { override def registerTable( logPath: Path, + tableIdentifier: Optional[TableIdentifier], currentVersion: Long, currentMetadata: AbstractMetadata, currentProtocol: AbstractProtocol): util.Map[String, String] = { - super.registerTable(logPath, currentVersion, currentMetadata, currentProtocol) + super.registerTable(logPath, tableIdentifier, currentVersion, currentMetadata, currentProtocol) TestCommitCoordinator.EXP_TABLE_CONF } override def getCommits( - logPath: Path, - coordinatedCommitsTableConf: util.Map[String, String], + tableDesc: TableDescriptor, startVersion: lang.Long, endVersion: lang.Long = null): GetCommitsResponse = { - checkArgument(coordinatedCommitsTableConf == TestCommitCoordinator.EXP_TABLE_CONF) - super.getCommits(logPath, coordinatedCommitsTableConf, startVersion, endVersion) + checkArgument(tableDesc.getTableConf == TestCommitCoordinator.EXP_TABLE_CONF) + super.getCommits(tableDesc, startVersion, endVersion) } override def commit( logStore: LogStore, hadoopConf: Configuration, - logPath: Path, - coordinatedCommitsTableConf: util.Map[String, String], + tableDesc: TableDescriptor, commitVersion: Long, actions: util.Iterator[String], updatedActions: UpdatedActions): CommitResponse = { - checkArgument(coordinatedCommitsTableConf == TestCommitCoordinator.EXP_TABLE_CONF) - super.commit(logStore, hadoopConf, logPath, coordinatedCommitsTableConf, - commitVersion, actions, updatedActions) + checkArgument(tableDesc.getTableConf == TestCommitCoordinator.EXP_TABLE_CONF) + super.commit(logStore, hadoopConf, tableDesc, commitVersion, actions, updatedActions) } } diff --git a/kernel/kernel-defaults/src/test/scala/io/delta/kernel/defaults/internal/coordinatedcommits/CoordinatedCommitsTestUtils.scala b/kernel/kernel-defaults/src/test/scala/io/delta/kernel/defaults/internal/coordinatedcommits/CoordinatedCommitsTestUtils.scala index 16806615e58..61379564770 100644 --- a/kernel/kernel-defaults/src/test/scala/io/delta/kernel/defaults/internal/coordinatedcommits/CoordinatedCommitsTestUtils.scala +++ b/kernel/kernel-defaults/src/test/scala/io/delta/kernel/defaults/internal/coordinatedcommits/CoordinatedCommitsTestUtils.scala @@ -18,7 +18,7 @@ package io.delta.kernel.defaults.internal.coordinatedcommits import io.delta.kernel.data.Row import java.{lang, util} -import io.delta.storage.commit.{CommitCoordinatorClient, InMemoryCommitCoordinator, Commit => StorageCommit, CommitResponse => StorageCommitResponse, GetCommitsResponse => StorageGetCommitsResponse, UpdatedActions => StorageUpdatedActions} +import io.delta.storage.commit.{CommitCoordinatorClient, InMemoryCommitCoordinator, Commit => StorageCommit, CommitResponse => StorageCommitResponse, GetCommitsResponse => StorageGetCommitsResponse, TableDescriptor, TableIdentifier, UpdatedActions => StorageUpdatedActions} import io.delta.kernel.defaults.internal.logstore.LogStoreProvider import io.delta.kernel.engine.{CommitCoordinatorClientHandler, Engine} import io.delta.kernel.internal.actions.{CommitInfo, Format, Metadata, Protocol} @@ -67,7 +67,9 @@ trait CoordinatedCommitsTestUtils { null, Collections.emptyMap(), true, - null) + null, + Collections.emptyMap() + ) } def commit( @@ -203,44 +205,34 @@ class TrackingCommitCoordinatorClient(delegatingCommitCoordinatorClient: InMemor override def commit( logStore: LogStore, hadoopConf: Configuration, - logPath: Path, - coordinatedCommitsTableConf: util.Map[String, String], + tableDesc: TableDescriptor, commitVersion: Long, actions: util.Iterator[String], updatedActions: StorageUpdatedActions): StorageCommitResponse = recordOperation("commit") { delegatingCommitCoordinatorClient.commit( logStore, hadoopConf, - logPath, - coordinatedCommitsTableConf, + tableDesc, commitVersion, actions, updatedActions) } override def getCommits( - logPath: Path, - coordinatedCommitsTableConf: util.Map[String, String], + tableDesc: TableDescriptor, startVersion: lang.Long, endVersion: lang.Long = null): StorageGetCommitsResponse = recordOperation("getCommits") { - delegatingCommitCoordinatorClient.getCommits( - logPath, coordinatedCommitsTableConf, startVersion, endVersion) + delegatingCommitCoordinatorClient.getCommits(tableDesc, startVersion, endVersion) } override def backfillToVersion( logStore: LogStore, hadoopConf: Configuration, - logPath: Path, - coordinatedCommitsTableConf: util.Map[String, String], + tableDesc: TableDescriptor, version: Long, lastKnownBackfilledVersion: lang.Long): Unit = recordOperation("backfillToVersion") { delegatingCommitCoordinatorClient.backfillToVersion( - logStore, - hadoopConf, - logPath, - coordinatedCommitsTableConf, - version, - lastKnownBackfilledVersion) + logStore, hadoopConf, tableDesc, version, lastKnownBackfilledVersion) } override def semanticEquals(other: CommitCoordinatorClient): Boolean = this == other @@ -253,11 +245,12 @@ class TrackingCommitCoordinatorClient(delegatingCommitCoordinatorClient: InMemor override def registerTable( logPath: Path, + tableIdentifier: Optional[TableIdentifier], currentVersion: Long, currentMetadata: AbstractMetadata, currentProtocol: AbstractProtocol): util.Map[String, String] = recordOperation("registerTable") { delegatingCommitCoordinatorClient.registerTable( - logPath, currentVersion, currentMetadata, currentProtocol) + logPath, tableIdentifier, currentVersion, currentMetadata, currentProtocol) } } diff --git a/setup.py b/setup.py index 323a93d52fe..b810a968022 100644 --- a/setup.py +++ b/setup.py @@ -65,7 +65,7 @@ def run(self): 'delta': ['py.typed'], }, install_requires=[ - 'pyspark>=3.5.2,<3.6.0', + 'pyspark>=3.5.3,<3.6.0', 'importlib_metadata>=1.0.0', ], python_requires='>=3.6', diff --git a/sharing/src/main/scala/io/delta/sharing/spark/DeltaSharingFileIndex.scala b/sharing/src/main/scala/io/delta/sharing/spark/DeltaSharingFileIndex.scala index e2efeb65daa..0e9e45c0b04 100644 --- a/sharing/src/main/scala/io/delta/sharing/spark/DeltaSharingFileIndex.scala +++ b/sharing/src/main/scala/io/delta/sharing/spark/DeltaSharingFileIndex.scala @@ -212,12 +212,7 @@ case class DeltaSharingFileIndex( partitionFilters: Seq[Expression], dataFilters: Seq[Expression]): TahoeLogFileIndex = { val deltaLog = fetchFilesAndConstructDeltaLog(partitionFilters, dataFilters, None) - new TahoeLogFileIndex( - params.spark, - deltaLog, - deltaLog.dataPath, - deltaLog.unsafeVolatileSnapshot - ) + TahoeLogFileIndex(params.spark, deltaLog) } override def listFiles( diff --git a/sharing/src/test/scala/io/delta/sharing/spark/DeltaSharingTestSparkUtils.scala b/sharing/src/test/scala/io/delta/sharing/spark/DeltaSharingTestSparkUtils.scala index 8f963c83c81..18b2396a4d8 100644 --- a/sharing/src/test/scala/io/delta/sharing/spark/DeltaSharingTestSparkUtils.scala +++ b/sharing/src/test/scala/io/delta/sharing/spark/DeltaSharingTestSparkUtils.scala @@ -85,7 +85,10 @@ trait DeltaSharingTestSparkUtils extends DeltaSQLTestUtils { protected def createSimpleTable(tableName: String, enableCdf: Boolean): Unit = { val tablePropertiesStr = if (enableCdf) { - "TBLPROPERTIES (delta.enableChangeDataFeed = true)" + """TBLPROPERTIES ( + |delta.minReaderVersion=1, + |delta.minWriterVersion=4, + |delta.enableChangeDataFeed = true)""".stripMargin } else { "" } diff --git a/spark/src/main/java/io/delta/dynamodbcommitcoordinator/CoordinatedCommitsUtils.java b/spark/src/main/java/io/delta/dynamodbcommitcoordinator/CoordinatedCommitsUtils.java deleted file mode 100644 index 492917aaca1..00000000000 --- a/spark/src/main/java/io/delta/dynamodbcommitcoordinator/CoordinatedCommitsUtils.java +++ /dev/null @@ -1,76 +0,0 @@ -/* - * Copyright (2021) The Delta Lake Project Authors. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package io.delta.dynamodbcommitcoordinator; - -import io.delta.storage.commit.actions.AbstractMetadata; -import io.delta.storage.commit.UpdatedActions; -import org.apache.hadoop.fs.Path; - -import java.util.UUID; - -public class CoordinatedCommitsUtils { - - private CoordinatedCommitsUtils() {} - - /** The subdirectory in which to store the unbackfilled commit files. */ - final static String COMMIT_SUBDIR = "_commits"; - - /** The configuration key for the coordinated commits owner. */ - private static final String COORDINATED_COMMITS_COORDINATOR_CONF_KEY = - "delta.coordinatedCommits.commitCoordinator-preview"; - - /** - * Creates a new unbackfilled delta file path for the given commit version. - * The path is of the form `tablePath/_delta_log/_commits/00000000000000000001.uuid.json`. - */ - public static Path generateUnbackfilledDeltaFilePath( - Path logPath, - long version) { - String uuid = UUID.randomUUID().toString(); - Path basePath = new Path(logPath, COMMIT_SUBDIR); - return new Path(basePath, String.format("%020d.%s.json", version, uuid)); - } - - /** - * Returns the path to the backfilled delta file for the given commit version. - * The path is of the form `tablePath/_delta_log/00000000000000000001.json`. - */ - public static Path getBackfilledDeltaFilePath( - Path logPath, - Long version) { - return new Path(logPath, String.format("%020d.json", version)); - } - - private static String getCoordinator(AbstractMetadata metadata) { - return metadata - .getConfiguration() - .getOrDefault(COORDINATED_COMMITS_COORDINATOR_CONF_KEY, ""); - } - - /** - * Returns true if the commit is a coordinated commits to filesystem conversion. - */ - public static boolean isCoordinatedCommitsToFSConversion( - Long commitVersion, - UpdatedActions updatedActions) { - boolean oldMetadataHasCoordinatedCommits = - !getCoordinator(updatedActions.getOldMetadata()).isEmpty(); - boolean newMetadataHasCoordinatedCommits = - !getCoordinator(updatedActions.getNewMetadata()).isEmpty(); - return oldMetadataHasCoordinatedCommits && !newMetadataHasCoordinatedCommits && commitVersion > 0; - } -} diff --git a/spark/src/main/java/io/delta/dynamodbcommitcoordinator/DynamoDBCommitCoordinatorClient.java b/spark/src/main/java/io/delta/dynamodbcommitcoordinator/DynamoDBCommitCoordinatorClient.java index b4206fd1c9b..f61a980d10c 100644 --- a/spark/src/main/java/io/delta/dynamodbcommitcoordinator/DynamoDBCommitCoordinatorClient.java +++ b/spark/src/main/java/io/delta/dynamodbcommitcoordinator/DynamoDBCommitCoordinatorClient.java @@ -32,10 +32,7 @@ import org.slf4j.LoggerFactory; import java.io.*; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.Iterator; -import java.util.Map; +import java.util.*; /** * A commit coordinator client that uses DynamoDB as the commit coordinator. The table schema is as follows: @@ -350,11 +347,11 @@ DynamoDBTableEntryConstants.TABLE_LATEST_TIMESTAMP, new AttributeValueUpdate() public CommitResponse commit( LogStore logStore, Configuration hadoopConf, - Path logPath, - Map coordinatedCommitsTableConf, + TableDescriptor tableDesc, long commitVersion, Iterator actions, UpdatedActions updatedActions) throws CommitFailedException { + Path logPath = tableDesc.getLogPath(); if (commitVersion == 0) { throw new CommitFailedException( false /* retryable */, @@ -375,7 +372,7 @@ public CommitResponse commit( commitVersion, commitPath); CommitResponse res = commitToCoordinator( logPath, - coordinatedCommitsTableConf, + tableDesc.getTableConf(), commitVersion, commitFileStatus, inCommitTimestamp, @@ -393,8 +390,7 @@ public CommitResponse commit( backfillToVersion( logStore, hadoopConf, - logPath, - coordinatedCommitsTableConf, + tableDesc, commitVersion, null /* lastKnownBackfilledVersion */); } @@ -420,7 +416,7 @@ private GetCommitsResultInternal getCommitsImpl( Long.parseLong(item.get(DynamoDBTableEntryConstants.TABLE_LATEST_VERSION).getN()); AttributeValue allStoredCommits = item.get(DynamoDBTableEntryConstants.COMMITS); ArrayList commits = new ArrayList<>(); - Path unbackfilledCommitsPath = new Path(logPath, CoordinatedCommitsUtils.COMMIT_SUBDIR); + Path unbackfilledCommitsPath = CoordinatedCommitsUtils.commitDirPath(logPath); for(AttributeValue attr: allStoredCommits.getL()) { java.util.Map commitMap = attr.getM(); long commitVersion = @@ -456,13 +452,12 @@ private GetCommitsResultInternal getCommitsImpl( @Override public GetCommitsResponse getCommits( - Path logPath, - Map coordinatedCommitsTableConf, + TableDescriptor tableDesc, Long startVersion, Long endVersion) { try { GetCommitsResultInternal res = - getCommitsImpl(logPath, coordinatedCommitsTableConf, startVersion, endVersion); + getCommitsImpl(tableDesc.getLogPath(), tableDesc.getTableConf(), startVersion, endVersion); long latestTableVersionToReturn = res.response.getLatestTableVersion(); if (!res.hasAcceptedCommits) { /* @@ -533,16 +528,16 @@ private void validateBackfilledFileExists( public void backfillToVersion( LogStore logStore, Configuration hadoopConf, - Path logPath, - Map coordinatedCommitsTableConf, + TableDescriptor tableDesc, long version, Long lastKnownBackfilledVersion) throws IOException { LOG.info("Backfilling all unbackfilled commits."); + Path logPath = tableDesc.getLogPath(); GetCommitsResponse resp; try { resp = getCommitsImpl( logPath, - coordinatedCommitsTableConf, + tableDesc.getTableConf(), lastKnownBackfilledVersion, null).response; } catch (IOException e) { @@ -582,7 +577,7 @@ public void backfillToVersion( .withTableName(coordinatedCommitsTableName) .addKeyEntry( DynamoDBTableEntryConstants.TABLE_ID, - new AttributeValue().withS(getTableId(coordinatedCommitsTableConf))) + new AttributeValue().withS(getTableId(tableDesc.getTableConf()))) .addAttributeUpdatesEntry( DynamoDBTableEntryConstants.COMMITS, new AttributeValueUpdate() @@ -624,6 +619,7 @@ public void backfillToVersion( @Override public Map registerTable( Path logPath, + Optional tableIdentifier, long currentVersion, AbstractMetadata currentMetadata, AbstractProtocol currentProtocol) { diff --git a/spark/src/main/resources/error/delta-error-classes.json b/spark/src/main/resources/error/delta-error-classes.json index c38f89459ef..2fc9ddf763d 100644 --- a/spark/src/main/resources/error/delta-error-classes.json +++ b/spark/src/main/resources/error/delta-error-classes.json @@ -207,6 +207,12 @@ ], "sqlState" : "42809" }, + "DELTA_CANNOT_MODIFY_COORDINATED_COMMITS_DEPENDENCIES" : { + "message" : [ + " cannot override or unset in-commit timestamp table properties because coordinated commits is enabled in this table and depends on them. Please remove them (\"delta.enableInCommitTimestamps\", \"delta.inCommitTimestampEnablementVersion\", \"delta.inCommitTimestampEnablementTimestamp\") from the TBLPROPERTIES clause and then retry the command again." + ], + "sqlState" : "42616" + }, "DELTA_CANNOT_MODIFY_TABLE_PROPERTY" : { "message" : [ "The Delta table configuration cannot be specified by the user" @@ -267,6 +273,12 @@ ], "sqlState" : "22003" }, + "DELTA_CANNOT_SET_COORDINATED_COMMITS_DEPENDENCIES" : { + "message" : [ + " cannot set in-commit timestamp table properties together with coordinated commits, because the latter depends on the former and sets the former internally. Please remove them (\"delta.enableInCommitTimestamps\", \"delta.inCommitTimestampEnablementVersion\", \"delta.inCommitTimestampEnablementTimestamp\") from the TBLPROPERTIES clause and then retry the command again." + ], + "sqlState" : "42616" + }, "DELTA_CANNOT_SET_LOCATION_MULTIPLE_TIMES" : { "message" : [ "Can't set location multiple times. Found " @@ -279,6 +291,12 @@ ], "sqlState" : "42613" }, + "DELTA_CANNOT_UNSET_COORDINATED_COMMITS_CONFS" : { + "message" : [ + "ALTER cannot unset coordinated commits configurations. To downgrade a table from coordinated commits, please try again using `ALTER TABLE [table-name] DROP FEATURE 'coordinatedCommits-preview'`." + ], + "sqlState" : "42616" + }, "DELTA_CANNOT_UPDATE_ARRAY_FIELD" : { "message" : [ "Cannot update %1$s field %2$s type: update the element by updating %2$s.element" @@ -381,9 +399,9 @@ }, "DELTA_CLUSTERING_COLUMNS_DATATYPE_NOT_SUPPORTED" : { "message" : [ - "Clustering requires the data types of clustering columns to support data skipping. However the following column(s) '' have unsupported data types for data skipping in Delta." + "CLUSTER BY is not supported because the following column(s): don't support data skipping." ], - "sqlState" : "22000" + "sqlState" : "0A000" }, "DELTA_CLUSTERING_COLUMNS_MISMATCH" : { "message" : [ @@ -1156,6 +1174,12 @@ "" ] }, + "UNSUPPORTED_PARTITION_DATA_TYPE" : { + "message" : [ + "IcebergCompatV does not support the data type for partition columns in your schema. Your partition schema:", + "" + ] + }, "VERSION_MUTUAL_EXCLUSIVE" : { "message" : [ "Only one IcebergCompat version can be enabled, please explicitly disable all other IcebergCompat versions that are not needed." diff --git a/spark/src/main/scala/org/apache/spark/sql/delta/Checkpoints.scala b/spark/src/main/scala/org/apache/spark/sql/delta/Checkpoints.scala index 14e4d0af0f5..844611f1968 100644 --- a/spark/src/main/scala/org/apache/spark/sql/delta/Checkpoints.scala +++ b/spark/src/main/scala/org/apache/spark/sql/delta/Checkpoints.scala @@ -41,6 +41,7 @@ import org.apache.spark.TaskContext import org.apache.spark.internal.MDC import org.apache.spark.paths.SparkPath import org.apache.spark.sql.{Column, DataFrame, Dataset, Row, SparkSession} +import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute import org.apache.spark.sql.catalyst.expressions.{Cast, ElementAt, Literal} import org.apache.spark.sql.execution.SQLExecution @@ -297,13 +298,15 @@ trait Checkpoints extends DeltaLogging { * Note that this function captures and logs all exceptions, since the checkpoint shouldn't fail * the overall commit operation. */ - def checkpoint(snapshotToCheckpoint: Snapshot): Unit = recordDeltaOperation( - this, "delta.checkpoint") { + def checkpoint( + snapshotToCheckpoint: Snapshot, + tableIdentifierOpt: Option[TableIdentifier] = None): Unit = + recordDeltaOperation(this, "delta.checkpoint") { withCheckpointExceptionHandling(snapshotToCheckpoint.deltaLog, "delta.checkpoint.sync.error") { if (snapshotToCheckpoint.version < 0) { throw DeltaErrors.checkpointNonExistTable(dataPath) } - checkpointAndCleanUpDeltaLog(snapshotToCheckpoint) + checkpointAndCleanUpDeltaLog(snapshotToCheckpoint, tableIdentifierOpt) } } @@ -323,8 +326,9 @@ trait Checkpoints extends DeltaLogging { } def checkpointAndCleanUpDeltaLog( - snapshotToCheckpoint: Snapshot): Unit = { - val lastCheckpointInfo = writeCheckpointFiles(snapshotToCheckpoint) + snapshotToCheckpoint: Snapshot, + tableIdentifierOpt: Option[TableIdentifier] = None): Unit = { + val lastCheckpointInfo = writeCheckpointFiles(snapshotToCheckpoint, tableIdentifierOpt) writeLastCheckpointFile( snapshotToCheckpoint.deltaLog, lastCheckpointInfo, LastCheckpointInfo.checksumEnabled(spark)) doLogCleanup(snapshotToCheckpoint) @@ -346,7 +350,9 @@ trait Checkpoints extends DeltaLogging { } } - protected def writeCheckpointFiles(snapshotToCheckpoint: Snapshot): LastCheckpointInfo = { + protected def writeCheckpointFiles( + snapshotToCheckpoint: Snapshot, + tableIdentifierOpt: Option[TableIdentifier] = None): LastCheckpointInfo = { // With Coordinated-Commits, commit files are not guaranteed to be backfilled immediately in the // _delta_log dir. While it is possible to compute a checkpoint file without backfilling, // writing the checkpoint file in the log directory before backfilling the relevant commits @@ -361,7 +367,7 @@ trait Checkpoints extends DeltaLogging { // 00015.json // 00016.json // 00018.checkpoint.parquet - snapshotToCheckpoint.ensureCommitFilesBackfilled() + snapshotToCheckpoint.ensureCommitFilesBackfilled(tableIdentifierOpt) Checkpoints.writeCheckpoint(spark, this, snapshotToCheckpoint) } diff --git a/spark/src/main/scala/org/apache/spark/sql/delta/ColumnWithDefaultExprUtils.scala b/spark/src/main/scala/org/apache/spark/sql/delta/ColumnWithDefaultExprUtils.scala index 8e8c8ce90f5..1874c64b0a2 100644 --- a/spark/src/main/scala/org/apache/spark/sql/delta/ColumnWithDefaultExprUtils.scala +++ b/spark/src/main/scala/org/apache/spark/sql/delta/ColumnWithDefaultExprUtils.scala @@ -24,16 +24,14 @@ import org.apache.spark.sql.delta.commands.cdc.CDCReader import org.apache.spark.sql.delta.constraints.{Constraint, Constraints} import org.apache.spark.sql.delta.metering.DeltaLogging import org.apache.spark.sql.delta.schema.SchemaUtils -import org.apache.spark.sql.delta.sources.{DeltaSourceUtils, DeltaSQLConf} +import org.apache.spark.sql.delta.sources.{DeltaSourceUtils, DeltaSQLConf, DeltaStreamUtils} -import org.apache.spark.sql.{Column, DataFrame, Dataset, Encoder} -import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder +import org.apache.spark.sql.{Column, DataFrame} import org.apache.spark.sql.catalyst.expressions.EqualNullSafe import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap import org.apache.spark.sql.catalyst.util.ResolveDefaultColumns._ import org.apache.spark.sql.execution.QueryExecution -import org.apache.spark.sql.execution.streaming.{IncrementalExecution, IncrementalExecutionShims, StreamExecution} -import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.execution.streaming.IncrementalExecution import org.apache.spark.sql.types.{MetadataBuilder, StructField, StructType} /** @@ -179,7 +177,7 @@ object ColumnWithDefaultExprUtils extends DeltaLogging { val newData = queryExecution match { case incrementalExecution: IncrementalExecution => - selectFromStreamingDataFrame(incrementalExecution, data, selectExprs: _*) + DeltaStreamUtils.selectFromStreamingDataFrame(incrementalExecution, data, selectExprs: _*) case _ => data.select(selectExprs: _*) } recordDeltaEvent(deltaLog, "delta.generatedColumns.write") @@ -222,30 +220,4 @@ object ColumnWithDefaultExprUtils extends DeltaLogging { schema } } - - /** - * Select `cols` from a micro batch DataFrame. Directly calling `select` won't work because it - * will create a `QueryExecution` rather than inheriting `IncrementalExecution` from - * the micro batch DataFrame. A streaming micro batch DataFrame to execute should use - * `IncrementalExecution`. - */ - private def selectFromStreamingDataFrame( - incrementalExecution: IncrementalExecution, - df: DataFrame, - cols: Column*): DataFrame = { - val newMicroBatch = df.select(cols: _*) - val newIncrementalExecution = IncrementalExecutionShims.newInstance( - newMicroBatch.sparkSession, - newMicroBatch.queryExecution.logical, - incrementalExecution) - newIncrementalExecution.executedPlan // Force the lazy generation of execution plan - - - // Use reflection to call the private constructor. - val constructor = - classOf[Dataset[_]].getConstructor(classOf[QueryExecution], classOf[Encoder[_]]) - constructor.newInstance( - newIncrementalExecution, - ExpressionEncoder(newIncrementalExecution.analyzed.schema)).asInstanceOf[DataFrame] - } } diff --git a/spark/src/main/scala/org/apache/spark/sql/delta/DeltaColumnMapping.scala b/spark/src/main/scala/org/apache/spark/sql/delta/DeltaColumnMapping.scala index 2ce3cc1b4c3..11ea6e513da 100644 --- a/spark/src/main/scala/org/apache/spark/sql/delta/DeltaColumnMapping.scala +++ b/spark/src/main/scala/org/apache/spark/sql/delta/DeltaColumnMapping.scala @@ -48,6 +48,17 @@ trait DeltaColumnMappingBase extends DeltaLogging { val PARQUET_MAP_KEY_FIELD_NAME = "key" val PARQUET_MAP_VALUE_FIELD_NAME = "value" + /** + * The list of column mapping metadata for each column in the schema. + */ + val COLUMN_MAPPING_METADATA_KEYS: Set[String] = Set( + COLUMN_MAPPING_METADATA_ID_KEY, + COLUMN_MAPPING_PHYSICAL_NAME_KEY, + COLUMN_MAPPING_METADATA_NESTED_IDS_KEY, + PARQUET_FIELD_ID_METADATA_KEY, + PARQUET_MAP_VALUE_FIELD_NAME + ) + /** * This list of internal columns (and only this list) is allowed to have missing * column mapping metadata such as field id and physical name because @@ -116,6 +127,7 @@ trait DeltaColumnMappingBase extends DeltaLogging { * - upgrading to the column mapping Protocol through configurations */ def verifyAndUpdateMetadataChange( + spark: SparkSession, deltaLog: DeltaLog, oldProtocol: Protocol, oldMetadata: Metadata, @@ -136,8 +148,34 @@ trait DeltaColumnMappingBase extends DeltaLogging { oldMappingMode.name, newMappingMode.name) } - val updatedMetadata = updateColumnMappingMetadata( - oldMetadata, newMetadata, isChangingModeOnExistingTable, isOverwriteSchema) + var updatedMetadata = newMetadata + + // If column mapping is disabled, we need to strip any column mapping metadata from the schema, + // because Delta code will use them even when column mapping is not enabled. However, we cannot + // strip column mapping metadata that already exist in the schema, because this would break + // the table. + if (newMappingMode == NoMapping && + schemaHasColumnMappingMetadata(newMetadata.schema)) { + val addsColumnMappingMetadata = !schemaHasColumnMappingMetadata(oldMetadata.schema) + if (addsColumnMappingMetadata && + spark.conf.get(DeltaSQLConf.DELTA_COLUMN_MAPPING_STRIP_METADATA)) { + recordDeltaEvent(deltaLog, opType = "delta.columnMapping.stripMetadata") + val strippedSchema = dropColumnMappingMetadata(newMetadata.schema) + updatedMetadata = newMetadata.copy(schemaString = strippedSchema.json) + } else { + recordDeltaEvent( + deltaLog, + opType = "delta.columnMapping.updateSchema.metadataPresentButFeatureDisabled", + data = Map( + "addsColumnMappingMetadata" -> addsColumnMappingMetadata.toString, + "isCreatingNewTable" -> isCreatingNewTable.toString, + "isOverwriteSchema" -> isOverwriteSchema.toString) + ) + } + } + + updatedMetadata = updateColumnMappingMetadata( + oldMetadata, updatedMetadata, isChangingModeOnExistingTable, isOverwriteSchema) // record column mapping table creation/upgrade if (newMappingMode != NoMapping) { @@ -455,16 +493,12 @@ trait DeltaColumnMappingBase extends DeltaLogging { def dropColumnMappingMetadata(schema: StructType): StructType = { SchemaMergingUtils.transformColumns(schema) { (_, field, _) => - field.copy( - metadata = new MetadataBuilder() - .withMetadata(field.metadata) - .remove(COLUMN_MAPPING_METADATA_ID_KEY) - .remove(COLUMN_MAPPING_METADATA_NESTED_IDS_KEY) - .remove(COLUMN_MAPPING_PHYSICAL_NAME_KEY) - .remove(PARQUET_FIELD_ID_METADATA_KEY) - .remove(PARQUET_FIELD_NESTED_IDS_METADATA_KEY) - .build() - ) + var strippedMetadataBuilder = new MetadataBuilder().withMetadata(field.metadata) + for (key <- COLUMN_MAPPING_METADATA_KEYS) { + strippedMetadataBuilder = strippedMetadataBuilder.remove(key) + } + val strippedMetadata = strippedMetadataBuilder.build() + field.copy(metadata = strippedMetadata) } } @@ -784,6 +818,15 @@ trait DeltaColumnMappingBase extends DeltaLogging { (transform(schema, new MetadataBuilder(), Seq.empty), currFieldId) } + + /** + * Returns whether the schema contains any metadata reserved for column mapping. + */ + def schemaHasColumnMappingMetadata(schema: StructType): Boolean = { + SchemaMergingUtils.explode(schema).exists { case (_, col) => + COLUMN_MAPPING_METADATA_KEYS.exists(k => col.metadata.contains(k)) + } + } } object DeltaColumnMapping extends DeltaColumnMappingBase diff --git a/spark/src/main/scala/org/apache/spark/sql/delta/DeltaErrors.scala b/spark/src/main/scala/org/apache/spark/sql/delta/DeltaErrors.scala index 3bd26c3ab7f..58d7105bae0 100644 --- a/spark/src/main/scala/org/apache/spark/sql/delta/DeltaErrors.scala +++ b/spark/src/main/scala/org/apache/spark/sql/delta/DeltaErrors.scala @@ -1634,14 +1634,14 @@ trait DeltaErrorsBase messageParameters = Array(option, operation)) } - def foundMapTypeColumnException(key: String, value: String, schema: StructType): Throwable = { + def foundMapTypeColumnException(key: String, value: String, schema: DataType): Throwable = { new DeltaAnalysisException( errorClass = "DELTA_FOUND_MAP_TYPE_COLUMN", - messageParameters = Array(key, value, schema.treeString) + messageParameters = Array(key, value, dataTypeToString(schema)) ) } - def columnNotInSchemaException(column: String, schema: StructType): Throwable = { - nonExistentColumnInSchema(column, schema.treeString) + def columnNotInSchemaException(column: String, schema: DataType): Throwable = { + nonExistentColumnInSchema(column, dataTypeToString(schema)) } def metadataAbsentException(): Throwable = { @@ -2690,10 +2690,14 @@ trait DeltaErrorsBase def incorrectArrayAccessByName( rightName: String, wrongName: String, - schema: StructType): Throwable = { + schema: DataType): Throwable = { new DeltaAnalysisException( errorClass = "DELTA_INCORRECT_ARRAY_ACCESS_BY_NAME", - messageParameters = Array(rightName, wrongName, schema.treeString) + messageParameters = Array( + rightName, + wrongName, + dataTypeToString(schema) + ) ) } @@ -2701,14 +2705,14 @@ trait DeltaErrorsBase columnPath: String, other: DataType, column: Seq[String], - schema: StructType): Throwable = { + schema: DataType): Throwable = { new DeltaAnalysisException( errorClass = "DELTA_COLUMN_PATH_NOT_NESTED", messageParameters = Array( s"$columnPath", s"$other", s"${SchemaUtils.prettyFieldName(column)}", - schema.treeString + dataTypeToString(schema) ) ) } @@ -3250,6 +3254,15 @@ trait DeltaErrorsBase ) } + def icebergCompatUnsupportedPartitionDataTypeException( + version: Int, dataType: DataType, schema: StructType): Throwable = { + new DeltaUnsupportedOperationException( + errorClass = "DELTA_ICEBERG_COMPAT_VIOLATION.UNSUPPORTED_PARTITION_DATA_TYPE", + messageParameters = Array(version.toString, version.toString, + dataType.typeName, schema.treeString) + ) + } + def icebergCompatMissingRequiredTableFeatureException( version: Int, tf: TableFeature): Throwable = { new DeltaUnsupportedOperationException( @@ -3436,11 +3449,11 @@ trait DeltaErrorsBase } def errorFindingColumnPosition( - columnPath: Seq[String], schema: StructType, extraErrMsg: String): Throwable = { + columnPath: Seq[String], schema: DataType, extraErrMsg: String): Throwable = { new DeltaAnalysisException( errorClass = "_LEGACY_ERROR_TEMP_DELTA_0008", messageParameters = Array( - UnresolvedAttribute(columnPath).name, schema.treeString, extraErrMsg)) + UnresolvedAttribute(columnPath).name, dataTypeToString(schema), extraErrMsg)) } def alterTableClusterByOnPartitionedTableException(): Throwable = { @@ -3472,6 +3485,11 @@ trait DeltaErrorsBase errorClass = "DELTA_UNSUPPORTED_WRITES_WITHOUT_COORDINATOR", messageParameters = Array(coordinatorName)) } + + private def dataTypeToString(dt: DataType): String = dt match { + case s: StructType => s.treeString + case other => other.simpleString + } } object DeltaErrors extends DeltaErrorsBase diff --git a/spark/src/main/scala/org/apache/spark/sql/delta/DeltaLog.scala b/spark/src/main/scala/org/apache/spark/sql/delta/DeltaLog.scala index 0aff32f3577..7d23ec134d5 100644 --- a/spark/src/main/scala/org/apache/spark/sql/delta/DeltaLog.scala +++ b/spark/src/main/scala/org/apache/spark/sql/delta/DeltaLog.scala @@ -221,9 +221,9 @@ class DeltaLog private( catalogTableOpt: Option[CatalogTable], snapshotOpt: Option[Snapshot] = None)( thunk: OptimisticTransaction => T): T = { + val txn = startTransaction(catalogTableOpt, snapshotOpt) + OptimisticTransaction.setActive(txn) try { - val txn = startTransaction(catalogTableOpt, snapshotOpt) - OptimisticTransaction.setActive(txn) thunk(txn) } finally { OptimisticTransaction.clearActive() @@ -233,9 +233,9 @@ class DeltaLog private( /** Legacy/compat overload that does not require catalog table information. Avoid prod use. */ @deprecated("Please use the CatalogTable overload instead", "3.0") def withNewTransaction[T](thunk: OptimisticTransaction => T): T = { + val txn = startTransaction() + OptimisticTransaction.setActive(txn) try { - val txn = startTransaction() - OptimisticTransaction.setActive(txn) thunk(txn) } finally { OptimisticTransaction.clearActive() @@ -425,7 +425,7 @@ class DeltaLog private( def assertTableFeaturesMatchMetadata( targetProtocol: Protocol, targetMetadata: Metadata): Unit = { - if (!targetProtocol.supportsReaderFeatures && !targetProtocol.supportsWriterFeatures) return + if (!targetProtocol.supportsTableFeatures) return val protocolEnabledFeatures = targetProtocol.writerFeatureNames .flatMap(TableFeature.featureNameToFeature) diff --git a/spark/src/main/scala/org/apache/spark/sql/delta/DeltaOperations.scala b/spark/src/main/scala/org/apache/spark/sql/delta/DeltaOperations.scala index 30680baacc6..1ad915555eb 100644 --- a/spark/src/main/scala/org/apache/spark/sql/delta/DeltaOperations.scala +++ b/spark/src/main/scala/org/apache/spark/sql/delta/DeltaOperations.scala @@ -72,6 +72,40 @@ object DeltaOperations { transformer.transformToString(metric, allMetrics) } } + + /** + * A transaction that commits AddFile actions with deletionVector should have column stats that + * are not tight bounds. An exception to this is ComputeStats operation, which recomputes stats + * on these files, and the new stats are tight bounds. Some other operations that merely take an + * existing AddFile action and commit a copy of it, not changing the deletionVector or stats, + * can then also recommit AddFile with deletionVector and tight bound stats that were recomputed + * before. + * + * An operation for which this can happen, and there is no way that it could be committing + * new deletion vectors, should set this to false to bypass this check. + * All other operations should set this to true, so that this is validated during commit. + * + * This is abstract to force the implementers of all operations to think about this setting. + * All operations should add a comment justifying this setting. + * Any operation that sets this to false should add a test in TightBoundsSuite. + */ + def checkAddFileWithDeletionVectorStatsAreNotTightBounds: Boolean + + /** + * Whether the transaction is updating metadata of existing files. + * + * The Delta protocol allows committing AddFile actions for files that already exist on the + * latest version of the table, without committing corresponding RemoveFile actions. This is + * used to update the metadata of existing files, e.g. to recompute statistics or add tags. + * + * Such operations need special handling during conflict checking, especially against + * no-data-change transactions, because the read/delete conflict can be resolved with + * read-file-remapping and because there is no RemoveFile action to trigger a delete/delete + * conflict. In case you are adding such operation, make sure to include a test for conflicts + * with business *and* no-data-change transactions, e.g. optimize. + */ + def isInPlaceFileMetadataUpdate: Option[Boolean] + } abstract class OperationWithPredicates(name: String, val predicates: Seq[Expression]) @@ -133,6 +167,12 @@ object DeltaOperations { DeltaOperationMetrics.WRITE_REPLACE_WHERE } override def changesData: Boolean = true + + // This operation shouldn't be introducing AddFile actions with DVs and tight bounds stats. + // DVs can be introduced by the replaceWhere operation. + override def checkAddFileWithDeletionVectorStatsAreNotTightBounds: Boolean = true + + override val isInPlaceFileMetadataUpdate: Option[Boolean] = Some(false) } case class RemoveColumnMapping( @@ -140,6 +180,11 @@ object DeltaOperations { override def parameters: Map[String, Any] = Map() override val operationMetrics: Set[String] = DeltaOperationMetrics.REMOVE_COLUMN_MAPPING + + // This operation shouldn't be introducing AddFile actions at all. This check should be trivial. + override def checkAddFileWithDeletionVectorStatsAreNotTightBounds: Boolean = true + + override val isInPlaceFileMetadataUpdate: Option[Boolean] = Some(false) } /** Recorded during streaming inserts. */ @@ -154,6 +199,11 @@ object DeltaOperations { ) override val operationMetrics: Set[String] = DeltaOperationMetrics.STREAMING_UPDATE override def changesData: Boolean = true + + // This operation shouldn't be introducing AddFile actions with DVs and tight bounds stats. + override def checkAddFileWithDeletionVectorStatsAreNotTightBounds: Boolean = true + + override val isInPlaceFileMetadataUpdate: Option[Boolean] = Some(false) } /** Recorded while deleting certain partitions. */ case class Delete(predicate: Seq[Expression]) @@ -175,12 +225,22 @@ object DeltaOperations { strMetrics ++ dvMetrics } override def changesData: Boolean = true + + // This operation shouldn't be introducing AddFile actions with DVs and tight bounds stats. + override def checkAddFileWithDeletionVectorStatsAreNotTightBounds: Boolean = true + + override val isInPlaceFileMetadataUpdate: Option[Boolean] = Some(false) } /** Recorded when truncating the table. */ case class Truncate() extends Operation("TRUNCATE") { override val parameters: Map[String, Any] = Map.empty override val operationMetrics: Set[String] = DeltaOperationMetrics.TRUNCATE override def changesData: Boolean = true + + // This operation shouldn't be introducing AddFile actions at all. This check should be trivial. + override def checkAddFileWithDeletionVectorStatsAreNotTightBounds: Boolean = true + + override val isInPlaceFileMetadataUpdate: Option[Boolean] = Some(false) } /** Recorded when converting a table into a Delta table. */ @@ -198,6 +258,11 @@ object DeltaOperations { sourceFormat.map("sourceFormat" -> _) override val operationMetrics: Set[String] = DeltaOperationMetrics.CONVERT override def changesData: Boolean = true + + // This operation shouldn't be introducing AddFile actions with DVs and non-tight bounds stats. + override def checkAddFileWithDeletionVectorStatsAreNotTightBounds: Boolean = true + + override val isInPlaceFileMetadataUpdate: Option[Boolean] = Some(false) } /** Represents the predicates and action type (insert, update, delete) for a Merge clause */ @@ -265,6 +330,11 @@ object DeltaOperations { } override def changesData: Boolean = true + + // This operation shouldn't be introducing AddFile actions with DVs and non-tight bounds stats. + override def checkAddFileWithDeletionVectorStatsAreNotTightBounds: Boolean = true + + override val isInPlaceFileMetadataUpdate: Option[Boolean] = Some(false) } object Merge { @@ -296,6 +366,11 @@ object DeltaOperations { val dvMetrics = transformDeletionVectorMetrics(metrics) super.transformMetrics(metrics) ++ dvMetrics } + + // This operation shouldn't be introducing AddFile actions with DVs and non-tight bounds stats. + override def checkAddFileWithDeletionVectorStatsAreNotTightBounds: Boolean = true + + override val isInPlaceFileMetadataUpdate: Option[Boolean] = Some(false) } /** Recorded when the table is created. */ case class CreateTable( @@ -317,6 +392,11 @@ object DeltaOperations { DeltaOperationMetrics.WRITE } override def changesData: Boolean = asSelect + + // This operation shouldn't be introducing AddFile actions with DVs and non-tight bounds stats. + override def checkAddFileWithDeletionVectorStatsAreNotTightBounds: Boolean = true + + override val isInPlaceFileMetadataUpdate: Option[Boolean] = Some(false) } /** Recorded when the table is replaced. */ case class ReplaceTable( @@ -341,12 +421,24 @@ object DeltaOperations { DeltaOperationMetrics.WRITE } override def changesData: Boolean = true + + // This operation shouldn't be introducing AddFile actions with DVs and non-tight bounds stats. + override def checkAddFileWithDeletionVectorStatsAreNotTightBounds: Boolean = true + + override val isInPlaceFileMetadataUpdate: Option[Boolean] = Some(false) } /** Recorded when the table properties are set. */ val OP_SET_TBLPROPERTIES = "SET TBLPROPERTIES" case class SetTableProperties( properties: Map[String, String]) extends Operation(OP_SET_TBLPROPERTIES) { override val parameters: Map[String, Any] = Map("properties" -> JsonUtils.toJson(properties)) + + // This operation shouldn't be introducing AddFile actions at all. This check should be trivial. + // Note: This operation may trigger additional actions and additional commits. For example + // RowTrackingBackfill. These are separate transactions, and this check is performed separately. + override def checkAddFileWithDeletionVectorStatsAreNotTightBounds: Boolean = true + + override val isInPlaceFileMetadataUpdate: Option[Boolean] = Some(false) } /** Recorded when the table properties are unset. */ case class UnsetTableProperties( @@ -355,6 +447,11 @@ object DeltaOperations { override val parameters: Map[String, Any] = Map( "properties" -> JsonUtils.toJson(propKeys), "ifExists" -> ifExists) + + // This operation shouldn't be introducing AddFile actions at all. This check should be trivial. + override def checkAddFileWithDeletionVectorStatsAreNotTightBounds: Boolean = true + + override val isInPlaceFileMetadataUpdate: Option[Boolean] = Some(false) } /** Recorded when dropping a table feature. */ case class DropTableFeature( @@ -363,6 +460,13 @@ object DeltaOperations { override val parameters: Map[String, Any] = Map( "featureName" -> featureName, "truncateHistory" -> truncateHistory) + + // This operation shouldn't be introducing AddFile actions at all. This check should be trivial. + // Note: this operation may trigger additional actions and additional commits. These would be + // separate transactions, and this check is performed separately. + override def checkAddFileWithDeletionVectorStatsAreNotTightBounds: Boolean = true + + override val isInPlaceFileMetadataUpdate: Option[Boolean] = Some(false) } /** Recorded when columns are added. */ case class AddColumns( @@ -375,6 +479,11 @@ object DeltaOperations { "column" -> structFieldToMap(columnPath, column) ) ++ colPosition.map("position" -> _.toString) })) + + // This operation shouldn't be introducing AddFile actions at all. This check should be trivial. + override def checkAddFileWithDeletionVectorStatsAreNotTightBounds: Boolean = true + + override val isInPlaceFileMetadataUpdate: Option[Boolean] = Some(false) } /** Recorded when columns are dropped. */ @@ -384,6 +493,11 @@ object DeltaOperations { override val parameters: Map[String, Any] = Map( "columns" -> JsonUtils.toJson(colsToDrop.map(UnresolvedAttribute(_).name))) + + // This operation shouldn't be introducing AddFile actions at all. This check should be trivial. + override def checkAddFileWithDeletionVectorStatsAreNotTightBounds: Boolean = true + + override val isInPlaceFileMetadataUpdate: Option[Boolean] = Some(false) } /** Recorded when column is renamed */ @@ -394,6 +508,11 @@ object DeltaOperations { "oldColumnPath" -> UnresolvedAttribute(oldColumnPath).name, "newColumnPath" -> UnresolvedAttribute(newColumnPath).name ) + + // This operation shouldn't be introducing AddFile actions at all. This check should be trivial. + override def checkAddFileWithDeletionVectorStatsAreNotTightBounds: Boolean = true + + override val isInPlaceFileMetadataUpdate: Option[Boolean] = Some(false) } /** Recorded when columns are changed. */ @@ -406,6 +525,11 @@ object DeltaOperations { override val parameters: Map[String, Any] = Map( "column" -> JsonUtils.toJson(structFieldToMap(columnPath, newColumn)) ) ++ colPosition.map("position" -> _) + + // This operation shouldn't be introducing AddFile actions at all. This check should be trivial. + override def checkAddFileWithDeletionVectorStatsAreNotTightBounds: Boolean = true + + override val isInPlaceFileMetadataUpdate: Option[Boolean] = Some(false) } /** Recorded when columns are replaced. */ case class ReplaceColumns( @@ -413,6 +537,11 @@ object DeltaOperations { override val parameters: Map[String, Any] = Map( "columns" -> JsonUtils.toJson(columns.map(structFieldToMap(Seq.empty, _)))) + + // This operation shouldn't be introducing AddFile actions at all. This check should be trivial. + override def checkAddFileWithDeletionVectorStatsAreNotTightBounds: Boolean = true + + override val isInPlaceFileMetadataUpdate: Option[Boolean] = Some(false) } case class UpgradeProtocol(newProtocol: Protocol) extends Operation("UPGRADE PROTOCOL") { @@ -422,15 +551,32 @@ object DeltaOperations { "readerFeatures" -> newProtocol.readerFeatures, "writerFeatures" -> newProtocol.writerFeatures ))) + + // This operation shouldn't be introducing AddFile actions at all. This check should be trivial. + override def checkAddFileWithDeletionVectorStatsAreNotTightBounds: Boolean = true + + override val isInPlaceFileMetadataUpdate: Option[Boolean] = Some(false) } object ManualUpdate extends Operation("Manual Update") { override val parameters: Map[String, Any] = Map.empty + + // Unsafe manual update disables checks. + override def checkAddFileWithDeletionVectorStatsAreNotTightBounds: Boolean = false + + // Manual update operations can commit arbitrary actions. In case this field is needed consider + // adding a new Delta operation. For test-only code use TestOperation. + override val isInPlaceFileMetadataUpdate: Option[Boolean] = None } /** A commit without any actions. Could be used to force creation of new checkpoints. */ object EmptyCommit extends Operation("Empty Commit") { override val parameters: Map[String, Any] = Map.empty + + // This operation shouldn't be introducing AddFile actions at all. This check should be trivial. + override def checkAddFileWithDeletionVectorStatsAreNotTightBounds: Boolean = true + + override val isInPlaceFileMetadataUpdate: Option[Boolean] = Some(false) } case class UpdateColumnMetadata( @@ -442,6 +588,11 @@ object DeltaOperations { case (path, field) => structFieldToMap(path, field) })) } + + // This operation shouldn't be introducing AddFile actions at all. This check should be trivial. + override def checkAddFileWithDeletionVectorStatsAreNotTightBounds: Boolean = true + + override val isInPlaceFileMetadataUpdate: Option[Boolean] = Some(false) } case class UpdateSchema(oldSchema: StructType, newSchema: StructType) @@ -449,11 +600,21 @@ object DeltaOperations { override val parameters: Map[String, Any] = Map( "oldSchema" -> JsonUtils.toJson(oldSchema), "newSchema" -> JsonUtils.toJson(newSchema)) + + // This operation shouldn't be introducing AddFile actions at all. This check should be trivial. + override def checkAddFileWithDeletionVectorStatsAreNotTightBounds: Boolean = true + + override val isInPlaceFileMetadataUpdate: Option[Boolean] = Some(false) } case class AddConstraint( constraintName: String, expr: String) extends Operation("ADD CONSTRAINT") { override val parameters: Map[String, Any] = Map("name" -> constraintName, "expr" -> expr) + + // This operation shouldn't be introducing AddFile actions at all. This check should be trivial. + override def checkAddFileWithDeletionVectorStatsAreNotTightBounds: Boolean = true + + override val isInPlaceFileMetadataUpdate: Option[Boolean] = Some(false) } case class DropConstraint( @@ -465,11 +626,24 @@ object DeltaOperations { Map("name" -> constraintName, "existed" -> "false") } } + + // This operation shouldn't be introducing AddFile actions at all. This check should be trivial. + override def checkAddFileWithDeletionVectorStatsAreNotTightBounds: Boolean = true + + override val isInPlaceFileMetadataUpdate: Option[Boolean] = Some(false) } /** Recorded when recomputing stats on the table. */ case class ComputeStats(predicate: Seq[Expression]) - extends OperationWithPredicates("COMPUTE STATS", predicate) + extends OperationWithPredicates("COMPUTE STATS", predicate) { + + // ComputeStats operation commits AddFiles with recomputed stats which are always tight bounds, + // even when DVs are present. This check should be disabled. + override def checkAddFileWithDeletionVectorStatsAreNotTightBounds: Boolean = false + + // ComputeStats operation only updates statistics of existing files. + override val isInPlaceFileMetadataUpdate: Option[Boolean] = Some(true) + } /** Recorded when restoring a Delta table to an older version. */ val OP_RESTORE = "RESTORE" @@ -482,6 +656,15 @@ object DeltaOperations { override def changesData: Boolean = true override val operationMetrics: Set[String] = DeltaOperationMetrics.RESTORE + + // Restore operation commits AddFiles with files, DVs and stats from the version it restores to. + // It can happen that tight bound stats were recomputed before by ComputeStats. + override def checkAddFileWithDeletionVectorStatsAreNotTightBounds: Boolean = false + + // The restore operation could perform in-place file metadata updates. However, the difference + // between the current and the restored state is computed using only the (path, DV) pairs as + // identifiers, meaning that metadata differences are ignored. + override val isInPlaceFileMetadataUpdate: Option[Boolean] = Some(false) } sealed abstract class OptimizeOrReorg(override val name: String, predicates: Seq[Expression]) @@ -517,6 +700,11 @@ object DeltaOperations { ) override val operationMetrics: Set[String] = DeltaOperationMetrics.OPTIMIZE + + // This operation shouldn't be introducing AddFile actions with DVs and tight bounds stats. + override def checkAddFileWithDeletionVectorStatsAreNotTightBounds: Boolean = true + + override val isInPlaceFileMetadataUpdate: Option[Boolean] = Some(false) } /** Recorded when cloning a Delta table into a new location. */ @@ -531,6 +719,12 @@ object DeltaOperations { ) override def changesData: Boolean = true override val operationMetrics: Set[String] = DeltaOperationMetrics.CLONE + + // Clone operation commits AddFiles with files, DVs and stats copied over from the source table. + // It can happen that tight bound stats were recomputed before by ComputeStats. + override def checkAddFileWithDeletionVectorStatsAreNotTightBounds: Boolean = false + + override val isInPlaceFileMetadataUpdate: Option[Boolean] = Some(false) } /** @@ -548,6 +742,11 @@ object DeltaOperations { ) ++ specifiedRetentionMillis.map("specifiedRetentionMillis" -> _) override val operationMetrics: Set[String] = DeltaOperationMetrics.VACUUM_START + + // This operation shouldn't be introducing AddFile actions at all. This check should be trivial. + override def checkAddFileWithDeletionVectorStatsAreNotTightBounds: Boolean = true + + override val isInPlaceFileMetadataUpdate: Option[Boolean] = Some(false) } /** @@ -559,6 +758,11 @@ object DeltaOperations { ) override val operationMetrics: Set[String] = DeltaOperationMetrics.VACUUM_END + + // This operation shouldn't be introducing AddFile actions at all. This check should be trivial. + override def checkAddFileWithDeletionVectorStatsAreNotTightBounds: Boolean = true + + override val isInPlaceFileMetadataUpdate: Option[Boolean] = Some(false) } /** Recorded when running REORG on the table. */ @@ -570,6 +774,11 @@ object DeltaOperations { ) override val operationMetrics: Set[String] = DeltaOperationMetrics.OPTIMIZE + + // This operation shouldn't be introducing AddFile actions with DVs and tight bounds stats. + override def checkAddFileWithDeletionVectorStatsAreNotTightBounds: Boolean = true + + override val isInPlaceFileMetadataUpdate: Option[Boolean] = Some(false) } /** Recorded when clustering columns are changed on clustered tables. */ @@ -579,6 +788,11 @@ object DeltaOperations { override val parameters: Map[String, Any] = Map( "oldClusteringColumns" -> oldClusteringColumns, "newClusteringColumns" -> newClusteringColumns) + + // This operation shouldn't be introducing AddFile actions at all. This check should be trivial. + override def checkAddFileWithDeletionVectorStatsAreNotTightBounds: Boolean = true + + override val isInPlaceFileMetadataUpdate: Option[Boolean] = Some(false) } /** Recorded when we backfill a Delta table's existing AddFiles with row tracking data. */ @@ -587,6 +801,13 @@ object DeltaOperations { override val parameters: Map[String, Any] = Map( "batchId" -> JsonUtils.toJson(batchId) ) + + // RowTrackingBackfill operation commits AddFiles with files, DVs and stats copied over. + // It can happen that tight bound stats were recomputed before by ComputeStats. + override def checkAddFileWithDeletionVectorStatsAreNotTightBounds: Boolean = false + + // RowTrackingBackfill only updates tags of existing files. + override val isInPlaceFileMetadataUpdate: Option[Boolean] = Some(true) } private def structFieldToMap(colPath: Seq[String], field: StructField): Map[String, Any] = { @@ -608,8 +829,14 @@ object DeltaOperations { colPosition: Option[String]) /** Dummy operation only for testing with arbitrary operation names */ - case class TestOperation(operationName: String = "TEST") extends Operation(operationName) { + case class TestOperation( + operationName: String = "TEST", + override val isInPlaceFileMetadataUpdate: Option[Boolean] = None + ) extends Operation(operationName) { override val parameters: Map[String, Any] = Map.empty + + // Perform the check for testing. + override def checkAddFileWithDeletionVectorStatsAreNotTightBounds: Boolean = true } /** @@ -630,6 +857,11 @@ object DeltaOperations { case class UpgradeUniformProperties(properties: Map[String, String]) extends Operation( OP_UPGRADE_UNIFORM_BY_REORG) { override val parameters: Map[String, Any] = Map("properties" -> JsonUtils.toJson(properties)) + + // This operation shouldn't be introducing AddFile actions with DVs and tight bounds stats. + override def checkAddFileWithDeletionVectorStatsAreNotTightBounds: Boolean = true + + override val isInPlaceFileMetadataUpdate: Option[Boolean] = Some(false) } } diff --git a/spark/src/main/scala/org/apache/spark/sql/delta/DeltaParquetWriteSupport.scala b/spark/src/main/scala/org/apache/spark/sql/delta/DeltaParquetWriteSupport.scala index 41038246982..71a4ae79d11 100644 --- a/spark/src/main/scala/org/apache/spark/sql/delta/DeltaParquetWriteSupport.scala +++ b/spark/src/main/scala/org/apache/spark/sql/delta/DeltaParquetWriteSupport.scala @@ -105,15 +105,17 @@ class DeltaParquetWriteSupport extends ParquetWriteSupport { val id = getNestedFieldId(parentField, relElemFieldPath) val elementField = field.asGroupType().getFields.get(0).asGroupType().getFields.get(0).withId(id) - Types + val builder = Types .buildGroup(field.getRepetition).as(LogicalTypeAnnotation.listType()) .addField( Types.repeatedGroup() .addField(convert(elementField, parentField, sparkSchema, absolutePath :+ PARQUET_LIST_ELEMENT_FIELD_NAME, relElemFieldPath)) .named("list")) - .id(field.getId.intValue()) - .named(field.getName) + if (field.getId != null) { + builder.id(field.getId.intValue()) + } + builder.named(field.getName) case _: MapLogicalTypeAnnotation => val relKeyFieldPath = relativePath :+ PARQUET_MAP_KEY_FIELD_NAME val relValFieldPath = relativePath :+ PARQUET_MAP_VALUE_FIELD_NAME @@ -123,7 +125,7 @@ class DeltaParquetWriteSupport extends ParquetWriteSupport { field.asGroupType().getFields.get(0).asGroupType().getFields.get(0).withId(keyId) val valueField = field.asGroupType().getFields.get(0).asGroupType().getFields.get(1).withId(valId) - Types + val builder = Types .buildGroup(field.getRepetition).as(LogicalTypeAnnotation.mapType()) .addField( Types @@ -133,8 +135,10 @@ class DeltaParquetWriteSupport extends ParquetWriteSupport { .addField(convert(valueField, parentField, sparkSchema, absolutePath :+ PARQUET_MAP_VALUE_FIELD_NAME, relValFieldPath)) .named("key_value")) - .id(field.getId.intValue()) - .named(field.getName) + if (field.getId != null) { + builder.id(field.getId.intValue()) + } + builder.named(field.getName) case _ if field.isPrimitive => field case _ => val builder = Types.buildGroup(field.getRepetition) @@ -143,7 +147,10 @@ class DeltaParquetWriteSupport extends ParquetWriteSupport { val parentField = findFieldInSparkSchema(sparkSchema, absPath) builder.addField(convert(field, parentField, sparkSchema, absPath, Seq(field.getName))) } - builder.id(field.getId.intValue()).named(field.getName) + if (field.getId != null) { + builder.id(field.getId.intValue()) + } + builder.named(field.getName) } } } diff --git a/spark/src/main/scala/org/apache/spark/sql/delta/IcebergCompat.scala b/spark/src/main/scala/org/apache/spark/sql/delta/IcebergCompat.scala index b84441cfc7b..014fb3d4aea 100644 --- a/spark/src/main/scala/org/apache/spark/sql/delta/IcebergCompat.scala +++ b/spark/src/main/scala/org/apache/spark/sql/delta/IcebergCompat.scala @@ -60,6 +60,7 @@ object IcebergCompatV2 extends IcebergCompat( CheckOnlySingleVersionEnabled, CheckAddFileHasStats, CheckTypeInV2AllowList, + CheckPartitionDataTypeInV2AllowList, CheckNoPartitionEvolution, CheckDeletionVectorDisabled ) @@ -398,6 +399,26 @@ object CheckTypeInV2AllowList extends IcebergCompatCheck { } } +object CheckPartitionDataTypeInV2AllowList extends IcebergCompatCheck { + private val allowedTypes = Set[Class[_]] ( + ByteType.getClass, ShortType.getClass, IntegerType.getClass, LongType.getClass, + FloatType.getClass, DoubleType.getClass, DecimalType.getClass, + StringType.getClass, BinaryType.getClass, + BooleanType.getClass, + TimestampType.getClass, TimestampNTZType.getClass, DateType.getClass + ) + override def apply(context: IcebergCompatContext): Unit = { + val partitionSchema = context.newestMetadata.partitionSchema + partitionSchema.fields.find(field => !allowedTypes.contains(field.dataType.getClass)) + match { + case Some(field) => + throw DeltaErrors.icebergCompatUnsupportedPartitionDataTypeException( + context.version, field.dataType, partitionSchema) + case _ => + } + } +} + /** * Check if the deletion vector has been disabled by previous snapshot * or newest metadata and protocol depending on whether the operation diff --git a/spark/src/main/scala/org/apache/spark/sql/delta/OptimisticTransaction.scala b/spark/src/main/scala/org/apache/spark/sql/delta/OptimisticTransaction.scala index 37aee7a3c82..dcafb62bd57 100644 --- a/spark/src/main/scala/org/apache/spark/sql/delta/OptimisticTransaction.scala +++ b/spark/src/main/scala/org/apache/spark/sql/delta/OptimisticTransaction.scala @@ -18,7 +18,7 @@ package org.apache.spark.sql.delta // scalastyle:off import.ordering.noEmptyLine import java.nio.file.FileAlreadyExistsException -import java.util.{ConcurrentModificationException, UUID} +import java.util.{ConcurrentModificationException, Optional, UUID} import java.util.concurrent.TimeUnit.NANOSECONDS import scala.collection.JavaConverters._ @@ -216,10 +216,14 @@ object OptimisticTransaction { * `OptimisticTransaction.withNewTransaction`. Use that to create and set active txns. */ private[delta] def setActive(txn: OptimisticTransaction): Unit = { - if (active.get != null) { - throw DeltaErrors.activeTransactionAlreadySet() + getActive() match { + case Some(activeTxn) => + if (!(activeTxn eq txn)) { + throw DeltaErrors.activeTransactionAlreadySet() + } + case _ => + active.set(txn) } - active.set(txn) } /** @@ -375,18 +379,6 @@ trait OptimisticTransactionImpl extends TransactionalWrite protected var checkUnsupportedDataType: Boolean = spark.sessionState.conf.getConf(DeltaSQLConf.DELTA_SCHEMA_TYPE_CHECK) - // Some operations (e.g. stats collection) may set files with DVs back to tight bounds. - // In that case they need to skip this check. - protected var checkDeletionVectorFilesHaveWideBounds: Boolean = true - /** - * Disable the check that ensures that all files with DVs added have tightBounds set to false. - * - * This is necessary when recomputing the stats on a table with DVs. - */ - def disableDeletionVectorFilesHaveWideBoundsCheck(): Unit = { - checkDeletionVectorFilesHaveWideBounds = false - } - // An array of tuples where each tuple represents a pair (colName, newHighWatermark). // This is collected after a write into Delta table with IDENTITY columns. If it's not // empty, we will update the high water marks during transaction commit. Note that the same @@ -559,6 +551,7 @@ trait OptimisticTransactionImpl extends TransactionalWrite // The `.schema` cannot be generated correctly unless the column mapping metadata is correctly // filled for all the fields. Therefore, the column mapping changes need to happen first. newMetadataTmp = DeltaColumnMapping.verifyAndUpdateMetadataChange( + spark, deltaLog, protocolBeforeUpdate, snapshot.metadata, @@ -708,9 +701,8 @@ trait OptimisticTransactionImpl extends TransactionalWrite } // We are done with protocol versions and features, time to remove related table properties. - val configsWithoutProtocolProps = newMetadataTmp.configuration.filterNot { - case (k, _) => TableFeatureProtocolUtils.isTableProtocolProperty(k) - } + val configsWithoutProtocolProps = + Protocol.filterProtocolPropsFromTableProps(newMetadataTmp.configuration) // Table features Part 3: add automatically-enabled features by looking at the new table // metadata. // @@ -752,6 +744,44 @@ trait OptimisticTransactionImpl extends TransactionalWrite updateMetadata(metadata) } + /** + * Updates the metadata of the target table in an effective REPLACE command. Note that replacing + * a table is similar to dropping a table and then recreating it. However, the backing catalog + * object does not change. For now, for Coordinated Commit tables, this function retains the + * coordinator details (and other associated Coordinated Commits properties) from the original + * table during a REPLACE. And if the table had a coordinator, existing ICT properties are also + * retained; otherwise, default ICT properties are included. + * TODO (YumingxuanGuo): Remove this once the exact semantic on default Coordinated Commits + * configurations is finalized. + */ + def updateMetadataForNewTableInReplace(metadata: Metadata): Unit = { + assert(CoordinatedCommitsUtils.extractCoordinatedCommitsConfigurations( + metadata.configuration).isEmpty, + "Command-specified Coordinated Commits configurations should have been blocked earlier.") + // Extract the existing Coordinated Commits configurations and ICT dependency configurations + // from the existing table metadata. + val existingCCConfs = CoordinatedCommitsUtils.extractCoordinatedCommitsConfigurations( + snapshot.metadata.configuration) + val existingICTConfs = CoordinatedCommitsUtils.extractICTConfigurations( + snapshot.metadata.configuration) + // Update the metadata. + updateMetadataForNewTable(metadata) + // Now the `txn.metadata` contains all the command-specified properties and all the default + // properties. The latter might still contain Coordinated Commits configurations, so we need + // to remove them and retain the Coordinated Commits configurations from the existing table. + val newConfsWithoutCC = newMetadata.get.configuration -- + CoordinatedCommitsUtils.TABLE_PROPERTY_KEYS + var newConfs: Map[String, String] = newConfsWithoutCC ++ existingCCConfs + // We also need to retain the existing ICT dependency configurations, but only when the + // existing table does have Coordinated Commits configurations. Otherwise, we treat the ICT + // configurations the same as any other configurations, by merging them from the default. + if (existingCCConfs.nonEmpty) { + val newConfsWithoutICT = newConfs -- CoordinatedCommitsUtils.ICT_TABLE_PROPERTY_KEYS + newConfs = newConfsWithoutICT ++ existingICTConfs + } + newMetadata = Some(newMetadata.get.copy(configuration = newConfs)) + } + /** * Records an update to the metadata that should be committed with this transaction and when * this transaction is attempt to overwrite the data and schema using .mode('overwrite') and @@ -847,35 +877,44 @@ trait OptimisticTransactionImpl extends TransactionalWrite protected def getAssertDeletionVectorWellFormedFunc( spark: SparkSession, op: DeltaOperations.Operation): (Action => Unit) = { - val deletionVectorCreationAllowed = - DeletionVectorUtils.deletionVectorsWritable(snapshot, newProtocol, newMetadata) - val isComputeStatsOperation = op.isInstanceOf[DeltaOperations.ComputeStats] val commitCheckEnabled = spark.conf.get(DeltaSQLConf.DELETION_VECTORS_COMMIT_CHECK_ENABLED) + if (!commitCheckEnabled) { + return _ => {} + } + + // Whether DVs are supported, i.e. the table is allowed to contain any DVs. + val deletionVectorsSupported = + DeletionVectorUtils.deletionVectorsReadable(snapshot, newProtocol, newMetadata) + // Whether DVs are enabled, i.e. operations are allowed to create new DVs. + val deletionVectorsEnabled = + DeletionVectorUtils.deletionVectorsWritable(snapshot, newProtocol, newMetadata) - val deletionVectorDisallowedForAddFiles = - commitCheckEnabled && !isComputeStatsOperation && !deletionVectorCreationAllowed + // If the operation does not define whether it performs in-place metadata updates, we are + // conservative and assume that it is not, which makes the check stricter. + val isInPlaceFileMetadataUpdate = op.isInPlaceFileMetadataUpdate.getOrElse(false) + val deletionVectorAllowedForAddFiles = + deletionVectorsSupported && (deletionVectorsEnabled || isInPlaceFileMetadataUpdate) - val addFileMustHaveWideBounds = deletionVectorCreationAllowed && - checkDeletionVectorFilesHaveWideBounds + val addFileMustHaveWideBounds = op.checkAddFileWithDeletionVectorStatsAreNotTightBounds action => action match { - case a: AddFile => - if (deletionVectorDisallowedForAddFiles && a.deletionVector != null) { + case a: AddFile if a.deletionVector != null => + if (!deletionVectorAllowedForAddFiles) { throw DeltaErrors.addingDeletionVectorsDisallowedException() } + // Protocol requirement checks: // 1. All files with DVs must have `stats` with `numRecords`. - if (a.deletionVector != null && (a.stats == null || a.numPhysicalRecords.isEmpty)) { + if (a.stats == null || a.numPhysicalRecords.isEmpty) { throw DeltaErrors.addFileWithDVsMissingNumRecordsException } // 2. All operations that add new DVs should always turn bounds to wide. // Operations that only update files with existing DVs may opt-out from this rule - // via `disableDeletionVectorFilesHaveWideBoundsCheck()`. - // (e.g. stats collection, metadata-only updates.) + // via `checkAddFileWithDeletionVectorStatsAreNotTightBounds`. + // See that field comment in DeltaOperation for more details. // Note, the absence of the tightBounds column when DVs exist is also an illegal state. if (addFileMustHaveWideBounds && - a.deletionVector != null && // Extra inversion to also catch absent `tightBounds`. !a.tightBounds.contains(false)) { throw DeltaErrors.addFileWithDVsAndTightBoundsException() @@ -1556,7 +1595,8 @@ trait OptimisticTransactionImpl extends TransactionalWrite val updatedActions = new UpdatedActions( commitInfo, metadata, protocol, snapshot.metadata, snapshot.protocol) val commitResponse = TransactionExecutionObserver.withObserver(executionObserver) { - effectiveTableCommitCoordinatorClient.commit(attemptVersion, jsonActions, updatedActions) + effectiveTableCommitCoordinatorClient.commit( + attemptVersion, jsonActions, updatedActions, catalogTable.map(_.identifier)) } // TODO(coordinated-commits): Use the right timestamp method on top of CommitInfo once ICT is // merged. @@ -1567,6 +1607,7 @@ trait OptimisticTransactionImpl extends TransactionalWrite Some(attemptVersion)) commitEndNano = System.nanoTime() committed = true + executionObserver.beginPostCommit() // NOTE: commitLarge cannot run postCommitHooks (such as the CheckpointHook). // Instead, manually run any necessary actions in updateAndCheckpoint. val postCommitSnapshot = updateAndCheckpoint( @@ -1706,8 +1747,14 @@ trait OptimisticTransactionImpl extends TransactionalWrite log"file-system based table to coordinated-commits table: " + log"[commit-coordinator: ${MDC(DeltaLogKeys.COORDINATOR_NAME, commitCoordinatorName)}" + log", conf: ${MDC(DeltaLogKeys.COORDINATOR_CONF, commitCoordinatorConf)}]") + val tableIdentifierOpt = + CoordinatedCommitsUtils.toCCTableIdentifier(catalogTable.map(_.identifier)) newCoordinatedCommitsTableConf = Some(newCommitCoordinatorClient.registerTable( - deltaLog.logPath, readVersion, finalMetadata, protocol).asScala.toMap) + deltaLog.logPath, + tableIdentifierOpt, + readVersion, + finalMetadata, + protocol).asScala.toMap) case (None, Some(readCommitCoordinatorClient)) => // CC -> FS conversion val (newOwnerName, newOwnerConf) = @@ -2185,6 +2232,7 @@ trait OptimisticTransactionImpl extends TransactionalWrite commitEndNano = System.nanoTime() + executionObserver.beginPostCommit() val postCommitSnapshot = deltaLog.updateAfterCommit( attemptVersion, commit, @@ -2277,11 +2325,11 @@ trait OptimisticTransactionImpl extends TransactionalWrite override def commit( logStore: io.delta.storage.LogStore, hadoopConf: Configuration, - logPath: Path, - coordinatedCommitsTableConf: java.util.Map[String, String], + tableDesc: TableDescriptor, commitVersion: Long, actions: java.util.Iterator[String], updatedActions: UpdatedActions): CommitResponse = { + val logPath = tableDesc.getLogPath // Get thread local observer for Fuzz testing purpose. val executionObserver = TransactionExecutionObserver.getObserver val commitFile = util.FileNames.unsafeDeltaFile(logPath, commitVersion) @@ -2315,8 +2363,7 @@ trait OptimisticTransactionImpl extends TransactionalWrite } override def getCommits( - logPath: Path, - coordinatedCommitsTableConf: java.util.Map[String, String], + tableDesc: TableDescriptor, startVersion: java.lang.Long, endVersion: java.lang.Long): GetCommitsResponse = new GetCommitsResponse(Seq.empty.asJava, -1) @@ -2324,8 +2371,7 @@ trait OptimisticTransactionImpl extends TransactionalWrite override def backfillToVersion( logStore: io.delta.storage.LogStore, hadoopConf: Configuration, - logPath: Path, - coordinatedCommitsTableConf: java.util.Map[String, String], + tableDesc: TableDescriptor, version: Long, lastKnownBackfilledVersion: java.lang.Long): Unit = {} @@ -2344,6 +2390,7 @@ trait OptimisticTransactionImpl extends TransactionalWrite override def registerTable( logPath: Path, + tableIdentifier: Optional[TableIdentifier], currentVersion: Long, currentMetadata: AbstractMetadata, currentProtocol: AbstractProtocol): java.util.Map[String, String] = @@ -2380,7 +2427,8 @@ trait OptimisticTransactionImpl extends TransactionalWrite val updatedActions = currentTransactionInfo.getUpdatedActions(snapshot.metadata, snapshot.protocol) val commitResponse = TransactionExecutionObserver.withObserver(executionObserver) { - tableCommitCoordinatorClient.commit(attemptVersion, jsonActions, updatedActions) + tableCommitCoordinatorClient.commit( + attemptVersion, jsonActions, updatedActions, catalogTable.map(_.identifier)) } if (attemptVersion == 0L) { val expectedPathForCommitZero = unsafeDeltaFile(deltaLog.logPath, version = 0L).toUri diff --git a/spark/src/main/scala/org/apache/spark/sql/delta/PreDowngradeTableFeatureCommand.scala b/spark/src/main/scala/org/apache/spark/sql/delta/PreDowngradeTableFeatureCommand.scala index bc1f0242096..5c0e5063965 100644 --- a/spark/src/main/scala/org/apache/spark/sql/delta/PreDowngradeTableFeatureCommand.scala +++ b/spark/src/main/scala/org/apache/spark/sql/delta/PreDowngradeTableFeatureCommand.scala @@ -61,7 +61,8 @@ case class TestWriterFeaturePreDowngradeCommand(table: DeltaTableV2) } val properties = Seq(TestRemovableWriterFeature.TABLE_PROP_KEY) - AlterTableUnsetPropertiesDeltaCommand(table, properties, ifExists = true).run(table.spark) + AlterTableUnsetPropertiesDeltaCommand( + table, properties, ifExists = true, fromDropFeatureCommand = true).run(table.spark) true } } @@ -77,7 +78,8 @@ case class TestWriterWithHistoryValidationFeaturePreDowngradeCommand(table: Delt } val properties = Seq(TestRemovableWriterWithHistoryTruncationFeature.TABLE_PROP_KEY) - AlterTableUnsetPropertiesDeltaCommand(table, properties, ifExists = true).run(table.spark) + AlterTableUnsetPropertiesDeltaCommand( + table, properties, ifExists = true, fromDropFeatureCommand = true).run(table.spark) true } } @@ -95,7 +97,8 @@ case class TestReaderWriterFeaturePreDowngradeCommand(table: DeltaTableV2) } val properties = Seq(TestRemovableReaderWriterFeature.TABLE_PROP_KEY) - AlterTableUnsetPropertiesDeltaCommand(table, properties, ifExists = true).run(table.spark) + AlterTableUnsetPropertiesDeltaCommand( + table, properties, ifExists = true, fromDropFeatureCommand = true).run(table.spark) true } } @@ -107,7 +110,8 @@ case class TestLegacyWriterFeaturePreDowngradeCommand(table: DeltaTableV2) if (TestRemovableLegacyWriterFeature.validateRemoval(table.initialSnapshot)) return false val properties = Seq(TestRemovableLegacyWriterFeature.TABLE_PROP_KEY) - AlterTableUnsetPropertiesDeltaCommand(table, properties, ifExists = true).run(table.spark) + AlterTableUnsetPropertiesDeltaCommand( + table, properties, ifExists = true, fromDropFeatureCommand = true).run(table.spark) true } } @@ -119,7 +123,8 @@ case class TestLegacyReaderWriterFeaturePreDowngradeCommand(table: DeltaTableV2) if (TestRemovableLegacyReaderWriterFeature.validateRemoval(table.initialSnapshot)) return false val properties = Seq(TestRemovableLegacyReaderWriterFeature.TABLE_PROP_KEY) - AlterTableUnsetPropertiesDeltaCommand(table, properties, ifExists = true).run(table.spark) + AlterTableUnsetPropertiesDeltaCommand( + table, properties, ifExists = true, fromDropFeatureCommand = true).run(table.spark) true } } @@ -251,7 +256,11 @@ case class CoordinatedCommitsPreDowngradeCommand(table: DeltaTableV2) traceRemovalNeeded = true try { AlterTableUnsetPropertiesDeltaCommand( - table, CoordinatedCommitsUtils.TABLE_PROPERTY_KEYS, ifExists = true).run(table.spark) + table, + CoordinatedCommitsUtils.TABLE_PROPERTY_KEYS, + ifExists = true, + fromDropFeatureCommand = true + ).run(table.spark) } catch { case NonFatal(e) => exceptionOpt = Some(e) @@ -304,7 +313,8 @@ case class TypeWideningPreDowngradeCommand(table: DeltaTableV2) val startTimeNs = System.nanoTime() val properties = Seq(DeltaConfigs.ENABLE_TYPE_WIDENING.key) - AlterTableUnsetPropertiesDeltaCommand(table, properties, ifExists = true).run(table.spark) + AlterTableUnsetPropertiesDeltaCommand( + table, properties, ifExists = true, fromDropFeatureCommand = true).run(table.spark) val numFilesRewritten = rewriteFilesIfNeeded() val metadataRemoved = removeMetadataIfNeeded() diff --git a/spark/src/main/scala/org/apache/spark/sql/delta/PreprocessTableMerge.scala b/spark/src/main/scala/org/apache/spark/sql/delta/PreprocessTableMerge.scala index 9a72ec99098..4a18b20633e 100644 --- a/spark/src/main/scala/org/apache/spark/sql/delta/PreprocessTableMerge.scala +++ b/spark/src/main/scala/org/apache/spark/sql/delta/PreprocessTableMerge.scala @@ -42,6 +42,8 @@ import org.apache.spark.sql.types.{DataType, DateType, StringType, StructField, case class PreprocessTableMerge(override val conf: SQLConf) extends Rule[LogicalPlan] with UpdateExpressionsSupport { + override protected val supportMergeAndUpdateLegacyCastBehavior: Boolean = true + override def apply(plan: LogicalPlan): LogicalPlan = plan.resolveOperators { case m: DeltaMergeInto if m.resolved => apply(m, true) diff --git a/spark/src/main/scala/org/apache/spark/sql/delta/PreprocessTableUpdate.scala b/spark/src/main/scala/org/apache/spark/sql/delta/PreprocessTableUpdate.scala index 61db471d937..fcce8aa2837 100644 --- a/spark/src/main/scala/org/apache/spark/sql/delta/PreprocessTableUpdate.scala +++ b/spark/src/main/scala/org/apache/spark/sql/delta/PreprocessTableUpdate.scala @@ -37,6 +37,8 @@ case class PreprocessTableUpdate(sqlConf: SQLConf) override def conf: SQLConf = sqlConf + override protected val supportMergeAndUpdateLegacyCastBehavior: Boolean = true + override def apply(plan: LogicalPlan): LogicalPlan = plan.resolveOperators { case u: DeltaUpdateTable if u.resolved => u.condition.foreach { cond => diff --git a/spark/src/main/scala/org/apache/spark/sql/delta/Snapshot.scala b/spark/src/main/scala/org/apache/spark/sql/delta/Snapshot.scala index 66847985ff3..7ed68404544 100644 --- a/spark/src/main/scala/org/apache/spark/sql/delta/Snapshot.scala +++ b/spark/src/main/scala/org/apache/spark/sql/delta/Snapshot.scala @@ -38,6 +38,7 @@ import org.apache.hadoop.fs.{FileStatus, Path} import org.apache.spark.internal.{MDC, MessageWithContext} import org.apache.spark.sql._ +import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.StructType import org.apache.spark.util.Utils @@ -541,7 +542,7 @@ class Snapshot( } base.put(Protocol.MIN_READER_VERSION_PROP, protocol.minReaderVersion.toString) base.put(Protocol.MIN_WRITER_VERSION_PROP, protocol.minWriterVersion.toString) - if (protocol.supportsReaderFeatures || protocol.supportsWriterFeatures) { + if (protocol.supportsTableFeatures) { val features = protocol.readerAndWriterFeatureNames.map(name => s"${TableFeatureProtocolUtils.FEATURE_PROP_PREFIX}$name" -> TableFeatureProtocolUtils.FEATURE_PROP_SUPPORTED) @@ -572,7 +573,7 @@ class Snapshot( * @throws IllegalStateException * if the delta file for the current version is not found after backfilling. */ - def ensureCommitFilesBackfilled(): Unit = { + def ensureCommitFilesBackfilled(tableIdentifierOpt: Option[TableIdentifier]): Unit = { val tableCommitCoordinatorClient = getTableCommitCoordinatorForWrites.getOrElse { return } @@ -580,6 +581,7 @@ class Snapshot( if (minUnbackfilledVersion <= version) { val hadoopConf = deltaLog.newDeltaHadoopConf() tableCommitCoordinatorClient.backfillToVersion( + tableIdentifierOpt, version, lastKnownBackfilledVersion = Some(minUnbackfilledVersion - 1)) val fs = deltaLog.logPath.getFileSystem(hadoopConf) diff --git a/spark/src/main/scala/org/apache/spark/sql/delta/SnapshotManagement.scala b/spark/src/main/scala/org/apache/spark/sql/delta/SnapshotManagement.scala index 54ea53e483c..cfb7bb6351f 100644 --- a/spark/src/main/scala/org/apache/spark/sql/delta/SnapshotManagement.scala +++ b/spark/src/main/scala/org/apache/spark/sql/delta/SnapshotManagement.scala @@ -174,9 +174,13 @@ trait SnapshotManagement { self: DeltaLog => // Submit a potential async call to get commits from commit coordinator if available val threadPool = SnapshotManagement.commitCoordinatorGetCommitsThreadPool + // TODO(table-identifier-plumbing): Plumb the right tableIdentifier from the deltaLog.update and + // Cold deltaLog initialization codepath. + val tableIdentifierOpt = None def getCommitsTask(isAsyncRequest: Boolean): GetCommitsResponse = { CoordinatedCommitsUtils.getCommitsFromCommitCoordinatorWithUsageLogs( - this, tableCommitCoordinatorClient, startVersion, versionToLoad, isAsyncRequest) + this, tableCommitCoordinatorClient, tableIdentifierOpt, + startVersion, versionToLoad, isAsyncRequest) } val unbackfilledCommitsResponseFuture = if (threadPool.getActiveCount < threadPool.getMaximumPoolSize) { diff --git a/spark/src/main/scala/org/apache/spark/sql/delta/TableFeature.scala b/spark/src/main/scala/org/apache/spark/sql/delta/TableFeature.scala index 1edf8d8c024..1c15086b768 100644 --- a/spark/src/main/scala/org/apache/spark/sql/delta/TableFeature.scala +++ b/spark/src/main/scala/org/apache/spark/sql/delta/TableFeature.scala @@ -540,10 +540,7 @@ object ColumnMappingTableFeature override def validateRemoval(snapshot: Snapshot): Boolean = { val schemaHasNoColumnMappingMetadata = - SchemaMergingUtils.explode(snapshot.schema).forall { case (_, col) => - !DeltaColumnMapping.hasPhysicalName(col) && - !DeltaColumnMapping.hasColumnId(col) - } + !DeltaColumnMapping.schemaHasColumnMappingMetadata(snapshot.schema) val metadataHasNoMappingMode = snapshot.metadata.columnMappingMode match { case NoMapping => true case _ => false diff --git a/spark/src/main/scala/org/apache/spark/sql/delta/TransactionExecutionObserver.scala b/spark/src/main/scala/org/apache/spark/sql/delta/TransactionExecutionObserver.scala index 68268b10ced..1a989ef5775 100644 --- a/spark/src/main/scala/org/apache/spark/sql/delta/TransactionExecutionObserver.scala +++ b/spark/src/main/scala/org/apache/spark/sql/delta/TransactionExecutionObserver.scala @@ -77,6 +77,9 @@ trait TransactionExecutionObserver /** Called after publishing the commit file but before the `backfill` attempt. */ def beginBackfill(): Unit + /** Called after backfill but before the `postCommit` attempt. */ + def beginPostCommit(): Unit + /** Called once a commit succeeded. */ def transactionCommitted(): Unit @@ -111,6 +114,8 @@ object NoOpTransactionExecutionObserver extends TransactionExecutionObserver { override def beginBackfill(): Unit = () + override def beginPostCommit(): Unit = () + override def transactionCommitted(): Unit = () override def transactionAborted(): Unit = () diff --git a/spark/src/main/scala/org/apache/spark/sql/delta/UpdateExpressionsSupport.scala b/spark/src/main/scala/org/apache/spark/sql/delta/UpdateExpressionsSupport.scala index 9a78f3ac4cf..79143153372 100644 --- a/spark/src/main/scala/org/apache/spark/sql/delta/UpdateExpressionsSupport.scala +++ b/spark/src/main/scala/org/apache/spark/sql/delta/UpdateExpressionsSupport.scala @@ -36,6 +36,15 @@ import org.apache.spark.sql.types._ * nested fields. */ trait UpdateExpressionsSupport extends SQLConfHelper with AnalysisHelper with DeltaLogging { + + /** + * Whether casting behavior can revert to following 'spark.sql.ansi.enabled' instead of + * 'spark.sql.storeAssignmentPolicy' to preserve legacy behavior for UPDATE and MERGE. + * Legacy behavior is applied only if + * 'spark.databricks.delta.updateAndMergeCastingFollowsAnsiEnabledFlag' is set to true. + */ + protected val supportMergeAndUpdateLegacyCastBehavior: Boolean = false + /** * Specifies an operation that updates a target column with the given expression. * The target column may or may not be a nested field and it is specified as a full quoted name @@ -440,12 +449,14 @@ trait UpdateExpressionsSupport extends SQLConfHelper with AnalysisHelper with De } /** - * Replaces 'CastSupport.cast'. Selects a cast based on 'spark.sql.storeAssignmentPolicy' if - * 'spark.databricks.delta.updateAndMergeCastingFollowsAnsiEnabledFlag. is false, and based on - * 'spark.sql.ansi.enabled' otherwise. + * Replaces 'CastSupport.cast'. Selects a cast based on 'spark.sql.storeAssignmentPolicy'. + * Legacy behavior for UPDATE and MERGE followed 'spark.sql.ansi.enabled' instead, this legacy + * behavior can be re-enabled by setting + * 'spark.databricks.delta.updateAndMergeCastingFollowsAnsiEnabledFlag' to true. */ private def cast(child: Expression, dataType: DataType, columnName: String): Expression = { - if (conf.getConf(DeltaSQLConf.UPDATE_AND_MERGE_CASTING_FOLLOWS_ANSI_ENABLED_FLAG)) { + if (supportMergeAndUpdateLegacyCastBehavior && + conf.getConf(DeltaSQLConf.UPDATE_AND_MERGE_CASTING_FOLLOWS_ANSI_ENABLED_FLAG)) { return Cast(child, dataType, Option(conf.sessionLocalTimeZone)) } @@ -455,7 +466,12 @@ trait UpdateExpressionsSupport extends SQLConfHelper with AnalysisHelper with De case SQLConf.StoreAssignmentPolicy.ANSI => val cast = Cast(child, dataType, Some(conf.sessionLocalTimeZone), ansiEnabled = true) if (canCauseCastOverflow(cast)) { - CheckOverflowInTableWrite(cast, columnName) + if (supportMergeAndUpdateLegacyCastBehavior) { + CheckOverflowInTableWrite(cast, columnName) + } else { + cast.setTagValue(Cast.BY_TABLE_INSERTION, ()) + CheckOverflowInTableInsert(cast, columnName) + } } else { cast } diff --git a/spark/src/main/scala/org/apache/spark/sql/delta/actions/TableFeatureSupport.scala b/spark/src/main/scala/org/apache/spark/sql/delta/actions/TableFeatureSupport.scala index b9065499084..2d41fae0f79 100644 --- a/spark/src/main/scala/org/apache/spark/sql/delta/actions/TableFeatureSupport.scala +++ b/spark/src/main/scala/org/apache/spark/sql/delta/actions/TableFeatureSupport.scala @@ -39,14 +39,32 @@ import com.fasterxml.jackson.annotation.JsonIgnore */ trait TableFeatureSupport { this: Protocol => - /** Check if this protocol is capable of adding features into its `readerFeatures` field. */ + /** + * Check if this protocol can support arbitrary reader features. If this returns false, + * then the table may still be able to support the "columnMapping" feature. + * See [[canSupportColumnMappingFeature]] below. + */ def supportsReaderFeatures: Boolean = TableFeatureProtocolUtils.supportsReaderFeatures(minReaderVersion) + /** + * Check if this protocol is in table feature representation and can support column mapping. + * Column mapping is the only legacy reader feature and requires special handling in some + * cases. + */ + def canSupportColumnMappingFeature: Boolean = + TableFeatureProtocolUtils.canSupportColumnMappingFeature(minReaderVersion, minWriterVersion) + /** Check if this protocol is capable of adding features into its `writerFeatures` field. */ def supportsWriterFeatures: Boolean = TableFeatureProtocolUtils.supportsWriterFeatures(minWriterVersion) + /** + * As soon as a protocol supports writer features it is considered a table features protocol. + * It is not possible to support reader features without supporting writer features. + */ + def supportsTableFeatures: Boolean = supportsWriterFeatures + /** * Get a new Protocol object that has `feature` supported. Writer-only features will be added to * `writerFeatures` field, and reader-writer features will be added to `readerFeatures` and @@ -60,7 +78,7 @@ trait TableFeatureSupport { this: Protocol => */ def withFeature(feature: TableFeature): Protocol = { def shouldAddRead: Boolean = { - if (supportsReaderFeatures) return true + if (feature == ColumnMappingTableFeature && canSupportColumnMappingFeature) return true if (feature.minReaderVersion <= minReaderVersion) return false throw DeltaErrors.tableFeatureRequiresHigherReaderProtocolVersion( @@ -111,25 +129,13 @@ trait TableFeatureSupport { this: Protocol => * `writerFeatures` field. * * The method does not require the feature to be recognized by the client, therefore will not - * try keeping the protocol's `readerFeatures` and `writerFeatures` in sync. Use with caution. + * try keeping the protocol's `readerFeatures` and `writerFeatures` in sync. + * Should never be used directly. Always use withFeature(feature: TableFeature): Protocol. */ private[actions] def withFeature( name: String, addToReaderFeatures: Boolean, addToWriterFeatures: Boolean): Protocol = { - if (addToReaderFeatures && !supportsReaderFeatures) { - throw DeltaErrors.tableFeatureRequiresHigherReaderProtocolVersion( - name, - currentVersion = minReaderVersion, - requiredVersion = TableFeatureProtocolUtils.TABLE_FEATURES_MIN_READER_VERSION) - } - if (addToWriterFeatures && !supportsWriterFeatures) { - throw DeltaErrors.tableFeatureRequiresHigherWriterProtocolVersion( - name, - currentVersion = minWriterVersion, - requiredVersion = TableFeatureProtocolUtils.TABLE_FEATURES_MIN_WRITER_VERSION) - } - val addedReaderFeatureOpt = if (addToReaderFeatures) Some(name) else None val addedWriterFeatureOpt = if (addToWriterFeatures) Some(name) else None @@ -143,11 +149,11 @@ trait TableFeatureSupport { this: Protocol => * `readerFeatures` field. * * The method does not require the features to be recognized by the client, therefore will not - * try keeping the protocol's `readerFeatures` and `writerFeatures` in sync. Use with caution. + * try keeping the protocol's `readerFeatures` and `writerFeatures` in sync. + * Intended only for testing. Use with caution. */ private[delta] def withReaderFeatures(names: Iterable[String]): Protocol = { - names.foldLeft(this)( - _.withFeature(_, addToReaderFeatures = true, addToWriterFeatures = false)) + names.foldLeft(this)(_.withFeature(_, addToReaderFeatures = true, addToWriterFeatures = false)) } /** @@ -155,11 +161,11 @@ trait TableFeatureSupport { this: Protocol => * `writerFeatures` field. * * The method does not require the features to be recognized by the client, therefore will not - * try keeping the protocol's `readerFeatures` and `writerFeatures` in sync. Use with caution. + * try keeping the protocol's `readerFeatures` and `writerFeatures` in sync. + * Intended only for testing. Use with caution. */ private[delta] def withWriterFeatures(names: Iterable[String]): Protocol = { - names.foldLeft(this)( - _.withFeature(_, addToReaderFeatures = false, addToWriterFeatures = true)) + names.foldLeft(this)(_.withFeature(_, addToReaderFeatures = false, addToWriterFeatures = true)) } /** @@ -203,14 +209,16 @@ trait TableFeatureSupport { this: Protocol => */ @JsonIgnore lazy val implicitlySupportedFeatures: Set[TableFeature] = { - if (supportsReaderFeatures && supportsWriterFeatures) { - // this protocol uses both reader and writer features, no feature can be implicitly supported + if (supportsTableFeatures) { + // As soon as a protocol supports writer features, all features need to be explicitly defined. + // This includes legacy reader features (the only one is Column Mapping), even if the + // reader protocol is legacy and explicitly supports Column Mapping. Set() } else { TableFeature.allSupportedFeaturesMap.values .filter(_.isLegacyFeature) - .filterNot(supportsReaderFeatures || this.minReaderVersion < _.minReaderVersion) - .filterNot(supportsWriterFeatures || this.minWriterVersion < _.minWriterVersion) + .filter(_.minReaderVersion <= this.minReaderVersion) + .filter(_.minWriterVersion <= this.minWriterVersion) .toSet } } @@ -271,14 +279,11 @@ trait TableFeatureSupport { this: Protocol => val protocols = this +: others val mergedReaderVersion = protocols.map(_.minReaderVersion).max val mergedWriterVersion = protocols.map(_.minWriterVersion).max - val mergedReaderFeatures = protocols.flatMap(_.readerFeatureNames) - val mergedWriterFeatures = protocols.flatMap(_.writerFeatureNames) + val mergedFeatures = protocols.flatMap(_.readerAndWriterFeatures) val mergedImplicitFeatures = protocols.flatMap(_.implicitlySupportedFeatures) val mergedProtocol = Protocol(mergedReaderVersion, mergedWriterVersion) - .withReaderFeatures(mergedReaderFeatures) - .withWriterFeatures(mergedWriterFeatures) - .withFeatures(mergedImplicitFeatures) + .withFeatures(mergedFeatures ++ mergedImplicitFeatures) // The merged protocol is always normalized in order to represent the protocol // with the weakest possible form. This enables backward compatibility. @@ -348,7 +353,7 @@ trait TableFeatureSupport { this: Protocol => */ def normalized: Protocol = { // Normalization can only be applied to table feature protocols. - if (!supportsWriterFeatures) return this + if (!supportsTableFeatures) return this val (minReaderVersion, minWriterVersion) = TableFeatureProtocolUtils.minimumRequiredVersions(readerAndWriterFeatures) @@ -371,7 +376,7 @@ trait TableFeatureSupport { this: Protocol => */ def denormalized: Protocol = { // Denormalization can only be applied to legacy protocols. - if (supportsWriterFeatures) return this + if (supportsTableFeatures) return this val (minReaderVersion, _) = TableFeatureProtocolUtils.minimumRequiredVersions(implicitlySupportedFeatures.toSeq) @@ -419,7 +424,7 @@ object TableFeatureProtocolUtils { /** The string constant "supported" for uses in table properties. */ val FEATURE_PROP_SUPPORTED = "supported" - /** Min reader version that supports reader features. */ + /** Min reader version that supports native reader features. */ val TABLE_FEATURES_MIN_READER_VERSION = 3 /** Min reader version that supports writer features. */ @@ -440,8 +445,20 @@ object TableFeatureProtocolUtils { s"$DEFAULT_FEATURE_PROP_PREFIX$featureName" /** - * Determine whether a [[Protocol]] with the given reader protocol version is capable of adding - * features into its `readerFeatures` field. + * Determine whether a [[Protocol]] with the given reader protocol version can support column + * mapping. All table feature protocols that can support column mapping are capable of adding + * the feature to the `readerFeatures` field. This includes legacy reader protocol version + * (2, 7). + */ + def canSupportColumnMappingFeature(readerVersion: Int, writerVersion: Int): Boolean = { + readerVersion >= ColumnMappingTableFeature.minReaderVersion && + supportsWriterFeatures(writerVersion) + } + + /** + * Determine whether a [[Protocol]] with the given reader protocol version supports + * native features. All protocols that can support native reader features are capable + * of adding the feature to the `readerFeatures` field. */ def supportsReaderFeatures(readerVersion: Int): Boolean = { readerVersion >= TABLE_FEATURES_MIN_READER_VERSION diff --git a/spark/src/main/scala/org/apache/spark/sql/delta/actions/actions.scala b/spark/src/main/scala/org/apache/spark/sql/delta/actions/actions.scala index ada9854f4b5..390c8a021c2 100644 --- a/spark/src/main/scala/org/apache/spark/sql/delta/actions/actions.scala +++ b/spark/src/main/scala/org/apache/spark/sql/delta/actions/actions.scala @@ -142,13 +142,13 @@ case class Protocol private ( // Correctness check // Reader and writer versions must match the status of reader and writer features require( - supportsReaderFeatures == readerFeatures.isDefined, + (supportsReaderFeatures || canSupportColumnMappingFeature) == readerFeatures.isDefined, "Mismatched minReaderVersion and readerFeatures.") require( supportsWriterFeatures == writerFeatures.isDefined, "Mismatched minWriterVersion and writerFeatures.") - // When reader is on table features, writer must be on table features too + // When reader is on table features, writer must be on table features too. if (supportsReaderFeatures && !supportsWriterFeatures) { throw DeltaErrors.tableFeatureReadRequiresWriteException( TableFeatureProtocolUtils.TABLE_FEATURES_MIN_WRITER_VERSION) @@ -165,7 +165,7 @@ case class Protocol private ( */ @JsonIgnore lazy val simpleString: String = { - if (!supportsReaderFeatures && !supportsWriterFeatures) { + if (!supportsTableFeatures) { s"$minReaderVersion,$minWriterVersion" } else { val readerFeaturesStr = readerFeatures @@ -202,10 +202,12 @@ object Protocol { def apply( minReaderVersion: Int = Action.readerVersion, minWriterVersion: Int = Action.writerVersion): Protocol = { + val shouldAddReaderFeatures = supportsReaderFeatures(minReaderVersion) || + canSupportColumnMappingFeature(minReaderVersion, minWriterVersion) new Protocol( minReaderVersion = minReaderVersion, minWriterVersion = minWriterVersion, - readerFeatures = if (supportsReaderFeatures(minReaderVersion)) Some(Set()) else None, + readerFeatures = if (shouldAddReaderFeatures) Some(Set()) else None, writerFeatures = if (supportsWriterFeatures(minWriterVersion)) Some(Set()) else None) } @@ -213,7 +215,7 @@ object Protocol { def forTableFeature(tf: TableFeature): Protocol = { // Every table feature is a writer feature. val writerFeatures = tf.requiredFeatures + tf - val readerFeatures = writerFeatures.filter(f => f.isReaderWriterFeature && !f.isLegacyFeature) + val readerFeatures = writerFeatures.filter(_.isReaderWriterFeature) val writerFeaturesNames = writerFeatures.map(_.name) val readerFeaturesNames = readerFeatures.map(_.name) @@ -322,12 +324,12 @@ object Protocol { * * This function returns the protocol versions and features individually instead of a * [[Protocol]], so the caller can identify the features that caused the protocol version. For - * example, if the return values are (2, 5, columnMapping), the caller can safely ignore all - * other features required by the protocol with a reader and writer version of 2 and 5. + * example, if the return values are (2, 5, columnMapping + preceding features), the caller + * can safely ignore all other features required by the protocol with a reader and writer + * version of 2 and 5. * - * Note that this method does not consider protocol versions and features configured in session - * defaults. To make them effective, copy them to `metadata` using - * [[DeltaConfigs.mergeGlobalConfigs]]. + * Note that this method does not consider features configured in session defaults. + * To make them effective, copy them to `metadata` using [[DeltaConfigs.mergeGlobalConfigs]]. */ def minProtocolComponentsFromMetadata( spark: SparkSession, @@ -343,46 +345,11 @@ object Protocol { spark, metadata, Protocol().withFeatures(tablePropEnabledFeatures)) val allEnabledFeatures = tablePropEnabledFeatures ++ metaEnabledFeatures - // Determine the min reader and writer version required by features in table properties or - // metadata. - // If any table property is specified: - // we start from (3, 7) or (0, 7) depending on the existence of any writer-only feature. - // If there's no table property: - // if no feature is enabled or all features are legacy, we start from (0, 0); - // if any feature is native and is reader-writer, we start from (3, 7); - // otherwise we start from (0, 7) because there must exist a native writer-only feature. - var (readerVersionFromFeatures, writerVersionFromFeatures) = { - if (tablePropEnabledFeatures.exists(_.isReaderWriterFeature)) { - (TABLE_FEATURES_MIN_READER_VERSION, TABLE_FEATURES_MIN_WRITER_VERSION) - } else if (tablePropEnabledFeatures.nonEmpty) { - (0, TABLE_FEATURES_MIN_WRITER_VERSION) - } else if (metaEnabledFeatures.forall(_.isLegacyFeature)) { // also true for empty set - (0, 0) - } else if (metaEnabledFeatures.exists(f => !f.isLegacyFeature && f.isReaderWriterFeature)) { - (TABLE_FEATURES_MIN_READER_VERSION, TABLE_FEATURES_MIN_WRITER_VERSION) - } else { - (0, TABLE_FEATURES_MIN_WRITER_VERSION) - } - } - allEnabledFeatures.foreach { feature => - readerVersionFromFeatures = math.max(readerVersionFromFeatures, feature.minReaderVersion) - writerVersionFromFeatures = math.max(writerVersionFromFeatures, feature.minWriterVersion) - } - // Protocol version provided in table properties can upgrade the protocol, but only when they // are higher than which required by the enabled features. val (readerVersionFromTableConfOpt, writerVersionFromTableConfOpt) = getProtocolVersionsFromTableConf(tableConf) - // Decide the final protocol version: - // a. 1, aka the lowest version possible - // b. version required by manually enabled features and metadata features - // c. version defined as table properties - val finalReaderVersion = - Seq(1, readerVersionFromFeatures, readerVersionFromTableConfOpt.getOrElse(0)).max - val finalWriterVersion = - Seq(1, writerVersionFromFeatures, writerVersionFromTableConfOpt.getOrElse(0)).max - // If the user explicitly sets the table versions, we need to take into account the // relevant implicit features. val implicitFeaturesFromTableConf = @@ -399,7 +366,14 @@ object Protocol { case _ => Set.empty } - (finalReaderVersion, finalWriterVersion, allEnabledFeatures ++ implicitFeaturesFromTableConf) + // Construct the minimum required protocol for the enabled features. + val minProtocol = Protocol(TABLE_FEATURES_MIN_READER_VERSION, TABLE_FEATURES_MIN_WRITER_VERSION) + .withFeatures(allEnabledFeatures ++ implicitFeaturesFromTableConf) + .normalized + + // Return the minimum protocol components. + (minProtocol.minReaderVersion, minProtocol.minWriterVersion, + minProtocol.implicitlyAndExplicitlySupportedFeatures) } /** @@ -450,6 +424,11 @@ object Protocol { (getReaderVersionFromTableConf(conf), getWriterVersionFromTableConf(conf)) } + def filterProtocolPropsFromTableProps(properties: Map[String, String]): Map[String, String] = + properties.filterNot { + case (k, _) => TableFeatureProtocolUtils.isTableProtocolProperty(k) + } + /** Assert a table metadata contains no protocol-related table properties. */ def assertMetadataContainsNoProtocolProps(metadata: Metadata): Unit = { assert( @@ -488,32 +467,12 @@ object Protocol { spark: SparkSession, metadata: Metadata, current: Protocol): Option[Protocol] = { - val (readerVersion, writerVersion, minRequiredFeatures) = - minProtocolComponentsFromAutomaticallyEnabledFeatures(spark, metadata, current) - - // If the user sets the protocol versions we need to take it account. In general, - // enabling legacy features on legacy protocols results to pumping up the protocol - // versions. However, setting table feature protocol versions while enabling - // legacy features results to only enabling the requested features. For example: - // 1) Create table with (1, 2), then ALTER TABLE with DeltaConfigs.CHANGE_DATA_FEED.key = true - // results to (1, 4). - // 2) Alternatively, Create table with (1, 2), then - // ALTER TABLE set versions (1, 7) and DeltaConfigs.CHANGE_DATA_FEED.key = true results - // to (1, 7, AppendOnly, Invariants, CDF). - val readerVersionFromConf = - Protocol.getReaderVersionFromTableConf(metadata.configuration).getOrElse(readerVersion) - val writerVersionFromConf = - Protocol.getWriterVersionFromTableConf(metadata.configuration).getOrElse(writerVersion) - - val finalReaderVersion = - Seq(readerVersion, readerVersionFromConf, current.minReaderVersion).max - val finalWriterVersion = - Seq(writerVersion, writerVersionFromConf, current.minWriterVersion).max - - // Increment the reader and writer version to accurately add enabled legacy table features - // either to the implicitly enabled table features or the table feature lists. + val required = - Protocol(finalReaderVersion, finalWriterVersion).withFeatures(minRequiredFeatures) + Protocol(TABLE_FEATURES_MIN_READER_VERSION, TABLE_FEATURES_MIN_WRITER_VERSION) + .withFeatures(extractAutomaticallyEnabledFeatures(spark, metadata, current)) + .normalized + if (!required.canUpgradeTo(current)) { // When the current protocol does not satisfy metadata requirement, some additional features // must be supported by the protocol. We assert those features can actually perform the diff --git a/spark/src/main/scala/org/apache/spark/sql/delta/catalog/DeltaCatalog.scala b/spark/src/main/scala/org/apache/spark/sql/delta/catalog/DeltaCatalog.scala index 0e7e3ae6b28..8be72d04ed4 100644 --- a/spark/src/main/scala/org/apache/spark/sql/delta/catalog/DeltaCatalog.scala +++ b/spark/src/main/scala/org/apache/spark/sql/delta/catalog/DeltaCatalog.scala @@ -152,8 +152,19 @@ class DeltaCatalog extends DelegatingCatalogExtension .getOrElse(spark.sessionState.catalog.defaultTablePath(id)) val storage = DataSource.buildStorageFormatFromOptions(writeOptions) .copy(locationUri = Option(loc)) - val tableType = - if (location.isDefined) CatalogTableType.EXTERNAL else CatalogTableType.MANAGED + // PROP_IS_MANAGED_LOCATION indicates that the table location is not user-specified but + // system-generated. The table should be created as managed table in this case. + val isManagedLocation = Option(allTableProperties.get(TableCatalog.PROP_IS_MANAGED_LOCATION)) + .exists(_.equalsIgnoreCase("true")) + // Note: Spark generates the table location for managed tables in + // `DeltaCatalog#delegate#createTable`, so `isManagedLocation` should never be true if + // Unity Catalog is not involved. For safety we also check `isUnityCatalog` here. + val respectManagedLoc = isUnityCatalog || org.apache.spark.util.Utils.isTesting + val tableType = if (location.isEmpty || (isManagedLocation && respectManagedLoc)) { + CatalogTableType.MANAGED + } else { + CatalogTableType.EXTERNAL + } val commentOpt = Option(allTableProperties.get("comment")) @@ -336,12 +347,24 @@ class DeltaCatalog extends DelegatingCatalogExtension properties: util.Map[String, String]) : Table = recordFrameProfile("DeltaCatalog", "createTable") { if (DeltaSourceUtils.isDeltaDataSourceName(getProvider(properties))) { + // TODO: we should extract write options from table properties for all the cases. We + // can remove the UC check when we have confidence. + val respectOptions = isUnityCatalog || properties.containsKey("test.simulateUC") + val (props, writeOptions) = if (respectOptions) { + val (props, writeOptions) = getTablePropsAndWriteOptions(properties) + expandTableProps(props, writeOptions, spark.sessionState.conf) + props.remove("test.simulateUC") + (props, writeOptions) + } else { + (properties, Map.empty[String, String]) + } + createDeltaTable( ident, schema, partitions, - properties, - Map.empty, + props, + writeOptions, sourceQuery = None, TableCreationModes.Create ) @@ -512,6 +535,44 @@ class DeltaCatalog extends DelegatingCatalogExtension } } + private def getTablePropsAndWriteOptions(properties: util.Map[String, String]) + : (util.Map[String, String], Map[String, String]) = { + val props = new util.HashMap[String, String]() + // Options passed in through the SQL API will show up both with an "option." prefix and + // without in Spark 3.1, so we need to remove those from the properties + val optionsThroughProperties = properties.asScala.collect { + case (k, _) if k.startsWith(TableCatalog.OPTION_PREFIX) => + k.stripPrefix(TableCatalog.OPTION_PREFIX) + }.toSet + val writeOptions = new util.HashMap[String, String]() + properties.asScala.foreach { case (k, v) => + if (!k.startsWith(TableCatalog.OPTION_PREFIX) && !optionsThroughProperties.contains(k)) { + // Add to properties + props.put(k, v) + } else if (optionsThroughProperties.contains(k)) { + writeOptions.put(k, v) + } + } + (props, writeOptions.asScala.toMap) + } + + private def expandTableProps( + props: util.Map[String, String], + options: Map[String, String], + conf: SQLConf): Unit = { + if (conf.getConf(DeltaSQLConf.DELTA_LEGACY_STORE_WRITER_OPTIONS_AS_PROPS)) { + // Legacy behavior + options.foreach { case (k, v) => props.put(k, v) } + } else { + options.foreach { case (k, v) => + // Continue putting in Delta prefixed options to avoid breaking workloads + if (k.toLowerCase(Locale.ROOT).startsWith("delta.")) { + props.put(k, v) + } + } + } + } + /** * A staged delta table, which creates a HiveMetaStore entry and appends data if this was a * CTAS/RTAS command. We have a ugly way of using this API right now, but it's the best way to @@ -533,35 +594,11 @@ class DeltaCatalog extends DelegatingCatalogExtension override def commitStagedChanges(): Unit = recordFrameProfile( "DeltaCatalog", "commitStagedChanges") { val conf = spark.sessionState.conf - val props = new util.HashMap[String, String]() - // Options passed in through the SQL API will show up both with an "option." prefix and - // without in Spark 3.1, so we need to remove those from the properties - val optionsThroughProperties = properties.asScala.collect { - case (k, _) if k.startsWith("option.") => k.stripPrefix("option.") - }.toSet - val sqlWriteOptions = new util.HashMap[String, String]() - properties.asScala.foreach { case (k, v) => - if (!k.startsWith("option.") && !optionsThroughProperties.contains(k)) { - // Do not add to properties - props.put(k, v) - } else if (optionsThroughProperties.contains(k)) { - sqlWriteOptions.put(k, v) - } - } - if (writeOptions.isEmpty && !sqlWriteOptions.isEmpty) { - writeOptions = sqlWriteOptions.asScala.toMap - } - if (conf.getConf(DeltaSQLConf.DELTA_LEGACY_STORE_WRITER_OPTIONS_AS_PROPS)) { - // Legacy behavior - writeOptions.foreach { case (k, v) => props.put(k, v) } - } else { - writeOptions.foreach { case (k, v) => - // Continue putting in Delta prefixed options to avoid breaking workloads - if (k.toLowerCase(Locale.ROOT).startsWith("delta.")) { - props.put(k, v) - } - } + val (props, sqlWriteOptions) = getTablePropsAndWriteOptions(properties) + if (writeOptions.isEmpty && sqlWriteOptions.nonEmpty) { + writeOptions = sqlWriteOptions } + expandTableProps(props, writeOptions, conf) createDeltaTable( ident, schema, diff --git a/spark/src/main/scala/org/apache/spark/sql/delta/catalog/DeltaTableV2.scala b/spark/src/main/scala/org/apache/spark/sql/delta/catalog/DeltaTableV2.scala index 82821b4cb46..15c6fd375c9 100644 --- a/spark/src/main/scala/org/apache/spark/sql/delta/catalog/DeltaTableV2.scala +++ b/spark/src/main/scala/org/apache/spark/sql/delta/catalog/DeltaTableV2.scala @@ -96,7 +96,7 @@ case class DeltaTableV2( // as Unity Catalog may add more table storage properties on the fly. We should respect it // and merge the table storage properties and Delta options. val dataSourceOptions = if (catalogTable.isDefined) { - // To be safe, here we only extra file system options from table storage properties and + // To be safe, here we only extract file system options from table storage properties and // the original `options` has higher priority than the table storage properties. val fileSystemOptions = catalogTable.get.storage.properties.filter { case (k, _) => DeltaTableUtils.validDeltaTableHadoopPrefixes.exists(k.startsWith) diff --git a/spark/src/main/scala/org/apache/spark/sql/delta/commands/ConvertToDeltaCommand.scala b/spark/src/main/scala/org/apache/spark/sql/delta/commands/ConvertToDeltaCommand.scala index 3dacf2712ea..213651f6bb4 100644 --- a/spark/src/main/scala/org/apache/spark/sql/delta/commands/ConvertToDeltaCommand.scala +++ b/spark/src/main/scala/org/apache/spark/sql/delta/commands/ConvertToDeltaCommand.scala @@ -376,13 +376,7 @@ abstract class ConvertToDeltaCommandBase( createdTime = Some(System.currentTimeMillis())) txn.updateMetadataForNewTable(metadata) - // TODO: we have not decided on how to implement CONVERT TO DELTA under column mapping modes - // for some convert targets so we block this feature for them here - checkColumnMapping(txn.metadata, targetTable) - RowTracking.checkStatsCollectedIfRowTrackingSupported( - txn.protocol, - collectStats, - statsEnabled) + checkConversionIsAllowed(txn, targetTable) val numFiles = targetTable.numFiles val addFilesIter = createDeltaActions(spark, manifest, partitionFields, txn, fs) @@ -442,6 +436,18 @@ abstract class ConvertToDeltaCommandBase( } } + /** Check if the conversion is allowed. */ + private def checkConversionIsAllowed( + txn: OptimisticTransaction, + targetTable: ConvertTargetTable): Unit = { + // TODO: we have not decided on how to implement CONVERT TO DELTA under column mapping modes + // for some convert targets so we block this feature for them here + checkColumnMapping(txn.metadata, targetTable) + RowTracking.checkStatsCollectedIfRowTrackingSupported( + txn.protocol, + collectStats, + statsEnabled) + } } case class ConvertToDeltaCommand( diff --git a/spark/src/main/scala/org/apache/spark/sql/delta/commands/CreateDeltaTableCommand.scala b/spark/src/main/scala/org/apache/spark/sql/delta/commands/CreateDeltaTableCommand.scala index dfe3d896284..644edbe1350 100644 --- a/spark/src/main/scala/org/apache/spark/sql/delta/commands/CreateDeltaTableCommand.scala +++ b/spark/src/main/scala/org/apache/spark/sql/delta/commands/CreateDeltaTableCommand.scala @@ -22,7 +22,7 @@ import java.util.concurrent.TimeUnit import org.apache.spark.sql.delta.skipping.clustering.ClusteredTableUtils import org.apache.spark.sql.delta._ import org.apache.spark.sql.delta.DeltaColumnMapping.{dropColumnMappingMetadata, filterColumnMappingProperties} -import org.apache.spark.sql.delta.actions.{Action, Metadata, Protocol} +import org.apache.spark.sql.delta.actions.{Action, Metadata, Protocol, TableFeatureProtocolUtils} import org.apache.spark.sql.delta.actions.DomainMetadata import org.apache.spark.sql.delta.commands.DMLUtils.TaggedCommitData import org.apache.spark.sql.delta.coordinatedcommits.CoordinatedCommitsUtils @@ -134,9 +134,14 @@ case class CreateDeltaTableCommand( } val tableLocation = getDeltaTablePath(tableWithLocation) - val deltaLog = DeltaLog.forTable(sparkSession, tableLocation) - CoordinatedCommitsUtils.validateCoordinatedCommitsConfigurations( - sparkSession, deltaLog, query, tableWithLocation.properties) + // To be safe, here we only extract file system options from table storage properties, to create + // the DeltaLog. + val fileSystemOptions = table.storage.properties.filter { case (k, _) => + DeltaTableUtils.validDeltaTableHadoopPrefixes.exists(k.startsWith) + } + val deltaLog = DeltaLog.forTable(sparkSession, tableLocation, fileSystemOptions) + CoordinatedCommitsUtils.validateConfigurationsForCreateDeltaTableCommand( + sparkSession, deltaLog.tableExists, query, tableWithLocation.properties) recordDeltaOperation(deltaLog, "delta.ddl.createTable") { val result = handleCommit(sparkSession, deltaLog, tableWithLocation) @@ -541,6 +546,10 @@ case class CreateDeltaTableCommand( // internal column mapping properties for the sake of comparison. var filteredTableProperties = filterColumnMappingProperties( tableDesc.properties) + // We also need to remove any protocol-related properties as we're filtering these + // from the metadata so they won't be present in the table properties. + filteredTableProperties = + Protocol.filterProtocolPropsFromTableProps(filteredTableProperties) var filteredExistingProperties = filterColumnMappingProperties( existingMetadata.configuration) // Clustered table has internal table properties in Metadata configurations and they are @@ -723,8 +732,12 @@ case class CreateDeltaTableCommand( if (txn.readVersion > -1L && isReplace && !dontOverwriteSchema) { // When a table already exists, and we're using the DataFrameWriterV2 API to replace // or createOrReplace a table, we blindly overwrite the metadata. - val newMetadata = getProvidedMetadata(table, schema.json) - txn.updateMetadataForNewTable(newMetadata) + var newMetadata = getProvidedMetadata(table, schema.json) + val updatedConfig = UniversalFormat.enforceDependenciesInConfiguration( + newMetadata.configuration, + txn.snapshot) + newMetadata = newMetadata.copy(configuration = updatedConfig) + txn.updateMetadataForNewTableInReplace(newMetadata) } } diff --git a/spark/src/main/scala/org/apache/spark/sql/delta/commands/MergeIntoCommandBase.scala b/spark/src/main/scala/org/apache/spark/sql/delta/commands/MergeIntoCommandBase.scala index 6ec9adbad0f..7b035f30979 100644 --- a/spark/src/main/scala/org/apache/spark/sql/delta/commands/MergeIntoCommandBase.scala +++ b/spark/src/main/scala/org/apache/spark/sql/delta/commands/MergeIntoCommandBase.scala @@ -65,6 +65,8 @@ trait MergeIntoCommandBase extends LeafRunnableCommand DeletionVectorUtils.deletionVectorsWritable(txn.snapshot) } + override protected val supportMergeAndUpdateLegacyCastBehavior: Boolean = true + override val (canMergeSchema, canOverwriteSchema) = { // Delta options can't be passed to MERGE INTO currently, so they'll always be empty. // The methods in options check if user has instructed to turn on schema evolution for this diff --git a/spark/src/main/scala/org/apache/spark/sql/delta/commands/VacuumCommand.scala b/spark/src/main/scala/org/apache/spark/sql/delta/commands/VacuumCommand.scala index ab1242248aa..24113d85765 100644 --- a/spark/src/main/scala/org/apache/spark/sql/delta/commands/VacuumCommand.scala +++ b/spark/src/main/scala/org/apache/spark/sql/delta/commands/VacuumCommand.scala @@ -178,6 +178,11 @@ object VacuumCommand extends VacuumCommandImpl with Serializable { None } } + .map { f => + // Below logic will make paths url-encoded + SerializableFileStatus(pathStringtoUrlEncodedString(f.path), f.length, f.isDir, + f.modificationTime) + } } /** @@ -273,6 +278,11 @@ object VacuumCommand extends VacuumCommandImpl with Serializable { ), fileListingParallelism = Option(parallelism) ) + .map { f => + // Below logic will make paths url-encoded + SerializableFileStatus(pathStringtoUrlEncodedString(f.path), f.length, f.isDir, + f.modificationTime) + } } val allFilesAndDirs = allFilesAndDirsWithDuplicates.groupByKey(_.path) .mapGroups { (k, v) => @@ -299,6 +309,7 @@ object VacuumCommand extends VacuumCommandImpl with Serializable { // 5. We subtract all the valid files and tombstones in our state // 6. We filter all paths with a count of 1, which will correspond to files not in the // state, and empty directories. We can safely delete all of these + val canonicalizedBasePath = SparkPath.fromPathString(basePath).urlEncoded val diff = allFilesAndDirs .where(col("modificationTime") < deleteBeforeTimestamp || col("isDir")) .mapPartitions { fileStatusIterator => @@ -307,16 +318,18 @@ object VacuumCommand extends VacuumCommandImpl with Serializable { fileStatusIterator.flatMap { fileStatus => if (fileStatus.isDir) { Iterator.single(FileNameAndSize( - relativize(fileStatus.getHadoopPath, fs, reservoirBase, isDir = true), 0L)) + relativize(urlEncodedStringToPath(fileStatus.path), fs, + reservoirBase, isDir = true), 0L)) } else { - val dirs = getAllSubdirs(basePath, fileStatus.path, fs) + val dirs = getAllSubdirs(canonicalizedBasePath, fileStatus.path, fs) val dirsWithSlash = dirs.map { p => - val relativizedPath = relativize(new Path(p), fs, reservoirBase, isDir = true) + val relativizedPath = relativize(urlEncodedStringToPath(p), fs, + reservoirBase, isDir = true) FileNameAndSize(relativizedPath, 0L) } dirsWithSlash ++ Iterator( FileNameAndSize(relativize( - fileStatus.getHadoopPath, fs, reservoirBase, isDir = false), + urlEncodedStringToPath(fileStatus.path), fs, reservoirBase, isDir = false), fileStatus.length)) } } @@ -337,9 +350,9 @@ object VacuumCommand extends VacuumCommandImpl with Serializable { .select(col("path")) .as[String] .map { relativePath => - assert(!stringToPath(relativePath).isAbsolute, + assert(!urlEncodedStringToPath(relativePath).isAbsolute, "Shouldn't have any absolute paths for deletion here.") - pathToString(DeltaFileOperations.absolutePath(basePath, relativePath)) + pathToUrlEncodedString(DeltaFileOperations.absolutePath(basePath, relativePath)) } val timeTakenToIdentifyEligibleFiles = System.currentTimeMillis() - startTimeToIdentifyEligibleFiles @@ -369,7 +382,7 @@ object VacuumCommand extends VacuumCommandImpl with Serializable { log"a total of ${MDC(DeltaLogKeys.NUM_DIRS, dirCounts)} directories " + log"that are safe to delete. Vacuum stats: ${MDC(DeltaLogKeys.STATS, stats)}") - return diffFiles.map(f => stringToPath(f).toString).toDF("path") + return diffFiles.map(f => urlEncodedStringToPath(f).toString).toDF("path") } logVacuumStart( spark, @@ -574,7 +587,7 @@ trait VacuumCommandImpl extends DeltaCommand { fs: FileSystem, reservoirBase: Path, isDir: Boolean): String = { - pathToString(DeltaFileOperations.tryRelativizePath(fs, reservoirBase, path)) + pathToUrlEncodedString(DeltaFileOperations.tryRelativizePath(fs, reservoirBase, path)) } /** @@ -601,21 +614,22 @@ trait VacuumCommandImpl extends DeltaCommand { diff.repartition(parallelPartitions).mapPartitions { files => val fs = new Path(basePath).getFileSystem(hadoopConf.value.value) val filesDeletedPerPartition = - files.map(p => stringToPath(p)).count(f => tryDeleteNonRecursive(fs, f)) + files.map(p => urlEncodedStringToPath(p)).count(f => tryDeleteNonRecursive(fs, f)) Iterator(filesDeletedPerPartition) }.collect().sum } else { val fs = new Path(basePath).getFileSystem(hadoopConf.value.value) val fileResultSet = diff.toLocalIterator().asScala - fileResultSet.map(p => stringToPath(p)).count(f => tryDeleteNonRecursive(fs, f)) + fileResultSet.map(p => urlEncodedStringToPath(p)).count(f => tryDeleteNonRecursive(fs, f)) } } - // scalastyle:off pathfromuri - protected def stringToPath(path: String): Path = new Path(new URI(path)) - // scalastyle:on pathfromuri + protected def urlEncodedStringToPath(path: String): Path = SparkPath.fromUrlString(path).toPath + + protected def pathToUrlEncodedString(path: Path): String = SparkPath.fromPath(path).toString - protected def pathToString(path: Path): String = path.toUri.toString + protected def pathStringtoUrlEncodedString(path: String) = + SparkPath.fromPathString(path).toString /** Returns the relative path of a file action or None if the file lives outside of the table. */ protected def getActionRelativePath( @@ -631,7 +645,7 @@ trait VacuumCommandImpl extends DeltaCommand { fs: FileSystem, basePath: Path, relativizeIgnoreError: Boolean): Option[String] = { - val filePath = stringToPath(path) + val filePath = urlEncodedStringToPath(path) if (filePath.isAbsolute) { val maybeRelative = DeltaFileOperations.tryRelativizePath(fs, basePath, filePath, relativizeIgnoreError) @@ -639,10 +653,10 @@ trait VacuumCommandImpl extends DeltaCommand { // This file lives outside the directory of the table. None } else { - Some(pathToString(maybeRelative)) + Some(pathToUrlEncodedString(maybeRelative)) } } else { - Some(pathToString(filePath)) + Some(pathToUrlEncodedString(filePath)) } } @@ -686,7 +700,7 @@ trait VacuumCommandImpl extends DeltaCommand { case Some(dv) if dv.isOnDisk => if (dv.isRelative) { // We actually want a relative path here. - Some((pathToString(dv.absolutePath(new Path("."))), dv.sizeInBytes)) + Some((pathToUrlEncodedString(dv.absolutePath(new Path("."))), dv.sizeInBytes)) } else { assert(dv.isAbsolute) // This is never going to be a path relative to `basePath` for DVs. diff --git a/spark/src/main/scala/org/apache/spark/sql/delta/commands/WriteIntoDelta.scala b/spark/src/main/scala/org/apache/spark/sql/delta/commands/WriteIntoDelta.scala index 44476ec68a9..6ef903499c3 100644 --- a/spark/src/main/scala/org/apache/spark/sql/delta/commands/WriteIntoDelta.scala +++ b/spark/src/main/scala/org/apache/spark/sql/delta/commands/WriteIntoDelta.scala @@ -153,7 +153,8 @@ case class WriteIntoDelta( // If READ_SIDE_CHAR_PADDING is not enabled, CHAR type is the same as VARCHAR. The change // below makes DESC TABLE to show VARCHAR instead of CHAR. CharVarcharUtils.replaceCharVarcharWithStringInSchema( - replaceCharWithVarchar(CharVarcharUtils.getRawSchema(data.schema)).asInstanceOf[StructType]) + CharVarcharUtils.replaceCharWithVarchar(CharVarcharUtils.getRawSchema(data.schema)) + .asInstanceOf[StructType]) } val finalSchema = schemaInCatalog.getOrElse(dataSchema) if (txn.metadata.schemaString != null) { diff --git a/spark/src/main/scala/org/apache/spark/sql/delta/commands/WriteIntoDeltaLike.scala b/spark/src/main/scala/org/apache/spark/sql/delta/commands/WriteIntoDeltaLike.scala index 06350e1b283..3ca82ca2464 100644 --- a/spark/src/main/scala/org/apache/spark/sql/delta/commands/WriteIntoDeltaLike.scala +++ b/spark/src/main/scala/org/apache/spark/sql/delta/commands/WriteIntoDeltaLike.scala @@ -153,20 +153,6 @@ trait WriteIntoDeltaLike { txn.registerSQLMetrics(spark, sqlMetrics) } - import org.apache.spark.sql.types.{ArrayType, CharType, DataType, MapType, VarcharType} - protected def replaceCharWithVarchar(dt: DataType): DataType = dt match { - case ArrayType(et, nullable) => - ArrayType(replaceCharWithVarchar(et), nullable) - case MapType(kt, vt, nullable) => - MapType(replaceCharWithVarchar(kt), replaceCharWithVarchar(vt), nullable) - case StructType(fields) => - StructType(fields.map { field => - field.copy(dataType = replaceCharWithVarchar(field.dataType)) - }) - case CharType(length) => VarcharType(length) - case _ => dt - } - protected def extractConstraints( sparkSession: SparkSession, expr: Seq[Expression]): Seq[Constraint] = { diff --git a/spark/src/main/scala/org/apache/spark/sql/delta/commands/alterDeltaTableCommands.scala b/spark/src/main/scala/org/apache/spark/sql/delta/commands/alterDeltaTableCommands.scala index fa36f177382..ee671af7b0e 100644 --- a/spark/src/main/scala/org/apache/spark/sql/delta/commands/alterDeltaTableCommands.scala +++ b/spark/src/main/scala/org/apache/spark/sql/delta/commands/alterDeltaTableCommands.scala @@ -31,6 +31,7 @@ import org.apache.spark.sql.delta.catalog.DeltaTableV2 import org.apache.spark.sql.delta.commands.backfill.RowTrackingBackfillCommand import org.apache.spark.sql.delta.commands.columnmapping.RemoveColumnMappingCommand import org.apache.spark.sql.delta.constraints.{CharVarcharConstraint, Constraints} +import org.apache.spark.sql.delta.coordinatedcommits.CoordinatedCommitsUtils import org.apache.spark.sql.delta.logging.DeltaLogKeys import org.apache.spark.sql.delta.schema.{SchemaMergingUtils, SchemaUtils} import org.apache.spark.sql.delta.schema.SchemaUtils.transformSchema @@ -161,6 +162,8 @@ case class AlterTableSetPropertiesDeltaCommand( true }.toMap + CoordinatedCommitsUtils.validateConfigurationsForAlterTableSetPropertiesDeltaCommand( + metadata.configuration, filteredConfs) val newMetadata = metadata.copy( description = configuration.getOrElse(TableCatalog.PROP_COMMENT, metadata.description), configuration = metadata.configuration ++ filteredConfs) @@ -193,7 +196,8 @@ case class AlterTableSetPropertiesDeltaCommand( case class AlterTableUnsetPropertiesDeltaCommand( table: DeltaTableV2, propKeys: Seq[String], - ifExists: Boolean) + ifExists: Boolean, + fromDropFeatureCommand: Boolean = false) extends LeafRunnableCommand with AlterDeltaTableCommand with IgnoreCachedData { override def run(sparkSession: SparkSession): Seq[Row] = { @@ -223,6 +227,10 @@ case class AlterTableUnsetPropertiesDeltaCommand( } } + if (!fromDropFeatureCommand) { + CoordinatedCommitsUtils.validateConfigurationsForAlterTableUnsetPropertiesDeltaCommand( + metadata.configuration, normalizedKeys) + } val newConfiguration = metadata.configuration.filterNot { case (key, _) => normalizedKeys.contains(key) } diff --git a/spark/src/main/scala/org/apache/spark/sql/delta/commands/backfill/BackfillCommand.scala b/spark/src/main/scala/org/apache/spark/sql/delta/commands/backfill/BackfillCommand.scala index b0258add0d1..8254aaa7109 100644 --- a/spark/src/main/scala/org/apache/spark/sql/delta/commands/backfill/BackfillCommand.scala +++ b/spark/src/main/scala/org/apache/spark/sql/delta/commands/backfill/BackfillCommand.scala @@ -60,6 +60,7 @@ trait BackfillCommand extends LeafRunnableCommand with DeltaCommand { txn.executionObserver.preparingCommit() txn.executionObserver.beginDoCommit() txn.executionObserver.beginBackfill() + txn.executionObserver.beginPostCommit() val maxNumFilesPerCommit = spark.conf.get(DeltaSQLConf.DELTA_BACKFILL_MAX_NUM_FILES_PER_COMMIT) val metricsOpType = "delta.backfill.materialization.trackerMetrics" diff --git a/spark/src/main/scala/org/apache/spark/sql/delta/commands/cdc/CDCReader.scala b/spark/src/main/scala/org/apache/spark/sql/delta/commands/cdc/CDCReader.scala index 9f3f85eab32..ae2d9a61f53 100644 --- a/spark/src/main/scala/org/apache/spark/sql/delta/commands/cdc/CDCReader.scala +++ b/spark/src/main/scala/org/apache/spark/sql/delta/commands/cdc/CDCReader.scala @@ -36,13 +36,13 @@ import org.apache.spark.internal.MDC import org.apache.spark.rdd.RDD import org.apache.spark.sql.{Column, DataFrame, Dataset, Row, SparkSession, SQLContext} import org.apache.spark.sql.catalyst.InternalRow -import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, Literal} +import org.apache.spark.sql.catalyst.expressions.{And, Attribute, AttributeReference, Expression, Literal} import org.apache.spark.sql.catalyst.plans.logical.Statistics import org.apache.spark.sql.catalyst.types.DataTypeUtils.toAttributes import org.apache.spark.sql.execution.LogicalRDD import org.apache.spark.sql.execution.datasources.{HadoopFsRelation, LogicalRelation} import org.apache.spark.sql.internal.SQLConf -import org.apache.spark.sql.sources.{BaseRelation, Filter, PrunedFilteredScan} +import org.apache.spark.sql.sources.{BaseRelation, CatalystScan, Filter} import org.apache.spark.sql.types.{LongType, StringType, StructType, TimestampType} import org.apache.spark.sql.util.CaseInsensitiveStringMap @@ -113,7 +113,7 @@ object CDCReader extends CDCReaderImpl snapshotWithSchemaMode: SnapshotWithSchemaMode, sqlContext: SQLContext, startingVersion: Option[Long], - endingVersion: Option[Long]) extends BaseRelation with PrunedFilteredScan { + endingVersion: Option[Long]) extends BaseRelation with CatalystScan { private val deltaLog = snapshotWithSchemaMode.snapshot.deltaLog @@ -152,7 +152,7 @@ object CDCReader extends CDCReaderImpl override def unhandledFilters(filters: Array[Filter]): Array[Filter] = Array.empty - override def buildScan(requiredColumns: Array[String], filters: Array[Filter]): RDD[Row] = { + override def buildScan(requiredColumns: Seq[Attribute], filters: Seq[Expression]): RDD[Row] = { val df = changesToBatchDF( deltaLog, startingVersion.get, @@ -163,8 +163,18 @@ object CDCReader extends CDCReaderImpl sqlContext.sparkSession, readSchemaSnapshot = Some(snapshotForBatchSchema)) - val filter = Column(DeltaSourceUtils.translateFilters(filters)) - val projections = requiredColumns.map(SchemaUtils.fieldNameToColumn) + // Rewrite the attributes in the required columns and pushed down filters to match the output + // of the internal DataFrame. + val outputMap = df.queryExecution.analyzed.output.map(a => a.name -> a).toMap + val projections = + requiredColumns.map(a => Column(a.withExprId(outputMap(a.name).exprId))) + val filter = Column( + filters + .map(_.transform { case a: Attribute => a.withExprId(outputMap(a.name).exprId) }) + .reduceOption(And) + .getOrElse(Literal.TrueLiteral) + ) + df.filter(filter).select(projections: _*).rdd } } @@ -406,7 +416,7 @@ trait CDCReaderImpl extends DeltaLogging { spark.sqlContext, startingVersion = None, endingVersion = None) { - override def buildScan(requiredColumns: Array[String], filters: Array[Filter]): RDD[Row] = + override def buildScan(requiredColumns: Seq[Attribute], filters: Seq[Expression]): RDD[Row] = sqlContext.sparkSession.sparkContext.emptyRDD[Row] } } diff --git a/spark/src/main/scala/org/apache/spark/sql/delta/coordinatedcommits/AbstractBatchBackfillingCommitCoordinatorClient.scala b/spark/src/main/scala/org/apache/spark/sql/delta/coordinatedcommits/AbstractBatchBackfillingCommitCoordinatorClient.scala index 3737a27a57b..a0b83781553 100644 --- a/spark/src/main/scala/org/apache/spark/sql/delta/coordinatedcommits/AbstractBatchBackfillingCommitCoordinatorClient.scala +++ b/spark/src/main/scala/org/apache/spark/sql/delta/coordinatedcommits/AbstractBatchBackfillingCommitCoordinatorClient.scala @@ -17,7 +17,7 @@ package org.apache.spark.sql.delta.coordinatedcommits import java.nio.file.FileAlreadyExistsException -import java.util.UUID +import java.util.{Optional, UUID} import scala.collection.JavaConverters._ @@ -28,7 +28,7 @@ import org.apache.spark.sql.delta.actions.Metadata import org.apache.spark.sql.delta.logging.DeltaLogKeys import org.apache.spark.sql.delta.util.FileNames import io.delta.storage.LogStore -import io.delta.storage.commit.{CommitCoordinatorClient, CommitFailedException => JCommitFailedException, CommitResponse, UpdatedActions} +import io.delta.storage.commit.{CommitCoordinatorClient, CommitFailedException => JCommitFailedException, CommitResponse, TableDescriptor, TableIdentifier, UpdatedActions} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileStatus, Path} @@ -64,11 +64,11 @@ trait AbstractBatchBackfillingCommitCoordinatorClient override def commit( logStore: LogStore, hadoopConf: Configuration, - logPath: Path, - coordinatedCommitsTableConf: java.util.Map[String, String], + tableDesc: TableDescriptor, commitVersion: Long, actions: java.util.Iterator[String], updatedActions: UpdatedActions): CommitResponse = { + val logPath = tableDesc.getLogPath val executionObserver = TransactionExecutionObserver.getObserver val tablePath = CoordinatedCommitsUtils.getTablePath(logPath) if (commitVersion == 0) { @@ -86,8 +86,7 @@ trait AbstractBatchBackfillingCommitCoordinatorClient backfillToVersion( logStore, hadoopConf, - logPath, - coordinatedCommitsTableConf, + tableDesc, commitVersion - 1, null) } @@ -103,7 +102,7 @@ trait AbstractBatchBackfillingCommitCoordinatorClient logStore, hadoopConf, logPath, - coordinatedCommitsTableConf.asScala.toMap, + tableDesc.getTableConf.asScala.toMap, commitVersion, fileStatus, commitTimestamp) @@ -122,13 +121,7 @@ trait AbstractBatchBackfillingCommitCoordinatorClient logInfo(log"Making sure commits are backfilled till " + log"${MDC(DeltaLogKeys.VERSION, commitVersion)} " + log"version for table ${MDC(DeltaLogKeys.PATH, tablePath.toString)}") - backfillToVersion( - logStore, - hadoopConf, - logPath, - coordinatedCommitsTableConf, - commitVersion, - null) + backfillToVersion(logStore, hadoopConf, tableDesc, commitVersion, null) } logInfo(log"Commit ${MDC(DeltaLogKeys.VERSION, commitVersion)} done successfully on table " + log"${MDC(DeltaLogKeys.PATH, tablePath)}") @@ -150,10 +143,10 @@ trait AbstractBatchBackfillingCommitCoordinatorClient override def backfillToVersion( logStore: LogStore, hadoopConf: Configuration, - logPath: Path, - coordinatedCommitsTableConf: java.util.Map[String, String], + tableDesc: TableDescriptor, version: Long, lastKnownBackfilledVersionOpt: java.lang.Long): Unit = { + val logPath = tableDesc.getLogPath // Confirm the last backfilled version by checking the backfilled delta file's existence. val validLastKnownBackfilledVersionOpt = Option(lastKnownBackfilledVersionOpt) .filter { version => @@ -161,7 +154,7 @@ trait AbstractBatchBackfillingCommitCoordinatorClient fs.exists(FileNames.unsafeDeltaFile(logPath, version)) } val startVersionOpt: Long = validLastKnownBackfilledVersionOpt.map(_ + 1).map(Long.box).orNull - getCommits(logPath, coordinatedCommitsTableConf, startVersionOpt, version) + getCommits(tableDesc, startVersionOpt, version) .getCommits.asScala .foreach { commit => backfill(logStore, hadoopConf, logPath, commit.getVersion, commit.getFileStatus) diff --git a/spark/src/main/scala/org/apache/spark/sql/delta/coordinatedcommits/CoordinatedCommitsUtils.scala b/spark/src/main/scala/org/apache/spark/sql/delta/coordinatedcommits/CoordinatedCommitsUtils.scala index 5a396ad7d36..435d5306396 100644 --- a/spark/src/main/scala/org/apache/spark/sql/delta/coordinatedcommits/CoordinatedCommitsUtils.scala +++ b/spark/src/main/scala/org/apache/spark/sql/delta/coordinatedcommits/CoordinatedCommitsUtils.scala @@ -16,6 +16,8 @@ package org.apache.spark.sql.delta.coordinatedcommits +import java.util.Optional + import scala.collection.JavaConverters._ import scala.util.control.NonFatal @@ -27,13 +29,14 @@ import org.apache.spark.sql.delta.metering.DeltaLogging import org.apache.spark.sql.delta.util.{FileNames, JsonUtils} import org.apache.spark.sql.delta.util.FileNames.{BackfilledDeltaFile, CompactedDeltaFile, DeltaFile, UnbackfilledDeltaFile} import io.delta.storage.LogStore -import io.delta.storage.commit.{CommitCoordinatorClient, GetCommitsResponse => JGetCommitsResponse} +import io.delta.storage.commit.{CommitCoordinatorClient, GetCommitsResponse => JGetCommitsResponse, TableIdentifier} import io.delta.storage.commit.actions.AbstractMetadata import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileStatus, Path} import org.apache.spark.internal.MDC import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.catalyst.{TableIdentifier => CatalystTableIdentifier} import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.util.Utils @@ -46,6 +49,7 @@ object CoordinatedCommitsUtils extends DeltaLogging { def getCommitsFromCommitCoordinatorWithUsageLogs( deltaLog: DeltaLog, tableCommitCoordinatorClient: TableCommitCoordinatorClient, + tableIdentifierOpt: Option[CatalystTableIdentifier], startVersion: Long, versionToLoad: Option[Long], isAsyncRequest: Boolean): JGetCommitsResponse = { @@ -66,7 +70,8 @@ object CoordinatedCommitsUtils extends DeltaLogging { try { val response = - tableCommitCoordinatorClient.getCommits(Some(startVersion), endVersion = versionToLoad) + tableCommitCoordinatorClient.getCommits( + tableIdentifierOpt, Some(startVersion), endVersion = versionToLoad) val additionalEventData = Map( "responseCommitsSize" -> response.getCommits.size, "responseLatestTableVersion" -> response.getLatestTableVersion) @@ -274,11 +279,21 @@ object CoordinatedCommitsUtils extends DeltaLogging { DeltaConfigs.COORDINATED_COMMITS_COORDINATOR_CONF, DeltaConfigs.COORDINATED_COMMITS_TABLE_CONF) + val ICT_TABLE_PROPERTY_CONFS = Seq( + DeltaConfigs.IN_COMMIT_TIMESTAMPS_ENABLED, + DeltaConfigs.IN_COMMIT_TIMESTAMP_ENABLEMENT_VERSION, + DeltaConfigs.IN_COMMIT_TIMESTAMP_ENABLEMENT_TIMESTAMP) + /** * The main table properties used to instantiate a TableCommitCoordinatorClient. */ val TABLE_PROPERTY_KEYS: Seq[String] = TABLE_PROPERTY_CONFS.map(_.key) + /** + * The main ICT table properties used as dependencies for Coordinated Commits. + */ + val ICT_TABLE_PROPERTY_KEYS: Seq[String] = ICT_TABLE_PROPERTY_CONFS.map(_.key) + /** * Returns true if any CoordinatedCommits-related table properties is present in the metadata. */ @@ -369,9 +384,14 @@ object CoordinatedCommitsUtils extends DeltaLogging { */ def extractCoordinatedCommitsConfigurations( properties: Map[String, String]): Map[String, String] = { - properties.filter { case (k, _) => - CoordinatedCommitsUtils.TABLE_PROPERTY_KEYS.contains(k) - } + properties.filter { case (k, _) => TABLE_PROPERTY_KEYS.contains(k) } + } + + /** + * Extracts the ICT configurations from the provided properties. + */ + def extractICTConfigurations(properties: Map[String, String]): Map[String, String] = { + properties.filter { case (k, _) => ICT_TABLE_PROPERTY_KEYS.contains(k) } } /** @@ -440,16 +460,88 @@ object CoordinatedCommitsUtils extends DeltaLogging { } } + /** + * Verifies that the property keys do not contain any ICT dependencies for Coordinated Commits. + */ + private def verifyNotContainsICTConfigurations( + propKeys: Seq[String], command: String, errorClass: String): Unit = { + ICT_TABLE_PROPERTY_KEYS.foreach { key => + if (propKeys.contains(key)) { + throw new DeltaIllegalArgumentException( + errorClass, + messageParameters = Array(command)) + } + } + } + + /** + * Validates the Coordinated Commits configurations in explicit command overrides for + * `AlterTableSetPropertiesDeltaCommand`. + * + * If the table already has Coordinated Commits configurations present, then we do not allow + * users to override them via `ALTER TABLE t SET TBLPROPERTIES ...`. Users must downgrade the + * table and then upgrade it with the new Coordinated Commits configurations. + * If the table is a Coordinated Commits table or will be one via this ALTER command, then we + * do not allow users to disable any ICT properties that Coordinated Commits depends on. + */ + def validateConfigurationsForAlterTableSetPropertiesDeltaCommand( + existingConfs: Map[String, String], + propertyOverrides: Map[String, String]): Unit = { + val existingCoordinatedCommitsConfs = extractCoordinatedCommitsConfigurations(existingConfs) + val coordinatedCommitsOverrides = extractCoordinatedCommitsConfigurations(propertyOverrides) + if (coordinatedCommitsOverrides.nonEmpty) { + if (existingCoordinatedCommitsConfs.nonEmpty) { + throw new DeltaIllegalArgumentException( + "DELTA_CANNOT_OVERRIDE_COORDINATED_COMMITS_CONFS", + Array("ALTER")) + } + verifyNotContainsICTConfigurations(propertyOverrides.keys.toSeq, command = "ALTER", + errorClass = "DELTA_CANNOT_SET_COORDINATED_COMMITS_DEPENDENCIES") + verifyContainsOnlyCoordinatorNameAndConf( + coordinatedCommitsOverrides, command = "ALTER", fromDefault = false) + } + if (existingCoordinatedCommitsConfs.nonEmpty) { + verifyNotContainsICTConfigurations(propertyOverrides.keys.toSeq, command = "ALTER", + errorClass = "DELTA_CANNOT_MODIFY_COORDINATED_COMMITS_DEPENDENCIES") + } + } + + /** + * Validates the configurations to unset for `AlterTableUnsetPropertiesDeltaCommand`. + * + * If the table already has Coordinated Commits configurations present, then we do not allow users + * to unset them via `ALTER TABLE t UNSET TBLPROPERTIES ...`. Users could only downgrade the table + * via `ALTER TABLE t DROP FEATURE ...`. We also do not allow users to unset any ICT properties + * that Coordinated Commits depends on. + */ + def validateConfigurationsForAlterTableUnsetPropertiesDeltaCommand( + existingConfs: Map[String, String], + propKeysToUnset: Seq[String]): Unit = { + // If the table does not have any Coordinated Commits configurations, then we do not check the + // properties to unset. This is because unsetting non-existent entries would either be caught + // earlier (without `IF EXISTS`) or simply be a no-op (with `IF EXISTS`). Thus, we ignore them + // instead of throwing an exception. + if (extractCoordinatedCommitsConfigurations(existingConfs).nonEmpty) { + if (propKeysToUnset.exists(TABLE_PROPERTY_KEYS.contains)) { + throw new DeltaIllegalArgumentException( + "DELTA_CANNOT_UNSET_COORDINATED_COMMITS_CONFS", + Array.empty) + } + verifyNotContainsICTConfigurations(propKeysToUnset, command = "ALTER", + errorClass = "DELTA_CANNOT_MODIFY_COORDINATED_COMMITS_DEPENDENCIES") + } + } + /** * Validates the Coordinated Commits configurations in explicit command overrides and default - * SparkSession properties. See `validateCoordinatedCommitsConfigurationsImpl` for details. + * SparkSession properties for `CreateDeltaTableCommand`. + * See `validateConfigurationsForCreateDeltaTableCommandImpl` for details. */ - def validateCoordinatedCommitsConfigurations( + def validateConfigurationsForCreateDeltaTableCommand( spark: SparkSession, - deltaLog: DeltaLog, + tableExists: Boolean, query: Option[LogicalPlan], catalogTableProperties: Map[String, String]): Unit = { - val tableExists = deltaLog.tableExists val (command, propertyOverrides) = query match { // For CLONE, we cannot use the properties from the catalog table, because they are already // the result of merging the source table properties with the overrides, but we do not @@ -459,7 +551,7 @@ object CoordinatedCommitsUtils extends DeltaLogging { cmd.tablePropertyOverrides) case _ => (if (tableExists) "REPLACE" else "CREATE", catalogTableProperties) } - validateCoordinatedCommitsConfigurationsImpl( + validateConfigurationsForCreateDeltaTableCommandImpl( spark, propertyOverrides, tableExists, command) } @@ -471,7 +563,7 @@ object CoordinatedCommitsUtils extends DeltaLogging { * the Coordinator Name and Coordinator Conf, and no Table Conf. Default configurations are * checked similarly if non of the three properties is present in explicit overrides. */ - private[delta] def validateCoordinatedCommitsConfigurationsImpl( + private[delta] def validateConfigurationsForCreateDeltaTableCommandImpl( spark: SparkSession, propertyOverrides: Map[String, String], tableExists: Boolean, @@ -497,4 +589,17 @@ object CoordinatedCommitsUtils extends DeltaLogging { } } } + + /** + * Converts a given Spark [[CatalystTableIdentifier]] to Coordinated Commits [[TableIdentifier]] + */ + def toCCTableIdentifier( + catalystTableIdentifierOpt: Option[CatalystTableIdentifier]): Optional[TableIdentifier] = { + catalystTableIdentifierOpt.map { catalystTableIdentifier => + val namespace = + catalystTableIdentifier.catalog.toSeq ++ + catalystTableIdentifier.database.toSeq + new TableIdentifier(namespace.toArray, catalystTableIdentifier.table) + }.map(Optional.of[TableIdentifier]).getOrElse(Optional.empty[TableIdentifier]) + } } diff --git a/spark/src/main/scala/org/apache/spark/sql/delta/coordinatedcommits/InMemoryCommitCoordinator.scala b/spark/src/main/scala/org/apache/spark/sql/delta/coordinatedcommits/InMemoryCommitCoordinator.scala index e722cdd5fe7..991150a5874 100644 --- a/spark/src/main/scala/org/apache/spark/sql/delta/coordinatedcommits/InMemoryCommitCoordinator.scala +++ b/spark/src/main/scala/org/apache/spark/sql/delta/coordinatedcommits/InMemoryCommitCoordinator.scala @@ -16,6 +16,7 @@ package org.apache.spark.sql.delta.coordinatedcommits +import java.util.{Map => JMap, Optional} import java.util.concurrent.ConcurrentHashMap import java.util.concurrent.locks.ReentrantReadWriteLock @@ -29,7 +30,9 @@ import io.delta.storage.commit.{ CommitCoordinatorClient, CommitFailedException => JCommitFailedException, CommitResponse, - GetCommitsResponse => JGetCommitsResponse + GetCommitsResponse => JGetCommitsResponse, + TableDescriptor, + TableIdentifier } import io.delta.storage.commit.actions.{AbstractMetadata, AbstractProtocol} import org.apache.hadoop.conf.Configuration @@ -150,14 +153,13 @@ class InMemoryCommitCoordinator(val batchSize: Long) } override def getCommits( - logPath: Path, - coordinatedCommitsTableConf: java.util.Map[String, String], + tableDesc: TableDescriptor, startVersion: java.lang.Long, endVersion: java.lang.Long): JGetCommitsResponse = { - withReadLock[JGetCommitsResponse](logPath) { + withReadLock[JGetCommitsResponse](tableDesc.getLogPath) { val startVersionOpt: Option[Long] = Option(startVersion).map(_.toLong) val endVersionOpt: Option[Long] = Option(endVersion).map(_.toLong) - val tableData = perTableMap.get(logPath) + val tableData = perTableMap.get(tableDesc.getLogPath) val effectiveStartVersion = startVersionOpt.getOrElse(0L) // Calculate the end version for the range, or use the last key if endVersion is not provided val effectiveEndVersion = endVersionOpt.getOrElse( @@ -187,9 +189,10 @@ class InMemoryCommitCoordinator(val batchSize: Long) override def registerTable( logPath: Path, + tableIdentifier: Optional[TableIdentifier], currentVersion: Long, currentMetadata: AbstractMetadata, - currentProtocol: AbstractProtocol): java.util.Map[String, String] = { + currentProtocol: AbstractProtocol): JMap[String, String] = { val newPerTableData = new PerTableData(currentVersion + 1) perTableMap.compute(logPath, (_, existingData) => { if (existingData != null) { diff --git a/spark/src/main/scala/org/apache/spark/sql/delta/coordinatedcommits/TableCommitCoordinatorClient.scala b/spark/src/main/scala/org/apache/spark/sql/delta/coordinatedcommits/TableCommitCoordinatorClient.scala index bf89a92a1e7..80a734b59da 100644 --- a/spark/src/main/scala/org/apache/spark/sql/delta/coordinatedcommits/TableCommitCoordinatorClient.scala +++ b/spark/src/main/scala/org/apache/spark/sql/delta/coordinatedcommits/TableCommitCoordinatorClient.scala @@ -24,11 +24,14 @@ import io.delta.storage.commit.{ CommitCoordinatorClient => JCommitCoordinatorClient, CommitResponse, GetCommitsResponse => JGetCommitsResponse, + TableDescriptor, UpdatedActions } import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path +import org.apache.spark.sql.catalyst.{TableIdentifier => CatalystTableIdentifier} + /** * A wrapper around [[CommitCoordinatorClient]] that provides a more user-friendly API for * committing/ accessing commits to a specific table. This class takes care of passing the @@ -48,35 +51,44 @@ case class TableCommitCoordinatorClient( hadoopConf: Configuration, logStore: LogStore) { + private def makeTableDesc( + tableIdentifierOpt: Option[CatalystTableIdentifier]): TableDescriptor = { + val ccTableIdentifier = CoordinatedCommitsUtils.toCCTableIdentifier(tableIdentifierOpt) + new TableDescriptor(logPath, ccTableIdentifier, tableConf.asJava) + } + def commit( commitVersion: Long, actions: Iterator[String], - updatedActions: UpdatedActions): CommitResponse = { + updatedActions: UpdatedActions, + tableIdentifierOpt: Option[CatalystTableIdentifier]): CommitResponse = { commitCoordinatorClient.commit( LogStoreInverseAdaptor(logStore, hadoopConf), hadoopConf, - logPath, - tableConf.asJava, + makeTableDesc(tableIdentifierOpt), commitVersion, actions.asJava, updatedActions) } def getCommits( + tableIdentifierOpt: Option[CatalystTableIdentifier], startVersion: Option[Long] = None, endVersion: Option[Long] = None): JGetCommitsResponse = { commitCoordinatorClient.getCommits( - logPath, tableConf.asJava, startVersion.map(Long.box).orNull, endVersion.map(Long.box).orNull) + makeTableDesc(tableIdentifierOpt), + startVersion.map(Long.box).orNull, + endVersion.map(Long.box).orNull) } def backfillToVersion( + tableIdentifierOpt: Option[CatalystTableIdentifier], version: Long, lastKnownBackfilledVersion: Option[Long] = None): Unit = { commitCoordinatorClient.backfillToVersion( LogStoreInverseAdaptor(logStore, hadoopConf), hadoopConf, - logPath, - tableConf.asJava, + makeTableDesc(tableIdentifierOpt), version, lastKnownBackfilledVersion.map(Long.box).orNull) } diff --git a/spark/src/main/scala/org/apache/spark/sql/delta/fuzzer/OptimisticTransactionPhases.scala b/spark/src/main/scala/org/apache/spark/sql/delta/fuzzer/OptimisticTransactionPhases.scala index bb59574728e..4fc42e666ce 100644 --- a/spark/src/main/scala/org/apache/spark/sql/delta/fuzzer/OptimisticTransactionPhases.scala +++ b/spark/src/main/scala/org/apache/spark/sql/delta/fuzzer/OptimisticTransactionPhases.scala @@ -20,7 +20,8 @@ case class OptimisticTransactionPhases( initialPhase: ExecutionPhaseLock, preparePhase: ExecutionPhaseLock, commitPhase: ExecutionPhaseLock, - backfillPhase: ExecutionPhaseLock) + backfillPhase: ExecutionPhaseLock, + postCommitPhase: ExecutionPhaseLock) object OptimisticTransactionPhases { @@ -30,6 +31,7 @@ object OptimisticTransactionPhases { final val PREPARE_PHASE_LABEL = PREFIX + "PREPARE" final val COMMIT_PHASE_LABEL = PREFIX + "COMMIT" final val BACKFILL_PHASE_LABEL = PREFIX + "BACKFILL" + final val POST_COMMIT_PHASE_LABEL = PREFIX + "POST_COMMIT" def forName(txnName: String): OptimisticTransactionPhases = { @@ -40,6 +42,7 @@ object OptimisticTransactionPhases { initialPhase = ExecutionPhaseLock(toTxnPhaseLabel(INITIAL_PHASE_LABEL)), preparePhase = ExecutionPhaseLock(toTxnPhaseLabel(PREPARE_PHASE_LABEL)), commitPhase = ExecutionPhaseLock(toTxnPhaseLabel(COMMIT_PHASE_LABEL)), - backfillPhase = ExecutionPhaseLock(toTxnPhaseLabel(BACKFILL_PHASE_LABEL))) + backfillPhase = ExecutionPhaseLock(toTxnPhaseLabel(BACKFILL_PHASE_LABEL)), + postCommitPhase = ExecutionPhaseLock(toTxnPhaseLabel(POST_COMMIT_PHASE_LABEL))) } } diff --git a/spark/src/main/scala/org/apache/spark/sql/delta/fuzzer/PhaseLockingTransactionExecutionObserver.scala b/spark/src/main/scala/org/apache/spark/sql/delta/fuzzer/PhaseLockingTransactionExecutionObserver.scala index 74fe566799c..62110fc0634 100644 --- a/spark/src/main/scala/org/apache/spark/sql/delta/fuzzer/PhaseLockingTransactionExecutionObserver.scala +++ b/spark/src/main/scala/org/apache/spark/sql/delta/fuzzer/PhaseLockingTransactionExecutionObserver.scala @@ -27,7 +27,8 @@ private[delta] class PhaseLockingTransactionExecutionObserver( phases.initialPhase, phases.preparePhase, phases.commitPhase, - phases.backfillPhase) + phases.backfillPhase, + phases.postCommitPhase) override def createChild(): TransactionExecutionObserver = { // Just return the current thread observer. @@ -59,11 +60,16 @@ private[delta] class PhaseLockingTransactionExecutionObserver( phases.backfillPhase.waitToEnter() } + override def beginPostCommit(): Unit = { + phases.backfillPhase.leave() + phases.postCommitPhase.waitToEnter() + } + override def transactionCommitted(): Unit = { if (nextObserver.nonEmpty && autoAdvanceNextObserver) { - waitForCommitPhaseAndAdvanceToNextObserver() + waitForLastPhaseAndAdvanceToNextObserver() } else { - phases.backfillPhase.leave() + phases.postCommitPhase.leave() } } @@ -74,25 +80,31 @@ private[delta] class PhaseLockingTransactionExecutionObserver( } phases.commitPhase.leave() } - if (!phases.backfillPhase.hasEntered) { - phases.backfillPhase.waitToEnter() + if (!phases.backfillPhase.hasLeft) { + if (!phases.backfillPhase.hasEntered) { + phases.backfillPhase.waitToEnter() + } + phases.backfillPhase.leave() + } + if (!phases.postCommitPhase.hasEntered) { + phases.postCommitPhase.waitToEnter() } if (nextObserver.nonEmpty && autoAdvanceNextObserver) { - waitForCommitPhaseAndAdvanceToNextObserver() + waitForLastPhaseAndAdvanceToNextObserver() } else { - phases.backfillPhase.leave() + phases.postCommitPhase.leave() } } /* - * Wait for the backfill phase to pass but do not unblock it so that callers can write tests + * Wait for the last phase to pass but do not unblock it so that callers can write tests * that capture errors caused by code between the end of the last txn and the start of the * new txn. After the commit phase is passed, update the thread observer of the thread to * the next observer. */ - def waitForCommitPhaseAndAdvanceToNextObserver(): Unit = { + def waitForLastPhaseAndAdvanceToNextObserver(): Unit = { require(nextObserver.nonEmpty) - phases.backfillPhase.waitToLeave() + phases.postCommitPhase.waitToLeave() advanceToNextThreadObserver() } @@ -103,7 +115,7 @@ private[delta] class PhaseLockingTransactionExecutionObserver( * Note that when a next observer is set, the caller needs to manually unblock the exit barrier * of the commit phase. * - * For example, see [[waitForCommitPhaseAndAdvanceToNextObserver]]. + * For example, see [[waitForLastPhaseAndAdvanceToNextObserver]]. */ def setNextObserver( nextTxnObserver: TransactionExecutionObserver, diff --git a/spark/src/main/scala/org/apache/spark/sql/delta/hooks/CheckpointHook.scala b/spark/src/main/scala/org/apache/spark/sql/delta/hooks/CheckpointHook.scala index e6e99c5b01a..833df53199d 100644 --- a/spark/src/main/scala/org/apache/spark/sql/delta/hooks/CheckpointHook.scala +++ b/spark/src/main/scala/org/apache/spark/sql/delta/hooks/CheckpointHook.scala @@ -40,6 +40,6 @@ object CheckpointHook extends PostCommitHook { committedVersion, lastCheckpointHint = None, lastCheckpointProvider = Some(cp)) - txn.deltaLog.checkpoint(snapshotToCheckpoint) + txn.deltaLog.checkpoint(snapshotToCheckpoint, txn.catalogTable.map(_.identifier)) } } diff --git a/spark/src/main/scala/org/apache/spark/sql/delta/schema/ImplicitMetadataOperation.scala b/spark/src/main/scala/org/apache/spark/sql/delta/schema/ImplicitMetadataOperation.scala index dbfc6034af4..015dcf5033c 100644 --- a/spark/src/main/scala/org/apache/spark/sql/delta/schema/ImplicitMetadataOperation.scala +++ b/spark/src/main/scala/org/apache/spark/sql/delta/schema/ImplicitMetadataOperation.scala @@ -29,7 +29,7 @@ import org.apache.spark.internal.MDC import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.expressions.FileSourceGeneratedMetadataStructField import org.apache.spark.sql.catalyst.types.DataTypeUtils.toAttributes -import org.apache.spark.sql.types.StructType +import org.apache.spark.sql.types.{DataType, StructType} /** * A trait that writers into Delta can extend to update the schema and/or partitioning of the table. @@ -227,6 +227,104 @@ object ImplicitMetadataOperation { } } + /** + * Check whether there are dependant (CHECK) constraints for + * the provided `currentDt`; if so, throw an error indicating + * the constraint data type mismatch. + * + * @param spark the spark session used. + * @param path the full column path for the current field. + * @param metadata the metadata used for checking dependant (CHECK) constraints. + * @param currentDt the current data type. + * @param updateDt the updated data type. + */ + private def checkDependentConstraints( + spark: SparkSession, + path: Seq[String], + metadata: Metadata, + currentDt: DataType, + updateDt: DataType): Unit = { + val dependentConstraints = + Constraints.findDependentConstraints(spark, path, metadata) + if (dependentConstraints.nonEmpty) { + throw DeltaErrors.constraintDataTypeMismatch( + path, + currentDt, + updateDt, + dependentConstraints + ) + } + } + + /** + * Check whether there are dependant generated columns for + * the provided `currentDt`; if so, throw an error indicating + * the generated columns data type mismatch. + * + * @param spark the spark session used. + * @param path the full column path for the current field. + * @param protocol the protocol used. + * @param metadata the metadata used for checking dependant generated columns. + * @param currentDt the current data type. + * @param updateDt the updated data type. + */ + private def checkDependentGeneratedColumns( + spark: SparkSession, + path: Seq[String], + protocol: Protocol, + metadata: Metadata, + currentDt: DataType, + updateDt: DataType): Unit = { + val dependentGeneratedColumns = SchemaUtils.findDependentGeneratedColumns( + spark, path, protocol, metadata.schema) + if (dependentGeneratedColumns.nonEmpty) { + throw DeltaErrors.generatedColumnsDataTypeMismatch( + path, + currentDt, + updateDt, + dependentGeneratedColumns + ) + } + } + + /** + * Check whether the provided field is currently being referenced + * by CHECK constraints or generated columns. + * Note that we explicitly ignore the check for `StructType` in this + * function by only inspecting its inner fields to relax the check; + * plus, any `StructType` will be traversed in [[checkDependentExpressions]]. + * + * @param spark the spark session used. + * @param path the full column path for the current field. + * @param protocol the protocol used. + * @param metadata the metadata used for checking constraints and generated columns. + * @param currentDt the current data type. + * @param updateDt the updated data type. + */ + private def checkConstraintsOrGeneratedColumnsOnStructField( + spark: SparkSession, + path: Seq[String], + protocol: Protocol, + metadata: Metadata, + currentDt: DataType, + updateDt: DataType): Unit = (currentDt, updateDt) match { + // we explicitly ignore the check for `StructType` here. + case (StructType(_), StructType(_)) => + + // FIXME: we intentionally incorporate the pattern match for `ArrayType` and `MapType` + // here mainly due to the field paths for maps/arrays in constraints/generated columns + // are *NOT* consistent with regular field paths, + // e.g., `hash(a.arr[0].x)` vs. `hash(a.element.x)`. + // this makes it hard to recurse into maps/arrays and check for the corresponding + // fields - thus we can not actually block the operation even if the updated field + // is being referenced by any CHECK constraints or generated columns. + case (from, to) => + if (currentDt != updateDt) { + checkDependentConstraints(spark, path, metadata, from, to) + checkDependentGeneratedColumns(spark, path, protocol, metadata, from, to) + } + } + /** * Finds all fields that change between the current schema and the new data schema and fail if any * of them are referenced by check constraints or generated columns. @@ -236,42 +334,23 @@ object ImplicitMetadataOperation { protocol: Protocol, metadata: actions.Metadata, dataSchema: StructType): Unit = - SchemaMergingUtils.transformColumns(metadata.schema, dataSchema) { - case (fieldPath, currentField, Some(updateField), _) - // This condition is actually too strict, structs may be identified as changing because one - // of their field is changing even though that field isn't referenced by any constraint or - // generated column. This is intentional to keep the check simple and robust, esp. since it - // aligns with the historical behavior of this check. - if !SchemaMergingUtils.equalsIgnoreCaseAndCompatibleNullability( - currentField.dataType, - updateField.dataType - ) => - val columnPath = fieldPath :+ currentField.name - // check if the field to change is referenced by check constraints - val dependentConstraints = - Constraints.findDependentConstraints(sparkSession, columnPath, metadata) - if (dependentConstraints.nonEmpty) { - throw DeltaErrors.constraintDataTypeMismatch( - columnPath, - currentField.dataType, - updateField.dataType, - dependentConstraints - ) - } - // check if the field to change is referenced by any generated columns - val dependentGenCols = SchemaUtils.findDependentGeneratedColumns( - sparkSession, columnPath, protocol, metadata.schema) - if (dependentGenCols.nonEmpty) { - throw DeltaErrors.generatedColumnsDataTypeMismatch( - columnPath, - currentField.dataType, - updateField.dataType, - dependentGenCols - ) - } - // We don't transform the schema but just perform checks, the returned field won't be used - // anyway. - updateField - case (_, field, _, _) => field - } + SchemaMergingUtils.transformColumns(metadata.schema, dataSchema) { + case (fieldPath, currentField, Some(updateField), _) + if !SchemaMergingUtils.equalsIgnoreCaseAndCompatibleNullability( + currentField.dataType, + updateField.dataType + ) => + checkConstraintsOrGeneratedColumnsOnStructField( + spark = sparkSession, + path = fieldPath :+ currentField.name, + protocol = protocol, + metadata = metadata, + currentDt = currentField.dataType, + updateDt = updateField.dataType + ) + // We don't transform the schema but just perform checks, + // the returned field won't be used anyway. + updateField + case (_, field, _, _) => field + } } diff --git a/spark/src/main/scala/org/apache/spark/sql/delta/schema/SchemaMergingUtils.scala b/spark/src/main/scala/org/apache/spark/sql/delta/schema/SchemaMergingUtils.scala index cfd84e9b0f0..fd7172603c0 100644 --- a/spark/src/main/scala/org/apache/spark/sql/delta/schema/SchemaMergingUtils.scala +++ b/spark/src/main/scala/org/apache/spark/sql/delta/schema/SchemaMergingUtils.scala @@ -296,9 +296,9 @@ object SchemaMergingUtils { * @param tf function to apply. * @return the transformed schema. */ - def transformColumns( - schema: StructType)( - tf: (Seq[String], StructField, Resolver) => StructField): StructType = { + def transformColumns[T <: DataType]( + schema: T)( + tf: (Seq[String], StructField, Resolver) => StructField): T = { def transform[E <: DataType](path: Seq[String], dt: E): E = { val newDt = dt match { case StructType(fields) => diff --git a/spark/src/main/scala/org/apache/spark/sql/delta/schema/SchemaUtils.scala b/spark/src/main/scala/org/apache/spark/sql/delta/schema/SchemaUtils.scala index b346802caf7..0a018e3ca50 100644 --- a/spark/src/main/scala/org/apache/spark/sql/delta/schema/SchemaUtils.scala +++ b/spark/src/main/scala/org/apache/spark/sql/delta/schema/SchemaUtils.scala @@ -59,7 +59,7 @@ object SchemaUtils extends DeltaLogging { * defines whether we should recurse into ArrayType and MapType. */ def filterRecursively( - schema: StructType, + schema: DataType, checkComplexTypes: Boolean)(f: StructField => Boolean): Seq[(Seq[String], StructField)] = { def recurseIntoComplexTypes( complexType: DataType, @@ -699,7 +699,7 @@ def normalizeColumnNamesInDataType( */ def findColumnPosition( column: Seq[String], - schema: StructType, + schema: DataType, resolver: Resolver = DELTA_COL_RESOLVER): Seq[Int] = { def findRecursively( searchPath: Seq[String], @@ -803,7 +803,7 @@ def normalizeColumnNamesInDataType( * @param position A list of ordinals (0-based) representing the path to the nested field in * `parent`. */ - def getNestedTypeFromPosition(schema: StructType, position: Seq[Int]): DataType = + def getNestedTypeFromPosition(schema: DataType, position: Seq[Int]): DataType = getNestedFieldFromPosition(StructField("schema", schema), position).dataType /** @@ -814,7 +814,34 @@ def normalizeColumnNamesInDataType( } /** - * Add `column` to the specified `position` in `schema`. + * Add a column to its child. + * @param parent The parent data type. + * @param column The column to add. + * @param position The position to add the column. + */ + def addColumn[T <: DataType](parent: T, column: StructField, position: Seq[Int]): T = { + if (position.isEmpty) { + throw DeltaErrors.addColumnParentNotStructException(column, parent) + } + parent match { + case struct: StructType => + addColumnToStruct(struct, column, position).asInstanceOf[T] + case map: MapType if position.head == MAP_KEY_INDEX => + map.copy(keyType = addColumn(map.keyType, column, position.tail)).asInstanceOf[T] + case map: MapType if position.head == MAP_VALUE_INDEX => + map.copy(valueType = addColumn(map.valueType, column, position.tail)).asInstanceOf[T] + case array: ArrayType if position.head == ARRAY_ELEMENT_INDEX => + array.copy(elementType = + addColumn(array.elementType, column, position.tail)).asInstanceOf[T] + case _: ArrayType => + throw DeltaErrors.incorrectArrayAccess() + case other => + throw DeltaErrors.addColumnParentNotStructException(column, other) + } + } + + /** + * Add `column` to the specified `position` in a struct `schema`. * @param position A Seq of ordinals on where this column should go. It is a Seq to denote * positions in nested columns (0-based). For example: * @@ -824,26 +851,10 @@ def normalizeColumnNamesInDataType( * will return * result: , b,c:STRUCT> */ - def addColumn(schema: StructType, column: StructField, position: Seq[Int]): StructType = { - def addColumnInChild(parent: DataType, column: StructField, position: Seq[Int]): DataType = { - if (position.isEmpty) { - throw DeltaErrors.addColumnParentNotStructException(column, parent) - } - parent match { - case struct: StructType => - addColumn(struct, column, position) - case map: MapType if position.head == MAP_KEY_INDEX => - map.copy(keyType = addColumnInChild(map.keyType, column, position.tail)) - case map: MapType if position.head == MAP_VALUE_INDEX => - map.copy(valueType = addColumnInChild(map.valueType, column, position.tail)) - case array: ArrayType if position.head == ARRAY_ELEMENT_INDEX => - array.copy(elementType = addColumnInChild(array.elementType, column, position.tail)) - case _: ArrayType => - throw DeltaErrors.incorrectArrayAccess() - case other => - throw DeltaErrors.addColumnParentNotStructException(column, other) - } - } + private def addColumnToStruct( + schema: StructType, + column: StructField, + position: Seq[Int]): StructType = { // If the proposed new column includes a default value, return a specific "not supported" error. // The rationale is that such operations require the data source scan operator to implement // support for filling in the specified default value when the corresponding field is not @@ -877,13 +888,42 @@ def normalizeColumnNamesInDataType( if (!column.nullable && field.nullable) { throw DeltaErrors.nullableParentWithNotNullNestedField } - val mid = field.copy(dataType = addColumnInChild(field.dataType, column, position.tail)) + val mid = field.copy(dataType = addColumn(field.dataType, column, position.tail)) StructType(pre ++ Seq(mid) ++ post.tail) } else { StructType(pre ++ Seq(column) ++ post) } } + /** + * Drop a column from its child. + * @param parent The parent data type. + * @param position The position to drop the column. + */ + def dropColumn[T <: DataType](parent: T, position: Seq[Int]): (T, StructField) = { + if (position.isEmpty) { + throw DeltaErrors.dropNestedColumnsFromNonStructTypeException(parent) + } + parent match { + case struct: StructType => + val (t, s) = dropColumnInStruct(struct, position) + (t.asInstanceOf[T], s) + case map: MapType if position.head == MAP_KEY_INDEX => + val (newKeyType, droppedColumn) = dropColumn(map.keyType, position.tail) + map.copy(keyType = newKeyType).asInstanceOf[T] -> droppedColumn + case map: MapType if position.head == MAP_VALUE_INDEX => + val (newValueType, droppedColumn) = dropColumn(map.valueType, position.tail) + map.copy(valueType = newValueType).asInstanceOf[T] -> droppedColumn + case array: ArrayType if position.head == ARRAY_ELEMENT_INDEX => + val (newElementType, droppedColumn) = dropColumn(array.elementType, position.tail) + array.copy(elementType = newElementType).asInstanceOf[T] -> droppedColumn + case _: ArrayType => + throw DeltaErrors.incorrectArrayAccess() + case other => + throw DeltaErrors.dropNestedColumnsFromNonStructTypeException(other) + } + } + /** * Drop from the specified `position` in `schema` and return with the original column. * @param position A Seq of ordinals on where this column should go. It is a Seq to denote @@ -894,30 +934,9 @@ def normalizeColumnNamesInDataType( * will return * result: , b,c:STRUCT> */ - def dropColumn(schema: StructType, position: Seq[Int]): (StructType, StructField) = { - def dropColumnInChild(parent: DataType, position: Seq[Int]): (DataType, StructField) = { - if (position.isEmpty) { - throw DeltaErrors.dropNestedColumnsFromNonStructTypeException(parent) - } - parent match { - case struct: StructType => - dropColumn(struct, position) - case map: MapType if position.head == MAP_KEY_INDEX => - val (newKeyType, droppedColumn) = dropColumnInChild(map.keyType, position.tail) - map.copy(keyType = newKeyType) -> droppedColumn - case map: MapType if position.head == MAP_VALUE_INDEX => - val (newValueType, droppedColumn) = dropColumnInChild(map.valueType, position.tail) - map.copy(valueType = newValueType) -> droppedColumn - case array: ArrayType if position.head == ARRAY_ELEMENT_INDEX => - val (newElementType, droppedColumn) = dropColumnInChild(array.elementType, position.tail) - array.copy(elementType = newElementType) -> droppedColumn - case _: ArrayType => - throw DeltaErrors.incorrectArrayAccess() - case other => - throw DeltaErrors.dropNestedColumnsFromNonStructTypeException(other) - } - } - + private def dropColumnInStruct( + schema: StructType, + position: Seq[Int]): (StructType, StructField) = { require(position.nonEmpty, "Don't know where to drop the column") val slicePosition = position.head if (slicePosition < 0) { @@ -930,7 +949,7 @@ def normalizeColumnNamesInDataType( val (pre, post) = schema.splitAt(slicePosition) val field = post.head if (position.length > 1) { - val (newType, droppedColumn) = dropColumnInChild(field.dataType, position.tail) + val (newType, droppedColumn) = dropColumn(field.dataType, position.tail) val mid = field.copy(dataType = newType) StructType(pre ++ Seq(mid) ++ post.tail) -> droppedColumn diff --git a/spark/src/main/scala/org/apache/spark/sql/delta/sources/DeltaSQLConf.scala b/spark/src/main/scala/org/apache/spark/sql/delta/sources/DeltaSQLConf.scala index f9fbb1167c8..41ba27ac219 100644 --- a/spark/src/main/scala/org/apache/spark/sql/delta/sources/DeltaSQLConf.scala +++ b/spark/src/main/scala/org/apache/spark/sql/delta/sources/DeltaSQLConf.scala @@ -1551,6 +1551,20 @@ trait DeltaSQLConfBase { .booleanConf .createWithDefault(true) + val DELTA_STREAMING_SINK_ALLOW_IMPLICIT_CASTS = + buildConf("streaming.sink.allowImplicitCasts") + .internal() + .doc( + """Whether to accept writing data to a Delta streaming sink when the data type doesn't + |match the type in the underlying Delta table. When true, data is cast to the expected + |type before the write. When false, the write fails. + |The casting behavior is governed by 'spark.sql.storeAssignmentPolicy'. + |""".stripMargin) + .booleanConf + // This feature doesn't properly support structs with missing fields and is disabled until a + // fix is implemented. + .createWithDefault(false) + val DELTA_CDF_UNSAFE_BATCH_READ_ON_INCOMPATIBLE_SCHEMA_CHANGES = buildConf("changeDataFeed.unsafeBatchReadOnIncompatibleSchemaChanges.enabled") .doc( @@ -1594,6 +1608,21 @@ trait DeltaSQLConfBase { .booleanConf .createWithDefault(true) + val DELTA_COLUMN_MAPPING_STRIP_METADATA = + buildConf("columnMapping.stripMetadata") + .doc( + """ + |Transactions might try to update the schema of a table with columns that contain + |column mapping metadata, even when column mapping is not enabled. For example, this + |can happen when transactions copy the schema from another table. When this setting is + |enabled, we will strip the column mapping metadata from the schema before applying it. + |Note that this config applies only when the existing schema of the table does not + |contain any column mapping metadata. + |""".stripMargin) + .internal() + .booleanConf + .createWithDefault(true) + val DYNAMIC_PARTITION_OVERWRITE_ENABLED = buildConf("dynamicPartitionOverwrite.enabled") .doc("Whether to overwrite partitions dynamically when 'partitionOverwriteMode' is set to " + diff --git a/spark/src/main/scala/org/apache/spark/sql/delta/sources/DeltaSink.scala b/spark/src/main/scala/org/apache/spark/sql/delta/sources/DeltaSink.scala index 849a399a997..df673cbe0db 100644 --- a/spark/src/main/scala/org/apache/spark/sql/delta/sources/DeltaSink.scala +++ b/spark/src/main/scala/org/apache/spark/sql/delta/sources/DeltaSink.scala @@ -20,23 +20,25 @@ import java.util.concurrent.ConcurrentHashMap import org.apache.spark.sql.delta._ import org.apache.spark.sql.delta.DeltaOperations.StreamingUpdate -import org.apache.spark.sql.delta.actions.{FileAction, SetTransaction} +import org.apache.spark.sql.delta.actions.{FileAction, Metadata, Protocol, SetTransaction} import org.apache.spark.sql.delta.logging.DeltaLogKeys import org.apache.spark.sql.delta.metering.DeltaLogging -import org.apache.spark.sql.delta.schema.{ImplicitMetadataOperation, SchemaUtils} +import org.apache.spark.sql.delta.schema.{ImplicitMetadataOperation, SchemaMergingUtils, SchemaUtils} import org.apache.hadoop.fs.Path // scalastyle:off import.ordering.noEmptyLine import org.apache.spark.internal.MDC import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.catalog.CatalogTable -import org.apache.spark.sql.catalyst.expressions.Attribute -import org.apache.spark.sql.execution.SQLExecution +import org.apache.spark.sql.catalyst.expressions.Alias +import org.apache.spark.sql.catalyst.types.DataTypeUtils +import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap +import org.apache.spark.sql.execution.{QueryExecution, SQLExecution} import org.apache.spark.sql.execution.metric.{SQLMetric, SQLMetrics} import org.apache.spark.sql.execution.metric.SQLMetrics.createMetric -import org.apache.spark.sql.execution.streaming.{Sink, StreamExecution} +import org.apache.spark.sql.execution.streaming.{IncrementalExecution, Sink, StreamExecution} import org.apache.spark.sql.streaming.OutputMode -import org.apache.spark.sql.types.NullType +import org.apache.spark.sql.types.{DataType, NullType, StructType} import org.apache.spark.util.Utils /** @@ -51,6 +53,7 @@ case class DeltaSink( catalogTable: Option[CatalogTable] = None) extends Sink with ImplicitMetadataOperation + with UpdateExpressionsSupport with DeltaLogging { private val deltaLog = DeltaLog.forTable(sqlContext.sparkSession, path) @@ -126,8 +129,9 @@ case class DeltaSink( txn.readWholeTable() } + val writeSchema = getWriteSchema(txn.protocol, txn.metadata, data.schema) // Streaming sinks can't blindly overwrite schema. See Schema Management design doc for details - updateMetadata(data.sparkSession, txn, data.schema, partitionColumns, Map.empty, + updateMetadata(data.sparkSession, txn, writeSchema, partitionColumns, Map.empty, outputMode == OutputMode.Complete(), rearrangeOnly = false) val currentVersion = txn.txnVersion(queryId) @@ -144,7 +148,7 @@ case class DeltaSink( case _ => Nil } val (newFiles, writeFilesTimeMs) = Utils.timeTakenMs{ - txn.writeFiles(data, Some(options)) + txn.writeFiles(castDataIfNeeded(data, writeSchema), Some(options)) } val totalSize = newFiles.map(_.getFileSize).sum val totalLogicalRecords = newFiles.map(_.numLogicalRecords.getOrElse(0L)).sum @@ -161,6 +165,56 @@ case class DeltaSink( return true } + /** + * Returns the schema to use to write data to this delta table. The write schema includes new + * columns to add with schema evolution and reconciles types to match the table types when + * possible or apply type widening if enabled. + */ + private def getWriteSchema( + protocol: Protocol, metadata: Metadata, dataSchema: StructType): StructType = { + if (!sqlConf.getConf(DeltaSQLConf.DELTA_STREAMING_SINK_ALLOW_IMPLICIT_CASTS)) return dataSchema + + if (canOverwriteSchema) return dataSchema + + SchemaMergingUtils.mergeSchemas( + metadata.schema, + dataSchema, + allowImplicitConversions = true, + allowTypeWidening = canMergeSchema && TypeWidening.isEnabled(protocol, metadata) + ) + } + + /** Casts columns in the given dataframe to match the target schema. */ + private def castDataIfNeeded(data: DataFrame, targetSchema: StructType): DataFrame = { + if (!sqlConf.getConf(DeltaSQLConf.DELTA_STREAMING_SINK_ALLOW_IMPLICIT_CASTS)) return data + + // We should respect 'spark.sql.caseSensitive' here but writing to a Delta sink is currently + // case insensitive so we align with that. + val targetTypes = + CaseInsensitiveMap[DataType](targetSchema.map(field => field.name -> field.dataType).toMap) + + val needCast = data.schema.exists { field => + !DataTypeUtils.equalsIgnoreCaseAndNullability(field.dataType, targetTypes(field.name)) + } + if (!needCast) return data + + val castColumns = data.columns.map { columnName => + val castExpr = castIfNeeded( + fromExpression = data.col(columnName).expr, + dataType = targetTypes(columnName), + allowStructEvolution = canMergeSchema, + columnName = columnName + ) + new Column(Alias(castExpr, columnName)()) + } + + data.queryExecution match { + case i: IncrementalExecution => + DeltaStreamUtils.selectFromStreamingDataFrame(i, data, castColumns: _*) + case _: QueryExecution => + data.select(castColumns: _*) + } + } override def toString(): String = s"DeltaSink[$path]" } diff --git a/spark/src/main/scala/org/apache/spark/sql/delta/sources/DeltaStreamUtils.scala b/spark/src/main/scala/org/apache/spark/sql/delta/sources/DeltaStreamUtils.scala new file mode 100644 index 00000000000..3770584b050 --- /dev/null +++ b/spark/src/main/scala/org/apache/spark/sql/delta/sources/DeltaStreamUtils.scala @@ -0,0 +1,55 @@ +/* + * Copyright (2021) The Delta Lake Project Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.delta.sources + +import scala.collection.mutable + +import org.apache.hadoop.fs.Path + +import org.apache.spark.sql.{Column, DataFrame, Dataset, Encoder} +import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder +import org.apache.spark.sql.execution.QueryExecution +import org.apache.spark.sql.execution.streaming.{IncrementalExecution, IncrementalExecutionShims, StreamExecution} + +object DeltaStreamUtils { + + /** + * Select `cols` from a micro batch DataFrame. Directly calling `select` won't work because it + * will create a `QueryExecution` rather than inheriting `IncrementalExecution` from + * the micro batch DataFrame. A streaming micro batch DataFrame to execute should use + * `IncrementalExecution`. + */ + def selectFromStreamingDataFrame( + incrementalExecution: IncrementalExecution, + df: DataFrame, + cols: Column*): DataFrame = { + val newMicroBatch = df.select(cols: _*) + val newIncrementalExecution = IncrementalExecutionShims.newInstance( + newMicroBatch.sparkSession, + newMicroBatch.queryExecution.logical, + incrementalExecution) + newIncrementalExecution.executedPlan // Force the lazy generation of execution plan + + + // Use reflection to call the private constructor. + val constructor = + classOf[Dataset[_]].getConstructor(classOf[QueryExecution], classOf[Encoder[_]]) + constructor.newInstance( + newIncrementalExecution, + ExpressionEncoder(newIncrementalExecution.analyzed.schema)).asInstanceOf[DataFrame] + } +} diff --git a/spark/src/main/scala/org/apache/spark/sql/delta/storage/dv/DeletionVectorStore.scala b/spark/src/main/scala/org/apache/spark/sql/delta/storage/dv/DeletionVectorStore.scala index 537c95f81bb..cb29804a0a3 100644 --- a/spark/src/main/scala/org/apache/spark/sql/delta/storage/dv/DeletionVectorStore.scala +++ b/spark/src/main/scala/org/apache/spark/sql/delta/storage/dv/DeletionVectorStore.scala @@ -25,15 +25,15 @@ import java.util.zip.CRC32 import org.apache.spark.sql.delta.DeltaErrors import org.apache.spark.sql.delta.actions.DeletionVectorDescriptor import org.apache.spark.sql.delta.deletionvectors.{RoaringBitmapArray, StoredBitmap} +import org.apache.spark.sql.delta.metering.DeltaLogging import org.apache.spark.sql.delta.util.PathWithFileSystem import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, FSDataOutputStream, Path} -import org.apache.spark.internal.Logging import org.apache.spark.paths.SparkPath import org.apache.spark.util.Utils -trait DeletionVectorStore extends Logging { +trait DeletionVectorStore extends DeltaLogging { /** * Read a Deletion Vector and parse it as [[RoaringBitmapArray]]. */ @@ -205,22 +205,36 @@ class HadoopFileSystemDVStore(hadoopConf: Configuration) // Lazily create the writer for the deletion vectors, so that we don't write an empty file // in case all deletion vectors are empty. private var outputStream: FSDataOutputStream = _ + private var writtenBytes = 0L override def write(data: Array[Byte]): DeletionVectorStore.DVRangeDescriptor = { if (outputStream == null) { val overwrite = false // `create` Java API does not support named parameters outputStream = path.fs.create(path.path, overwrite) outputStream.writeByte(DeletionVectorStore.DV_FILE_FORMAT_VERSION_ID_V1) + writtenBytes += 1 } val dvRange = DeletionVectorStore.DVRangeDescriptor( offset = outputStream.size(), length = data.length, - checksum = DeletionVectorStore.calculateChecksum(data) - ) + checksum = DeletionVectorStore.calculateChecksum(data)) + + if (writtenBytes != dvRange.offset) { + recordDeltaEvent( + deltaLog = null, + opType = "delta.deletionVector.write.offsetMismatch", + data = Map( + "path" -> path.path.toString, + "reportedOffset" -> dvRange.offset, + "calculatedOffset" -> writtenBytes)) + } + log.debug(s"Writing DV range to file: Path=${path.path}, Range=${dvRange}") outputStream.writeInt(data.length) outputStream.write(data) outputStream.writeInt(dvRange.checksum) + writtenBytes += DeletionVectorStore.getTotalSizeOfDVFieldsInFile(data.length) + dvRange } diff --git a/spark/src/main/scala/org/apache/spark/sql/delta/util/DatasetRefCache.scala b/spark/src/main/scala/org/apache/spark/sql/delta/util/DatasetRefCache.scala index 3d9bdbdbebe..879a101bb31 100644 --- a/spark/src/main/scala/org/apache/spark/sql/delta/util/DatasetRefCache.scala +++ b/spark/src/main/scala/org/apache/spark/sql/delta/util/DatasetRefCache.scala @@ -42,10 +42,12 @@ import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession} * * @param creator a function to create [[Dataset]]. */ -class DatasetRefCache[T](creator: () => Dataset[T]) { +class DatasetRefCache[T] private[util](creator: () => Dataset[T]) { private val holder = new AtomicReference[Dataset[T]] + private[delta] def invalidate() = holder.set(null) + def get: Dataset[T] = Option(holder.get()) .filter(_.sparkSession eq SparkSession.active) .getOrElse { @@ -54,4 +56,3 @@ class DatasetRefCache[T](creator: () => Dataset[T]) { df } } - diff --git a/spark/src/main/scala/org/apache/spark/sql/delta/util/DeltaFileOperations.scala b/spark/src/main/scala/org/apache/spark/sql/delta/util/DeltaFileOperations.scala index bf1d4727fd7..8df11b3e18b 100644 --- a/spark/src/main/scala/org/apache/spark/sql/delta/util/DeltaFileOperations.scala +++ b/spark/src/main/scala/org/apache/spark/sql/delta/util/DeltaFileOperations.scala @@ -243,7 +243,10 @@ object DeltaFileOperations extends DeltaLogging { import org.apache.spark.sql.delta.implicits._ if (subDirs.isEmpty) return spark.emptyDataset[SerializableFileStatus] val listParallelism = fileListingParallelism.getOrElse(spark.sparkContext.defaultParallelism) - val dirsAndFiles = spark.sparkContext.parallelize(subDirs).mapPartitions { dirs => + val subDirsParallelism = subDirs.length.min(spark.sparkContext.defaultParallelism) + val dirsAndFiles = spark.sparkContext.parallelize( + subDirs, + subDirsParallelism).mapPartitions { dirs => val logStore = LogStore(SparkEnv.get.conf, hadoopConf.value.value) listUsingLogStore( logStore, diff --git a/spark/src/main/scala/org/apache/spark/sql/delta/util/StateCache.scala b/spark/src/main/scala/org/apache/spark/sql/delta/util/StateCache.scala index 391cf56710d..86784bfde3c 100644 --- a/spark/src/main/scala/org/apache/spark/sql/delta/util/StateCache.scala +++ b/spark/src/main/scala/org/apache/spark/sql/delta/util/StateCache.scala @@ -40,6 +40,7 @@ trait StateCache extends DeltaLogging { private var _isCached = true /** A list of RDDs that we need to uncache when we are done with this snapshot. */ private val cached = ArrayBuffer[RDD[_]]() + private val cached_refs = ArrayBuffer[DatasetRefCache[_]]() /** Method to expose the value of _isCached for testing. */ private[delta] def isCached: Boolean = _isCached @@ -47,7 +48,7 @@ trait StateCache extends DeltaLogging { private val storageLevel = StorageLevel.fromString( spark.sessionState.conf.getConf(DeltaSQLConf.DELTA_SNAPSHOT_CACHE_STORAGE_LEVEL)) - class CachedDS[A](ds: Dataset[A], name: String) { + class CachedDS[A] private[StateCache](ds: Dataset[A], name: String) { // While we cache RDD to avoid re-computation in different spark sessions, `Dataset` can only be // reused by the session that created it to avoid session pollution. So we use `DatasetRefCache` // to re-create a new `Dataset` when the active session is changed. This is an optimization for @@ -64,10 +65,10 @@ trait StateCache extends DeltaLogging { rdd.persist(storageLevel) } cached += rdd - val dsCache = new DatasetRefCache(() => { + val dsCache = datasetRefCache { () => val logicalRdd = LogicalRDD(qe.analyzed.output, rdd)(spark) Dataset.ofRows(spark, logicalRdd) - }) + } Some(dsCache) } else { None @@ -110,11 +111,18 @@ trait StateCache extends DeltaLogging { new CachedDS[A](ds, name) } + def datasetRefCache[A](creator: () => Dataset[A]): DatasetRefCache[A] = { + val dsCache = new DatasetRefCache(creator) + cached_refs += dsCache + dsCache + } + /** Drop any cached data for this [[Snapshot]]. */ def uncache(): Unit = cached.synchronized { if (isCached) { _isCached = false cached.foreach(_.unpersist(blocking = false)) + cached_refs.foreach(_.invalidate()) } } } diff --git a/spark/src/test/resources/log4j2.properties b/spark/src/test/resources/log4j2.properties index a35b0f42608..742d9e1fa8c 100644 --- a/spark/src/test/resources/log4j2.properties +++ b/spark/src/test/resources/log4j2.properties @@ -38,13 +38,6 @@ appender.file.append = true appender.file.layout.type = PatternLayout appender.file.layout.pattern = %d{yy/MM/dd HH:mm:ss.SSS} %t %p %c{1}: %m%n -# Structured Logging Appender -appender.structured.type = File -appender.structured.name = structured -appender.structured.fileName = target/structured.log -appender.structured.layout.type = JsonTemplateLayout -appender.structured.layout.eventTemplateUri = classpath:org/apache/spark/SparkLayout.json - # Pattern Logging Appender appender.pattern.type = File appender.pattern.name = pattern @@ -52,12 +45,6 @@ appender.pattern.fileName = target/pattern.log appender.pattern.layout.type = PatternLayout appender.pattern.layout.pattern = %d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n%ex -# Custom logger for testing structured logging with Spark master -logger.structured_logging.name = org.apache.spark.sql.delta.logging.DeltaStructuredLoggingSuite -logger.structured_logging.level = trace -logger.structured_logging.appenderRefs = structured -logger.structured_logging.appenderRef.structured.ref = structured - # Custom logger for testing structured logging with Spark 3.5 shims logger.pattern_logging.name = org.apache.spark.sql.delta.logging.DeltaPatternLoggingSuite logger.pattern_logging.level = trace @@ -76,4 +63,3 @@ appender.console.layout.pattern = %d{yy/MM/dd HH:mm:ss.SSS} %t %p %c{1}: %m%n # Ignore messages below warning level from Jetty, because it's a bit verbose logger.jetty.name = org.sparkproject.jetty logger.jetty.level = warn - diff --git a/spark/src/test/resources/log4j2_spark_master.properties b/spark/src/test/resources/log4j2_spark_master.properties new file mode 100644 index 00000000000..e76ca3383b7 --- /dev/null +++ b/spark/src/test/resources/log4j2_spark_master.properties @@ -0,0 +1,65 @@ +# +# Copyright (2021) The Delta Lake Project Authors. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Set everything to be logged to the file target/unit-tests.log +rootLogger.level = info +rootLogger.appenderRef.file.ref = ${sys:test.appender:-File} + +appender.file.type = File +appender.file.name = File +appender.file.fileName = target/unit-tests.log +appender.file.append = true +appender.file.layout.type = PatternLayout +appender.file.layout.pattern = %d{yy/MM/dd HH:mm:ss.SSS} %t %p %c{1}: %m%n + +# Structured Logging Appender +appender.structured.type = File +appender.structured.name = structured +appender.structured.fileName = target/structured.log +appender.structured.layout.type = JsonTemplateLayout +appender.structured.layout.eventTemplateUri = classpath:org/apache/spark/SparkLayout.json + +# Custom logger for testing structured logging with Spark master +logger.structured_logging.name = org.apache.spark.sql.delta.logging.DeltaStructuredLoggingSuite +logger.structured_logging.level = trace +logger.structured_logging.appenderRefs = structured +logger.structured_logging.appenderRef.structured.ref = structured + +# Tests that launch java subprocesses can set the "test.appender" system property to +# "console" to avoid having the child process's logs overwrite the unit test's +# log file. +appender.console.type = Console +appender.console.name = console +appender.console.target = SYSTEM_ERR +appender.console.layout.type = PatternLayout +appender.console.layout.pattern = %d{yy/MM/dd HH:mm:ss.SSS} %t %p %c{1}: %m%n + +# Ignore messages below warning level from Jetty, because it's a bit verbose +logger.jetty.name = org.sparkproject.jetty +logger.jetty.level = warn diff --git a/spark/src/test/scala-spark-master/org/apache/spark/sql/delta/DeltaVariantSuite.scala b/spark/src/test/scala-spark-master/org/apache/spark/sql/delta/DeltaVariantSuite.scala index 449cd55b9fb..daac96a28e4 100644 --- a/spark/src/test/scala-spark-master/org/apache/spark/sql/delta/DeltaVariantSuite.scala +++ b/spark/src/test/scala-spark-master/org/apache/spark/sql/delta/DeltaVariantSuite.scala @@ -100,7 +100,7 @@ class DeltaVariantSuite // check previously thrown error message checkError( e, - errorClass = "DELTA_FEATURES_REQUIRE_MANUAL_ENABLEMENT", + "DELTA_FEATURES_REQUIRE_MANUAL_ENABLEMENT", parameters = Map( "unsupportedFeatures" -> VariantTypeTableFeature.name, "supportedFeatures" -> currentFeatures @@ -123,13 +123,13 @@ class DeltaVariantSuite test("VariantType may not be used as a partition column") { withTable("delta_test") { checkError( - exception = intercept[AnalysisException] { + intercept[AnalysisException] { sql( """CREATE TABLE delta_test(s STRING, v VARIANT) |USING delta |PARTITIONED BY (v)""".stripMargin) }, - errorClass = "INVALID_PARTITION_COLUMN_DATA_TYPE", + "INVALID_PARTITION_COLUMN_DATA_TYPE", parameters = Map("type" -> "\"VARIANT\"") ) } @@ -516,7 +516,7 @@ class DeltaVariantSuite } checkError( insertException, - errorClass = "DELTA_NOT_NULL_CONSTRAINT_VIOLATED", + "DELTA_NOT_NULL_CONSTRAINT_VIOLATED", parameters = Map("columnName" -> "v") ) @@ -539,7 +539,7 @@ class DeltaVariantSuite } checkError( insertException, - errorClass = "DELTA_VIOLATE_CONSTRAINT_WITH_VALUES", + "DELTA_VIOLATE_CONSTRAINT_WITH_VALUES", parameters = Map( "constraintName" -> "variantgtezero", "expression" -> "(variant_get(v, '$', 'INT') >= 0)", "values" -> " - v : -1" diff --git a/spark/src/test/scala/io/delta/sql/parser/DeltaSqlParserSuite.scala b/spark/src/test/scala/io/delta/sql/parser/DeltaSqlParserSuite.scala index 1052c5528bb..4934a1d8849 100644 --- a/spark/src/test/scala/io/delta/sql/parser/DeltaSqlParserSuite.scala +++ b/spark/src/test/scala/io/delta/sql/parser/DeltaSqlParserSuite.scala @@ -463,9 +463,9 @@ class DeltaSqlParserSuite extends SparkFunSuite with SQLHelper { val parser = new DeltaSqlParser(new SparkSqlParser()) val sql = clusterByStatement(clause, asSelect, "a int, b string", "CLUSTER BY (a) CLUSTER BY (b)") - checkError(exception = intercept[ParseException] { + checkError(intercept[ParseException] { parser.parsePlan(sql) - }, errorClass = "DUPLICATE_CLAUSES", parameters = Map("clauseName" -> "CLUSTER BY")) + }, "DUPLICATE_CLAUSES", parameters = Map("clauseName" -> "CLUSTER BY")) } test("CLUSTER BY set clustering column property is ignored - " + @@ -492,9 +492,9 @@ class DeltaSqlParserSuite extends SparkFunSuite with SQLHelper { "CLUSTER BY (a) PARTITIONED BY (b)") val errorMsg = "Clustering and partitioning cannot both be specified. " + "Please remove PARTITIONED BY if you want to create a Delta table with clustering" - checkError(exception = intercept[ParseException] { + checkError(intercept[ParseException] { parser.parsePlan(sql) - }, errorClass = "_LEGACY_ERROR_TEMP_0035", parameters = Map("message" -> errorMsg)) + }, "_LEGACY_ERROR_TEMP_0035", parameters = Map("message" -> errorMsg)) } test(s"CLUSTER BY with bucketing - $clause TABLE asSelect = $asSelect") { @@ -508,9 +508,9 @@ class DeltaSqlParserSuite extends SparkFunSuite with SQLHelper { val errorMsg = "Clustering and bucketing cannot both be specified. " + "Please remove CLUSTERED BY INTO BUCKETS if you " + "want to create a Delta table with clustering" - checkError(exception = intercept[ParseException] { + checkError(intercept[ParseException] { parser.parsePlan(sql) - }, errorClass = "_LEGACY_ERROR_TEMP_0035", parameters = Map("message" -> errorMsg)) + }, "_LEGACY_ERROR_TEMP_0035", parameters = Map("message" -> errorMsg)) } } } diff --git a/spark/src/test/scala/io/delta/tables/DeltaTableBuilderSuite.scala b/spark/src/test/scala/io/delta/tables/DeltaTableBuilderSuite.scala index cbeb5b24bc7..4cf9f43f558 100644 --- a/spark/src/test/scala/io/delta/tables/DeltaTableBuilderSuite.scala +++ b/spark/src/test/scala/io/delta/tables/DeltaTableBuilderSuite.scala @@ -492,10 +492,7 @@ class DeltaTableBuilderSuite .execute() } - checkError( - exception = e, - errorClass = "DELTA_CLUSTER_BY_WITH_PARTITIONED_BY" - ) + checkError(e, "DELTA_CLUSTER_BY_WITH_PARTITIONED_BY") } } } diff --git a/spark/src/test/scala/org/apache/spark/sql/delta/ActionSerializerSuite.scala b/spark/src/test/scala/org/apache/spark/sql/delta/ActionSerializerSuite.scala index e9121cd5ba0..48aa427a539 100644 --- a/spark/src/test/scala/org/apache/spark/sql/delta/ActionSerializerSuite.scala +++ b/spark/src/test/scala/org/apache/spark/sql/delta/ActionSerializerSuite.scala @@ -235,7 +235,7 @@ class ActionSerializerSuite extends QueryTest with SharedSparkSession with Delta expectedJson = s"""{"protocol":{"minReaderVersion":$TABLE_FEATURES_MIN_READER_VERSION,""" + s""""minWriterVersion":$TABLE_FEATURES_MIN_WRITER_VERSION,""" + - """"readerFeatures":["testLegacyReaderWriter"],""" + + """"readerFeatures":[],""" + """"writerFeatures":["testLegacyReaderWriter"]}}""") testActionSerDe( @@ -248,7 +248,7 @@ class ActionSerializerSuite extends QueryTest with SharedSparkSession with Delta expectedJson = s"""{"protocol":{"minReaderVersion":$TABLE_FEATURES_MIN_READER_VERSION,""" + s""""minWriterVersion":$TABLE_FEATURES_MIN_WRITER_VERSION,""" + - """"readerFeatures":["testLegacyReaderWriter","testReaderWriter"],""" + + """"readerFeatures":["testReaderWriter"],""" + """"writerFeatures":["testLegacyReaderWriter","testReaderWriter"]}}""") testActionSerDe( diff --git a/spark/src/test/scala/org/apache/spark/sql/delta/ConvertToDeltaSuiteBase.scala b/spark/src/test/scala/org/apache/spark/sql/delta/ConvertToDeltaSuiteBase.scala index 0bf3a379331..70a3c05ccca 100644 --- a/spark/src/test/scala/org/apache/spark/sql/delta/ConvertToDeltaSuiteBase.scala +++ b/spark/src/test/scala/org/apache/spark/sql/delta/ConvertToDeltaSuiteBase.scala @@ -188,7 +188,7 @@ trait ConvertToDeltaSuiteBase extends ConvertToDeltaSuiteBaseCommons } test("filter non-parquet file for schema inference when not using catalog schema") { - withTempDir(prefix = "spark") { dir => + withTempDir { dir => val tempDir = dir.getCanonicalPath writeFiles(tempDir + "/part=1/", Seq(1).toDF("corrupted_id"), format = "orc") writeFiles(tempDir + "/part=2/", Seq(2).toDF("id")) diff --git a/spark/src/test/scala/org/apache/spark/sql/delta/CustomCatalogSuite.scala b/spark/src/test/scala/org/apache/spark/sql/delta/CustomCatalogSuite.scala index 43ec30d0b85..79305a230d0 100644 --- a/spark/src/test/scala/org/apache/spark/sql/delta/CustomCatalogSuite.scala +++ b/spark/src/test/scala/org/apache/spark/sql/delta/CustomCatalogSuite.scala @@ -19,7 +19,7 @@ package org.apache.spark.sql.delta import scala.collection.JavaConverters._ // scalastyle:off import.ordering.noEmptyLine -import org.apache.spark.sql.delta.catalog.DeltaTableV2 +import org.apache.spark.sql.delta.catalog.{DeltaCatalog, DeltaTableV2} import org.apache.spark.sql.delta.commands._ import org.apache.spark.sql.delta.test.DeltaSQLCommandTest import org.apache.hadoop.fs.{FileSystem, Path} @@ -34,7 +34,6 @@ import org.apache.spark.sql.catalyst.plans.logical.{AppendData, LogicalPlan, Set import org.apache.spark.sql.catalyst.trees.UnaryLike import org.apache.spark.sql.connector.catalog.{DelegatingCatalogExtension, Identifier, InMemoryTable, InMemoryTableCatalog, Table, TableCatalog, TableChange, V1Table} import org.apache.spark.sql.connector.expressions.Transform -import org.apache.spark.sql.delta.catalog.DeltaCatalog import org.apache.spark.sql.execution.command.LeafRunnableCommand import org.apache.spark.sql.execution.datasources.v2.{DataSourceV2Relation, V2SessionCatalog} import org.apache.spark.sql.test.SharedSparkSession @@ -297,6 +296,22 @@ class CustomCatalogSuite extends QueryTest with SharedSparkSession } } } + + test("custom catalog that generates location for managed tables") { + // Reset catalog manager so that the new `spark_catalog` implementation can apply. + spark.sessionState.catalogManager.reset() + withSQLConf("spark.sql.catalog.spark_catalog" -> classOf[DummySessionCatalog].getName) { + withTable("t") { + withTempPath { path => + sql(s"CREATE TABLE t (id LONG) USING delta TBLPROPERTIES (fakeLoc='$path')") + val t = spark.sessionState.catalogManager.v2SessionCatalog.asInstanceOf[TableCatalog] + .loadTable(Identifier.of(Array("default"), "t")) + // It should be a managed table. + assert(!t.properties().containsKey(TableCatalog.PROP_EXTERNAL)) + } + } + } + } } class DummyCatalog extends TableCatalog { @@ -397,9 +412,10 @@ class DummySessionCatalogInner extends DelegatingCatalogExtension { } // A dummy catalog that adds a layer between DeltaCatalog and the Spark SessionCatalog, -// to attach additional table storage properties after the table is loaded. +// to attach additional table storage properties after the table is loaded, and generates location +// for managed tables. class DummySessionCatalog extends TableCatalog { - private var deltaCatalog: DelegatingCatalogExtension = null + private var deltaCatalog: DeltaCatalog = null override def initialize(name: String, options: CaseInsensitiveStringMap): Unit = { val inner = new DummySessionCatalogInner() @@ -422,7 +438,16 @@ class DummySessionCatalog extends TableCatalog { schema: StructType, partitions: Array[Transform], properties: java.util.Map[String, String]): Table = { - deltaCatalog.createTable(ident, schema, partitions, properties) + if (!properties.containsKey(TableCatalog.PROP_EXTERNAL) && + !properties.containsKey(TableCatalog.PROP_LOCATION)) { + val newProps = new java.util.HashMap[String, String] + newProps.putAll(properties) + newProps.put(TableCatalog.PROP_LOCATION, properties.get("fakeLoc")) + newProps.put(TableCatalog.PROP_IS_MANAGED_LOCATION, "true") + deltaCatalog.createTable(ident, schema, partitions, newProps) + } else { + deltaCatalog.createTable(ident, schema, partitions, properties) + } } override def alterTable(ident: Identifier, changes: TableChange*): Table = { diff --git a/spark/src/test/scala/org/apache/spark/sql/delta/DeltaAlterTableTests.scala b/spark/src/test/scala/org/apache/spark/sql/delta/DeltaAlterTableTests.scala index 32098b9c1e4..fa966b36223 100644 --- a/spark/src/test/scala/org/apache/spark/sql/delta/DeltaAlterTableTests.scala +++ b/spark/src/test/scala/org/apache/spark/sql/delta/DeltaAlterTableTests.scala @@ -924,17 +924,17 @@ trait DeltaAlterTableTests extends DeltaAlterTableTestBase { .withColumn("a", map('v1, 'v2)) withDeltaTable(df) { tableName => checkError( - exception = intercept[DeltaAnalysisException] { + intercept[DeltaAnalysisException] { sql(s"ALTER TABLE $tableName CHANGE COLUMN a.key COMMENT 'a comment'") }, - errorClass = "DELTA_UNSUPPORTED_COMMENT_MAP_ARRAY", + "DELTA_UNSUPPORTED_COMMENT_MAP_ARRAY", parameters = Map("fieldPath" -> "a.key") ) checkError( - exception = intercept[DeltaAnalysisException] { + intercept[DeltaAnalysisException] { sql(s"ALTER TABLE $tableName CHANGE COLUMN a.value COMMENT 'a comment'") }, - errorClass = "DELTA_UNSUPPORTED_COMMENT_MAP_ARRAY", + "DELTA_UNSUPPORTED_COMMENT_MAP_ARRAY", parameters = Map("fieldPath" -> "a.value") ) } @@ -945,10 +945,10 @@ trait DeltaAlterTableTests extends DeltaAlterTableTestBase { .withColumn("a", array('v1)) withDeltaTable(df) { tableName => checkError( - exception = intercept[DeltaAnalysisException] { + intercept[DeltaAnalysisException] { sql(s"ALTER TABLE $tableName CHANGE COLUMN a.element COMMENT 'a comment'") }, - errorClass = "DELTA_UNSUPPORTED_COMMENT_MAP_ARRAY", + "DELTA_UNSUPPORTED_COMMENT_MAP_ARRAY", parameters = Map("fieldPath" -> "a.element") ) } @@ -959,20 +959,20 @@ trait DeltaAlterTableTests extends DeltaAlterTableTestBase { .withColumn("a", map('v1, 'v2)) withDeltaTable(df) { tableName => checkError( - exception = intercept[AnalysisException] { + intercept[AnalysisException] { sql(s"ALTER TABLE $tableName RENAME COLUMN a.key TO key2") }, - errorClass = "INVALID_FIELD_NAME", + "INVALID_FIELD_NAME", parameters = Map( "fieldName" -> "`a`.`key2`", "path" -> "`a`" ) ) checkError( - exception = intercept[AnalysisException] { + intercept[AnalysisException] { sql(s"ALTER TABLE $tableName RENAME COLUMN a.value TO value2") }, - errorClass = "INVALID_FIELD_NAME", + "INVALID_FIELD_NAME", parameters = Map( "fieldName" -> "`a`.`value2`", "path" -> "`a`" @@ -986,10 +986,10 @@ trait DeltaAlterTableTests extends DeltaAlterTableTestBase { .withColumn("a", array('v1)) withDeltaTable(df) { tableName => checkError( - exception = intercept[AnalysisException] { + intercept[AnalysisException] { sql(s"ALTER TABLE $tableName RENAME COLUMN a.element TO element2") }, - errorClass = "INVALID_FIELD_NAME", + "INVALID_FIELD_NAME", parameters = Map( "fieldName" -> "`a`.`element2`", "path" -> "`a`" @@ -1008,10 +1008,10 @@ trait DeltaAlterTableTests extends DeltaAlterTableTestBase { ddlTest("CHANGE COLUMN - incompatible") { withDeltaTable(Seq((1, "a"), (2, "b")).toDF("v1", "v2")) { tableName => checkError( - exception = intercept[DeltaAnalysisException] { + intercept[DeltaAnalysisException] { sql(s"ALTER TABLE $tableName CHANGE COLUMN v1 v1 long") }, - errorClass = "DELTA_UNSUPPORTED_ALTER_TABLE_CHANGE_COL_OP", + "DELTA_UNSUPPORTED_ALTER_TABLE_CHANGE_COL_OP", parameters = Map( "fieldPath" -> "v1", "oldField" -> "INT", @@ -1026,10 +1026,10 @@ trait DeltaAlterTableTests extends DeltaAlterTableTestBase { .withColumn("struct", struct("v1", "v2")) withDeltaTable(df) { tableName => checkError( - exception = intercept[DeltaAnalysisException] { + intercept[DeltaAnalysisException] { sql(s"ALTER TABLE $tableName CHANGE COLUMN struct.v1 v1 long") }, - errorClass = "DELTA_UNSUPPORTED_ALTER_TABLE_CHANGE_COL_OP", + "DELTA_UNSUPPORTED_ALTER_TABLE_CHANGE_COL_OP", parameters = Map( "fieldPath" -> "struct.v1", "oldField" -> "INT", @@ -1044,10 +1044,10 @@ trait DeltaAlterTableTests extends DeltaAlterTableTestBase { .withColumn("a", map('v1, 'v2)) withDeltaTable(df) { tableName => checkError( - exception = intercept[DeltaAnalysisException] { + intercept[DeltaAnalysisException] { sql(s"ALTER TABLE $tableName CHANGE COLUMN a.key key long") }, - errorClass = "DELTA_UNSUPPORTED_ALTER_TABLE_CHANGE_COL_OP", + "DELTA_UNSUPPORTED_ALTER_TABLE_CHANGE_COL_OP", parameters = Map( "fieldPath" -> "a.key", "oldField" -> "INT NOT NULL", @@ -1062,10 +1062,10 @@ trait DeltaAlterTableTests extends DeltaAlterTableTestBase { .withColumn("a", map('v1, 'v2)) withDeltaTable(df) { tableName => checkError( - exception = intercept[DeltaAnalysisException] { + intercept[DeltaAnalysisException] { sql(s"ALTER TABLE $tableName CHANGE COLUMN a.value value long") }, - errorClass = "DELTA_UNSUPPORTED_ALTER_TABLE_CHANGE_COL_OP", + "DELTA_UNSUPPORTED_ALTER_TABLE_CHANGE_COL_OP", parameters = Map( "fieldPath" -> "a.value", "oldField" -> "INT", @@ -1080,10 +1080,10 @@ trait DeltaAlterTableTests extends DeltaAlterTableTestBase { .withColumn("a", array('v1)) withDeltaTable(df) { tableName => checkError( - exception = intercept[DeltaAnalysisException] { + intercept[DeltaAnalysisException] { sql(s"ALTER TABLE $tableName CHANGE COLUMN a.element element long") }, - errorClass = "DELTA_UNSUPPORTED_ALTER_TABLE_CHANGE_COL_OP", + "DELTA_UNSUPPORTED_ALTER_TABLE_CHANGE_COL_OP", parameters = Map( "fieldPath" -> "a.element", "oldField" -> "INT", @@ -1383,8 +1383,8 @@ trait DeltaAlterTableTests extends DeltaAlterTableTestBase { // Changing the nullability of map/array fields is not allowed. var statement = s"ALTER TABLE $tableName CHANGE COLUMN m.key DROP NOT NULL" checkError( - exception = intercept[AnalysisException] { sql(statement) }, - errorClass = "DELTA_UNSUPPORTED_ALTER_TABLE_CHANGE_COL_OP", + intercept[AnalysisException] { sql(statement) }, + "DELTA_UNSUPPORTED_ALTER_TABLE_CHANGE_COL_OP", parameters = Map( "fieldPath" -> "m.key", "oldField" -> "INT NOT NULL", @@ -1394,8 +1394,8 @@ trait DeltaAlterTableTests extends DeltaAlterTableTestBase { statement = s"ALTER TABLE $tableName CHANGE COLUMN m.value SET NOT NULL" checkError( - exception = intercept[AnalysisException] { sql(statement) }, - errorClass = "_LEGACY_ERROR_TEMP_2330", + intercept[AnalysisException] { sql(statement) }, + "_LEGACY_ERROR_TEMP_2330", parameters = Map( "fieldName" -> "m.value" ), @@ -1404,8 +1404,8 @@ trait DeltaAlterTableTests extends DeltaAlterTableTestBase { statement = s"ALTER TABLE $tableName CHANGE COLUMN a.element SET NOT NULL" checkError( - exception = intercept[AnalysisException] { sql(statement) }, - errorClass = "_LEGACY_ERROR_TEMP_2330", + intercept[AnalysisException] { sql(statement) }, + "_LEGACY_ERROR_TEMP_2330", parameters = Map( "fieldName" -> "a.element" ), diff --git a/spark/src/test/scala/org/apache/spark/sql/delta/DeltaCDCColumnMappingSuite.scala b/spark/src/test/scala/org/apache/spark/sql/delta/DeltaCDCColumnMappingSuite.scala index 4eb5a8cba80..465a748ba65 100644 --- a/spark/src/test/scala/org/apache/spark/sql/delta/DeltaCDCColumnMappingSuite.scala +++ b/spark/src/test/scala/org/apache/spark/sql/delta/DeltaCDCColumnMappingSuite.scala @@ -539,7 +539,8 @@ trait DeltaCDCColumnMappingSuiteBase extends DeltaCDCSuiteBase "add column batch cdc read not blocked", "data type and nullability change batch cdc read blocked", "drop column batch cdc read blocked", - "rename column batch cdc read blocked" + "rename column batch cdc read blocked", + "filters with special characters in name should be pushed down" ) protected def assertBlocked( @@ -618,7 +619,7 @@ trait DeltaCDCColumnMappingSuiteBase extends DeltaCDCSuiteBase } // upgrade to name mode val protocol = deltaLog.snapshot.protocol - val (r, w) = if (protocol.supportsReaderFeatures || protocol.supportsWriterFeatures) { + val (r, w) = if (protocol.supportsTableFeatures) { (TableFeatureProtocolUtils.TABLE_FEATURES_MIN_READER_VERSION, TableFeatureProtocolUtils.TABLE_FEATURES_MIN_WRITER_VERSION) } else { @@ -649,6 +650,28 @@ trait DeltaCDCColumnMappingSuiteBase extends DeltaCDCSuiteBase EndingVersion(deltaLog.update().version.toString)).dropCDCFields, (0 until 10).map(_.toString).toDF("id").withColumn("value", col("id"))) } + + test("filters with special characters in name should be pushed down") { + val tblName = "tbl" + withTable(tblName) { + spark.range(end = 10).withColumn("id with space", col("id")) + .write.format("delta").saveAsTable(tblName) + + val plans = DeltaTestUtils.withAllPlansCaptured(spark) { + val res = cdcRead(new TableName(tblName), StartingVersion("0"), EndingVersion("1")) + .select("id with space", "_change_type") + .where(col("id with space") < lit(5)) + + assert(res.columns === Seq("id with space", "_change_type")) + checkAnswer( + res, + spark.range(end = 5) + .withColumn("_change_type", lit("insert"))) + } + assert(plans.map(_.executedPlan).toString + .contains("PushedFilters: [*IsNotNull(id with space), *LessThan(id with space,5)]")) + } + } } trait DeltaCDCColumnMappingScalaSuiteBase extends DeltaCDCColumnMappingSuiteBase { diff --git a/spark/src/test/scala/org/apache/spark/sql/delta/DeltaCDCSQLSuite.scala b/spark/src/test/scala/org/apache/spark/sql/delta/DeltaCDCSQLSuite.scala index f8bde8d239f..fd1d6c820c4 100644 --- a/spark/src/test/scala/org/apache/spark/sql/delta/DeltaCDCSQLSuite.scala +++ b/spark/src/test/scala/org/apache/spark/sql/delta/DeltaCDCSQLSuite.scala @@ -289,10 +289,10 @@ class DeltaCDCSQLSuite extends DeltaCDCSuiteBase with DeltaColumnMappingTestUtil withTable(tbl) { spark.range(10).write.format("delta").saveAsTable(tbl) checkError( - exception = intercept[AnalysisException] { + intercept[AnalysisException] { sql(s"SELECT * FROM table_changes('$tbl', 0, id)") }, - errorClass = "UNRESOLVED_COLUMN.WITHOUT_SUGGESTION", + "UNRESOLVED_COLUMN.WITHOUT_SUGGESTION", parameters = Map("objectName" -> "`id`"), queryContext = Array(ExpectedContext( fragment = "id", @@ -308,9 +308,16 @@ class DeltaCDCSQLSuite extends DeltaCDCSuiteBase with DeltaColumnMappingTestUtil // We set CDC to be enabled by default, so this should automatically bump the writer protocol // to the required version. if (columnMappingEnabled) { - assert(log.snapshot.protocol == Protocol(2, 5)) + assert(log.update().protocol == Protocol(2, 7).withFeatures(Seq( + AppendOnlyTableFeature, + InvariantsTableFeature, + ChangeDataFeedTableFeature, + ColumnMappingTableFeature))) } else { - assert(log.snapshot.protocol == Protocol(1, 4)) + assert(log.update().protocol == Protocol(1, 7).withFeatures(Seq( + AppendOnlyTableFeature, + InvariantsTableFeature, + ChangeDataFeedTableFeature))) } } } diff --git a/spark/src/test/scala/org/apache/spark/sql/delta/DeltaColumnMappingSuite.scala b/spark/src/test/scala/org/apache/spark/sql/delta/DeltaColumnMappingSuite.scala index 4359e5c562f..54ba60ba655 100644 --- a/spark/src/test/scala/org/apache/spark/sql/delta/DeltaColumnMappingSuite.scala +++ b/spark/src/test/scala/org/apache/spark/sql/delta/DeltaColumnMappingSuite.scala @@ -24,6 +24,7 @@ import scala.collection.JavaConverters._ import scala.collection.mutable import org.apache.spark.sql.delta.DeltaOperations.ManualUpdate +import org.apache.spark.sql.delta.DeltaTestUtils.BOOLEAN_DOMAIN import org.apache.spark.sql.delta.actions.{Action, AddCDCFile, AddFile, Metadata => MetadataAction, Protocol, SetTransaction} import org.apache.spark.sql.delta.catalog.DeltaTableV2 import org.apache.spark.sql.delta.schema.SchemaMergingUtils @@ -482,12 +483,13 @@ class DeltaColumnMappingSuite extends QueryTest expectedSchema: StructType, ignorePhysicalName: Boolean, mode: String, - createNewTable: Boolean = true)(fn: => Unit): Unit = { + createNewTable: Boolean = true, + tableFeaturesProtocolExpected: Boolean = true)(fn: => Unit): Unit = { withTable(tableName) { fn checkProperties(tableName, readerVersion = 2, - writerVersion = 5, + writerVersion = if (tableFeaturesProtocolExpected) 7 else 5, mode = Some(mode), curMaxId = DeltaColumnMapping.findMaxColumnId(expectedSchema) ) @@ -826,7 +828,7 @@ class DeltaColumnMappingSuite extends QueryTest checkSchema("t1", schemaWithId) checkProperties("t1", readerVersion = 2, - writerVersion = 5, + writerVersion = 7, mode = Some(mode), curMaxId = DeltaColumnMapping.findMaxColumnId(schemaWithId) ) @@ -849,7 +851,7 @@ class DeltaColumnMappingSuite extends QueryTest checkProperties("t1", readerVersion = 2, - writerVersion = 5, + writerVersion = 7, mode = Some(mode), curMaxId = DeltaColumnMapping.findMaxColumnId(schemaWithIdNested)) checkSchema( @@ -871,7 +873,7 @@ class DeltaColumnMappingSuite extends QueryTest checkProperties("t1", readerVersion = 2, - writerVersion = 5, + writerVersion = 7, mode = Some(mode), curMaxId = curMaxId) @@ -886,7 +888,7 @@ class DeltaColumnMappingSuite extends QueryTest ) checkProperties("t1", readerVersion = 2, - writerVersion = 5, + writerVersion = 7, mode = Some(mode), curMaxId = curMaxId2) checkSchema("t1", @@ -938,7 +940,7 @@ class DeltaColumnMappingSuite extends QueryTest checkProperties("t1", readerVersion = 2, - writerVersion = 5, + writerVersion = 7, mode = Some(mode), curMaxId = curMaxId) checkSchema("t1", @@ -960,7 +962,7 @@ class DeltaColumnMappingSuite extends QueryTest ) checkProperties("t1", readerVersion = 2, - writerVersion = 5, + writerVersion = 7, mode = Some(mode), curMaxId = curMaxId2) checkSchema("t1", @@ -998,7 +1000,7 @@ class DeltaColumnMappingSuite extends QueryTest checkProperties("t1", readerVersion = 2, - writerVersion = 5, + writerVersion = 7, mode = Some(mode), curMaxId = curMaxId) checkSchema("t1", schemaWithId) @@ -1013,7 +1015,7 @@ class DeltaColumnMappingSuite extends QueryTest checkProperties("t1", readerVersion = 2, - writerVersion = 5, + writerVersion = 7, mode = Some(mode), curMaxId = curMaxId) @@ -1037,7 +1039,7 @@ class DeltaColumnMappingSuite extends QueryTest val curMaxId2 = DeltaColumnMapping.findMaxColumnId(schemaWithId) + 1 checkProperties("t1", readerVersion = 2, - writerVersion = 5, + writerVersion = 7, mode = Some(mode), curMaxId = curMaxId2) checkSchema("t1", schemaWithId.add("c", StringType, true, withId(3))) @@ -1627,7 +1629,8 @@ class DeltaColumnMappingSuite extends QueryTest schemaWithDottedColumnNames, false, "name", - createNewTable = false + createNewTable = false, + tableFeaturesProtocolExpected = false ) { sql(s"CREATE TABLE t1 (${schemaWithDottedColumnNames.toDDL}) USING DELTA") alterTableWithProps("t1", props = Map( @@ -1942,12 +1945,12 @@ class DeltaColumnMappingSuite extends QueryTest |TBLPROPERTIES('${DeltaConfigs.COLUMN_MAPPING_MODE.key}'='none') |""".stripMargin) } - val errorClass = "DELTA_INVALID_CHARACTERS_IN_COLUMN_NAMES" + val condition = "DELTA_INVALID_CHARACTERS_IN_COLUMN_NAMES" checkError( - exception = e, - errorClass = errorClass, + e, + condition, parameters = DeltaThrowableHelper - .getParameterNames(errorClass, errorSubClass = null) + .getParameterNames(condition, errorSubClass = null) .zip(invalidColumns).toMap ) } @@ -2044,4 +2047,48 @@ class DeltaColumnMappingSuite extends QueryTest } } } + + for (txnIntroducesMetadata <- BOOLEAN_DOMAIN) { + test("column mapping metadata are stripped when feature is disabled - " + + s"txnIntroducesMetadata=$txnIntroducesMetadata") { + withTempDir { dir => + val tablePath = dir.getCanonicalPath + val deltaLog = DeltaLog.forTable(spark, tablePath) + // Create the original table. + val schemaV0 = if (txnIntroducesMetadata) { + new StructType().add("id", LongType, true) + } else { + new StructType().add("id", LongType, true, withIdAndPhysicalName(0, "col-0")) + } + withSQLConf(DeltaSQLConf.DELTA_COLUMN_MAPPING_STRIP_METADATA.key -> "false") { + deltaLog.withNewTransaction(catalogTableOpt = None) { txn => + val metadata = actions.Metadata( + name = "testTable", + schemaString = schemaV0.json, + configuration = Map(DeltaConfigs.COLUMN_MAPPING_MODE.key -> NoMapping.name) + ) + txn.updateMetadata(metadata) + txn.commit(Seq.empty, ManualUpdate) + } + } + val metadataV0 = deltaLog.update().metadata + assert(DeltaColumnMapping.schemaHasColumnMappingMetadata(metadataV0.schema) === + !txnIntroducesMetadata) + + // Update the schema of the existing table. + withSQLConf(DeltaSQLConf.DELTA_COLUMN_MAPPING_STRIP_METADATA.key -> "true") { + deltaLog.withNewTransaction(catalogTableOpt = None) { txn => + val schemaV1 = + schemaV0.add("value", LongType, true, withIdAndPhysicalName(0, "col-0")) + val metadata = metadataV0.copy(schemaString = schemaV1.json) + txn.updateMetadata(metadata) + txn.commit(Seq.empty, ManualUpdate) + } + val metadataV1 = deltaLog.update().metadata + assert(DeltaColumnMapping.schemaHasColumnMappingMetadata(metadataV1.schema) === + !txnIntroducesMetadata) + } + } + } + } } diff --git a/spark/src/test/scala/org/apache/spark/sql/delta/DeltaColumnMappingTestUtils.scala b/spark/src/test/scala/org/apache/spark/sql/delta/DeltaColumnMappingTestUtils.scala index d6ad49e2c03..aced5b0ae1f 100644 --- a/spark/src/test/scala/org/apache/spark/sql/delta/DeltaColumnMappingTestUtils.scala +++ b/spark/src/test/scala/org/apache/spark/sql/delta/DeltaColumnMappingTestUtils.scala @@ -264,7 +264,7 @@ trait DeltaColumnMappingTestUtilsBase extends SharedSparkSession { Protocol.forNewTable(spark, Some(metadata)).minReaderVersion.toString), (Protocol.MIN_WRITER_VERSION_PROP, Protocol.forNewTable(spark, Some(metadata)).minWriterVersion.toString)) - if (snapshot.protocol.supportsReaderFeatures || snapshot.protocol.supportsWriterFeatures) { + if (snapshot.protocol.supportsTableFeatures) { props ++= Protocol.minProtocolComponentsFromAutomaticallyEnabledFeatures( spark, metadata, snapshot.protocol) diff --git a/spark/src/test/scala/org/apache/spark/sql/delta/DeltaDDLSuite.scala b/spark/src/test/scala/org/apache/spark/sql/delta/DeltaDDLSuite.scala index 2304da97c16..a4fd227dabf 100644 --- a/spark/src/test/scala/org/apache/spark/sql/delta/DeltaDDLSuite.scala +++ b/spark/src/test/scala/org/apache/spark/sql/delta/DeltaDDLSuite.scala @@ -23,7 +23,7 @@ import org.apache.spark.sql.delta.schema.InvariantViolationException import org.apache.spark.sql.delta.sources.DeltaSQLConf import org.apache.spark.sql.delta.test.DeltaSQLCommandTest import org.apache.spark.sql.delta.test.DeltaSQLTestUtils -import org.apache.hadoop.fs.Path +import org.apache.hadoop.fs.{Path, UnsupportedFileSystemException} import org.apache.spark.SparkEnv import org.apache.spark.sql.{AnalysisException, DataFrame, QueryTest, Row} @@ -41,6 +41,35 @@ class DeltaDDLSuite extends DeltaDDLTestBase with SharedSparkSession exception.getMessage.contains("Cannot change nullable column to non-nullable") } + test("protocol-related properties are not considered during duplicate table creation") { + def createTable(tableName: String, location: String): Unit = { + sql(s""" + |CREATE TABLE $tableName (id INT, val STRING) + |USING DELTA + |LOCATION '$location' + |TBLPROPERTIES ( + | 'delta.columnMapping.mode' = 'name', + | 'delta.minReaderVersion' = '2', + | 'delta.minWriterVersion' = '5' + |)""".stripMargin + ) + } + withTempDir { dir => + val table1 = "t1" + val table2 = "t2" + withTable(table1, table2) { + withSQLConf(DeltaSQLConf.DELTA_UPDATE_CATALOG_ENABLED.key -> "true") { + createTable(table1, dir.getCanonicalPath) + createTable(table2, dir.getCanonicalPath) + val catalogTable1 = spark.sessionState.catalog.getTableMetadata(TableIdentifier(table1)) + val catalogTable2 = spark.sessionState.catalog.getTableMetadata(TableIdentifier(table2)) + assert(catalogTable1.properties("delta.columnMapping.mode") == "name") + assert(catalogTable2.properties("delta.columnMapping.mode") == "name") + } + } + } + } + test("table creation with ambiguous paths only allowed with legacy flag") { // ambiguous paths not allowed withTempDir { foo => @@ -84,6 +113,26 @@ class DeltaDDLSuite extends DeltaDDLTestBase with SharedSparkSession assert(spark.table("t").collect().isEmpty) } } + + test("CREATE TABLE with OPTIONS") { + withTempPath { path => + spark.range(10).write.format("delta").save(path.getCanonicalPath) + withTable("t") { + def createTableWithOptions(simulateUC: Boolean): Unit = { + sql( + s""" + |CREATE TABLE t USING delta LOCATION 'fake://${path.getCanonicalPath}' + |${if (simulateUC) "TBLPROPERTIES (test.simulateUC=true)" else ""} + |OPTIONS ( + | fs.fake.impl='${classOf[FakeFileSystem].getName}', + | fs.fake.impl.disable.cache=true) + |""".stripMargin) + } + intercept[UnsupportedFileSystemException](createTableWithOptions(false)) + createTableWithOptions(true) + } + } + } } diff --git a/spark/src/test/scala/org/apache/spark/sql/delta/DeltaDDLUsingPathSuite.scala b/spark/src/test/scala/org/apache/spark/sql/delta/DeltaDDLUsingPathSuite.scala index 6b28baaf392..8dba331eb03 100644 --- a/spark/src/test/scala/org/apache/spark/sql/delta/DeltaDDLUsingPathSuite.scala +++ b/spark/src/test/scala/org/apache/spark/sql/delta/DeltaDDLUsingPathSuite.scala @@ -169,25 +169,25 @@ trait DeltaDDLUsingPathTests extends QueryTest "key" -> "value") } + val protocol = Protocol.forNewTable(spark, Some(metadata)) + val supportedFeatures = protocol + .readerAndWriterFeatureNames + .map(name => s"delta.feature.$name" -> "supported") + val expectedProperties = Seq( + "delta.logRetentionDuration" -> "2 weeks", + "delta.minReaderVersion" -> protocol.minReaderVersion.toString, + "delta.minWriterVersion" -> protocol.minWriterVersion.toString, + "key" -> "value") ++ supportedFeatures + checkDatasetUnorderly( dropColumnMappingConfigurations( sql(s"SHOW TBLPROPERTIES $table").as[(String, String)]), - "delta.logRetentionDuration" -> "2 weeks", - "delta.minReaderVersion" -> - Protocol.forNewTable(spark, Some(metadata)).minReaderVersion.toString, - "delta.minWriterVersion" -> - Protocol.forNewTable(spark, Some(metadata)).minWriterVersion.toString, - "key" -> "value") + expectedProperties: _*) checkDatasetUnorderly( dropColumnMappingConfigurations( sql(s"SHOW TBLPROPERTIES delta.`$path`").as[(String, String)]), - "delta.logRetentionDuration" -> "2 weeks", - "delta.minReaderVersion" -> - Protocol.forNewTable(spark, Some(metadata)).minReaderVersion.toString, - "delta.minWriterVersion" -> - Protocol.forNewTable(spark, Some(metadata)).minWriterVersion.toString, - "key" -> "value") + expectedProperties: _*) if (table == "`delta_test`") { val tableName = s"$catalogName.default.delta_test" diff --git a/spark/src/test/scala/org/apache/spark/sql/delta/DeltaDataFrameWriterV2Suite.scala b/spark/src/test/scala/org/apache/spark/sql/delta/DeltaDataFrameWriterV2Suite.scala index ac398604640..9c579f0d524 100644 --- a/spark/src/test/scala/org/apache/spark/sql/delta/DeltaDataFrameWriterV2Suite.scala +++ b/spark/src/test/scala/org/apache/spark/sql/delta/DeltaDataFrameWriterV2Suite.scala @@ -677,8 +677,8 @@ class DeltaDataFrameWriterV2Suite def verifyNotImplicitCasting(f: => Unit): Unit = { val e = intercept[DeltaAnalysisException](f) checkError( - exception = e.getCause.asInstanceOf[DeltaAnalysisException], - errorClass = "DELTA_MERGE_INCOMPATIBLE_DATATYPE", + e.getCause.asInstanceOf[DeltaAnalysisException], + "DELTA_MERGE_INCOMPATIBLE_DATATYPE", parameters = Map("currentDataType" -> "LongType", "updateDataType" -> "IntegerType")) } verifyNotImplicitCasting { diff --git a/spark/src/test/scala/org/apache/spark/sql/delta/DeltaDropColumnSuite.scala b/spark/src/test/scala/org/apache/spark/sql/delta/DeltaDropColumnSuite.scala index 73356c348ae..ece065f4ab0 100644 --- a/spark/src/test/scala/org/apache/spark/sql/delta/DeltaDropColumnSuite.scala +++ b/spark/src/test/scala/org/apache/spark/sql/delta/DeltaDropColumnSuite.scala @@ -450,10 +450,10 @@ class DeltaDropColumnSuite extends QueryTest field <- Seq("m.key", "m.value", "a.element") } checkError( - exception = intercept[AnalysisException] { + intercept[AnalysisException] { sql(s"ALTER TABLE delta_test DROP COLUMN $field") }, - errorClass = "DELTA_UNSUPPORTED_DROP_NESTED_COLUMN_FROM_NON_STRUCT_TYPE", + "DELTA_UNSUPPORTED_DROP_NESTED_COLUMN_FROM_NON_STRUCT_TYPE", parameters = Map( "struct" -> "IntegerType" ) diff --git a/spark/src/test/scala/org/apache/spark/sql/delta/DeltaErrorsSuite.scala b/spark/src/test/scala/org/apache/spark/sql/delta/DeltaErrorsSuite.scala index b5e8b069f8f..c94b4a884d9 100644 --- a/spark/src/test/scala/org/apache/spark/sql/delta/DeltaErrorsSuite.scala +++ b/spark/src/test/scala/org/apache/spark/sql/delta/DeltaErrorsSuite.scala @@ -477,12 +477,12 @@ trait DeltaErrorsSuiteBase Some(s"Delta table $table doesn't exist.")) } checkError( - exception = intercept[DeltaIllegalStateException] { + intercept[DeltaIllegalStateException] { throw DeltaErrors.differentDeltaTableReadByStreamingSource( newTableId = "027fb01c-94aa-4cab-87cb-5aab6aec6d17", oldTableId = "2edf2c02-bb63-44e9-a84c-517fad0db296") }, - errorClass = "DIFFERENT_DELTA_TABLE_READ_BY_STREAMING_SOURCE", + "DIFFERENT_DELTA_TABLE_READ_BY_STREAMING_SOURCE", parameters = Map( "oldTableId" -> "2edf2c02-bb63-44e9-a84c-517fad0db296", "newTableId" -> "027fb01c-94aa-4cab-87cb-5aab6aec6d17") @@ -961,12 +961,12 @@ trait DeltaErrorsSuiteBase SchemaMergingUtils.mergeSchemas(s1, s2) } checkError( - exception = e, - errorClass = "DELTA_FAILED_TO_MERGE_FIELDS", + e, + "DELTA_FAILED_TO_MERGE_FIELDS", parameters = Map("currentField" -> "c0", "updateField" -> "c0")) checkError( - exception = e.getCause.asInstanceOf[DeltaAnalysisException], - errorClass = "DELTA_MERGE_INCOMPATIBLE_DATATYPE", + e.getCause.asInstanceOf[DeltaAnalysisException], + "DELTA_MERGE_INCOMPATIBLE_DATATYPE", parameters = Map("currentDataType" -> "IntegerType", "updateDataType" -> "StringType")) } { @@ -997,13 +997,13 @@ trait DeltaErrorsSuiteBase } { checkError( - exception = intercept[DeltaAnalysisException] { + intercept[DeltaAnalysisException] { throw DeltaErrors.alterTableChangeColumnException( fieldPath = "a.b.c", oldField = StructField("c", IntegerType), newField = StructField("c", LongType)) }, - errorClass = "DELTA_UNSUPPORTED_ALTER_TABLE_CHANGE_COL_OP", + "DELTA_UNSUPPORTED_ALTER_TABLE_CHANGE_COL_OP", parameters = Map( "fieldPath" -> "a.b.c", "oldField" -> "INT", @@ -1421,14 +1421,14 @@ trait DeltaErrorsSuiteBase } { checkError( - exception = intercept[DeltaAnalysisException] { + intercept[DeltaAnalysisException] { throw DeltaErrors.constraintDataTypeMismatch( columnPath = Seq("a", "x"), columnType = ByteType, dataType = IntegerType, constraints = Map("ck1" -> "a > 0", "ck2" -> "hash(b) > 0")) }, - errorClass = "DELTA_CONSTRAINT_DATA_TYPE_MISMATCH", + "DELTA_CONSTRAINT_DATA_TYPE_MISMATCH", parameters = Map( "columnName" -> "a.x", "columnType" -> "TINYINT", @@ -1438,7 +1438,7 @@ trait DeltaErrorsSuiteBase } { checkError( - exception = intercept[DeltaAnalysisException] { + intercept[DeltaAnalysisException] { throw DeltaErrors.generatedColumnsDataTypeMismatch( columnPath = Seq("a", "x"), columnType = ByteType, @@ -1448,7 +1448,7 @@ trait DeltaErrorsSuiteBase "gen2" -> "3 + a . x" )) }, - errorClass = "DELTA_GENERATED_COLUMNS_DATA_TYPE_MISMATCH", + "DELTA_GENERATED_COLUMNS_DATA_TYPE_MISMATCH", parameters = Map( "columnName" -> "a.x", "columnType" -> "TINYINT", @@ -1916,10 +1916,10 @@ trait DeltaErrorsSuiteBase } { checkError( - exception = intercept[DeltaIllegalStateException] { + intercept[DeltaIllegalStateException] { throw MaterializedRowId.missingMetadataException("table_name") }, - errorClass = "DELTA_MATERIALIZED_ROW_TRACKING_COLUMN_NAME_MISSING", + "DELTA_MATERIALIZED_ROW_TRACKING_COLUMN_NAME_MISSING", parameters = Map( "rowTrackingColumn" -> "Row ID", "tableName" -> "table_name" @@ -1928,10 +1928,10 @@ trait DeltaErrorsSuiteBase } { checkError( - exception = intercept[DeltaIllegalStateException] { + intercept[DeltaIllegalStateException] { throw MaterializedRowCommitVersion.missingMetadataException("table_name") }, - errorClass = "DELTA_MATERIALIZED_ROW_TRACKING_COLUMN_NAME_MISSING", + "DELTA_MATERIALIZED_ROW_TRACKING_COLUMN_NAME_MISSING", parameters = Map( "rowTrackingColumn" -> "Row Commit Version", "tableName" -> "table_name" diff --git a/spark/src/test/scala/org/apache/spark/sql/delta/DeltaInsertIntoImplicitCastSuite.scala b/spark/src/test/scala/org/apache/spark/sql/delta/DeltaInsertIntoImplicitCastSuite.scala new file mode 100644 index 00000000000..c014feb228f --- /dev/null +++ b/spark/src/test/scala/org/apache/spark/sql/delta/DeltaInsertIntoImplicitCastSuite.scala @@ -0,0 +1,214 @@ +/* + * Copyright (2021) The Delta Lake Project Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.delta + +import org.apache.spark.sql.delta.sources.DeltaSQLConf +import org.apache.spark.sql.SaveMode +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.types._ + +/** + * Test suite covering implicit casting in INSERT operations when the type of the data to insert + * doesn't match the type in Delta table. + * + * The casting behavior is (unfortunately) dependent on the API used to run the INSERT, e.g. + * Dataframe V1 insertInto() vs V2 saveAsTable() or using SQL. + * This suite intends to exhaustively cover all the ways INSERT can be run on a Delta table. See + * [[DeltaInsertIntoTest]] for a list of these INSERT operations covered. + */ +class DeltaInsertIntoImplicitCastSuite extends DeltaInsertIntoTest { + + override def beforeAll(): Unit = { + super.beforeAll() + spark.conf.set(DeltaSQLConf.DELTA_STREAMING_SINK_ALLOW_IMPLICIT_CASTS.key, "true") + spark.conf.set(SQLConf.ANSI_ENABLED.key, "true") + } + + for (schemaEvolution <- BOOLEAN_DOMAIN) { + testInserts("insert with implicit up and down cast on top-level fields, " + + s"schemaEvolution=$schemaEvolution")( + initialSchemaDDL = "a long, b int", + initialJsonData = Seq("""{ "a": 1, "b": 2 }"""), + partitionBy = Seq("a"), + overwriteWhere = "a" -> 1, + insertSchemaDDL = "a int, b long", + insertJsonData = Seq("""{ "a": 1, "b": 4 }"""), + expectedResult = ExpectedResult.Success( + expectedSchema = new StructType() + .add("a", LongType) + .add("b", IntegerType)), + // The following insert operations don't implicitly cast the data but fail instead - see + // following test covering failure for these cases. We should change this to offer consistent + // behavior across all inserts. + excludeInserts = Seq( + DFv1SaveAsTable(SaveMode.Append), + DFv1SaveAsTable(SaveMode.Overwrite), + DFv1Save(SaveMode.Append), + DFv1Save(SaveMode.Overwrite), + DFv2Append, + DFv2Overwrite, + DFv2OverwritePartition + ), + confs = Seq(DeltaSQLConf.DELTA_SCHEMA_AUTO_MIGRATE.key -> schemaEvolution.toString) + ) + + testInserts("insert with implicit up and down cast on top-level fields, " + + s"schemaEvolution=$schemaEvolution")( + initialSchemaDDL = "a long, b int", + initialJsonData = Seq("""{ "a": 1, "b": 2 }"""), + partitionBy = Seq("a"), + overwriteWhere = "a" -> 1, + insertSchemaDDL = "a int, b long", + insertJsonData = Seq("""{ "a": 1, "b": 4 }"""), + expectedResult = ExpectedResult.Failure(ex => { + checkError( + ex, + "DELTA_FAILED_TO_MERGE_FIELDS", + parameters = Map( + "currentField" -> "a", + "updateField" -> "a" + )) + }), + includeInserts = Seq( + DFv1SaveAsTable(SaveMode.Append), + DFv1SaveAsTable(SaveMode.Overwrite), + DFv1Save(SaveMode.Append), + DFv1Save(SaveMode.Overwrite), + DFv2Append, + DFv2Overwrite, + DFv2OverwritePartition + ), + confs = Seq(DeltaSQLConf.DELTA_SCHEMA_AUTO_MIGRATE.key -> schemaEvolution.toString) + ) + + testInserts("insert with implicit up and down cast on fields nested in array, " + + s"schemaEvolution=$schemaEvolution")( + initialSchemaDDL = "key int, a array>", + initialJsonData = Seq("""{ "key": 1, "a": [ { "x": 1, "y": 2 } ] }"""), + partitionBy = Seq("key"), + overwriteWhere = "key" -> 1, + insertSchemaDDL = "key int, a array>", + insertJsonData = Seq("""{ "key": 1, "a": [ { "x": 3, "y": 4 } ] }"""), + expectedResult = ExpectedResult.Success( + expectedSchema = new StructType() + .add("key", IntegerType) + .add("a", ArrayType(new StructType() + .add("x", LongType) + .add("y", IntegerType, nullable = true)))), + // The following insert operations don't implicitly cast the data but fail instead - see + // following test covering failure for these cases. We should change this to offer consistent + // behavior across all inserts. + excludeInserts = Seq( + DFv1SaveAsTable(SaveMode.Append), + DFv1SaveAsTable(SaveMode.Overwrite), + DFv1Save(SaveMode.Append), + DFv1Save(SaveMode.Overwrite), + DFv2Append, + DFv2Overwrite, + DFv2OverwritePartition + ), + confs = Seq(DeltaSQLConf.DELTA_SCHEMA_AUTO_MIGRATE.key -> schemaEvolution.toString) + ) + + testInserts("insert with implicit up and down cast on fields nested in array, " + + s"schemaEvolution=$schemaEvolution")( + initialSchemaDDL = "key int, a array>", + initialJsonData = Seq("""{ "key": 1, "a": [ { "x": 1, "y": 2 } ] }"""), + partitionBy = Seq("key"), + overwriteWhere = "key" -> 1, + insertSchemaDDL = "key int, a array>", + insertJsonData = Seq("""{ "key": 1, "a": [ { "x": 3, "y": 4 } ] }"""), + expectedResult = ExpectedResult.Failure(ex => { + checkError( + ex, + "DELTA_FAILED_TO_MERGE_FIELDS", + parameters = Map( + "currentField" -> "a", + "updateField" -> "a" + )) + }), + includeInserts = Seq( + DFv1SaveAsTable(SaveMode.Append), + DFv1SaveAsTable(SaveMode.Overwrite), + DFv1Save(SaveMode.Append), + DFv1Save(SaveMode.Overwrite), + DFv2Append, + DFv2Overwrite, + DFv2OverwritePartition + ), + confs = Seq(DeltaSQLConf.DELTA_SCHEMA_AUTO_MIGRATE.key -> schemaEvolution.toString) + ) + + testInserts("insert with implicit up and down cast on fields nested in map, " + + s"schemaEvolution=$schemaEvolution")( + initialSchemaDDL = "key int, m map>", + initialJsonData = Seq("""{ "key": 1, "m": { "a": { "x": 1, "y": 2 } } }"""), + partitionBy = Seq("key"), + overwriteWhere = "key" -> 1, + insertSchemaDDL = "key int, m map>", + insertJsonData = Seq("""{ "key": 1, "m": { "a": { "x": 3, "y": 4 } } }"""), + expectedResult = ExpectedResult.Success( + expectedSchema = new StructType() + .add("key", IntegerType) + .add("m", MapType(StringType, new StructType() + .add("x", LongType) + .add("y", IntegerType)))), + // The following insert operations don't implicitly cast the data but fail instead - see + // following test covering failure for these cases. We should change this to offer consistent + // behavior across all inserts. + excludeInserts = Seq( + DFv1SaveAsTable(SaveMode.Append), + DFv1SaveAsTable(SaveMode.Overwrite), + DFv1Save(SaveMode.Append), + DFv1Save(SaveMode.Overwrite), + DFv2Append, + DFv2Overwrite, + DFv2OverwritePartition + ), + confs = Seq(DeltaSQLConf.DELTA_SCHEMA_AUTO_MIGRATE.key -> schemaEvolution.toString) + ) + + testInserts("insert with implicit up and down cast on fields nested in map, " + + s"schemaEvolution=$schemaEvolution")( + initialSchemaDDL = "key int, m map>", + initialJsonData = Seq("""{ "key": 1, "m": { "a": { "x": 1, "y": 2 } } }"""), + partitionBy = Seq("key"), + overwriteWhere = "key" -> 1, + insertSchemaDDL = "key int, m map>", + insertJsonData = Seq("""{ "key": 1, "m": { "a": { "x": 3, "y": 4 } } }"""), + expectedResult = ExpectedResult.Failure(ex => { + checkError( + ex, + "DELTA_FAILED_TO_MERGE_FIELDS", + parameters = Map( + "currentField" -> "m", + "updateField" -> "m" + )) + }), + includeInserts = Seq( + DFv1SaveAsTable(SaveMode.Append), + DFv1SaveAsTable(SaveMode.Overwrite), + DFv1Save(SaveMode.Append), + DFv1Save(SaveMode.Overwrite), + DFv2Append, + DFv2Overwrite, + DFv2OverwritePartition + ), + confs = Seq(DeltaSQLConf.DELTA_SCHEMA_AUTO_MIGRATE.key -> schemaEvolution.toString) + ) + } +} diff --git a/spark/src/test/scala/org/apache/spark/sql/delta/DeltaInsertIntoTableSuite.scala b/spark/src/test/scala/org/apache/spark/sql/delta/DeltaInsertIntoTableSuite.scala index b40fa7adf65..18acbc09e0f 100644 --- a/spark/src/test/scala/org/apache/spark/sql/delta/DeltaInsertIntoTableSuite.scala +++ b/spark/src/test/scala/org/apache/spark/sql/delta/DeltaInsertIntoTableSuite.scala @@ -213,10 +213,10 @@ class DeltaInsertIntoSQLSuite withTable("t") { sql(s"CREATE TABLE t(i STRING, c string) USING $v2Format PARTITIONED BY (c)") checkError( - exception = intercept[AnalysisException] { + intercept[AnalysisException] { sql("INSERT OVERWRITE t PARTITION (c='1') (c) VALUES ('2')") }, - errorClass = "STATIC_PARTITION_COLUMN_IN_INSERT_COLUMN_LIST", + "STATIC_PARTITION_COLUMN_IN_INSERT_COLUMN_LIST", parameters = Map("staticName" -> "c")) } } @@ -596,22 +596,22 @@ class DeltaColumnDefaultsInsertSuite extends InsertIntoSQLOnlyTests with DeltaSQ // The table feature is not enabled via TBLPROPERTIES. withTable("createTableWithDefaultFeatureNotEnabled") { checkError( - exception = intercept[DeltaAnalysisException] { + intercept[DeltaAnalysisException] { sql(s"create table createTableWithDefaultFeatureNotEnabled(" + s"i boolean, s bigint, q int default 42) using $v2Format " + "partitioned by (i)") }, - errorClass = "WRONG_COLUMN_DEFAULTS_FOR_DELTA_FEATURE_NOT_ENABLED", + "WRONG_COLUMN_DEFAULTS_FOR_DELTA_FEATURE_NOT_ENABLED", parameters = Map("commandType" -> "CREATE TABLE") ) } withTable("alterTableSetDefaultFeatureNotEnabled") { sql(s"create table alterTableSetDefaultFeatureNotEnabled(a int) using $v2Format") checkError( - exception = intercept[DeltaAnalysisException] { + intercept[DeltaAnalysisException] { sql("alter table alterTableSetDefaultFeatureNotEnabled alter column a set default 42") }, - errorClass = "WRONG_COLUMN_DEFAULTS_FOR_DELTA_FEATURE_NOT_ENABLED", + "WRONG_COLUMN_DEFAULTS_FOR_DELTA_FEATURE_NOT_ENABLED", parameters = Map("commandType" -> "ALTER TABLE") ) } @@ -620,19 +620,19 @@ class DeltaColumnDefaultsInsertSuite extends InsertIntoSQLOnlyTests with DeltaSQ sql(s"create table alterTableTest(i boolean, s bigint, q int default 42) using $v2Format " + s"partitioned by (i) $tblPropertiesAllowDefaults") checkError( - exception = intercept[DeltaAnalysisException] { + intercept[DeltaAnalysisException] { sql("alter table alterTableTest add column z int default 42") }, - errorClass = "WRONG_COLUMN_DEFAULTS_FOR_DELTA_ALTER_TABLE_ADD_COLUMN_NOT_SUPPORTED" + "WRONG_COLUMN_DEFAULTS_FOR_DELTA_ALTER_TABLE_ADD_COLUMN_NOT_SUPPORTED" ) } // The default value fails to analyze. checkError( - exception = intercept[AnalysisException] { + intercept[AnalysisException] { sql(s"create table t4 (s int default badvalue) using $v2Format " + s"$tblPropertiesAllowDefaults") }, - errorClass = INVALID_COLUMN_DEFAULT_VALUE_ERROR_MSG, + INVALID_COLUMN_DEFAULT_VALUE_ERROR_MSG, parameters = Map( "statement" -> "CREATE TABLE", "colName" -> "`s`", @@ -642,11 +642,11 @@ class DeltaColumnDefaultsInsertSuite extends InsertIntoSQLOnlyTests with DeltaSQ // The error message reports that we failed to execute the command because subquery // expressions are not allowed in DEFAULT values. checkError( - exception = intercept[AnalysisException] { + intercept[AnalysisException] { sql(s"create table t4 (s int default (select min(x) from badtable)) using $v2Format " + tblPropertiesAllowDefaults) }, - errorClass = "INVALID_DEFAULT_VALUE.SUBQUERY_EXPRESSION", + "INVALID_DEFAULT_VALUE.SUBQUERY_EXPRESSION", parameters = Map( "statement" -> "CREATE TABLE", "colName" -> "`s`", @@ -656,22 +656,22 @@ class DeltaColumnDefaultsInsertSuite extends InsertIntoSQLOnlyTests with DeltaSQ // The error message reports that we failed to execute the command because subquery // expressions are not allowed in DEFAULT values. checkError( - exception = intercept[AnalysisException] { + intercept[AnalysisException] { sql(s"create table t4 (s int default (select 42 as alias)) using $v2Format " + tblPropertiesAllowDefaults) }, - errorClass = "INVALID_DEFAULT_VALUE.SUBQUERY_EXPRESSION", + "INVALID_DEFAULT_VALUE.SUBQUERY_EXPRESSION", parameters = Map( "statement" -> "CREATE TABLE", "colName" -> "`s`", "defaultValue" -> "(select 42 as alias)")) // The default value parses but the type is not coercible. checkError( - exception = intercept[AnalysisException] { + intercept[AnalysisException] { sql(s"create table t4 (s bigint default false) " + s"using $v2Format $tblPropertiesAllowDefaults") }, - errorClass = "INVALID_DEFAULT_VALUE.DATA_TYPE", + "INVALID_DEFAULT_VALUE.DATA_TYPE", parameters = Map( "statement" -> "CREATE TABLE", "colName" -> "`s`", @@ -702,11 +702,11 @@ class DeltaColumnDefaultsInsertSuite extends InsertIntoSQLOnlyTests with DeltaSQ // Column default values are disabled per configuration in general. withSQLConf(SQLConf.ENABLE_DEFAULT_COLUMNS.key -> "false") { checkError( - exception = intercept[ParseException] { + intercept[ParseException] { sql(s"create table t4 (s int default 41 + 1) using $v2Format " + tblPropertiesAllowDefaults) }, - errorClass = "UNSUPPORTED_DEFAULT_VALUE.WITH_SUGGESTION", + "UNSUPPORTED_DEFAULT_VALUE.WITH_SUGGESTION", parameters = Map.empty, context = ExpectedContext(fragment = "s int default 41 + 1", start = 17, stop = 36)) } diff --git a/spark/src/test/scala/org/apache/spark/sql/delta/DeltaInsertIntoTest.scala b/spark/src/test/scala/org/apache/spark/sql/delta/DeltaInsertIntoTest.scala new file mode 100644 index 00000000000..3145ef6f8c7 --- /dev/null +++ b/spark/src/test/scala/org/apache/spark/sql/delta/DeltaInsertIntoTest.scala @@ -0,0 +1,291 @@ +/* + * Copyright (2021) The Delta Lake Project Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.delta + +import org.apache.spark.sql.delta.test.DeltaSQLCommandTest + +import org.apache.spark.{DebugFilesystem, SparkThrowable} +import org.apache.spark.sql.{DataFrame, QueryTest, SaveMode} +import org.apache.spark.sql.catalyst.TableIdentifier +import org.apache.spark.sql.functions.{col, lit} +import org.apache.spark.sql.streaming.Trigger +import org.apache.spark.sql.types.StructType + +/** + * There are **many** different ways to run an insert: + * - Using SQL, the dataframe v1 and v2 APIs or the streaming API. + * - Append vs. Overwrite / Partition overwrite. + * - Position-based vs. name-based resolution. + * + * Each take a unique path through analysis. The abstractions below captures these different + * inserts to allow more easily running tests with all or a subset of them. + */ +trait DeltaInsertIntoTest extends QueryTest with DeltaDMLTestUtils with DeltaSQLCommandTest { + + /** + * Represents one way of inserting data into a Delta table. + * @param mode Append or Overwrite. This dictates in particular what the expected result after the + * insert should be. + * @param name A human-readable name for the insert type displayed in the test names. + */ + trait Insert { + val mode: SaveMode + val name: String + + /** + * The method that tests will call to run the insert. Each type of insert must implement its + * specific way to run insert. + */ + def runInsert(columns: Seq[String], whereCol: String, whereValue: Int): Unit + + /** SQL keyword for this type of insert. */ + def intoOrOverwrite: String = if (mode == SaveMode.Append) "INTO" else "OVERWRITE" + + /** The expected content of the table after the insert. */ + def expectedResult(initialDF: DataFrame, insertedDF: DataFrame): DataFrame = + if (mode == SaveMode.Overwrite) insertedDF + else initialDF.unionByName(insertedDF, allowMissingColumns = true) + } + + /** INSERT INTO/OVERWRITE */ + case class SQLInsertByPosition(mode: SaveMode) extends Insert { + val name: String = s"INSERT $intoOrOverwrite" + def runInsert(columns: Seq[String], whereCol: String, whereValue: Int): Unit = + sql(s"INSERT $intoOrOverwrite target SELECT * FROM source") + } + + /** INSERT INTO/OVERWRITE (a, b) */ + case class SQLInsertColList(mode: SaveMode) extends Insert { + val name: String = s"INSERT $intoOrOverwrite (columns) - $mode" + def runInsert(columns: Seq[String], whereCol: String, whereValue: Int): Unit = { + val colList = columns.mkString(", ") + sql(s"INSERT $intoOrOverwrite target ($colList) SELECT $colList FROM source") + } + } + + /** INSERT INTO/OVERWRITE BY NAME */ + case class SQLInsertByName(mode: SaveMode) extends Insert { + val name: String = s"INSERT $intoOrOverwrite BY NAME - $mode" + def runInsert(columns: Seq[String], whereCol: String, whereValue: Int): Unit = + sql(s"INSERT $intoOrOverwrite target SELECT ${columns.mkString(", ")} FROM source") + } + + /** INSERT INTO REPLACE WHERE */ + object SQLInsertOverwriteReplaceWhere extends Insert { + val mode: SaveMode = SaveMode.Overwrite + val name: String = s"INSERT INTO REPLACE WHERE" + def runInsert(columns: Seq[String], whereCol: String, whereValue: Int): Unit = + sql(s"INSERT INTO target REPLACE WHERE $whereCol = $whereValue " + + s"SELECT ${columns.mkString(", ")} FROM source") + } + + /** INSERT OVERWRITE PARTITION (part = 1) */ + object SQLInsertOverwritePartitionByPosition extends Insert { + val mode: SaveMode = SaveMode.Overwrite + val name: String = s"INSERT OVERWRITE PARTITION (partition)" + def runInsert(columns: Seq[String], whereCol: String, whereValue: Int): Unit = { + val assignments = columns.filterNot(_ == whereCol).mkString(", ") + sql(s"INSERT OVERWRITE target PARTITION ($whereCol = $whereValue) " + + s"SELECT $assignments FROM source") + } + } + + /** INSERT OVERWRITE PARTITION (part = 1) (a, b) */ + object SQLInsertOverwritePartitionColList extends Insert { + val mode: SaveMode = SaveMode.Overwrite + val name: String = s"INSERT OVERWRITE PARTITION (partition) (columns)" + def runInsert(columns: Seq[String], whereCol: String, whereValue: Int): Unit = { + val assignments = columns.filterNot(_ == whereCol).mkString(", ") + sql(s"INSERT OVERWRITE target PARTITION ($whereCol = $whereValue) ($assignments) " + + s"SELECT $assignments FROM source") + } + } + + /** df.write.mode(mode).insertInto() */ + case class DFv1InsertInto(mode: SaveMode) extends Insert { + val name: String = s"DFv1 insertInto() - $mode" + def runInsert(columns: Seq[String], whereCol: String, whereValue: Int): Unit = + spark.read.table("source").write.mode(mode).insertInto("target") + } + + /** df.write.mode(mode).saveAsTable() */ + case class DFv1SaveAsTable(mode: SaveMode) extends Insert { + val name: String = s"DFv1 saveAsTable() - $mode" + def runInsert(columns: Seq[String], whereCol: String, whereValue: Int): Unit = { + spark.read.table("source").write.mode(mode).format("delta").saveAsTable("target") + } + } + + /** df.write.mode(mode).save() */ + case class DFv1Save(mode: SaveMode) extends Insert { + val name: String = s"DFv1 save() - $mode" + def runInsert(columns: Seq[String], whereCol: String, whereValue: Int): Unit = { + val deltaLog = DeltaLog.forTable(spark, TableIdentifier("target")) + spark.read.table("source").write.mode(mode).format("delta").save(deltaLog.dataPath.toString) + } + } + + /** df.writeTo.append() */ + object DFv2Append extends Insert { self: Insert => + val mode: SaveMode = SaveMode.Append + val name: String = "DFv2 append()" + def runInsert(columns: Seq[String], whereCol: String, whereValue: Int): Unit = { + spark.read.table("source").writeTo("target").append() + } + } + + /** df.writeTo.overwrite() */ + object DFv2Overwrite extends Insert { self: Insert => + val mode: SaveMode = SaveMode.Overwrite + val name: String = s"DFv2 overwrite()" + def runInsert(columns: Seq[String], whereCol: String, whereValue: Int): Unit = { + spark.read.table("source").writeTo("target").overwrite(col(whereCol) === lit(whereValue)) + } + } + + /** df.writeTo.overwritePartitions() */ + object DFv2OverwritePartition extends Insert { self: Insert => + override val mode: SaveMode = SaveMode.Overwrite + val name: String = s"DFv2 overwritePartitions()" + def runInsert(columns: Seq[String], whereCol: String, whereValue: Int): Unit = { + spark.read.table("source").writeTo("target").overwritePartitions() + } + } + + /** df.writeStream.toTable() */ + object StreamingInsert extends Insert { self: Insert => + override val mode: SaveMode = SaveMode.Append + val name: String = s"Streaming toTable()" + def runInsert(columns: Seq[String], whereCol: String, whereValue: Int): Unit = { + val tablePath = DeltaLog.forTable(spark, TableIdentifier("target")).dataPath + val query = spark.readStream + .table("source") + .writeStream + .option("checkpointLocation", tablePath.toString) + .format("delta") + .trigger(Trigger.AvailableNow()) + .toTable("target") + query.processAllAvailable() + } + } + + /** Collects all the types of insert previously defined. */ + protected lazy val allInsertTypes: Seq[Insert] = Seq( + SQLInsertOverwriteReplaceWhere, + SQLInsertOverwritePartitionByPosition, + SQLInsertOverwritePartitionColList, + DFv2Append, + DFv2Overwrite, + DFv2OverwritePartition, + StreamingInsert + ) ++ (for { + mode: SaveMode <- Seq(SaveMode.Append, SaveMode.Overwrite) + insert: Insert <- Seq( + SQLInsertByPosition(mode), + SQLInsertColList(mode), + SQLInsertByName(mode), + DFv1InsertInto(mode), + DFv1SaveAsTable(mode), + DFv1Save(mode) + ) + } yield insert) + + /** + * Represents the expected result after running an insert operation in `testInserts()` below. + * Either: + * - Success: the table schema after the operation is checked against the expected schema. + * `testInserts()` also validates the data, though it's able to infer the expected data from the + * test inputs. + * - Failure: an exception is thrown and the caller passes a function to check that it matches an + * expected error. + */ + type ExpectedResult = Either[StructType, SparkThrowable => Unit] + object ExpectedResult { + def Success(expectedSchema: StructType): ExpectedResult = Left(expectedSchema) + def Failure(checkError: SparkThrowable => Unit): ExpectedResult = Right(checkError) + } + + /** + * Test runner to cover INSERT operations defined above. + * @param name Test name + * @param initialSchemaDDL Initial schema of the table to be inserted into (as a DDL string). + * @param initialJsonData Initial data present in the table to be inserted into (as a JSON + * string). + * @param partitionBy Partition columns for the initial table. + * @param insertSchemaDDL Schema of the data to be inserted (as a DDL string). + * @param insertJsonData Data to be inserted (as a JSON string) + * @param overwriteWhere Where clause for overwrite PARTITION / REPLACE WHERE (as + * colName -> value) + * @param expectedResult Expected result, see [[ExpectedResult]] above. + * @param includeInserts List of insert types to run the test with. Defaults to all inserts. + * @param excludeInserts List of insert types to exclude when running the test. Defaults to no + * inserts excluded. + * @param confs Custom spark confs to set before running the insert operation. + */ + // scalastyle:off argcount + def testInserts(name: String)( + initialSchemaDDL: String, + initialJsonData: Seq[String], + partitionBy: Seq[String] = Seq.empty, + insertSchemaDDL: String, + insertJsonData: Seq[String], + overwriteWhere: (String, Int), + expectedResult: ExpectedResult, + includeInserts: Seq[Insert] = allInsertTypes, + excludeInserts: Seq[Insert] = Seq.empty, + confs: Seq[(String, String)] = Seq.empty): Unit = { + for (insert <- includeInserts.filterNot(excludeInserts.toSet)) { + test(s"${insert.name} - $name") { + withTable("source", "target") { + val initialDF = readFromJSON(initialJsonData, StructType.fromDDL(initialSchemaDDL)) + val writer = initialDF.write.format("delta") + if (partitionBy.nonEmpty) { + writer.partitionBy(partitionBy: _*) + } + writer.saveAsTable("target") + // Write the data to insert to a table so that we can use it in both SQL and dataframe + // writer inserts. + val insertDF = readFromJSON(insertJsonData, StructType.fromDDL(insertSchemaDDL)) + insertDF.write.format("delta").saveAsTable("source") + + def runInsert(): Unit = + insert.runInsert( + columns = insertDF.schema.map(_.name), + whereCol = overwriteWhere._1, + whereValue = overwriteWhere._2 + ) + + withSQLConf(confs: _*) { + expectedResult match { + case Left(expectedSchema) => + runInsert() + val target = spark.read.table("target") + assert(target.schema === expectedSchema) + checkAnswer(target, insert.expectedResult(initialDF, insertDF)) + case Right(checkError) => + val ex = intercept[SparkThrowable] { + runInsert() + } + checkError(ex) + } + } + } + } + } + } + // scalastyle:on argcount +} diff --git a/spark/src/test/scala/org/apache/spark/sql/delta/DeltaLogMinorCompactionSuite.scala b/spark/src/test/scala/org/apache/spark/sql/delta/DeltaLogMinorCompactionSuite.scala index a2096edcc9c..af1bda7f03e 100644 --- a/spark/src/test/scala/org/apache/spark/sql/delta/DeltaLogMinorCompactionSuite.scala +++ b/spark/src/test/scala/org/apache/spark/sql/delta/DeltaLogMinorCompactionSuite.scala @@ -22,6 +22,7 @@ import org.apache.spark.sql.delta.coordinatedcommits.CoordinatedCommitsBaseSuite import org.apache.spark.sql.delta.sources.DeltaSQLConf import org.apache.spark.sql.delta.test.DeltaSQLCommandTest import org.apache.spark.sql.delta.test.DeltaSQLTestUtils +import org.apache.spark.sql.delta.test.DeltaTestImplicits._ import org.apache.spark.sql.delta.util.{DeltaCommitFileProvider, FileNames} import org.apache.hadoop.fs.Path diff --git a/spark/src/test/scala/org/apache/spark/sql/delta/DeltaLogSuite.scala b/spark/src/test/scala/org/apache/spark/sql/delta/DeltaLogSuite.scala index aa6e25c971f..93a7e275378 100644 --- a/spark/src/test/scala/org/apache/spark/sql/delta/DeltaLogSuite.scala +++ b/spark/src/test/scala/org/apache/spark/sql/delta/DeltaLogSuite.scala @@ -18,7 +18,7 @@ package org.apache.spark.sql.delta import java.io.{BufferedReader, File, InputStreamReader, IOException} import java.nio.charset.StandardCharsets -import java.util.Locale +import java.util.{Locale, Optional} import scala.collection.JavaConverters._ import scala.language.postfixOps @@ -34,6 +34,7 @@ import org.apache.spark.sql.delta.test.DeltaTestImplicits._ import org.apache.spark.sql.delta.util.{FileNames, JsonUtils} import com.fasterxml.jackson.databind.ObjectMapper import com.fasterxml.jackson.module.scala.DefaultScalaModule +import io.delta.storage.commit.TableDescriptor import org.apache.hadoop.fs.Path import org.apache.hadoop.fs.permission.FsPermission @@ -506,8 +507,9 @@ class DeltaLogSuite extends QueryTest // file. val oc = CommitCoordinatorProvider.getCommitCoordinatorClient( "tracking-in-memory", Map.empty[String, String], spark) - val commitResponse = oc.getCommits( - deltaLog.logPath, Map.empty[String, String].asJava, 2, null) + val tableDesc = + new TableDescriptor(deltaLog.logPath, Optional.empty(), Map.empty[String, String].asJava) + val commitResponse = oc.getCommits(tableDesc, 2, null) if (!commitResponse.getCommits.isEmpty) { val path = commitResponse.getCommits.asScala.last.getFileStatus.getPath fs.delete(path, true) @@ -619,8 +621,9 @@ class DeltaLogSuite extends QueryTest // file. val oc = CommitCoordinatorProvider.getCommitCoordinatorClient( "tracking-in-memory", Map.empty[String, String], spark) - val commitResponse = oc.getCommits( - log.logPath, Map.empty[String, String].asJava, 1, null) + val tableDesc = + new TableDescriptor(log.logPath, Optional.empty(), Map.empty[String, String].asJava) + val commitResponse = oc.getCommits(tableDesc, 1, null) if (!commitResponse.getCommits.isEmpty) { commitFilePath = commitResponse.getCommits.asScala.head.getFileStatus.getPath } diff --git a/spark/src/test/scala/org/apache/spark/sql/delta/DeltaProtocolTransitionsSuite.scala b/spark/src/test/scala/org/apache/spark/sql/delta/DeltaProtocolTransitionsSuite.scala index b492ca3e242..ec36c423e04 100644 --- a/spark/src/test/scala/org/apache/spark/sql/delta/DeltaProtocolTransitionsSuite.scala +++ b/spark/src/test/scala/org/apache/spark/sql/delta/DeltaProtocolTransitionsSuite.scala @@ -156,6 +156,38 @@ class DeltaProtocolTransitionsSuite extends DeltaProtocolTransitionsBaseSuite { expectedProtocol = Protocol(3, 7).withFeature(TestRemovableReaderWriterFeature)) } + test("Setting partial versions") { + testProtocolTransition( + createTableProperties = Seq( + ("delta.minWriterVersion", 3.toString)), + expectedProtocol = Protocol(1, 3)) + + testProtocolTransition( + alterTableProperties = Seq( + ("delta.minWriterVersion", 3.toString)), + expectedProtocol = Protocol(1, 3)) + + testProtocolTransition( + createTableProperties = Seq( + ("delta.minWriterVersion", 3.toString), + (s"delta.feature.${DeletionVectorsTableFeature.name}", "supported")), + expectedProtocol = Protocol(3, 7).withFeatures(Seq( + AppendOnlyTableFeature, + InvariantsTableFeature, + CheckConstraintsTableFeature, + DeletionVectorsTableFeature))) + + testProtocolTransition( + alterTableProperties = Seq( + ("delta.minWriterVersion", 3.toString), + (s"delta.feature.${DeletionVectorsTableFeature.name}", "supported")), + expectedProtocol = Protocol(3, 7).withFeatures(Seq( + AppendOnlyTableFeature, + InvariantsTableFeature, + CheckConstraintsTableFeature, + DeletionVectorsTableFeature))) + } + for ((readerVersion, writerVersion) <- Seq((2, 1), (2, 2), (2, 3), (2, 4), (1, 5))) test("Invalid legacy protocol normalization" + s" - invalidProtocol($readerVersion, $writerVersion)") { @@ -448,17 +480,42 @@ class DeltaProtocolTransitionsSuite extends DeltaProtocolTransitionsBaseSuite { test("Default Enabled legacy features") { testProtocolTransition( createTableProperties = Seq((DeltaConfigs.CHANGE_DATA_FEED.key, true.toString)), - expectedProtocol = Protocol(1, 4)) + expectedProtocol = Protocol(1, 7).withFeatures(Seq( + AppendOnlyTableFeature, + InvariantsTableFeature, + ChangeDataFeedTableFeature))) testProtocolTransition( createTableProperties = Seq( ("delta.minReaderVersion", 1.toString), ("delta.minWriterVersion", 3.toString), (DeltaConfigs.CHANGE_DATA_FEED.key, true.toString)), + expectedProtocol = Protocol(1, 7).withFeatures(Seq( + AppendOnlyTableFeature, + InvariantsTableFeature, + CheckConstraintsTableFeature, + ChangeDataFeedTableFeature))) + + testProtocolTransition( + createTableProperties = Seq( + ("delta.minReaderVersion", 1.toString), + ("delta.minWriterVersion", 4.toString), + (DeltaConfigs.CHANGE_DATA_FEED.key, true.toString)), + expectedProtocol = Protocol(1, 4)) + + testProtocolTransition( + alterTableProperties = Seq( + ("delta.minReaderVersion", 1.toString), + ("delta.minWriterVersion", 4.toString), + (DeltaConfigs.CHANGE_DATA_FEED.key, true.toString)), expectedProtocol = Protocol(1, 4)) withSQLConf(DeltaConfigs.CHANGE_DATA_FEED.defaultTablePropertyKey -> "true") { - testProtocolTransition(expectedProtocol = Protocol(1, 4)) + testProtocolTransition( + expectedProtocol = Protocol(1, 7).withFeatures(Seq( + AppendOnlyTableFeature, + InvariantsTableFeature, + ChangeDataFeedTableFeature))) } testProtocolTransition( @@ -497,7 +554,10 @@ class DeltaProtocolTransitionsSuite extends DeltaProtocolTransitionsBaseSuite { testProtocolTransition( createTableColumns = Seq(("id", "INT")), createTableGeneratedColumns = Seq(("id2", "INT", "id + 1")), - expectedProtocol = Protocol(1, 4)) + expectedProtocol = Protocol(1, 7).withFeatures(Seq( + AppendOnlyTableFeature, + InvariantsTableFeature, + GeneratedColumnsTableFeature))) testProtocolTransition( createTableColumns = Seq(("id", "INT")), @@ -516,10 +576,6 @@ class DeltaProtocolTransitionsSuite extends DeltaProtocolTransitionsBaseSuite { expectedProtocol = Protocol(1, 7).withFeature(GeneratedColumnsTableFeature)) } - testProtocolTransition( - alterTableProperties = Seq((DeltaConfigs.CHANGE_DATA_FEED.key, "true")), - expectedProtocol = Protocol(1, 4)) - testProtocolTransition( alterTableProperties = Seq( ("delta.minReaderVersion", 1.toString), @@ -534,7 +590,27 @@ class DeltaProtocolTransitionsSuite extends DeltaProtocolTransitionsBaseSuite { test("Column Mapping does not require a manual protocol versions upgrade") { testProtocolTransition( createTableProperties = Seq((DeltaConfigs.COLUMN_MAPPING_MODE.key, "name")), - expectedProtocol = Protocol(2, 5)) + expectedProtocol = Protocol(2, 7).withFeatures(Seq( + AppendOnlyTableFeature, + InvariantsTableFeature, + ColumnMappingTableFeature))) + + withSQLConf(DeltaSQLConf.TABLE_FEATURES_TEST_FEATURES_ENABLED.key -> false.toString) { + testProtocolTransition( + createTableProperties = Seq( + ("delta.minReaderVersion", 1.toString), + ("delta.minWriterVersion", 4.toString), + (DeltaConfigs.COLUMN_MAPPING_MODE.key, "name")), + expectedProtocol = Protocol(2, 5)) + + testProtocolTransition( + createTableProperties = Seq( + ("delta.minReaderVersion", 1.toString), + ("delta.minWriterVersion", 4.toString)), + alterTableProperties = Seq( + (DeltaConfigs.COLUMN_MAPPING_MODE.key, "name")), + expectedProtocol = Protocol(2, 5)) + } testProtocolTransition( createTableProperties = Seq( @@ -553,7 +629,10 @@ class DeltaProtocolTransitionsSuite extends DeltaProtocolTransitionsBaseSuite { testProtocolTransition( alterTableProperties = Seq((DeltaConfigs.COLUMN_MAPPING_MODE.key, "name")), - expectedProtocol = Protocol(2, 5)) + expectedProtocol = Protocol(2, 7).withFeatures(Seq( + AppendOnlyTableFeature, + InvariantsTableFeature, + ColumnMappingTableFeature))) testProtocolTransition( alterTableProperties = Seq( diff --git a/spark/src/test/scala/org/apache/spark/sql/delta/DeltaProtocolVersionSuite.scala b/spark/src/test/scala/org/apache/spark/sql/delta/DeltaProtocolVersionSuite.scala index b7465329ddb..84aebb36023 100644 --- a/spark/src/test/scala/org/apache/spark/sql/delta/DeltaProtocolVersionSuite.scala +++ b/spark/src/test/scala/org/apache/spark/sql/delta/DeltaProtocolVersionSuite.scala @@ -38,6 +38,7 @@ import org.apache.spark.sql.delta.util.FileNames import org.apache.spark.sql.delta.util.FileNames.{unsafeDeltaFile, DeltaFile} import org.apache.spark.sql.delta.util.JsonUtils import io.delta.storage.LogStore +import io.delta.storage.commit.TableDescriptor import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path @@ -102,7 +103,7 @@ trait DeltaProtocolVersionSuiteBase extends QueryTest readerVersion = 1, writerVersion = 1, sqlConfs = Seq((DeltaConfigs.CHANGE_DATA_FEED.defaultTablePropertyKey, "true")), - expectedProtocol = Protocol(1, 1).merge(ChangeDataFeedTableFeature.minProtocolVersion)) + expectedProtocol = Protocol(1, 7).withFeature(ChangeDataFeedTableFeature)) testEmptyFolder( readerVersion = 1, writerVersion = 1, @@ -213,39 +214,38 @@ trait DeltaProtocolVersionSuiteBase extends QueryTest } test("upgrade to support table features - many features") { - withTempDir { path => - val log = createTableWithProtocol(Protocol(2, 5), path) - assert(log.update().protocol === Protocol(2, 5)) - val table = io.delta.tables.DeltaTable.forPath(spark, path.getCanonicalPath) - table.upgradeTableProtocol(2, TABLE_FEATURES_MIN_WRITER_VERSION) - // Setting table feature versions to a protocol without table features is a noop. - assert(log.update().protocol === Protocol(2, 5)) - spark.sql( - s"ALTER TABLE delta.`${path.getPath}` SET TBLPROPERTIES (" + - s" delta.feature.${TestWriterFeature.name}='enabled'" + - s")") - table.upgradeTableProtocol( - TABLE_FEATURES_MIN_READER_VERSION, - TABLE_FEATURES_MIN_WRITER_VERSION) - assert( - log.snapshot.protocol === Protocol( - minReaderVersion = 2, - minWriterVersion = TABLE_FEATURES_MIN_WRITER_VERSION, - readerFeatures = None, - writerFeatures = Some( - Set( - AppendOnlyTableFeature, - ChangeDataFeedTableFeature, - CheckConstraintsTableFeature, - ColumnMappingTableFeature, - GeneratedColumnsTableFeature, - InvariantsTableFeature, - TestLegacyWriterFeature, - TestRemovableLegacyWriterFeature, - TestLegacyReaderWriterFeature, - TestRemovableLegacyReaderWriterFeature, - TestWriterFeature) - .map(_.name)))) + withSQLConf(DeltaSQLConf.TABLE_FEATURES_TEST_FEATURES_ENABLED.key -> false.toString) { + withTempDir { path => + val log = createTableWithProtocol(Protocol(2, 5), path) + assert(log.update().protocol === Protocol(2, 5)) + val table = io.delta.tables.DeltaTable.forPath(spark, path.getCanonicalPath) + table.upgradeTableProtocol(2, TABLE_FEATURES_MIN_WRITER_VERSION) + // Setting table feature versions to a protocol without table features is a noop. + assert(log.update().protocol === Protocol(2, 5)) + spark.sql( + s"ALTER TABLE delta.`${path.getPath}` SET TBLPROPERTIES (" + + s" delta.feature.${RowTrackingFeature.name}='enabled'" + + s")") + table.upgradeTableProtocol( + TABLE_FEATURES_MIN_READER_VERSION, + TABLE_FEATURES_MIN_WRITER_VERSION) + assert( + log.snapshot.protocol === Protocol( + minReaderVersion = 2, + minWriterVersion = TABLE_FEATURES_MIN_WRITER_VERSION, + readerFeatures = Some(Set(ColumnMappingTableFeature.name)), + writerFeatures = Some( + Set( + AppendOnlyTableFeature, + InvariantsTableFeature, + ChangeDataFeedTableFeature, + CheckConstraintsTableFeature, + ColumnMappingTableFeature, + GeneratedColumnsTableFeature, + DomainMetadataTableFeature, + RowTrackingFeature) + .map(_.name)))) + } } } @@ -1052,7 +1052,7 @@ trait DeltaProtocolVersionSuiteBase extends QueryTest "creating a new table with default protocol - requiring more recent protocol version") { val tableName = "delta_test" def testTableCreation(fn: String => Unit, tableInitialized: Boolean = false): Unit = - testCreation(tableName, 2, tableInitialized)(fn) + testCreation(tableName, 7, tableInitialized)(fn) testTableCreation { dir => spark.range(10).writeTo(tableName).using("delta") @@ -1114,7 +1114,7 @@ trait DeltaProtocolVersionSuiteBase extends QueryTest sql(s"CREATE TABLE $tbl (id bigint) USING delta LOCATION '${dir.getCanonicalPath}'") } val deltaLog = DeltaLog.forTable(spark, dir) - assert(deltaLog.snapshot.protocol.minWriterVersion === 1, + assert(deltaLog.update().protocol.minWriterVersion === 1, "Should've picked up the protocol from the configuration") // Replace the table and make sure the config is picked up @@ -1122,13 +1122,13 @@ trait DeltaProtocolVersionSuiteBase extends QueryTest spark.range(10).writeTo(tbl).using("delta") .tableProperty("location", dir.getCanonicalPath).replace() } - assert(deltaLog.snapshot.protocol.minWriterVersion === 2, + assert(deltaLog.update().protocol.minWriterVersion === 2, "Should've picked up the protocol from the configuration") // Will not downgrade without special flag. withSQLConf(DeltaSQLConf.DELTA_PROTOCOL_DEFAULT_WRITER_VERSION.key -> "1") { sql(s"REPLACE TABLE $tbl (id bigint) USING delta LOCATION '${dir.getCanonicalPath}'") - assert(deltaLog.snapshot.protocol.minWriterVersion === 2, + assert(deltaLog.update().protocol.minWriterVersion === 2, "Should not pick up the protocol from the configuration") } @@ -1137,23 +1137,23 @@ trait DeltaProtocolVersionSuiteBase extends QueryTest DeltaSQLConf.DELTA_PROTOCOL_DEFAULT_WRITER_VERSION.key -> "1", DeltaSQLConf.REPLACE_TABLE_PROTOCOL_DOWNGRADE_ALLOWED.key -> "true") { sql(s"REPLACE TABLE $tbl (id bigint) USING delta LOCATION '${dir.getCanonicalPath}'") - assert(deltaLog.snapshot.protocol.minWriterVersion === 1, + assert(deltaLog.update().protocol.minWriterVersion === 1, "Should've created a new protocol") sql(s"CREATE OR REPLACE TABLE $tbl (id bigint NOT NULL) USING delta " + s"LOCATION '${dir.getCanonicalPath}'") - assert(deltaLog.snapshot.protocol.minWriterVersion === 2, + assert(deltaLog.update().protocol === Protocol(1, 7).withFeature(InvariantsTableFeature), "Invariant should require the higher protocol") // Go back to version 1 sql(s"REPLACE TABLE $tbl (id bigint) USING delta LOCATION '${dir.getCanonicalPath}'") - assert(deltaLog.snapshot.protocol.minWriterVersion === 1, + assert(deltaLog.update().protocol.minWriterVersion === 1, "Should've created a new protocol") // Check table properties with different syntax spark.range(10).writeTo(tbl).tableProperty("location", dir.getCanonicalPath) .tableProperty("delta.appendOnly", "true").using("delta").createOrReplace() - assert(deltaLog.snapshot.protocol.minWriterVersion === 2, + assert(deltaLog.update().protocol === Protocol(1, 7).withFeature(AppendOnlyTableFeature), "appendOnly should require the higher protocol") } } @@ -1312,7 +1312,7 @@ trait DeltaProtocolVersionSuiteBase extends QueryTest deltaLog.snapshot.protocol === Protocol( minReaderVersion = 2, minWriterVersion = TABLE_FEATURES_MIN_WRITER_VERSION, - readerFeatures = None, + readerFeatures = Some(Set.empty), writerFeatures = Some(Set(TestLegacyReaderWriterFeature.name)))) assertPropertiesAndShowTblProperties(deltaLog, tableHasFeatures = true) } @@ -1367,8 +1367,10 @@ trait DeltaProtocolVersionSuiteBase extends QueryTest " delta.minWriterVersion='2'," + " delta.enableChangeDataFeed='true'" + ")") - assert(deltaLog.snapshot.protocol.minReaderVersion === 1) - assert(deltaLog.snapshot.protocol.minWriterVersion === 4) + assert(deltaLog.update().protocol === Protocol(1, 7).withFeatures(Seq( + AppendOnlyTableFeature, + InvariantsTableFeature, + ChangeDataFeedTableFeature))) } } @@ -1700,12 +1702,15 @@ trait DeltaProtocolVersionSuiteBase extends QueryTest DeltaConfigs.MIN_READER_VERSION.key -> "1", DeltaConfigs.MIN_WRITER_VERSION.key -> "1", DeltaConfigs.CHANGE_DATA_FEED.key -> "true"), - expectedFinalProtocol = Some(Protocol(1, 4))) + expectedFinalProtocol = Some(Protocol(1, 7).withFeatures(Seq( + AppendOnlyTableFeature, + InvariantsTableFeature, + ChangeDataFeedTableFeature)))) testAlterTable( "legacy protocol, legacy feature, metadata", Map("delta.appendOnly" -> "true"), - expectedFinalProtocol = Some(Protocol(1, 2))) + expectedFinalProtocol = Some(Protocol(1, 7).withFeature(AppendOnlyTableFeature))) testAlterTable( "legacy protocol, legacy feature, feature property", @@ -1924,7 +1929,7 @@ trait DeltaProtocolVersionSuiteBase extends QueryTest .tableProperty("delta.appendOnly", "true") .using("delta") .create() - val protocolOfNewTable = Protocol(1, 2) + val protocolOfNewTable = Protocol(1, 7).withFeature(AppendOnlyTableFeature) assert(deltaLog.update().protocol === protocolOfNewTable) val e = intercept[DeltaTableFeatureException] { @@ -1975,8 +1980,8 @@ trait DeltaProtocolVersionSuiteBase extends QueryTest sql(s"CREATE TABLE delta.`${dir.getCanonicalPath}` (id bigint) USING delta " + "TBLPROPERTIES (delta.minWriterVersion=1, delta.appendOnly=true)") - assert(deltaLog.snapshot.protocol.minWriterVersion === 2) - assertPropertiesAndShowTblProperties(deltaLog) + assert(deltaLog.update().protocol === Protocol(1, 7).withFeature(AppendOnlyTableFeature)) + assertPropertiesAndShowTblProperties(deltaLog, tableHasFeatures = true) } } } @@ -1991,8 +1996,8 @@ trait DeltaProtocolVersionSuiteBase extends QueryTest .using("delta") .create() - assert(deltaLog.snapshot.protocol.minWriterVersion === 2) - assertPropertiesAndShowTblProperties(deltaLog) + assert(deltaLog.update().protocol === Protocol(1, 7).withFeature(AppendOnlyTableFeature)) + assertPropertiesAndShowTblProperties(deltaLog, tableHasFeatures = true) } } } @@ -2074,7 +2079,7 @@ trait DeltaProtocolVersionSuiteBase extends QueryTest |ALTER TABLE delta.`${log.dataPath.toString}` |SET TBLPROPERTIES ('delta.appendOnly' = 'true') """.stripMargin) - assert(log.snapshot.protocol.minWriterVersion === 2) + assert(log.update().protocol === Protocol(1, 7).withFeature(AppendOnlyTableFeature)) } } @@ -2100,7 +2105,10 @@ trait DeltaProtocolVersionSuiteBase extends QueryTest | 'delta.minWriterVersion' = '2', | 'delta.enableChangeDataFeed' = 'true' |)""".stripMargin) - assert(log.snapshot.protocol.minWriterVersion === 4) + assert(log.update().protocol === Protocol(1, 7).withFeatures(Seq( + AppendOnlyTableFeature, + InvariantsTableFeature, + ChangeDataFeedTableFeature))) } } @@ -2127,7 +2135,7 @@ trait DeltaProtocolVersionSuiteBase extends QueryTest assert(log.snapshot.protocol === Protocol( 2, TABLE_FEATURES_MIN_WRITER_VERSION, - readerFeatures = None, + readerFeatures = Some(Set.empty), writerFeatures = Some(Set(TestLegacyReaderWriterFeature.name)))) } } @@ -2368,6 +2376,27 @@ trait DeltaProtocolVersionSuiteBase extends QueryTest } } + test("Column mapping appears in reader features") { + withTempDir { dir => + val deltaLog = DeltaLog.forTable(spark, dir) + sql( + s"""CREATE TABLE delta.`${deltaLog.dataPath}` (id bigint) USING delta + |TBLPROPERTIES ( + |delta.feature.${ColumnMappingTableFeature.name} = 'supported', + |delta.feature.${TestWriterFeature.name} = 'supported' + |)""".stripMargin) + assert(deltaLog.update().protocol === Protocol( + minReaderVersion = 2, + minWriterVersion = 7, + readerFeatures = Some(Set(ColumnMappingTableFeature.name)), + writerFeatures = Some(Set( + InvariantsTableFeature.name, + AppendOnlyTableFeature.name, + ColumnMappingTableFeature.name, + TestWriterFeature.name)))) + } + } + def protocolWithFeatures( readerFeatures: Seq[TableFeature] = Seq.empty, writerFeatures: Seq[TableFeature] = Seq.empty): Protocol = { @@ -2425,8 +2454,17 @@ trait DeltaProtocolVersionSuiteBase extends QueryTest val readerVersion = Math.max(feature.minReaderVersion, 1) val expectedWriterFeatures = Some(Set(feature.name, InvariantsTableFeature.name, AppendOnlyTableFeature.name)) + val supportsColumnMapping = + canSupportColumnMappingFeature(readerVersion, TABLE_FEATURES_MIN_WRITER_VERSION) val expectedReaderFeatures: Option[Set[String]] = - if (supportsReaderFeatures(readerVersion)) Some(Set(feature.name)) else None + if ((feature == ColumnMappingTableFeature && supportsColumnMapping) || + supportsReaderFeatures(readerVersion)) { + Some(Set(feature.name)) + } else if (supportsColumnMapping) { + Some(Set.empty) + } else { + None + } assert( deltaLog.update().protocol === Protocol( @@ -2493,8 +2531,8 @@ trait DeltaProtocolVersionSuiteBase extends QueryTest feature.name).run(spark) } checkError( - exception = e1, - errorClass = "DELTA_FEATURE_DROP_WAIT_FOR_RETENTION_PERIOD", + e1, + "DELTA_FEATURE_DROP_WAIT_FOR_RETENTION_PERIOD", parameters = Map( "feature" -> feature.name, "logRetentionPeriodKey" -> "delta.logRetentionDuration", @@ -2512,8 +2550,8 @@ trait DeltaProtocolVersionSuiteBase extends QueryTest feature.name).run(spark) } checkError( - exception = e2, - errorClass = "DELTA_FEATURE_DROP_HISTORICAL_VERSIONS_EXIST", + e2, + "DELTA_FEATURE_DROP_HISTORICAL_VERSIONS_EXIST", parameters = Map( "feature" -> feature.name, "logRetentionPeriodKey" -> "delta.logRetentionDuration", @@ -2554,8 +2592,8 @@ trait DeltaProtocolVersionSuiteBase extends QueryTest dropCommand.run(spark) } checkError( - exception = e3, - errorClass = "DELTA_FEATURE_DROP_HISTORICAL_VERSIONS_EXIST", + e3, + "DELTA_FEATURE_DROP_HISTORICAL_VERSIONS_EXIST", parameters = Map( "feature" -> feature.name, "logRetentionPeriodKey" -> "delta.logRetentionDuration", @@ -2669,8 +2707,8 @@ trait DeltaProtocolVersionSuiteBase extends QueryTest command.run(spark) } checkError( - exception = e, - errorClass = "DELTA_FEATURE_DROP_NONREMOVABLE_FEATURE", + e, + "DELTA_FEATURE_DROP_NONREMOVABLE_FEATURE", parameters = Map("feature" -> TestWriterMetadataNoAutoUpdateFeature.name)) } } @@ -2692,8 +2730,8 @@ trait DeltaProtocolVersionSuiteBase extends QueryTest command.run(spark) } checkError( - exception = e, - errorClass = "DELTA_FEATURE_DROP_NONREMOVABLE_FEATURE", + e, + "DELTA_FEATURE_DROP_NONREMOVABLE_FEATURE", parameters = Map("feature" -> AppendOnlyTableFeature.name)) } } @@ -2718,8 +2756,8 @@ trait DeltaProtocolVersionSuiteBase extends QueryTest command.run(spark) } checkError( - exception = e, - errorClass = "DELTA_FEATURE_DROP_UNSUPPORTED_CLIENT_FEATURE", + e, + "DELTA_FEATURE_DROP_UNSUPPORTED_CLIENT_FEATURE", parameters = Map("feature" -> "NonSupportedFeature")) } } @@ -2744,8 +2782,8 @@ trait DeltaProtocolVersionSuiteBase extends QueryTest command.run(spark) } checkError( - exception = e, - errorClass = "DELTA_FEATURE_DROP_FEATURE_NOT_PRESENT", + e, + "DELTA_FEATURE_DROP_FEATURE_NOT_PRESENT", parameters = Map("feature" -> TestRemovableWriterFeature.name)) } } @@ -2825,8 +2863,8 @@ trait DeltaProtocolVersionSuiteBase extends QueryTest TestRemovableWriterFeature.name).run(spark) } checkError( - exception = e1, - errorClass = "DELTA_FEATURE_DROP_DEPENDENT_FEATURE", + e1, + "DELTA_FEATURE_DROP_DEPENDENT_FEATURE", parameters = Map( "feature" -> TestRemovableWriterFeature.name, "dependentFeatures" -> TestRemovableWriterFeatureWithDependency.name)) @@ -2866,8 +2904,8 @@ trait DeltaProtocolVersionSuiteBase extends QueryTest |TRUNCATE HISTORY""".stripMargin) } checkError( - exception = e, - errorClass = "DELTA_FEATURE_DROP_HISTORY_TRUNCATION_NOT_ALLOWED", + e, + "DELTA_FEATURE_DROP_HISTORY_TRUNCATION_NOT_ALLOWED", parameters = Map.empty) } } @@ -2894,8 +2932,8 @@ trait DeltaProtocolVersionSuiteBase extends QueryTest TestRemovableReaderWriterFeature.name).run(spark) } checkError( - exception = e1, - errorClass = "DELTA_FEATURE_DROP_WAIT_FOR_RETENTION_PERIOD", + e1, + "DELTA_FEATURE_DROP_WAIT_FOR_RETENTION_PERIOD", parameters = Map( "feature" -> TestRemovableReaderWriterFeature.name, "logRetentionPeriodKey" -> "delta.logRetentionDuration", @@ -2925,8 +2963,8 @@ trait DeltaProtocolVersionSuiteBase extends QueryTest } checkError( - exception = e2, - errorClass = "DELTA_FEATURE_DROP_WAIT_FOR_RETENTION_PERIOD", + e2, + "DELTA_FEATURE_DROP_WAIT_FOR_RETENTION_PERIOD", parameters = Map( "feature" -> TestRemovableReaderWriterFeature.name, "logRetentionPeriodKey" -> "delta.logRetentionDuration", @@ -2953,8 +2991,8 @@ trait DeltaProtocolVersionSuiteBase extends QueryTest TestRemovableReaderWriterFeature.name).run(spark) } checkError( - exception = e1, - errorClass = "DELTA_FEATURE_DROP_WAIT_FOR_RETENTION_PERIOD", + e1, + "DELTA_FEATURE_DROP_WAIT_FOR_RETENTION_PERIOD", parameters = Map( "feature" -> TestRemovableReaderWriterFeature.name, "logRetentionPeriodKey" -> "delta.logRetentionDuration", @@ -2973,8 +3011,8 @@ trait DeltaProtocolVersionSuiteBase extends QueryTest TestRemovableReaderWriterFeature.name).run(spark) } checkError( - exception = e2, - errorClass = "DELTA_FEATURE_DROP_HISTORICAL_VERSIONS_EXIST", + e2, + "DELTA_FEATURE_DROP_HISTORICAL_VERSIONS_EXIST", parameters = Map( "feature" -> TestRemovableReaderWriterFeature.name, "logRetentionPeriodKey" -> "delta.logRetentionDuration", @@ -3025,8 +3063,8 @@ trait DeltaProtocolVersionSuiteBase extends QueryTest TestRemovableReaderWriterFeature.name).run(spark) } checkError( - exception = e1, - errorClass = "DELTA_FEATURE_DROP_WAIT_FOR_RETENTION_PERIOD", + e1, + "DELTA_FEATURE_DROP_WAIT_FOR_RETENTION_PERIOD", parameters = Map( "feature" -> TestRemovableReaderWriterFeature.name, "logRetentionPeriodKey" -> "delta.logRetentionDuration", @@ -3048,8 +3086,8 @@ trait DeltaProtocolVersionSuiteBase extends QueryTest TestRemovableReaderWriterFeature.name).run(spark) } checkError( - exception = e2, - errorClass = "DELTA_FEATURE_DROP_WAIT_FOR_RETENTION_PERIOD", + e2, + "DELTA_FEATURE_DROP_WAIT_FOR_RETENTION_PERIOD", parameters = Map( "feature" -> TestRemovableReaderWriterFeature.name, "logRetentionPeriodKey" -> "delta.logRetentionDuration", @@ -3076,8 +3114,8 @@ trait DeltaProtocolVersionSuiteBase extends QueryTest TestRemovableReaderWriterFeature.name).run(spark) } checkError( - exception = e1, - errorClass = "DELTA_FEATURE_DROP_WAIT_FOR_RETENTION_PERIOD", + e1, + "DELTA_FEATURE_DROP_WAIT_FOR_RETENTION_PERIOD", parameters = Map( "feature" -> TestRemovableReaderWriterFeature.name, "logRetentionPeriodKey" -> "delta.logRetentionDuration", @@ -3493,8 +3531,8 @@ trait DeltaProtocolVersionSuiteBase extends QueryTest TestRemovableWriterWithHistoryTruncationFeature.name).run(spark) } checkError( - exception = e1, - errorClass = "DELTA_FEATURE_DROP_WAIT_FOR_RETENTION_PERIOD", + e1, + "DELTA_FEATURE_DROP_WAIT_FOR_RETENTION_PERIOD", parameters = Map( "feature" -> TestRemovableWriterWithHistoryTruncationFeature.name, "logRetentionPeriodKey" -> "delta.logRetentionDuration", @@ -3573,8 +3611,8 @@ trait DeltaProtocolVersionSuiteBase extends QueryTest dropV2CheckpointsTableFeature(spark, targetLog) } checkError( - exception = e1, - errorClass = "DELTA_FEATURE_DROP_WAIT_FOR_RETENTION_PERIOD", + e1, + "DELTA_FEATURE_DROP_WAIT_FOR_RETENTION_PERIOD", parameters = Map( "feature" -> V2CheckpointTableFeature.name, "logRetentionPeriodKey" -> "delta.logRetentionDuration", @@ -3607,8 +3645,8 @@ trait DeltaProtocolVersionSuiteBase extends QueryTest dropV2CheckpointsTableFeature(spark, targetLog) } checkError( - exception = e2, - errorClass = "DELTA_FEATURE_DROP_HISTORICAL_VERSIONS_EXIST", + e2, + "DELTA_FEATURE_DROP_HISTORICAL_VERSIONS_EXIST", parameters = Map( "feature" -> V2CheckpointTableFeature.name, "logRetentionPeriodKey" -> "delta.logRetentionDuration", @@ -3985,12 +4023,11 @@ trait DeltaProtocolVersionSuiteBase extends QueryTest val alternatingFailureBackfillClient = new TrackingCommitCoordinatorClient(new InMemoryCommitCoordinator(1000) { override def backfillToVersion( - logStore: LogStore, - hadoopConf: Configuration, - logPath: Path, - coordinatedCommitsTableConf: java.util.Map[String, String], - startVersion: Long, - endVersionOpt: java.lang.Long): Unit = { + logStore: LogStore, + hadoopConf: Configuration, + tableDesc: TableDescriptor, + startVersion: Long, + endVersionOpt: java.lang.Long): Unit = { // Backfill fails on every other attempt. if (shouldFailBackfill) { shouldFailBackfill = !shouldFailBackfill @@ -3999,8 +4036,7 @@ trait DeltaProtocolVersionSuiteBase extends QueryTest super.backfillToVersion( logStore, hadoopConf, - logPath, - coordinatedCommitsTableConf, + tableDesc, startVersion, endVersionOpt) } @@ -4033,7 +4069,7 @@ trait DeltaProtocolVersionSuiteBase extends QueryTest assert(!backfilledCommitExists(3)) // The commit coordinator still tracks the commit that disables it. val commitsFromCommitCoordinator = - log.snapshot.tableCommitCoordinatorClientOpt.get.getCommits(Some(3)) + log.snapshot.tableCommitCoordinatorClientOpt.get.getCommits(Some(3L)) assert(commitsFromCommitCoordinator.getCommits.asScala.exists(_.getVersion == 3)) // The next drop attempt will also trigger an explicit backfill. val usageLogs2 = Log4jUsageLogger.track { diff --git a/spark/src/test/scala/org/apache/spark/sql/delta/DeltaSinkImplicitCastSuite.scala b/spark/src/test/scala/org/apache/spark/sql/delta/DeltaSinkImplicitCastSuite.scala new file mode 100644 index 00000000000..0d2008e52ce --- /dev/null +++ b/spark/src/test/scala/org/apache/spark/sql/delta/DeltaSinkImplicitCastSuite.scala @@ -0,0 +1,539 @@ +/* + * Copyright (2021) The Delta Lake Project Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.delta + +import java.io.File +import java.sql.{Date, Timestamp} + +import org.apache.spark.sql.delta.sources.{DeltaSink, DeltaSQLConf} + +import org.apache.spark.{SparkArithmeticException, SparkThrowable} +import org.apache.spark.sql.{DataFrame, Encoder, Row} +import org.apache.spark.sql.errors.QueryExecutionErrors.toSQLType +import org.apache.spark.sql.execution.streaming.{MemoryStream, StreamExecution} +import org.apache.spark.sql.functions.{col, lit} +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.internal.SQLConf.StoreAssignmentPolicy +import org.apache.spark.sql.streaming.{OutputMode, StreamingQueryException, Trigger} +import org.apache.spark.sql.types._ + +/** + * Defines helper class & methods to test writing to a Delta streaming sink using data types that + * don't match the corresponding column type in the table schema. + */ +abstract class DeltaSinkImplicitCastSuiteBase extends DeltaSinkTest { + + override def beforeAll(): Unit = { + super.beforeAll() + spark.conf.set(DeltaSQLConf.DELTA_STREAMING_SINK_ALLOW_IMPLICIT_CASTS.key, "true") + spark.conf.set(SQLConf.ANSI_ENABLED.key, "true") + } + + /** + * Helper to write to and read from a Delta sink. Creates and runs a streaming query for each call + * to `write`. + */ + class TestDeltaStream[T: Encoder]( + outputDir: File, + checkpointDir: File) { + private val source = MemoryStream[T] + + def write(data: T*)(selectExpr: String*): Unit = + write(outputMode = OutputMode.Append, extraOptions = Map.empty)(data: _*)(selectExpr: _*) + + def write( + outputMode: OutputMode, + extraOptions: Map[String, String])( + data: T*)( + selectExpr: String*): Unit = { + source.addData(data) + val query = + source.toDF() + .selectExpr(selectExpr: _*) + .writeStream + .option("checkpointLocation", checkpointDir.getCanonicalPath) + .outputMode(outputMode) + .options(extraOptions) + .format("delta") + .trigger(Trigger.AvailableNow()) + .start(outputDir.getCanonicalPath) + try { + failAfter(streamingTimeout) { + query.processAllAvailable() + } + } finally { + query.stop() + } + } + + def currentSchema: StructType = + spark.read.format("delta").load(outputDir.getCanonicalPath).schema + + def read(): DataFrame = + spark.read.format("delta").load(outputDir.getCanonicalPath) + + def deltaLog: DeltaLog = + DeltaLog.forTable(spark, outputDir.getCanonicalPath) + } + + /** Sets up a new [[TestDeltaStream]] to write to and read from a test Delta sink. */ + def withDeltaStream[T: Encoder](f: TestDeltaStream[T] => Unit): Unit = + withTempDirs { (outputDir, checkpointDir) => + f(new TestDeltaStream[T](outputDir, checkpointDir)) + } + + /** + * Validates that the table history for the test Delta sink matches the given list of operations. + */ + def checkOperationHistory[T](stream: TestDeltaStream[T], expectedOperations: Seq[String]) + : Unit = { + val history = sql(s"DESCRIBE HISTORY delta.`${stream.deltaLog.dataPath}`") + .sort("version") + .select("operation") + checkAnswer(history, expectedOperations.map(Row(_))) + } +} + +/** + * Covers handling implicit casting to handle type mismatches when writing data to a Delta sink. + */ +class DeltaSinkImplicitCastSuite extends DeltaSinkImplicitCastSuiteBase { + import testImplicits._ + + test(s"write wider type - long -> int") { + withDeltaStream[Long] { stream => + stream.write(17)("CAST(value AS INT)") + assert(stream.currentSchema("value").dataType === IntegerType) + checkAnswer(stream.read(), Row(17)) + + stream.write(23)("CAST(value AS LONG)") + assert(stream.currentSchema("value").dataType === IntegerType) + checkAnswer(stream.read(), Row(17) :: Row(23) :: Nil) + checkOperationHistory(stream, expectedOperations = Seq( + "STREAMING UPDATE", // First write + "STREAMING UPDATE" // Second write + )) + } + } + + test("write wider type - long -> int - overflow with " + + s"storeAssignmentPolicy=${StoreAssignmentPolicy.STRICT}") { + withDeltaStream[Long] { stream => + stream.write(17)("CAST(value AS INT)") + assert(stream.currentSchema("value").dataType === IntegerType) + checkAnswer(stream.read(), Row(17)) + withSQLConf(SQLConf.STORE_ASSIGNMENT_POLICY.key -> StoreAssignmentPolicy.STRICT.toString) { + val ex = intercept[StreamingQueryException] { + stream.write(Long.MaxValue)("CAST(value AS LONG)") + } + checkError( + ex.getCause.asInstanceOf[SparkThrowable], + "CANNOT_UP_CAST_DATATYPE", + parameters = Map( + "expression" -> "value", + "sourceType" -> toSQLType("BIGINT"), + "targetType" -> toSQLType("INT"), + "details" -> ("The type path of the target object is:\n\nYou can either add an " + + "explicit cast to the input data or choose a higher precision type of the field in " + + "the target object") + ) + ) + } + } + } + + test("write wider type - long -> int - overflow with " + + s"storeAssignmentPolicy=${StoreAssignmentPolicy.ANSI}") { + withDeltaStream[Long] { stream => + stream.write(17)("CAST(value AS INT)") + assert(stream.currentSchema("value").dataType === IntegerType) + checkAnswer(stream.read(), Row(17)) + withSQLConf(SQLConf.STORE_ASSIGNMENT_POLICY.key -> StoreAssignmentPolicy.ANSI.toString) { + val ex = intercept[StreamingQueryException] { + stream.write(Long.MaxValue)("CAST(value AS LONG)") + } + + def getSparkArithmeticException(ex: Throwable): SparkArithmeticException = ex match { + case e: SparkArithmeticException => e + case e: Throwable if e.getCause != null => getSparkArithmeticException(e.getCause) + case e => fail(s"Unexpected exception: $e") + } + checkError( + getSparkArithmeticException(ex), + "CAST_OVERFLOW_IN_TABLE_INSERT", + parameters = Map( + "sourceType" -> "\"BIGINT\"", + "targetType" -> "\"INT\"", + "columnName" -> "`value`") + ) + } + } + } + + test("write wider type - long -> int - overflow with " + + s"storeAssignmentPolicy=${StoreAssignmentPolicy.LEGACY}") { + withDeltaStream[Long] { stream => + stream.write(17)("CAST(value AS INT)") + assert(stream.currentSchema("value").dataType === IntegerType) + checkAnswer(stream.read(), Row(17)) + withSQLConf(SQLConf.STORE_ASSIGNMENT_POLICY.key -> StoreAssignmentPolicy.LEGACY.toString) { + stream.write(Long.MaxValue)("CAST(value AS LONG)") + // LEGACY allows the value to silently overflow. + checkAnswer(stream.read(), Row(17) :: Row(-1) :: Nil) + } + } + } + + test("write wider type - Decimal(10, 4) -> Decimal(6, 2)") { + withDeltaStream[BigDecimal] { stream => + stream.write(BigDecimal(123456L, scale = 2))("CAST(value AS DECIMAL(6, 2))") + assert(stream.currentSchema("value").dataType === DecimalType(6, 2)) + checkAnswer(stream.read(), Row(BigDecimal(123456L, scale = 2))) + + stream.write(BigDecimal(987654L, scale = 4))("CAST(value AS DECIMAL(10, 4))") + assert(stream.currentSchema("value").dataType === DecimalType(6, 2)) + checkAnswer(stream.read(), + Row(BigDecimal(123456L, scale = 2)) :: Row(BigDecimal(9877L, scale = 2)) :: Nil + ) + } + } + + test("write narrower type - int -> long") { + withDeltaStream[Long] { stream => + stream.write(Long.MinValue)("CAST(value AS LONG)") + assert(stream.currentSchema("value").dataType === LongType) + checkAnswer(stream.read(), Row(Long.MinValue)) + + stream.write(23)("CAST(value AS INT)") + assert(stream.currentSchema("value").dataType === LongType) + checkAnswer(stream.read(), Row(Long.MinValue) :: Row(23) :: Nil) + } + } + + test("write different type - date -> string") { + withDeltaStream[String] { stream => + stream.write("abc")("CAST(value AS STRING)") + assert(stream.currentSchema("value").dataType === StringType) + checkAnswer(stream.read(), Row("abc")) + + stream.write("2024-07-25")("CAST(value AS DATE)") + assert(stream.currentSchema("value").dataType === StringType) + checkAnswer(stream.read(), Row("abc") :: Row("2024-07-25") :: Nil) + } + } + + test("implicit cast in nested struct/array/map") { + withDeltaStream[Int] { stream => + stream.write(17)("named_struct('a', value) AS s") + assert(stream.currentSchema("s").dataType === new StructType().add("a", IntegerType)) + checkAnswer(stream.read(), Row(Row(17))) + + stream.write(-12)("named_struct('a', CAST(value AS LONG)) AS s") + assert(stream.currentSchema("s").dataType === new StructType().add("a", IntegerType)) + checkAnswer(stream.read(), Row(Row(17)) :: Row(Row(-12)) :: Nil) + } + + withDeltaStream[(Int, Int)] { stream => + stream.write((17, 57))("map(_1, _2) AS m") + assert(stream.currentSchema("m").dataType === MapType(IntegerType, IntegerType)) + checkAnswer(stream.read(), Row(Map(17 -> 57))) + stream.write((-12, 3))("map(CAST(_1 AS LONG), CAST(_2 AS STRING)) AS m") + assert(stream.currentSchema("m").dataType === MapType(IntegerType, IntegerType)) + checkAnswer(stream.read(), Row(Map(17 -> 57)) :: Row(Map(-12 -> 3)) :: Nil) + } + + withDeltaStream[(Int, Int)] { stream => + stream.write((17, 57))("array(_1, _2) AS a") + assert(stream.currentSchema("a").dataType === ArrayType(IntegerType)) + checkAnswer(stream.read(), Row(Seq(17, 57)) :: Nil) + stream.write((-12, 3))("array(_1, _2) AS a") + assert(stream.currentSchema("a").dataType === ArrayType(IntegerType)) + checkAnswer(stream.read(), Row(Seq(17, 57)) :: Row(Seq(-12, 3)) :: Nil) + } + } + + test("write invalid nested type - array -> struct") { + withDeltaStream[Int] { stream => + stream.write(17)("named_struct('a', value) AS s") + assert(stream.currentSchema("s").dataType === new StructType().add("a", IntegerType)) + checkAnswer(stream.read(), Row(Row(17))) + + val ex = intercept[StreamingQueryException] { + stream.write(-12)("array(value) AS s") + } + checkError( + ex.getCause.asInstanceOf[SparkThrowable], + "DELTA_FAILED_TO_MERGE_FIELDS", + parameters = Map( + "currentField" -> "s", + "updateField" -> "s") + ) + } + } + + test("implicit cast on partition value") { + withDeltaStream[(String, Int)] { stream => + sql( + s""" + |CREATE TABLE delta.`${stream.deltaLog.dataPath}` (day date, value int) + |USING DELTA + |PARTITIONED BY (day) + """.stripMargin) + + stream.write(("2024-07-26", 1))("CAST(_1 AS DATE) AS day", "_2 AS value") + assert(stream.currentSchema === new StructType() + .add("day", DateType) + .add("value", IntegerType)) + checkAnswer(stream.read(), Row(Date.valueOf("2024-07-26"), 1)) + + stream.write(("2024-07-27", 2))( + "CAST(_1 AS TIMESTAMP) AS day", "CAST(_2 AS DECIMAL(4, 1)) AS value") + assert(stream.currentSchema === new StructType() + .add("day", DateType) + .add("value", IntegerType)) + checkAnswer(stream.read(), + Row(Date.valueOf("2024-07-26"), 1) :: Row(Date.valueOf("2024-07-27"), 2) :: Nil) + } + } + + test("implicit cast with schema evolution") { + withDeltaStream[(Long, String)] { stream => + stream.write((123, "unused"))("CAST(_1 AS DECIMAL(6, 3)) AS a") + assert(stream.currentSchema === new StructType() + .add("a", DecimalType(6, 3))) + checkAnswer(stream.read(), Row(BigDecimal(123000, scale = 3))) + + withSQLConf(DeltaSQLConf.DELTA_SCHEMA_AUTO_MIGRATE.key -> "true") { + stream.write((678, "abc"))("CAST(_1 AS LONG) AS a", "_2 AS b") + assert(stream.currentSchema === new StructType() + .add("a", DecimalType(6, 3)) + .add("b", StringType)) + checkAnswer(stream.read(), + Row(BigDecimal(123000, scale = 3), null) :: + Row(BigDecimal(678000, scale = 3), "abc") :: Nil) + } + } + } + + test("implicit cast with schema overwrite") { + withTempDirs { (outputDir, checkpointDir) => + val source = MemoryStream[Long] + + def write(streamingDF: DataFrame, data: Long*): Unit = { + val query = streamingDF.writeStream + .option("checkpointLocation", checkpointDir.getCanonicalPath) + .outputMode(OutputMode.Complete) + .option(DeltaOptions.OVERWRITE_SCHEMA_OPTION, "true") + .format("delta") + .start(outputDir.getCanonicalPath) + try { + source.addData(data: _*) + failAfter(streamingTimeout) { + query.processAllAvailable() + } + } finally { + query.stop() + } + } + + // Initial write to the sink with columns a, count, b, c. + val initialDF = source.toDF() + .selectExpr("CAST(value AS DECIMAL(6, 3)) AS a") + .groupBy("a") + .count() + .withColumn("b", col("count").cast("INT")) + .withColumn("c", lit(11).cast("STRING")) + write(initialDF, 10) + val initialResult = spark.read.format("delta").load(outputDir.getCanonicalPath) + assert(initialResult.schema === new StructType() + .add("a", DecimalType(6, 3)) + .add("count", LongType) + .add("b", IntegerType) + .add("c", StringType)) + checkAnswer(initialResult, Row(BigDecimal(10000, scale = 3), 1, 1, "11")) + + // Second write with overwrite schema: change type of column b and replace c with d. + val overwriteDF = source.toDF() + .selectExpr("CAST(value AS DECIMAL(6, 3)) AS a") + .groupBy("a") + .count() + .withColumn("b", col("count").cast("LONG")) + .withColumn("d", lit(21).cast("STRING")) + write(overwriteDF, 20) + val overwriteResult = spark.read.format("delta").load(outputDir.getCanonicalPath) + assert(overwriteResult.schema === new StructType() + .add("a", DecimalType(6, 3)) + .add("count", LongType) + .add("b", LongType) + .add("d", StringType)) + checkAnswer(overwriteResult, + Row(BigDecimal(10000, scale = 3), 1, 1, "21") :: + Row(BigDecimal(20000, scale = 3), 1, 1, "21") :: Nil + ) + } + } + + // Writing to a delta sink is always case insensitive and ignores the value of + // 'spark.sql.caseSensitive'. + for (caseSensitive <- Seq(true, false)) + test(s"implicit cast with case sensitivity, caseSensitive=$caseSensitive") { + withDeltaStream[Long] { stream => + stream.write(17)("CAST(value AS LONG) AS value") + assert(stream.currentSchema === new StructType().add("value", LongType)) + checkAnswer(stream.read(), Row(17)) + + withSQLConf(SQLConf.CASE_SENSITIVE.key -> caseSensitive.toString) { + stream.write(23)("CAST(value AS INT) AS VALUE") + assert(stream.currentSchema === new StructType().add("value", LongType)) + checkAnswer(stream.read(), Row(17) :: Row(23) :: Nil) + } + } + } + + test("implicit cast and missing column") { + withDeltaStream[(String, String)] { stream => + stream.write(("2024-07-28 12:00:00", "abc"))("CAST(_1 AS TIMESTAMP) AS a", "_2 AS b") + assert(stream.currentSchema === new StructType() + .add("a", TimestampType) + .add("b", StringType)) + checkAnswer(stream.read(), Row(Timestamp.valueOf("2024-07-28 12:00:00"), "abc")) + + stream.write(("2024-07-29", "unused"))("CAST(_1 AS DATE) AS a") + assert(stream.currentSchema === new StructType() + .add("a", TimestampType) + .add("b", StringType)) + checkAnswer(stream.read(), + Row(Timestamp.valueOf("2024-07-28 12:00:00"), "abc") :: + Row(Timestamp.valueOf("2024-07-29 00:00:00"), null) :: Nil) + checkOperationHistory(stream, expectedOperations = Seq( + "STREAMING UPDATE", // First write + "STREAMING UPDATE" // Second write + )) + } + } + + test("implicit cast after renaming/dropping columns with column mapping") { + withDeltaStream[(Int, Int)] { stream => + stream.write((1, 100))("_1 AS a", "CAST(_2 AS LONG) AS b") + assert(stream.currentSchema === new StructType() + .add("a", IntegerType) + .add("b", LongType)) + checkAnswer(stream.read(), Row(1, 100)) + sql( + s""" + |ALTER TABLE delta.`${stream.deltaLog.dataPath}` SET TBLPROPERTIES ( + | 'delta.columnMapping.mode' = 'name', + | 'delta.minReaderVersion' = '2', + | 'delta.minWriterVersion' = '5' + |) + """.stripMargin) + + sql(s"ALTER TABLE delta.`${stream.deltaLog.dataPath}` DROP COLUMN a") + sql(s"ALTER TABLE delta.`${stream.deltaLog.dataPath}` RENAME COLUMN b to a") + assert(stream.currentSchema === new StructType() + .add("a", LongType)) + + stream.write((17, -1))("CAST(_1 AS STRING) AS a") + assert(stream.currentSchema === new StructType() + .add("a", LongType)) + checkAnswer(stream.read(), Row(100) :: Row(17) :: Nil) + + checkOperationHistory(stream, expectedOperations = Seq( + "STREAMING UPDATE", // First write + "SET TBLPROPERTIES", // Enable column mapping + "DROP COLUMNS", // Drop column + "RENAME COLUMN", // Rename Column + "STREAMING UPDATE" // Second write + )) + } + } + + test("disallow implicit cast with spark.databricks.delta.streaming.sink.allowImplicitCasts") { + withSQLConf(DeltaSQLConf.DELTA_STREAMING_SINK_ALLOW_IMPLICIT_CASTS.key -> "false") { + withDeltaStream[Long] { stream => + stream.write(17)("CAST(value AS INT)") + assert(stream.currentSchema("value").dataType === IntegerType) + checkAnswer(stream.read(), Row(17)) + + val ex = intercept[StreamingQueryException] { + stream.write(23)("CAST(value AS LONG)") + } + checkError( + ex.getCause.asInstanceOf[SparkThrowable], + "DELTA_FAILED_TO_MERGE_FIELDS", + parameters = Map( + "currentField" -> "value", + "updateField" -> "value") + ) + } + } + } + + for (allowImplicitCasts <- Seq(true, false)) + test(s"schema evolution with case sensitivity and without type mismatch, " + + s"allowImplicitCasts=$allowImplicitCasts") { + withSQLConf( + DeltaSQLConf.DELTA_STREAMING_SINK_ALLOW_IMPLICIT_CASTS.key -> allowImplicitCasts.toString, + SQLConf.CASE_SENSITIVE.key -> "true", + DeltaSQLConf.DELTA_SCHEMA_AUTO_MIGRATE.key -> "true" + ) { + withDeltaStream[(Long, Long)] { stream => + stream.write((17, -1))("CAST(_1 AS INT) AS a") + assert(stream.currentSchema == new StructType().add("a", IntegerType)) + checkAnswer(stream.read(), Row(17)) + + stream.write((21, 22))("CAST(_1 AS INT) AS A", "_2 AS b") + assert(stream.currentSchema == new StructType() + .add("a", IntegerType) + .add("b", LongType)) + checkAnswer(stream.read(), Row(17, null) :: Row(21, 22) :: Nil) + } + } + } + + test("handling type mismatch in addBatch") { + withTempDir { tempDir => + val tablePath = tempDir.getAbsolutePath + val deltaLog = DeltaLog.forTable(spark, tablePath) + sqlContext.sparkContext.setLocalProperty(StreamExecution.QUERY_ID_KEY, "streaming_query") + val sink = DeltaSink( + sqlContext, + path = deltaLog.dataPath, + partitionColumns = Seq.empty, + outputMode = OutputMode.Append(), + options = new DeltaOptions(options = Map.empty, conf = spark.sessionState.conf) + ) + + val schema = new StructType().add("value", IntegerType) + + { + val data = Seq(0, 1).toDF("value").selectExpr("CAST(value AS INT)") + sink.addBatch(0, data) + val df = spark.read.format("delta").load(tablePath) + assert(df.schema === schema) + checkAnswer(df, Row(0) :: Row(1) :: Nil) + } + { + val data = Seq(2, 3).toDF("value").selectExpr("CAST(value AS LONG)") + sink.addBatch(1, data) + val df = spark.read.format("delta").load(tablePath) + assert(df.schema === schema) + checkAnswer(df, Row(0) :: Row(1) :: Row(2) :: Row(3) :: Nil) + } + } + } +} diff --git a/spark/src/test/scala/org/apache/spark/sql/delta/DeltaSinkSuite.scala b/spark/src/test/scala/org/apache/spark/sql/delta/DeltaSinkSuite.scala index d48dca68110..1578c88c120 100644 --- a/spark/src/test/scala/org/apache/spark/sql/delta/DeltaSinkSuite.scala +++ b/spark/src/test/scala/org/apache/spark/sql/delta/DeltaSinkSuite.scala @@ -37,9 +37,8 @@ import org.apache.spark.sql.streaming._ import org.apache.spark.sql.types._ import org.apache.spark.util.Utils -class DeltaSinkSuite +abstract class DeltaSinkTest extends StreamTest - with DeltaColumnMappingTestUtils with DeltaSQLCommandTest { override val streamingTimeout = 60.seconds @@ -68,6 +67,13 @@ class DeltaSinkSuite } } } +} + +class DeltaSinkSuite + extends DeltaSinkTest + with DeltaColumnMappingTestUtils { + + import testImplicits._ test("append mode") { failAfter(streamingTimeout) { @@ -401,8 +407,8 @@ class DeltaSinkSuite .save(outputDir.getCanonicalPath) } checkError( - exception = e, - errorClass = "DELTA_FAILED_TO_MERGE_FIELDS", + e, + "DELTA_FAILED_TO_MERGE_FIELDS", parameters = Map("currentField" -> "id", "updateField" -> "id")) } finally { query.stop() @@ -426,16 +432,20 @@ class DeltaSinkSuite .mode("append") .save(outputDir.getCanonicalPath) - val wrapperException = intercept[StreamingQueryException] { - val q = dsWriter.start(outputDir.getCanonicalPath) - inputData.addData(1, 2, 3) - q.processAllAvailable() + // More tests covering type changes can be found in [[DeltaSinkImplicitCastSuite]]. This only + // covers type changes disabled. + withSQLConf(DeltaSQLConf.DELTA_STREAMING_SINK_ALLOW_IMPLICIT_CASTS.key -> "false") { + val wrapperException = intercept[StreamingQueryException] { + val q = dsWriter.start(outputDir.getCanonicalPath) + inputData.addData(1, 2, 3) + q.processAllAvailable() + } + assert(wrapperException.cause.isInstanceOf[AnalysisException]) + checkError( + wrapperException.cause.asInstanceOf[AnalysisException], + "DELTA_FAILED_TO_MERGE_FIELDS", + parameters = Map("currentField" -> "id", "updateField" -> "id")) } - assert(wrapperException.cause.isInstanceOf[AnalysisException]) - checkError( - exception = wrapperException.cause.asInstanceOf[AnalysisException], - errorClass = "DELTA_FAILED_TO_MERGE_FIELDS", - parameters = Map("currentField" -> "id", "updateField" -> "id")) } } diff --git a/spark/src/test/scala/org/apache/spark/sql/delta/DeltaSourceColumnMappingSuite.scala b/spark/src/test/scala/org/apache/spark/sql/delta/DeltaSourceColumnMappingSuite.scala index cae6b29f4c2..36c142ace29 100644 --- a/spark/src/test/scala/org/apache/spark/sql/delta/DeltaSourceColumnMappingSuite.scala +++ b/spark/src/test/scala/org/apache/spark/sql/delta/DeltaSourceColumnMappingSuite.scala @@ -303,7 +303,7 @@ trait ColumnMappingStreamingBlockedWorkflowSuiteBase extends ColumnMappingStream // upgrade to name mode val protocol = deltaLog.snapshot.protocol - val (r, w) = if (protocol.supportsReaderFeatures || protocol.supportsWriterFeatures) { + val (r, w) = if (protocol.supportsTableFeatures) { (TableFeatureProtocolUtils.TABLE_FEATURES_MIN_READER_VERSION, TableFeatureProtocolUtils.TABLE_FEATURES_MIN_WRITER_VERSION) } else { diff --git a/spark/src/test/scala/org/apache/spark/sql/delta/DeltaSuite.scala b/spark/src/test/scala/org/apache/spark/sql/delta/DeltaSuite.scala index dc013656bb1..d87b2eebd3c 100644 --- a/spark/src/test/scala/org/apache/spark/sql/delta/DeltaSuite.scala +++ b/spark/src/test/scala/org/apache/spark/sql/delta/DeltaSuite.scala @@ -1305,8 +1305,8 @@ class DeltaSuite extends QueryTest .save(tempDir.toString) } checkError( - exception = e, - errorClass = "DELTA_FAILED_TO_MERGE_FIELDS", + e, + "DELTA_FAILED_TO_MERGE_FIELDS", parameters = Map("currentField" -> "value", "updateField" -> "value")) } } @@ -3000,7 +3000,7 @@ class DeltaNameColumnMappingSuite extends DeltaSuite .save(tempDir.getCanonicalPath) val protocol = DeltaLog.forTable(spark, tempDir).snapshot.protocol - val (r, w) = if (protocol.supportsReaderFeatures || protocol.supportsWriterFeatures) { + val (r, w) = if (protocol.supportsTableFeatures) { (TableFeatureProtocolUtils.TABLE_FEATURES_MIN_READER_VERSION, TableFeatureProtocolUtils.TABLE_FEATURES_MIN_WRITER_VERSION) } else { diff --git a/spark/src/test/scala/org/apache/spark/sql/delta/DeltaTableCreationTests.scala b/spark/src/test/scala/org/apache/spark/sql/delta/DeltaTableCreationTests.scala index ee98efbff9b..5b5db01d387 100644 --- a/spark/src/test/scala/org/apache/spark/sql/delta/DeltaTableCreationTests.scala +++ b/spark/src/test/scala/org/apache/spark/sql/delta/DeltaTableCreationTests.scala @@ -1207,7 +1207,7 @@ trait DeltaTableCreationTests test("create datasource table with a non-existing location") { withTempPath { dir => withTable("t") { - spark.sql(s"CREATE TABLE t(a int, b int) USING delta LOCATION '${dir.toURI}'") + spark.sql(s"CREATE TABLE t(a int, b int) USING delta LOCATION '${dir.getAbsolutePath}'") val table = spark.sessionState.catalog.getTableMetadata(TableIdentifier("t")) assert(table.location == makeQualifiedPath(dir.getAbsolutePath)) @@ -1225,17 +1225,21 @@ trait DeltaTableCreationTests withTempPath { dir => withTable("t1") { spark.sql( - s"CREATE TABLE t1(a int, b int) USING delta PARTITIONED BY(a) LOCATION '${dir.toURI}'") + s""" + |CREATE TABLE t1(a int, b int) USING delta PARTITIONED BY(a) + |LOCATION '${dir.getAbsolutePath}' + |""".stripMargin) val table = spark.sessionState.catalog.getTableMetadata(TableIdentifier("t1")) assert(table.location == makeQualifiedPath(dir.getAbsolutePath)) Seq((1, 2)).toDF("a", "b") - .write.format("delta").mode("append").save(table.location.toString) - val read = spark.read.format("delta").load(table.location.toString) + .write.format("delta").mode("append").save(table.location.getPath) + val read = spark.read.format("delta").load(table.location.getPath) checkAnswer(read, Seq(Row(1, 2))) - val deltaLog = loadDeltaLog(table.location.toString) + val deltaLog = loadDeltaLog(table.location.getPath) + assert(deltaLog.update().version > 0) assertPartitionWithValueExists("a", "1", deltaLog) } } @@ -1252,7 +1256,7 @@ trait DeltaTableCreationTests s""" |CREATE TABLE t |USING delta - |LOCATION '${dir.toURI}' + |LOCATION '${dir.getAbsolutePath}' |AS SELECT 3 as a, 4 as b, 1 as c, 2 as d """.stripMargin) val table = spark.sessionState.catalog.getTableMetadata(TableIdentifier("t")) @@ -1262,6 +1266,7 @@ trait DeltaTableCreationTests // Query the data and the metadata directly via the DeltaLog val deltaLog = getDeltaLog(table) + assert(deltaLog.update().version >= 0) assertEqual(deltaLog.snapshot.schema, new StructType() .add("a", "integer").add("b", "integer") @@ -1290,7 +1295,7 @@ trait DeltaTableCreationTests |CREATE TABLE t1 |USING delta |PARTITIONED BY(a, b) - |LOCATION '${dir.toURI}' + |LOCATION '${dir.getAbsolutePath}' |AS SELECT 3 as a, 4 as b, 1 as c, 2 as d """.stripMargin) val table = spark.sessionState.catalog.getTableMetadata(TableIdentifier("t1")) @@ -1380,12 +1385,12 @@ trait DeltaTableCreationTests dir.delete() Seq((3, 4)).toDF("a", "b") .write.format("delta") - .save(dir.toString) + .save(dir.getAbsolutePath) val ex = intercept[AnalysisException](spark.sql( s""" |CREATE TABLE t |USING delta - |LOCATION '${dir.toURI}' + |LOCATION '${dir.getAbsolutePath}' |AS SELECT 1 as a, 2 as b """.stripMargin)) assert(ex.getMessage.contains("Cannot create table")) @@ -1395,14 +1400,12 @@ trait DeltaTableCreationTests withTable("t") { withTempDir { dir => dir.delete() - Seq((3, 4)).toDF("a", "b") - .write.format("parquet") - .save(dir.toString) + Seq((3, 4)).toDF("a", "b").write.format("parquet").save(dir.getCanonicalPath) val ex = intercept[AnalysisException](spark.sql( s""" |CREATE TABLE t |USING delta - |LOCATION '${dir.toURI}' + |LOCATION '${dir.getAbsolutePath}' |AS SELECT 1 as a, 2 as b """.stripMargin)) assert(ex.getMessage.contains("Cannot create table")) @@ -1474,13 +1477,14 @@ trait DeltaTableCreationTests |CREATE TABLE t(a string, `$specialChars` string) |USING delta |PARTITIONED BY(`$specialChars`) - |LOCATION '${dir.toURI}' + |LOCATION '${dir.getAbsolutePath}' """.stripMargin) assert(dir.listFiles().forall(_.toString.contains("_delta_log"))) spark.sql(s"INSERT INTO TABLE t SELECT 1, 2") - val deltaLog = loadDeltaLog(dir.toString) + val deltaLog = loadDeltaLog(dir.getAbsolutePath) + assert(deltaLog.update().version > 0) assertPartitionWithValueExists(specialChars, "2", deltaLog) checkAnswer(spark.table("t"), Row("1", "2") :: Nil) @@ -2387,8 +2391,8 @@ class DeltaTableCreationSuite s" LOCATION '${subdir.getCanonicalPath}'") } checkError( - exception = e, - errorClass = "DELTA_METADATA_ABSENT_EXISTING_CATALOG_TABLE", + e, + "DELTA_METADATA_ABSENT_EXISTING_CATALOG_TABLE", parameters = Map( "tableName" -> tableName, "tablePath" -> deltaLog.logPath.toString, @@ -2428,8 +2432,6 @@ trait DeltaTableCreationColumnMappingSuiteBase extends DeltaColumnMappingSelecte class DeltaTableCreationIdColumnMappingSuite extends DeltaTableCreationSuite with DeltaColumnMappingEnableIdMode { - override val defaultTempDirPrefix = "spark" - override protected def getTableProperties(tableName: String): Map[String, String] = { // ignore comparing column mapping properties dropColumnMappingConfigurations(super.getTableProperties(tableName)) @@ -2438,7 +2440,6 @@ class DeltaTableCreationIdColumnMappingSuite extends DeltaTableCreationSuite class DeltaTableCreationNameColumnMappingSuite extends DeltaTableCreationSuite with DeltaColumnMappingEnableNameMode { - override val defaultTempDirPrefix = "spark" override protected def getTableProperties(tableName: String): Map[String, String] = { // ignore comparing column mapping properties diff --git a/spark/src/test/scala/org/apache/spark/sql/delta/DeltaTableFeatureSuite.scala b/spark/src/test/scala/org/apache/spark/sql/delta/DeltaTableFeatureSuite.scala index 261aa06c6ee..38f1e275ad3 100644 --- a/spark/src/test/scala/org/apache/spark/sql/delta/DeltaTableFeatureSuite.scala +++ b/spark/src/test/scala/org/apache/spark/sql/delta/DeltaTableFeatureSuite.scala @@ -197,9 +197,8 @@ class DeltaTableFeatureSuite val protocol = Protocol(2, TABLE_FEATURES_MIN_WRITER_VERSION).withFeature(TestLegacyReaderWriterFeature) - assert(!protocol.readerFeatures.isDefined) - assert( - protocol.writerFeatures.get === Set(TestLegacyReaderWriterFeature.name)) + assert(protocol.readerFeatures.get === Set.empty) + assert(protocol.writerFeatures.get === Set(TestLegacyReaderWriterFeature.name)) } test("merge protocols") { @@ -406,9 +405,10 @@ class DeltaTableFeatureSuite withTable("tbl") { spark.range(0).write.format("delta").saveAsTable("tbl") val log = DeltaLog.forTable(spark, TableIdentifier("tbl")) - val protocol = log.update().protocol - assert(protocol.minReaderVersion === 2) - assert(protocol.minWriterVersion === 5) + assert(log.update().protocol === Protocol(2, 7).withFeatures(Seq( + AppendOnlyTableFeature, + InvariantsTableFeature, + ColumnMappingTableFeature))) val tblProperties = Seq(s"'$FEATURE_PROP_PREFIX${TestWriterFeature.name}' = 'enabled'", s"'delta.minWriterVersion' = $TABLE_FEATURES_MIN_WRITER_VERSION") sql(buildTablePropertyModifyingCommand( @@ -416,16 +416,9 @@ class DeltaTableFeatureSuite val newProtocol = log.update().protocol assert(newProtocol.readerAndWriterFeatureNames === Set( AppendOnlyTableFeature.name, - ColumnMappingTableFeature.name, InvariantsTableFeature.name, - CheckConstraintsTableFeature.name, - ChangeDataFeedTableFeature.name, - GeneratedColumnsTableFeature.name, - TestWriterFeature.name, - TestLegacyWriterFeature.name, - TestLegacyReaderWriterFeature.name, - TestRemovableLegacyWriterFeature.name, - TestRemovableLegacyReaderWriterFeature.name)) + ColumnMappingTableFeature.name, + TestWriterFeature.name)) } } } @@ -469,7 +462,8 @@ class DeltaTableFeatureSuite // Add coordinated commits table feature to the table CommitCoordinatorProvider.registerBuilder(InMemoryCommitCoordinatorBuilder(batchSize = 100)) val tblProperties1 = - Seq(s"'${DeltaConfigs.COORDINATED_COMMITS_COORDINATOR_NAME.key}' = 'in-memory'") + Seq(s"'${DeltaConfigs.COORDINATED_COMMITS_COORDINATOR_NAME.key}' = 'in-memory'", + s"'${DeltaConfigs.COORDINATED_COMMITS_COORDINATOR_CONF.key}' = '{}'") sql(buildTablePropertyModifyingCommand( "ALTER", targetTableName = table, sourceTableName = table, tblProperties1)) diff --git a/spark/src/test/scala/org/apache/spark/sql/delta/DeltaVacuumSuite.scala b/spark/src/test/scala/org/apache/spark/sql/delta/DeltaVacuumSuite.scala index 63122d66bd3..a4851054a54 100644 --- a/spark/src/test/scala/org/apache/spark/sql/delta/DeltaVacuumSuite.scala +++ b/spark/src/test/scala/org/apache/spark/sql/delta/DeltaVacuumSuite.scala @@ -28,6 +28,7 @@ import org.apache.spark.sql.delta.DeltaTestUtils.createTestAddFile import org.apache.spark.sql.delta.DeltaVacuumSuiteShims._ import org.apache.spark.sql.delta.actions.{AddCDCFile, AddFile, Metadata, RemoveFile} import org.apache.spark.sql.delta.commands.VacuumCommand +import org.apache.spark.sql.delta.coordinatedcommits.CoordinatedCommitsBaseSuite import org.apache.spark.sql.delta.sources.DeltaSQLConf import org.apache.spark.sql.delta.test.DeltaSQLCommandTest import org.apache.spark.sql.delta.test.DeltaSQLTestUtils @@ -56,7 +57,8 @@ trait DeltaVacuumSuiteBase extends QueryTest with GivenWhenThen with DeltaSQLTestUtils with DeletionVectorsTestUtils - with DeltaTestUtilsForTempViews { + with DeltaTestUtilsForTempViews + with CoordinatedCommitsBaseSuite { private def executeWithEnvironment(file: File)(f: (File, ManualClock) => Unit): Unit = { val clock = new ManualClock() @@ -1355,3 +1357,7 @@ class DeltaVacuumSuite timeGapHours = 10 ) } + +class DeltaVacuumWithCoordinatedCommitsBatch100Suite extends DeltaVacuumSuite { + override val coordinatedCommitsBackfillBatchSize: Option[Int] = Some(100) +} diff --git a/spark/src/test/scala/org/apache/spark/sql/delta/DeltaWithNewTransactionSuite.scala b/spark/src/test/scala/org/apache/spark/sql/delta/DeltaWithNewTransactionSuite.scala index 79fa8a3fb63..981295ba8cd 100644 --- a/spark/src/test/scala/org/apache/spark/sql/delta/DeltaWithNewTransactionSuite.scala +++ b/spark/src/test/scala/org/apache/spark/sql/delta/DeltaWithNewTransactionSuite.scala @@ -263,16 +263,38 @@ trait DeltaWithNewTransactionSuiteBase extends QueryTest withTempDir { dir => val log = DeltaLog.forTable(spark, dir.getCanonicalPath) log.withNewTransaction { txn => - - require(OptimisticTransaction.getActive().nonEmpty) + assert(OptimisticTransaction.getActive() === Some(txn)) intercept[IllegalStateException] { - OptimisticTransaction.setActive(txn) + log.withNewTransaction { txn2 => } + } + assert(OptimisticTransaction.getActive() === Some(txn)) + } + assert(OptimisticTransaction.getActive().isEmpty) + } + } + + test("withActiveTxn idempotency") { + withTempDir { dir => + val log = DeltaLog.forTable(spark, dir.getCanonicalPath) + val txn = log.startTransaction() + assert(OptimisticTransaction.getActive().isEmpty) + OptimisticTransaction.withActive(txn) { + assert(OptimisticTransaction.getActive() === Some(txn)) + OptimisticTransaction.withActive(txn) { + assert(OptimisticTransaction.getActive() === Some(txn)) } + assert(OptimisticTransaction.getActive() === Some(txn)) + val txn2 = log.startTransaction() intercept[IllegalStateException] { - log.withNewTransaction { txn2 => } + OptimisticTransaction.withActive(txn2) { } + } + intercept[IllegalStateException] { + OptimisticTransaction.setActive(txn2) } + assert(OptimisticTransaction.getActive() === Some(txn)) } + assert(OptimisticTransaction.getActive().isEmpty) } } diff --git a/spark/src/test/scala/org/apache/spark/sql/delta/DescribeDeltaHistorySuite.scala b/spark/src/test/scala/org/apache/spark/sql/delta/DescribeDeltaHistorySuite.scala index 917a04ef5f7..20fdd92d18a 100644 --- a/spark/src/test/scala/org/apache/spark/sql/delta/DescribeDeltaHistorySuite.scala +++ b/spark/src/test/scala/org/apache/spark/sql/delta/DescribeDeltaHistorySuite.scala @@ -424,7 +424,7 @@ trait DescribeDeltaHistorySuiteBase Seq("UPGRADE PROTOCOL", s"""{"minReaderVersion":$readerVersion,""" + s""""minWriterVersion":$writerVersion,""" + - s""""readerFeatures":["${TestLegacyReaderWriterFeature.name}"],""" + + s""""readerFeatures":[],""" + s""""writerFeatures":["${TestLegacyReaderWriterFeature.name}"]}"""), Seq($"operation", $"operationParameters.newProtocol")) // scalastyle:on line.size.limit diff --git a/spark/src/test/scala/org/apache/spark/sql/delta/GeneratedColumnSuite.scala b/spark/src/test/scala/org/apache/spark/sql/delta/GeneratedColumnSuite.scala index 044f7adb83d..218b51b677f 100644 --- a/spark/src/test/scala/org/apache/spark/sql/delta/GeneratedColumnSuite.scala +++ b/spark/src/test/scala/org/apache/spark/sql/delta/GeneratedColumnSuite.scala @@ -21,6 +21,7 @@ import java.io.PrintWriter import scala.collection.JavaConverters._ +import org.apache.spark.sql.delta.actions.Protocol import org.apache.spark.sql.delta.commands.cdc.CDCReader import org.apache.spark.sql.delta.schema.{DeltaInvariantViolationException, InvariantViolationException, SchemaUtils} import org.apache.spark.sql.delta.sources.DeltaSourceUtils.GENERATION_EXPRESSION_METADATA_KEY @@ -718,12 +719,12 @@ trait GeneratedColumnSuiteBase assert(tableSchema == spark.table(table).schema) // Insert a LONG to `c1` should fail rather than changing the `c1` type to LONG. checkError( - exception = intercept[AnalysisException] { + intercept[AnalysisException] { Seq(32767.toLong).toDF("c1").write.format("delta").mode("append") .option("mergeSchema", "true") .saveAsTable(table) }, - errorClass = "DELTA_GENERATED_COLUMNS_DATA_TYPE_MISMATCH", + "DELTA_GENERATED_COLUMNS_DATA_TYPE_MISMATCH", parameters = Map( "columnName" -> "c1", "columnType" -> "INT", @@ -753,14 +754,14 @@ trait GeneratedColumnSuiteBase // Insert an INT to `a` should fail rather than changing the `a` type to INT checkError( - exception = intercept[AnalysisException] { + intercept[AnalysisException] { Seq((32767, 32767)).toDF("a", "c1a") .selectExpr("a", "named_struct('a', c1a) as c1") .write.format("delta").mode("append") .option("mergeSchema", "true") .saveAsTable(table) }, - errorClass = "DELTA_GENERATED_COLUMNS_DATA_TYPE_MISMATCH", + "DELTA_GENERATED_COLUMNS_DATA_TYPE_MISMATCH", parameters = Map( "columnName" -> "a", "columnType" -> "SMALLINT", @@ -787,25 +788,17 @@ trait GeneratedColumnSuiteBase test("changing the type of nested field not referenced by a generated col") { withTableName("disallow_column_type_evolution") { table => - createTable(table, None, "t STRUCT, gen SMALLINT", - Map("gen" -> "CAST(HASH(t.a - 10s) AS SMALLINT)"), Nil) + createTable(table, None, "t STRUCT, gen INT", + Map("gen" -> "HASH(t.a)"), Nil) - checkError( - exception = intercept[AnalysisException] { - Seq((32767.toShort, 32767)).toDF("a", "b") - .selectExpr("named_struct('a', a, 'b', b) as t") - .write.format("delta").mode("append") - .option("mergeSchema", "true") - .saveAsTable(table) - }, - errorClass = "DELTA_GENERATED_COLUMNS_DATA_TYPE_MISMATCH", - parameters = Map( - "columnName" -> "t", - "columnType" -> "STRUCT", - "dataType" -> "STRUCT", - "generatedColumns" -> "gen -> CAST(HASH(t.a - 10s) AS SMALLINT)" - ) - ) + // changing the type of `t.b` should succeed since it is not being + // referenced by any CHECK constraints or generated columns. + Seq((32767.toShort, 32767)).toDF("a", "b") + .selectExpr("named_struct('a', a, 'b', b) as t") + .write.format("delta").mode("append") + .option("mergeSchema", "true") + .saveAsTable(table) + checkAnswer(spark.table(table), Row(Row(32767, 32767), 1249274084) :: Nil) } } @@ -1006,28 +999,25 @@ trait GeneratedColumnSuiteBase test("using generated columns should upgrade the protocol") { withTableName("upgrade_protocol") { table => - def protocolVersions: (Int, Int) = { - sql(s"DESC DETAIL $table") - .select("minReaderVersion", "minWriterVersion") - .as[(Int, Int)] - .head() - } - - // Use the default protocol versions when not using computed partitions + // Use the default protocol versions when not using computed partitions. createTable(table, None, "i INT", Map.empty, Seq.empty) - assert(protocolVersions == (1, 2)) + val deltaLog = DeltaLog.forTable(spark, TableIdentifier(tableName = table)) + assert(deltaLog.update().protocol == Protocol(1, 2)) assert(DeltaLog.forTable(spark, TableIdentifier(tableName = table)).snapshot.version == 0) - // Protocol versions should be upgraded when using computed partitions + // Protocol versions should be upgraded when using computed partitions. replaceTable( table, None, defaultTestTableSchema, defaultTestTableGeneratedColumns, defaultTestTablePartitionColumns) - assert(protocolVersions == (1, 4)) + assert(deltaLog.update().protocol == Protocol(1, 7).withFeatures(Seq( + AppendOnlyTableFeature, + InvariantsTableFeature, + GeneratedColumnsTableFeature))) // Make sure we did overwrite the table rather than deleting and re-creating. - assert(DeltaLog.forTable(spark, TableIdentifier(tableName = table)).snapshot.version == 1) + assert(DeltaLog.forTable(spark, TableIdentifier(tableName = table)).update().version == 1) } } diff --git a/spark/src/test/scala/org/apache/spark/sql/delta/HiveConvertToDeltaSuite.scala b/spark/src/test/scala/org/apache/spark/sql/delta/HiveConvertToDeltaSuite.scala index 597d4a63126..d2d826e514f 100644 --- a/spark/src/test/scala/org/apache/spark/sql/delta/HiveConvertToDeltaSuite.scala +++ b/spark/src/test/scala/org/apache/spark/sql/delta/HiveConvertToDeltaSuite.scala @@ -130,7 +130,7 @@ abstract class HiveConvertToDeltaSuiteBase test("convert a Hive based external parquet table") { val tbl = "hive_parquet" - withTempDir(prefix = "spark") { dir => + withTempDir { dir => withTable(tbl) { sql( s""" diff --git a/spark/src/test/scala/org/apache/spark/sql/delta/IdentityColumnConflictSuite.scala b/spark/src/test/scala/org/apache/spark/sql/delta/IdentityColumnConflictSuite.scala index 0caff6363b3..e56b57aa21b 100644 --- a/spark/src/test/scala/org/apache/spark/sql/delta/IdentityColumnConflictSuite.scala +++ b/spark/src/test/scala/org/apache/spark/sql/delta/IdentityColumnConflictSuite.scala @@ -162,7 +162,7 @@ trait IdentityColumnConflictSuiteBase if (currentTxn.isInstanceOf[RowTrackingEnablementOnlyTestCase]) { txnObserver.setNextObserver(metadataUpdateObserver, autoAdvance = true) unblockAllPhases(txnObserver) - txnObserver.phases.backfillPhase.exitBarrier.unblock() + txnObserver.phases.postCommitPhase.exitBarrier.unblock() txnObserver = metadataUpdateObserver } diff --git a/spark/src/test/scala/org/apache/spark/sql/delta/IdentityColumnSuite.scala b/spark/src/test/scala/org/apache/spark/sql/delta/IdentityColumnSuite.scala index 00a146dd09d..7aac422bd80 100644 --- a/spark/src/test/scala/org/apache/spark/sql/delta/IdentityColumnSuite.scala +++ b/spark/src/test/scala/org/apache/spark/sql/delta/IdentityColumnSuite.scala @@ -333,7 +333,7 @@ trait IdentityColumnSuiteBase extends IdentityColumnTestUtils { TestColumnSpec(colName = "id", dataType = LongType), TestColumnSpec(colName = "value", dataType = IntegerType)) ) - assert(getProtocolVersions == (1, 2) || getProtocolVersions == (2, 5)) + assert(getProtocolVersions == (1, 2) || getProtocolVersions == (2, 7)) assert(DeltaLog.forTable(spark, TableIdentifier(tblName)).snapshot.version == 0) replaceTable( @@ -347,8 +347,11 @@ trait IdentityColumnSuiteBase extends IdentityColumnTestUtils { TestColumnSpec(colName = "value", dataType = IntegerType) ) ) - assert(getProtocolVersions == (1, 6) || getProtocolVersions == (2, 6)) - assert(DeltaLog.forTable(spark, TableIdentifier(tblName)).snapshot.version == 1) + val deltaLog = DeltaLog.forTable(spark, TableIdentifier(tblName)) + val protocol = deltaLog.update().protocol + assert(getProtocolVersions == (1, 7) || + protocol.readerAndWriterFeatures.contains(IdentityColumnsTableFeature)) + assert(deltaLog.update().version == 1) } } } diff --git a/spark/src/test/scala/org/apache/spark/sql/delta/InCommitTimestampSuite.scala b/spark/src/test/scala/org/apache/spark/sql/delta/InCommitTimestampSuite.scala index 15f162224dc..14e80aeea0d 100644 --- a/spark/src/test/scala/org/apache/spark/sql/delta/InCommitTimestampSuite.scala +++ b/spark/src/test/scala/org/apache/spark/sql/delta/InCommitTimestampSuite.scala @@ -29,6 +29,7 @@ import org.apache.spark.sql.delta.actions.{Action, CommitInfo} import org.apache.spark.sql.delta.coordinatedcommits.{CommitCoordinatorProvider, CoordinatedCommitsBaseSuite, CoordinatedCommitsTestUtils, TrackingInMemoryCommitCoordinatorBuilder} import org.apache.spark.sql.delta.sources.DeltaSQLConf import org.apache.spark.sql.delta.test.DeltaSQLCommandTest +import org.apache.spark.sql.delta.test.DeltaTestImplicits._ import org.apache.spark.sql.delta.util.{DeltaCommitFileProvider, FileNames, JsonUtils} import org.apache.hadoop.fs.Path @@ -204,8 +205,8 @@ class InCommitTimestampSuite latestSnapshot.timestamp } checkError( - exception = e, - errorClass = "DELTA_MISSING_COMMIT_INFO", + e, + "DELTA_MISSING_COMMIT_INFO", parameters = Map( "featureName" -> InCommitTimestampTableFeature.name, "version" -> "1")) @@ -243,8 +244,8 @@ class InCommitTimestampSuite latestSnapshot.timestamp } checkError( - exception = e, - errorClass = "DELTA_MISSING_COMMIT_TIMESTAMP", + e, + "DELTA_MISSING_COMMIT_TIMESTAMP", parameters = Map("featureName" -> InCommitTimestampTableFeature.name, "version" -> "1")) } } @@ -261,24 +262,26 @@ class InCommitTimestampSuite } test("CREATE OR REPLACE should not disable ICT") { - withSQLConf( - DeltaConfigs.IN_COMMIT_TIMESTAMPS_ENABLED.defaultTablePropertyKey -> false.toString - ) { - withTempDir { tempDir => - spark.range(10).write.format("delta").save(tempDir.getAbsolutePath) - spark.sql( - s"ALTER TABLE delta.`${tempDir.getAbsolutePath}` " + - s"SET TBLPROPERTIES ('${DeltaConfigs.IN_COMMIT_TIMESTAMPS_ENABLED.key}' = 'true')") - - spark.sql( - s"CREATE OR REPLACE TABLE delta.`${tempDir.getAbsolutePath}` (id long) USING delta") - - val deltaLogAfterCreateOrReplace = - DeltaLog.forTable(spark, new Path(tempDir.getCanonicalPath)) - val snapshot = deltaLogAfterCreateOrReplace.snapshot - assert(DeltaConfigs.IN_COMMIT_TIMESTAMPS_ENABLED.fromMetaData(snapshot.metadata)) - assert(snapshot.timestamp == - getInCommitTimestamp(deltaLogAfterCreateOrReplace, snapshot.version)) + withoutCoordinatedCommitsDefaultTableProperties { + withSQLConf( + DeltaConfigs.IN_COMMIT_TIMESTAMPS_ENABLED.defaultTablePropertyKey -> false.toString + ) { + withTempDir { tempDir => + spark.range(10).write.format("delta").save(tempDir.getAbsolutePath) + spark.sql( + s"ALTER TABLE delta.`${tempDir.getAbsolutePath}` " + + s"SET TBLPROPERTIES ('${DeltaConfigs.IN_COMMIT_TIMESTAMPS_ENABLED.key}' = 'true')") + + spark.sql( + s"CREATE OR REPLACE TABLE delta.`${tempDir.getAbsolutePath}` (id long) USING delta") + + val deltaLogAfterCreateOrReplace = + DeltaLog.forTable(spark, new Path(tempDir.getCanonicalPath)) + val snapshot = deltaLogAfterCreateOrReplace.snapshot + assert(DeltaConfigs.IN_COMMIT_TIMESTAMPS_ENABLED.fromMetaData(snapshot.metadata)) + assert(snapshot.timestamp == + getInCommitTimestamp(deltaLogAfterCreateOrReplace, snapshot.version)) + } } } } @@ -1016,7 +1019,7 @@ class InCommitTimestampWithCoordinatedCommitsSuite val commitFileProvider = DeltaCommitFileProvider(deltaLog.update()) val unbackfilledCommits = tableCommitCoordinatorClient - .getCommits(Some(1)) + .getCommits(Some(1L)) .getCommits.asScala .map { commit => DeltaHistoryManager.Commit(commit.getVersion, commit.getCommitTimestamp)} val commits = (Seq(commit0) ++ unbackfilledCommits).toList diff --git a/spark/src/test/scala/org/apache/spark/sql/delta/OptimisticTransactionSuite.scala b/spark/src/test/scala/org/apache/spark/sql/delta/OptimisticTransactionSuite.scala index 851d5545104..4daff9811e3 100644 --- a/spark/src/test/scala/org/apache/spark/sql/delta/OptimisticTransactionSuite.scala +++ b/spark/src/test/scala/org/apache/spark/sql/delta/OptimisticTransactionSuite.scala @@ -28,7 +28,7 @@ import org.apache.spark.sql.delta.sources.DeltaSQLConf import org.apache.spark.sql.delta.test.DeltaTestImplicits._ import org.apache.spark.sql.delta.util.{FileNames, JsonUtils} import io.delta.storage.LogStore -import io.delta.storage.commit.{CommitCoordinatorClient, CommitFailedException, CommitResponse, UpdatedActions} +import io.delta.storage.commit.{CommitCoordinatorClient, CommitFailedException, CommitResponse, TableDescriptor, UpdatedActions} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path @@ -501,15 +501,19 @@ class OptimisticTransactionSuite override def commit( logStore: LogStore, hadoopConf: Configuration, - tablePath: Path, - tableConf: java.util.Map[String, String], + tableDesc: TableDescriptor, commitVersion: Long, actions: java.util.Iterator[String], updatedActions: UpdatedActions): CommitResponse = { // Fail all commits except first one if (commitVersion == 0) { return super.commit( - logStore, hadoopConf, tablePath, tableConf, commitVersion, actions, updatedActions) + logStore, + hadoopConf, + tableDesc, + commitVersion, + actions, + updatedActions) } commitAttempts += 1 throw new CommitFailedException( @@ -557,15 +561,19 @@ class OptimisticTransactionSuite override def commit( logStore: LogStore, hadoopConf: Configuration, - tablePath: Path, - tableConf: java.util.Map[String, String], + tableDesc: TableDescriptor, commitVersion: Long, actions: java.util.Iterator[String], updatedActions: UpdatedActions): CommitResponse = { // Fail all commits except first one if (commitVersion == 0) { return super.commit( - logStore, hadoopConf, tablePath, tableConf, commitVersion, actions, updatedActions) + logStore, + hadoopConf, + tableDesc, + commitVersion, + actions, + updatedActions) } commitAttempts += 1 throw new FileAlreadyExistsException("Commit-File Already Exists") @@ -868,8 +876,7 @@ class OptimisticTransactionSuite override def commit( logStore: LogStore, hadoopConf: Configuration, - tablePath: Path, - tableConf: java.util.Map[String, String], + tableDesc: TableDescriptor, commitVersion: Long, actions: java.util.Iterator[String], updatedActions: UpdatedActions): CommitResponse = { @@ -878,8 +885,7 @@ class OptimisticTransactionSuite deltaLog.startTransaction().commit(addB :: Nil, ManualUpdate) throw new CommitFailedException(true, conflict, "") } - super.commit( - logStore, hadoopConf, tablePath, tableConf, commitVersion, actions, updatedActions) + super.commit(logStore, hadoopConf, tableDesc, commitVersion, actions, updatedActions) } } object RetryableConflictCommitCoordinatorBuilder$ extends CommitCoordinatorBuilder { diff --git a/spark/src/test/scala/org/apache/spark/sql/delta/RestoreTableSuiteBase.scala b/spark/src/test/scala/org/apache/spark/sql/delta/RestoreTableSuiteBase.scala index f52307ef39e..3f438d2a04a 100644 --- a/spark/src/test/scala/org/apache/spark/sql/delta/RestoreTableSuiteBase.scala +++ b/spark/src/test/scala/org/apache/spark/sql/delta/RestoreTableSuiteBase.scala @@ -251,11 +251,11 @@ trait RestoreTableSuiteBase extends QueryTest with SharedSparkSession deltaLog.upgradeProtocol( Protocol(TABLE_FEATURES_MIN_READER_VERSION, TABLE_FEATURES_MIN_WRITER_VERSION) .withFeatures(Seq(TestLegacyReaderWriterFeature)) - .withFeatures(oldProtocolVersion.implicitlySupportedFeatures)) + .withFeatures(oldProtocolVersion.implicitlyAndExplicitlySupportedFeatures)) val newProtocolVersion = deltaLog.snapshot.protocol assert( newProtocolVersion.minReaderVersion > oldProtocolVersion.minReaderVersion && - newProtocolVersion.minWriterVersion > oldProtocolVersion.minWriterVersion, + newProtocolVersion.minWriterVersion >= oldProtocolVersion.minWriterVersion, s"newProtocolVersion=$newProtocolVersion is not strictly greater than" + s" oldProtocolVersion=$oldProtocolVersion") diff --git a/spark/src/test/scala/org/apache/spark/sql/delta/S3SingleDriverLogStoreSuite.scala b/spark/src/test/scala/org/apache/spark/sql/delta/S3SingleDriverLogStoreSuite.scala index 7c869bab726..ba65bd9b886 100644 --- a/spark/src/test/scala/org/apache/spark/sql/delta/S3SingleDriverLogStoreSuite.scala +++ b/spark/src/test/scala/org/apache/spark/sql/delta/S3SingleDriverLogStoreSuite.scala @@ -66,86 +66,6 @@ trait S3SingleDriverLogStoreSuiteBase extends LogStoreSuiteBase { } } - test("cache works") { - withTempDir { dir => - val store = createLogStore(spark) - val deltas = - Seq(0, 1, 2, 3, 4).map(i => FileNames.unsafeDeltaFile(new Path(dir.toURI), i)) - store.write(deltas(0), Iterator("zero"), overwrite = false, sessionHadoopConf) - store.write(deltas(1), Iterator("one"), overwrite = false, sessionHadoopConf) - store.write(deltas(2), Iterator("two"), overwrite = false, sessionHadoopConf) - - // delete delta file 2 from file system - val fs = new Path(dir.getCanonicalPath).getFileSystem(sessionHadoopConf) - fs.delete(deltas(2), true) - - // file system listing doesn't see file 2 - checkFileSystemList(fs, deltas(0), Seq(0, 1)) - - // can't re-write because cache says it still exists - intercept[java.nio.file.FileAlreadyExistsException] { - store.write(deltas(2), Iterator("two"), overwrite = false, sessionHadoopConf) - } - - // log store list still sees file 2 as it's cached - checkLogStoreList(store, deltas(0), Seq(0, 1, 2), sessionHadoopConf) - - if (canInvalidateCache) { - // clear the cache - store.invalidateCache() - - // log store list doesn't see file 2 anymore - checkLogStoreList(store, deltas(0), Seq(0, 1), sessionHadoopConf) - - // write a new file 2 - store.write(deltas(2), Iterator("two"), overwrite = false, sessionHadoopConf) - } - - // add a file 3 to cache only - store.write(deltas(3), Iterator("three"), overwrite = false, sessionHadoopConf) - fs.delete(deltas(3), true) - - // log store listing returns a union of: - // 1) file system listing: 0, 1, 2 - // 2a) cache listing - canInvalidateCache=true: 2, 3 - // 2b) cache listing - canInvalidateCache=false: 0, 1, 2, 3 - checkLogStoreList(store, deltas(0), Seq(0, 1, 2, 3), sessionHadoopConf) - } - } - - test("cache works correctly when writing an initial log version") { - withTempDir { rootDir => - val dir = new File(rootDir, "_delta_log") - dir.mkdir() - val store = createLogStore(spark) - val deltas = - Seq(0, 1, 2).map(i => FileNames.unsafeDeltaFile(new Path(dir.toURI), i)) - store.write(deltas(0), Iterator("log version 0"), overwrite = false, sessionHadoopConf) - store.write(deltas(1), Iterator("log version 1"), overwrite = false, sessionHadoopConf) - store.write(deltas(2), Iterator("log version 2"), overwrite = false, sessionHadoopConf) - - val fs = new Path(dir.getCanonicalPath).getFileSystem(sessionHadoopConf) - // delete all log files - fs.delete(deltas(2), true) - fs.delete(deltas(1), true) - fs.delete(deltas(0), true) - - // can't write a new version 1 as it's in cache - intercept[java.nio.file.FileAlreadyExistsException] { - store.write(deltas(1), Iterator("new log version 1"), overwrite = false, sessionHadoopConf) - } - - // all three log files still in cache - checkLogStoreList(store, deltas(0), Seq(0, 1, 2), sessionHadoopConf) - - // can write a new version 0 as it's the initial version of the log - store.write(deltas(0), Iterator("new log version 0"), overwrite = false, sessionHadoopConf) - - // writing a new initial version invalidates all files in that log - checkLogStoreList(store, deltas(0), Seq(0), sessionHadoopConf) - } - } - protected def shouldUseRenameToWriteCheckpoint: Boolean = false /** diff --git a/spark/src/test/scala/org/apache/spark/sql/delta/SchemaValidationSuite.scala b/spark/src/test/scala/org/apache/spark/sql/delta/SchemaValidationSuite.scala index 0e9e1c47515..70924195f35 100644 --- a/spark/src/test/scala/org/apache/spark/sql/delta/SchemaValidationSuite.scala +++ b/spark/src/test/scala/org/apache/spark/sql/delta/SchemaValidationSuite.scala @@ -348,8 +348,8 @@ class SchemaValidationSuite } checkErrorMatchPVals( - exception = e, - errorClass = "DELTA_SCHEMA_CHANGE_SINCE_ANALYSIS", + e, + "DELTA_SCHEMA_CHANGE_SINCE_ANALYSIS", parameters = Map( "schemaDiff" -> ".*id.*", "legacyFlagMessage" -> "" @@ -383,8 +383,8 @@ class SchemaValidationSuite .execute() } checkErrorMatchPVals( - exception = e, - errorClass = "DELTA_SCHEMA_CHANGE_SINCE_ANALYSIS", + e, + "DELTA_SCHEMA_CHANGE_SINCE_ANALYSIS", parameters = Map( "schemaDiff" -> ".*col2.*", "legacyFlagMessage" -> "" diff --git a/spark/src/test/scala/org/apache/spark/sql/delta/SnapshotManagementSuite.scala b/spark/src/test/scala/org/apache/spark/sql/delta/SnapshotManagementSuite.scala index 785d4fb0f02..02b5e58a499 100644 --- a/spark/src/test/scala/org/apache/spark/sql/delta/SnapshotManagementSuite.scala +++ b/spark/src/test/scala/org/apache/spark/sql/delta/SnapshotManagementSuite.scala @@ -34,7 +34,7 @@ import org.apache.spark.sql.delta.test.DeltaSQLTestUtils import org.apache.spark.sql.delta.test.DeltaTestImplicits._ import org.apache.spark.sql.delta.util.{DeltaCommitFileProvider, FileNames, JsonUtils} import io.delta.storage.LogStore -import io.delta.storage.commit.{Commit, CommitCoordinatorClient, GetCommitsResponse} +import io.delta.storage.commit.{Commit, CommitCoordinatorClient, GetCommitsResponse, TableDescriptor} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.FileStatus import org.apache.hadoop.fs.Path @@ -546,8 +546,7 @@ case class ConcurrentBackfillCommitCoordinatorClient( ) extends InMemoryCommitCoordinator(batchSize) { private val deferredBackfills: mutable.Map[Long, () => Unit] = mutable.Map.empty override def getCommits( - logPath: Path, - coordinatedCommitsTableConf: java.util.Map[String, String], + tableDesc: TableDescriptor, startVersion: java.lang.Long, endVersion: java.lang.Long): GetCommitsResponse = { if (ConcurrentBackfillCommitCoordinatorClient.beginConcurrentBackfills) { @@ -556,7 +555,7 @@ case class ConcurrentBackfillCommitCoordinatorClient( deferredBackfills.keys.toSeq.sorted.foreach((version: Long) => deferredBackfills(version)()) deferredBackfills.clear() } - super.getCommits(logPath, coordinatedCommitsTableConf, startVersion, endVersion) + super.getCommits(tableDesc, startVersion, endVersion) } override def backfill( logStore: LogStore, diff --git a/spark/src/test/scala/org/apache/spark/sql/delta/TightBoundsSuite.scala b/spark/src/test/scala/org/apache/spark/sql/delta/TightBoundsSuite.scala index f9e515bda8b..1b771351c37 100644 --- a/spark/src/test/scala/org/apache/spark/sql/delta/TightBoundsSuite.scala +++ b/spark/src/test/scala/org/apache/spark/sql/delta/TightBoundsSuite.scala @@ -25,6 +25,8 @@ import org.apache.spark.sql.delta.stats.DeltaStatistics.{MIN, NULL_COUNT, NUM_RE import org.apache.spark.sql.delta.stats.StatisticsCollection import org.apache.spark.sql.delta.test.DeltaSQLCommandTest import org.apache.spark.sql.delta.test.DeltaTestImplicits._ +import org.apache.spark.sql.delta.util.JsonUtils +import com.fasterxml.jackson.databind.node.ObjectNode import org.apache.spark.sql.{DataFrame, QueryTest, Row} import org.apache.spark.sql.functions.{col, lit, map_values, when} @@ -113,13 +115,21 @@ class TightBoundsSuite } val exception = intercept[DeltaIllegalStateException] { - txn.commitManually(addFiles: _*) + txn.commitActions(DeltaOperations.TestOperation(), addFiles: _*) } assert(exception.getErrorClass === "DELTA_ADDING_DELETION_VECTORS_WITH_TIGHT_BOUNDS_DISALLOWED") } } + protected def getStats(snapshot: Snapshot, statName: String): Array[Row] = { + val statsColumnName = snapshot.getBaseStatsColumnName + snapshot + .withStatsDeduplicated + .select(s"$statsColumnName.$statName") + .collect() + } + protected def getStatFromLastFile(snapshot: Snapshot, statName: String): Row = { val statsColumnName = snapshot.getBaseStatsColumnName snapshot @@ -284,6 +294,79 @@ class TightBoundsSuite assert(statsAfterDelete === expectedStatsAfterDelete) } } + + def tableAddDVAndTightStats( + targetTable: () => io.delta.tables.DeltaTable, + targetLog: DeltaLog, + deleteCond: String): Unit = { + // Add DVs. Stats should have tightBounds = false afterwards. + targetTable().delete(deleteCond) + val initialStats = getStats(targetLog.update(), "*") + assert(initialStats.forall(_.get(4) === false)) // tightBounds + + // Other systems may support Compute Stats that recomputes tightBounds stats on tables with DVs. + // Simulate this with a manual update commit that introduces tight stats. + val txn = targetLog.startTransaction() + val addFiles = txn.snapshot.allFiles.collect().toSeq.map { action => + val node = JsonUtils.mapper.readTree(action.stats).asInstanceOf[ObjectNode] + assert(node.has("numRecords")) + val numRecords = node.get("numRecords").asInt() + action.copy(stats = s"""{ "numRecords" : $numRecords, "tightBounds" : true }""") + } + txn.commitActions(DeltaOperations.ManualUpdate, addFiles: _*) + } + + test("CLONE on table with DVs and tightBound stats") { + val targetDF = spark.range(0, 100, 1, 1).toDF() + withTempDeltaTable(targetDF) { (targetTable, targetLog) => + val targetPath = targetLog.dataPath.toString + tableAddDVAndTightStats(targetTable, targetLog, "id >= 80") + // CLONE shouldn't throw + // DELTA_ADDING_DELETION_VECTORS_WITH_TIGHT_BOUNDS_DISALLOWED + withTempPath("cloned") { clonedPath => + sql(s"CREATE TABLE delta.`$clonedPath` SHALLOW CLONE delta.`$targetPath`") + } + } + } + + test("RESTORE TABLE on table with DVs and tightBound stats") { + val targetDF = spark.range(0, 100, 1, 1).toDF() + withTempDeltaTable(targetDF) { (targetTable, targetLog) => + val targetPath = targetLog.dataPath.toString + // adds version 1 (delete) and 2 (compute stats) + tableAddDVAndTightStats(targetTable, targetLog, "id >= 80") + // adds version 3 (delete more) + targetTable().delete("id < 20") + // Restore back to version 2 (after compute stats) + // After 2nd delete, new DVs are added to the file, so the restore will + // have to recommit the file with old DVs. + targetTable().restoreToVersion(2) + // Verify that the restored table has DVs and tight bounds. + val stats = getStatFromLastFileWithDVs(targetLog.update(), "*") + assert(stats.get(4) === true) // tightBounds + } + } + + test("Row Tracking backfill on table with DVs and tightBound stats") { + // Enabling Row Tracking and backfill shouldn't throw + // DELTA_ADDING_DELETION_VECTORS_WITH_TIGHT_BOUNDS_DISALLOWED + withSQLConf(DeltaConfigs.ROW_TRACKING_ENABLED.defaultTablePropertyKey -> "false") { + val targetDF = spark.range(0, 100, 1, 1).toDF() + withTempDeltaTable(targetDF) { (targetTable, targetLog) => + val targetPath = targetLog.dataPath.toString + tableAddDVAndTightStats(targetTable, targetLog, "id >= 80") + // Make sure that we start with no RowTracking feature. + assert(!RowTracking.isSupported(targetLog.unsafeVolatileSnapshot.protocol)) + assert(!RowId.isEnabled(targetLog.unsafeVolatileSnapshot.protocol, + targetLog.unsafeVolatileSnapshot.metadata)) + + sql(s"ALTER TABLE delta.`$targetPath` SET TBLPROPERTIES " + + "('delta.enableRowTracking' = 'true')") + assert(targetLog.history.getHistory(None) + .count(_.operation == DeltaOperations.ROW_TRACKING_BACKFILL_OPERATION_NAME) == 1) + } + } + } } class TightBoundsColumnMappingSuite extends TightBoundsSuite with DeltaColumnMappingEnableIdMode diff --git a/spark/src/test/scala/org/apache/spark/sql/delta/UpdateSQLSuite.scala b/spark/src/test/scala/org/apache/spark/sql/delta/UpdateSQLSuite.scala index fcfd59cca8b..fa046d00cbc 100644 --- a/spark/src/test/scala/org/apache/spark/sql/delta/UpdateSQLSuite.scala +++ b/spark/src/test/scala/org/apache/spark/sql/delta/UpdateSQLSuite.scala @@ -117,10 +117,10 @@ class UpdateSQLSuite extends UpdateSuiteBase SQLConf.STORE_ASSIGNMENT_POLICY.key -> StoreAssignmentPolicy.STRICT.toString, DeltaSQLConf.UPDATE_AND_MERGE_CASTING_FOLLOWS_ANSI_ENABLED_FLAG.key -> "false") { checkError( - exception = intercept[AnalysisException] { + intercept[AnalysisException] { executeUpdate(target = s"delta.`$tempPath`", set = "value = 'false'") }, - errorClass = "CANNOT_UP_CAST_DATATYPE", + "CANNOT_UP_CAST_DATATYPE", parameters = Map( "expression" -> "'false'", "sourceType" -> toSQLType("STRING"), @@ -139,11 +139,11 @@ class UpdateSQLSuite extends UpdateSuiteBase SQLConf.STORE_ASSIGNMENT_POLICY.key -> StoreAssignmentPolicy.STRICT.toString, DeltaSQLConf.UPDATE_AND_MERGE_CASTING_FOLLOWS_ANSI_ENABLED_FLAG.key -> "false") { checkError( - exception = intercept[AnalysisException] { + intercept[AnalysisException] { executeUpdate(target = s"delta.`$tempPath`", set = "value = '5'") }, - errorClass = "CANNOT_UP_CAST_DATATYPE", - parameters = Map( + "CANNOT_UP_CAST_DATATYPE", + parameters = Map( "expression" -> "'5'", "sourceType" -> toSQLType("STRING"), "targetType" -> toSQLType("INT"), diff --git a/spark/src/test/scala/org/apache/spark/sql/delta/columnmapping/DropColumnMappingFeatureSuite.scala b/spark/src/test/scala/org/apache/spark/sql/delta/columnmapping/DropColumnMappingFeatureSuite.scala index 0e9ac064806..be5abc7f376 100644 --- a/spark/src/test/scala/org/apache/spark/sql/delta/columnmapping/DropColumnMappingFeatureSuite.scala +++ b/spark/src/test/scala/org/apache/spark/sql/delta/columnmapping/DropColumnMappingFeatureSuite.scala @@ -58,8 +58,7 @@ class DropColumnMappingFeatureSuite extends RemoveColumnMappingSuiteUtils { dropColumnMappingTableFeature() } checkError(e, - errorClass = DeltaErrors.dropTableFeatureFeatureNotSupportedByProtocol(".") - .getErrorClass, + DeltaErrors.dropTableFeatureFeatureNotSupportedByProtocol(".").getErrorClass, parameters = Map("feature" -> "columnMapping")) } @@ -76,7 +75,7 @@ class DropColumnMappingFeatureSuite extends RemoveColumnMappingSuiteUtils { dropColumnMappingTableFeature() } checkError(e, - errorClass = "DELTA_INVALID_COLUMN_NAMES_WHEN_REMOVING_COLUMN_MAPPING", + "DELTA_INVALID_COLUMN_NAMES_WHEN_REMOVING_COLUMN_MAPPING", parameters = Map("invalidColumnNames" -> "col1 with special chars ,;{}()\n\t=")) } @@ -125,7 +124,7 @@ class DropColumnMappingFeatureSuite extends RemoveColumnMappingSuiteUtils { } checkError( e, - errorClass = "DELTA_FEATURE_DROP_HISTORICAL_VERSIONS_EXIST", + "DELTA_FEATURE_DROP_HISTORICAL_VERSIONS_EXIST", parameters = Map( "feature" -> "columnMapping", "logRetentionPeriodKey" -> "delta.logRetentionDuration", @@ -168,7 +167,7 @@ class DropColumnMappingFeatureSuite extends RemoveColumnMappingSuiteUtils { } checkError( e, - errorClass = "DELTA_FEATURE_DROP_WAIT_FOR_RETENTION_PERIOD", + "DELTA_FEATURE_DROP_WAIT_FOR_RETENTION_PERIOD", parameters = Map( "feature" -> "columnMapping", "logRetentionPeriodKey" -> "delta.logRetentionDuration", diff --git a/spark/src/test/scala/org/apache/spark/sql/delta/commands/backfill/RowTrackingBackfillConflictsSuite.scala b/spark/src/test/scala/org/apache/spark/sql/delta/commands/backfill/RowTrackingBackfillConflictsSuite.scala index 11aaf8e213f..fa991595c50 100644 --- a/spark/src/test/scala/org/apache/spark/sql/delta/commands/backfill/RowTrackingBackfillConflictsSuite.scala +++ b/spark/src/test/scala/org/apache/spark/sql/delta/commands/backfill/RowTrackingBackfillConflictsSuite.scala @@ -418,7 +418,7 @@ class RowTrackingBackfillConflictsSuite extends RowTrackingBackfillConflictsTest backfillObserver.setNextObserver( NoOpTransactionExecutionObserver, autoAdvance = true) unblockCommit(backfillObserver) - backfillObserver.phases.backfillPhase.exitBarrier.unblock() + backfillObserver.phases.postCommitPhase.exitBarrier.unblock() waitForCommit(backfillObserver) } diff --git a/spark/src/test/scala/org/apache/spark/sql/delta/concurrency/TransactionExecutionObserverSuite.scala b/spark/src/test/scala/org/apache/spark/sql/delta/concurrency/TransactionExecutionObserverSuite.scala index ce7c80a48e5..f6c8070ce73 100644 --- a/spark/src/test/scala/org/apache/spark/sql/delta/concurrency/TransactionExecutionObserverSuite.scala +++ b/spark/src/test/scala/org/apache/spark/sql/delta/concurrency/TransactionExecutionObserverSuite.scala @@ -61,11 +61,13 @@ class TransactionExecutionObserverSuite extends QueryTest with SharedSparkSessio assert(!observer.phases.preparePhase.hasEntered) assert(!observer.phases.commitPhase.hasEntered) assert(!observer.phases.backfillPhase.hasEntered) + assert(!observer.phases.postCommitPhase.hasEntered) // allow things to progress observer.phases.preparePhase.entryBarrier.unblock() observer.phases.commitPhase.entryBarrier.unblock() observer.phases.backfillPhase.entryBarrier.unblock() + observer.phases.postCommitPhase.entryBarrier.unblock() val removedFiles = txn.snapshot.allFiles.collect().map(_.remove).toSeq txn.commit(removedFiles, DeltaOperations.ManualUpdate) @@ -75,6 +77,8 @@ class TransactionExecutionObserverSuite extends QueryTest with SharedSparkSessio assert(observer.phases.commitPhase.hasLeft) assert(observer.phases.backfillPhase.hasEntered) assert(observer.phases.backfillPhase.hasLeft) + assert(observer.phases.postCommitPhase.hasEntered) + assert(observer.phases.postCommitPhase.hasLeft) } } val res = spark.read.format("delta").load(tempPath).collect() @@ -126,6 +130,10 @@ class TransactionExecutionObserverSuite extends QueryTest with SharedSparkSessio observer.phases.backfillPhase.entryBarrier.unblock() busyWaitFor(observer.phases.backfillPhase.hasEntered, timeout) busyWaitFor(observer.phases.backfillPhase.hasLeft, timeout) + + observer.phases.postCommitPhase.entryBarrier.unblock() + busyWaitFor(observer.phases.postCommitPhase.hasEntered, timeout) + busyWaitFor(observer.phases.postCommitPhase.hasLeft, timeout) testThread.join(timeout.toMillis) assert(!testThread.isAlive) // should have passed the barrier and completed @@ -155,6 +163,7 @@ class TransactionExecutionObserverSuite extends QueryTest with SharedSparkSessio observer.phases.preparePhase.entryBarrier.unblock() observer.phases.commitPhase.entryBarrier.unblock() observer.phases.backfillPhase.entryBarrier.unblock() + observer.phases.postCommitPhase.entryBarrier.unblock() val removedFiles = txn.snapshot.allFiles.collect().map(_.remove).toSeq txn.commit(removedFiles, DeltaOperations.ManualUpdate) } @@ -165,6 +174,7 @@ class TransactionExecutionObserverSuite extends QueryTest with SharedSparkSessio observer.phases.preparePhase.entryBarrier.unblock() observer.phases.commitPhase.entryBarrier.unblock() observer.phases.backfillPhase.entryBarrier.unblock() + observer.phases.postCommitPhase.entryBarrier.unblock() val removedFiles = txn.snapshot.allFiles.collect().map(_.remove).toSeq txn.commit(removedFiles, DeltaOperations.ManualUpdate) } @@ -221,6 +231,7 @@ class TransactionExecutionObserverSuite extends QueryTest with SharedSparkSessio busyWaitFor(observer.phases.preparePhase.hasLeft, timeout) assert(!observer.phases.commitPhase.hasEntered) assert(!observer.phases.backfillPhase.hasEntered) + assert(!observer.phases.postCommitPhase.hasEntered) assertOperationNotVisible() @@ -228,6 +239,8 @@ class TransactionExecutionObserverSuite extends QueryTest with SharedSparkSessio busyWaitFor(observer.phases.commitPhase.hasLeft, timeout) observer.phases.backfillPhase.entryBarrier.unblock() busyWaitFor(observer.phases.backfillPhase.hasLeft, timeout) + observer.phases.postCommitPhase.entryBarrier.unblock() + busyWaitFor(observer.phases.postCommitPhase.hasLeft, timeout) testThread.join(timeout.toMillis) assert(!testThread.isAlive) // should have passed the barrier and completed @@ -256,7 +269,7 @@ class TransactionExecutionObserverSuite extends QueryTest with SharedSparkSessio TransactionExecutionObserver.withObserver(observer) { deltaLog.withNewTransaction { txn => - observer.phases.backfillPhase.exitBarrier.unblock() + observer.phases.postCommitPhase.exitBarrier.unblock() val removedFiles = txn.snapshot.allFiles.collect().map(_.remove).toSeq txn.commit(removedFiles, DeltaOperations.ManualUpdate) } diff --git a/spark/src/test/scala/org/apache/spark/sql/delta/concurrency/TransactionExecutionTestMixin.scala b/spark/src/test/scala/org/apache/spark/sql/delta/concurrency/TransactionExecutionTestMixin.scala index 8833d7464e8..059314aa072 100644 --- a/spark/src/test/scala/org/apache/spark/sql/delta/concurrency/TransactionExecutionTestMixin.scala +++ b/spark/src/test/scala/org/apache/spark/sql/delta/concurrency/TransactionExecutionTestMixin.scala @@ -124,6 +124,7 @@ trait TransactionExecutionTestMixin { def unblockCommit(observer: TransactionObserver): Unit = { observer.phases.commitPhase.entryBarrier.unblock() observer.phases.backfillPhase.entryBarrier.unblock() + observer.phases.postCommitPhase.entryBarrier.unblock() } /** Unblocks all phases for [[TransactionObserver]] so that corresponding query can finish. */ @@ -132,6 +133,7 @@ trait TransactionExecutionTestMixin { observer.phases.preparePhase.entryBarrier.unblock() observer.phases.commitPhase.entryBarrier.unblock() observer.phases.backfillPhase.entryBarrier.unblock() + observer.phases.postCommitPhase.entryBarrier.unblock() } def waitForPrecommit(observer: TransactionObserver): Unit = @@ -140,6 +142,7 @@ trait TransactionExecutionTestMixin { def waitForCommit(observer: TransactionObserver): Unit = { busyWaitFor(observer.phases.commitPhase.hasLeft, timeout) busyWaitFor(observer.phases.backfillPhase.hasLeft, timeout) + busyWaitFor(observer.phases.postCommitPhase.hasLeft, timeout) } /** @@ -153,7 +156,7 @@ trait TransactionExecutionTestMixin { waitForPrecommit(observer) unblockCommit(observer) if (hasNextObserver) { - observer.phases.backfillPhase.leave() + observer.phases.postCommitPhase.leave() } waitForCommit(observer) } @@ -222,13 +225,15 @@ trait TransactionExecutionTestMixin { // B starts and commits unblockAllPhases(observerB) - busyWaitFor(observerB.phases.backfillPhase.hasLeft, timeout) + busyWaitFor(observerB.phases.postCommitPhase.hasLeft, timeout) // A commits observerA.phases.commitPhase.entryBarrier.unblock() busyWaitFor(observerA.phases.commitPhase.hasLeft, timeout) observerA.phases.backfillPhase.entryBarrier.unblock() busyWaitFor(observerA.phases.backfillPhase.hasLeft, timeout) + observerA.phases.postCommitPhase.entryBarrier.unblock() + busyWaitFor(observerA.phases.postCommitPhase.hasLeft, timeout) } (futureA, futureB) } @@ -260,17 +265,19 @@ trait TransactionExecutionTestMixin { // B starts and commits unblockAllPhases(observerB) - busyWaitFor(observerB.phases.backfillPhase.hasLeft, timeout) + busyWaitFor(observerB.phases.postCommitPhase.hasLeft, timeout) // C starts and commits unblockAllPhases(observerC) - busyWaitFor(observerC.phases.backfillPhase.hasLeft, timeout) + busyWaitFor(observerC.phases.postCommitPhase.hasLeft, timeout) // A commits observerA.phases.commitPhase.entryBarrier.unblock() busyWaitFor(observerA.phases.commitPhase.hasLeft, timeout) observerA.phases.backfillPhase.entryBarrier.unblock() busyWaitFor(observerA.phases.backfillPhase.hasLeft, timeout) + observerA.phases.postCommitPhase.entryBarrier.unblock() + busyWaitFor(observerA.phases.postCommitPhase.hasLeft, timeout) } (futureA, futureB, futureC) } diff --git a/spark/src/test/scala/org/apache/spark/sql/delta/coordinatedcommits/CommitCoordinatorClientImplSuiteBase.scala b/spark/src/test/scala/org/apache/spark/sql/delta/coordinatedcommits/CommitCoordinatorClientImplSuiteBase.scala index 2a2e252573b..ba7642771e9 100644 --- a/spark/src/test/scala/org/apache/spark/sql/delta/coordinatedcommits/CommitCoordinatorClientImplSuiteBase.scala +++ b/spark/src/test/scala/org/apache/spark/sql/delta/coordinatedcommits/CommitCoordinatorClientImplSuiteBase.scala @@ -27,6 +27,7 @@ import org.apache.spark.sql.delta.DeltaLog import org.apache.spark.sql.delta.actions.{CommitInfo, Metadata, Protocol} import org.apache.spark.sql.delta.storage.{LogStore, LogStoreProvider} import org.apache.spark.sql.delta.test.{DeltaSQLCommandTest, DeltaSQLTestUtils} +import org.apache.spark.sql.delta.test.DeltaTestImplicits._ import org.apache.spark.sql.delta.util.FileNames import org.apache.spark.sql.delta.util.threads.DeltaThreadPool import io.delta.dynamodbcommitcoordinator.DynamoDBCommitCoordinatorClient diff --git a/spark/src/test/scala/org/apache/spark/sql/delta/coordinatedcommits/CommitCoordinatorClientSuite.scala b/spark/src/test/scala/org/apache/spark/sql/delta/coordinatedcommits/CommitCoordinatorClientSuite.scala index 6ba36dd0d3e..553f6bec850 100644 --- a/spark/src/test/scala/org/apache/spark/sql/delta/coordinatedcommits/CommitCoordinatorClientSuite.scala +++ b/spark/src/test/scala/org/apache/spark/sql/delta/coordinatedcommits/CommitCoordinatorClientSuite.scala @@ -16,6 +16,8 @@ package org.apache.spark.sql.delta.coordinatedcommits +import java.util.Optional + import scala.collection.JavaConverters._ import scala.reflect.runtime.universe._ @@ -24,7 +26,7 @@ import org.apache.spark.sql.delta.actions._ import org.apache.spark.sql.delta.test.DeltaSQLCommandTest import org.apache.spark.sql.delta.test.DeltaSQLTestUtils import io.delta.storage.LogStore -import io.delta.storage.commit.{CommitCoordinatorClient, CommitResponse, GetCommitsResponse => JGetCommitsResponse, UpdatedActions} +import io.delta.storage.commit.{CommitCoordinatorClient, CommitResponse, GetCommitsResponse => JGetCommitsResponse, TableDescriptor, TableIdentifier, UpdatedActions} import io.delta.storage.commit.actions.{AbstractMetadata, AbstractProtocol} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path @@ -39,8 +41,7 @@ class CommitCoordinatorClientSuite extends QueryTest with DeltaSQLTestUtils with override def commit( logStore: LogStore, hadoopConf: Configuration, - logPath: Path, - coordinatedCommitsTableConf: java.util.Map[String, String], + tableDesc: TableDescriptor, commitVersion: Long, actions: java.util.Iterator[String], updatedActions: UpdatedActions): CommitResponse = { @@ -48,8 +49,7 @@ class CommitCoordinatorClientSuite extends QueryTest with DeltaSQLTestUtils with } override def getCommits( - logPath: Path, - coordinatedCommitsTableConf: java.util.Map[String, String], + tableDesc: TableDescriptor, startVersion: java.lang.Long, endVersion: java.lang.Long): JGetCommitsResponse = new JGetCommitsResponse(Seq.empty.asJava, -1) @@ -57,13 +57,13 @@ class CommitCoordinatorClientSuite extends QueryTest with DeltaSQLTestUtils with override def backfillToVersion( logStore: LogStore, hadoopConf: Configuration, - logPath: Path, - coordinatedCommitsTableConf: java.util.Map[String, String], + tableDesc: TableDescriptor, version: Long, lastKnownBackfilledVersion: java.lang.Long): Unit = {} override def registerTable( logPath: Path, + tableIdentifier: Optional[TableIdentifier], currentVersion: Long, currentMetadata: AbstractMetadata, currentProtocol: AbstractProtocol): java.util.Map[String, String] = diff --git a/spark/src/test/scala/org/apache/spark/sql/delta/coordinatedcommits/CoordinatedCommitsEnablementSuite.scala b/spark/src/test/scala/org/apache/spark/sql/delta/coordinatedcommits/CoordinatedCommitsEnablementSuite.scala index 9efd838c44d..1461b60fb36 100644 --- a/spark/src/test/scala/org/apache/spark/sql/delta/coordinatedcommits/CoordinatedCommitsEnablementSuite.scala +++ b/spark/src/test/scala/org/apache/spark/sql/delta/coordinatedcommits/CoordinatedCommitsEnablementSuite.scala @@ -92,7 +92,8 @@ class CoordinatedCommitsEnablementSuite val log = DeltaLog.forTable(spark, tablePath) validateCoordinatedCommitsCompleteEnablement(log.snapshot, expectEnabled = false) sql(s"ALTER TABLE delta.`$tablePath` SET TBLPROPERTIES " + // Enable CC - s"('${DeltaConfigs.COORDINATED_COMMITS_COORDINATOR_NAME.key}' = 'tracking-in-memory')") + s"('${DeltaConfigs.COORDINATED_COMMITS_COORDINATOR_NAME.key}' = 'tracking-in-memory', " + + s"'${DeltaConfigs.COORDINATED_COMMITS_COORDINATOR_CONF.key}' = '{}')") Seq(1).toDF().write.format("delta").mode("overwrite").save(tablePath) // commit 3 validateCoordinatedCommitsCompleteEnablement(log.update(), expectEnabled = true) } diff --git a/spark/src/test/scala/org/apache/spark/sql/delta/coordinatedcommits/CoordinatedCommitsSuite.scala b/spark/src/test/scala/org/apache/spark/sql/delta/coordinatedcommits/CoordinatedCommitsSuite.scala index 25750474b9f..ae2643d2050 100644 --- a/spark/src/test/scala/org/apache/spark/sql/delta/coordinatedcommits/CoordinatedCommitsSuite.scala +++ b/spark/src/test/scala/org/apache/spark/sql/delta/coordinatedcommits/CoordinatedCommitsSuite.scala @@ -17,6 +17,8 @@ package org.apache.spark.sql.delta.coordinatedcommits import java.io.File +import java.lang.{Long => JLong} +import java.util.{Iterator => JIterator, Optional} import scala.collection.JavaConverters._ import scala.collection.mutable.ArrayBuffer @@ -26,7 +28,7 @@ import com.databricks.spark.util.UsageRecord import org.apache.spark.sql.delta.{CommitStats, CoordinatedCommitsStats, CoordinatedCommitsTableFeature, DeltaOperations, DeltaUnsupportedOperationException, V2CheckpointTableFeature} import org.apache.spark.sql.delta.{CommitCoordinatorGetCommitsFailedException, DeltaIllegalArgumentException} import org.apache.spark.sql.delta.CoordinatedCommitType._ -import org.apache.spark.sql.delta.DeltaConfigs.{CHECKPOINT_INTERVAL, COORDINATED_COMMITS_COORDINATOR_CONF, COORDINATED_COMMITS_COORDINATOR_NAME, COORDINATED_COMMITS_TABLE_CONF} +import org.apache.spark.sql.delta.DeltaConfigs.{CHECKPOINT_INTERVAL, COORDINATED_COMMITS_COORDINATOR_CONF, COORDINATED_COMMITS_COORDINATOR_NAME, COORDINATED_COMMITS_TABLE_CONF, IN_COMMIT_TIMESTAMPS_ENABLED} import org.apache.spark.sql.delta.DeltaLog import org.apache.spark.sql.delta.DeltaTestUtils.createTestAddFile import org.apache.spark.sql.delta.InitialSnapshot @@ -34,17 +36,17 @@ import org.apache.spark.sql.delta.LogSegment import org.apache.spark.sql.delta.Snapshot import org.apache.spark.sql.delta.actions._ import org.apache.spark.sql.delta.sources.DeltaSQLConf +import org.apache.spark.sql.delta.test.DeltaExceptionTestUtils import org.apache.spark.sql.delta.test.DeltaSQLCommandTest import org.apache.spark.sql.delta.test.DeltaSQLTestUtils import org.apache.spark.sql.delta.test.DeltaTestImplicits._ import org.apache.spark.sql.delta.util.{FileNames, JsonUtils} import org.apache.spark.sql.delta.util.FileNames.{CompactedDeltaFile, DeltaFile, UnbackfilledDeltaFile} import io.delta.storage.LogStore -import io.delta.storage.commit.{CommitCoordinatorClient, CommitResponse, GetCommitsResponse => JGetCommitsResponse, UpdatedActions} +import io.delta.storage.commit.{CommitCoordinatorClient, CommitResponse, GetCommitsResponse => JGetCommitsResponse, TableDescriptor, TableIdentifier, UpdatedActions} import io.delta.storage.commit.actions.{AbstractMetadata, AbstractProtocol} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileStatus, Path} -import org.scalatest.Tag import org.apache.spark.SparkConf import org.apache.spark.sql.{QueryTest, Row, SparkSession} @@ -56,7 +58,8 @@ class CoordinatedCommitsSuite with DeltaSQLTestUtils with SharedSparkSession with DeltaSQLCommandTest - with CoordinatedCommitsTestUtils { + with CoordinatedCommitsTestUtils + with DeltaExceptionTestUtils { import testImplicits._ @@ -111,10 +114,9 @@ class CoordinatedCommitsSuite override def commit( logStore: LogStore, hadoopConf: Configuration, - logPath: Path, - coordinatedCommitsTableConf: java.util.Map[String, String], + tableDesc: TableDescriptor, commitVersion: Long, - actions: java.util.Iterator[String], + actions: JIterator[String], updatedActions: UpdatedActions): CommitResponse = { throw new IllegalStateException("Fail commit request") } @@ -402,15 +404,14 @@ class CoordinatedCommitsSuite var failAttempts = Set[Int]() override def getCommits( - logPath: Path, - coordinatedCommitsTableConf: java.util.Map[String, String], - startVersion: java.lang.Long, - endVersion: java.lang.Long): JGetCommitsResponse = { + tableDesc: TableDescriptor, + startVersion: java.lang.Long, + endVersion: java.lang.Long): JGetCommitsResponse = { if (failAttempts.contains(numGetCommitsCalled.get + 1)) { numGetCommitsCalled.incrementAndGet() throw new IllegalStateException("Injected failure") } - super.getCommits(logPath, coordinatedCommitsTableConf, startVersion, endVersion) + super.getCommits(tableDesc, startVersion, endVersion) } } case class TrackingInMemoryCommitCoordinatorClientBuilder( @@ -754,48 +755,45 @@ class CoordinatedCommitsSuite new InMemoryCommitCoordinator(batchSize = 10) { override def registerTable( logPath: Path, + tableIdentifier: Optional[TableIdentifier], currentVersion: Long, currentMetadata: AbstractMetadata, currentProtocol: AbstractProtocol): java.util.Map[String, String] = { - super.registerTable(logPath, currentVersion, currentMetadata, currentProtocol) + super.registerTable( + logPath, tableIdentifier, currentVersion, currentMetadata, currentProtocol) tableConf } override def getCommits( - logPath: Path, - coordinatedCommitsTableConf: java.util.Map[String, String], + tableDesc: TableDescriptor, startVersion: java.lang.Long, endVersion: java.lang.Long): JGetCommitsResponse = { - assert(coordinatedCommitsTableConf === tableConf) - super.getCommits(logPath, coordinatedCommitsTableConf, startVersion, endVersion) + assert(tableDesc.getTableConf === tableConf) + super.getCommits(tableDesc, startVersion, endVersion) } override def commit( logStore: LogStore, hadoopConf: Configuration, - logPath: Path, - coordinatedCommitsTableConf: java.util.Map[String, String], + tableDesc: TableDescriptor, commitVersion: Long, actions: java.util.Iterator[String], updatedActions: UpdatedActions): CommitResponse = { - assert(coordinatedCommitsTableConf === tableConf) - super.commit(logStore, hadoopConf, logPath, coordinatedCommitsTableConf, - commitVersion, actions, updatedActions) + assert(tableDesc.getTableConf === tableConf) + super.commit(logStore, hadoopConf, tableDesc, commitVersion, actions, updatedActions) } override def backfillToVersion( logStore: LogStore, hadoopConf: Configuration, - logPath: Path, - coordinatedCommitsTableConf: java.util.Map[String, String], + tableDesc: TableDescriptor, version: Long, lastKnownBackfilledVersionOpt: java.lang.Long): Unit = { - assert(coordinatedCommitsTableConf === tableConf) + assert(tableDesc.getTableConf === tableConf) super.backfillToVersion( logStore, hadoopConf, - logPath, - coordinatedCommitsTableConf, + tableDesc, version, lastKnownBackfilledVersionOpt) } @@ -1127,12 +1125,11 @@ class CoordinatedCommitsSuite val neverBackfillingCommitCoordinator = new TrackingCommitCoordinatorClient(new InMemoryCommitCoordinator(batchSize) { override def backfillToVersion( - logStore: LogStore, - hadoopConf: Configuration, - logPath: Path, - coordinatedCommitsTableConf: java.util.Map[String, String], - version: Long, - lastKnownBackfilledVersionOpt: java.lang.Long): Unit = { } + logStore: LogStore, + hadoopConf: Configuration, + tableDesc: TableDescriptor, + version: Long, + lastKnownBackfilledVersionOpt: JLong): Unit = { } }) CommitCoordinatorProvider.clearNonDefaultBuilders() val builder = @@ -1325,7 +1322,7 @@ class CoordinatedCommitsSuite tableMutationFn() } checkError(e, - errorClass = "DELTA_UNSUPPORTED_WRITES_WITHOUT_COORDINATOR", + "DELTA_UNSUPPORTED_WRITES_WITHOUT_COORDINATOR", sqlState = "0AKDC", parameters = Map("coordinatorName" -> "tracking-in-memory") ) @@ -1345,7 +1342,7 @@ class CoordinatedCommitsSuite } ///////////////////////////////////////////////////////////////////////////////////////////// - // Test coordinated-commits with DeltaLog.getChangeLogFile API starts // + // Test coordinated-commits with DeltaLog.getChangeLogFile API starts // ///////////////////////////////////////////////////////////////////////////////////////////// /** @@ -1571,227 +1568,156 @@ class CoordinatedCommitsSuite } ///////////////////////////////////////////////////////////////////////////////////////////// - // Test coordinated-commits with DeltaLog.getChangeLogFile API ENDS // + // Test coordinated-commits with DeltaLog.getChangeLogFile API ENDS // ///////////////////////////////////////////////////////////////////////////////////////////// - ///////////////////////////////////////////////////////////////////////////////////////////// - // Test CoordinatedCommitsUtils.validateCoordinatedCommitsConfigurationsImpl STARTS // - ///////////////////////////////////////////////////////////////////////////////////////////// + test("During ALTER, overriding Coordinated Commits configurations throws an exception.") { + CommitCoordinatorProvider.registerBuilder(TrackingInMemoryCommitCoordinatorBuilder(1)) + CommitCoordinatorProvider.registerBuilder(InMemoryCommitCoordinatorBuilder(1)) - def gridTest[A](testNamePrefix: String, testTags: Tag*)(params: Seq[A])( - testFun: A => Unit): Unit = { - for (param <- params) { - test(testNamePrefix + s" ($param)", testTags: _*)(testFun(param)) + withTempDir { tempDir => + sql(s"CREATE TABLE delta.`${tempDir.getAbsolutePath}` (id LONG) USING delta TBLPROPERTIES " + + s"('${COORDINATED_COMMITS_COORDINATOR_NAME.key}' = 'tracking-in-memory', " + + s"'${COORDINATED_COMMITS_COORDINATOR_CONF.key}' = '${JsonUtils.toJson(Map())}')") + val e = interceptWithUnwrapping[DeltaIllegalArgumentException] { + sql(s"ALTER TABLE delta.`${tempDir.getAbsolutePath}` SET TBLPROPERTIES " + + s"('${COORDINATED_COMMITS_COORDINATOR_NAME.key}' = 'in-memory', " + + s"'${COORDINATED_COMMITS_COORDINATOR_CONF.key}' = '${JsonUtils.toJson(Map())}')") + } + checkError( + e, + "DELTA_CANNOT_OVERRIDE_COORDINATED_COMMITS_CONFS", + sqlState = "42616", + parameters = Map("Command" -> "ALTER")) } } - private val cNameKey = COORDINATED_COMMITS_COORDINATOR_NAME.key - private val cConfKey = COORDINATED_COMMITS_COORDINATOR_CONF.key - private val tableConfKey = COORDINATED_COMMITS_TABLE_CONF.key - private val cName = cNameKey -> "some-cc-name" - private val cConf = cConfKey -> "some-cc-conf" - private val tableConf = tableConfKey -> "some-table-conf" - - private val cNameDefaultKey = COORDINATED_COMMITS_COORDINATOR_NAME.defaultTablePropertyKey - private val cConfDefaultKey = COORDINATED_COMMITS_COORDINATOR_CONF.defaultTablePropertyKey - private val tableConfDefaultKey = COORDINATED_COMMITS_TABLE_CONF.defaultTablePropertyKey - private val cNameDefault = cNameDefaultKey -> "some-cc-name" - private val cConfDefault = cConfDefaultKey -> "some-cc-conf" - private val tableConfDefault = tableConfDefaultKey -> "some-table-conf" - - private val command = "CLONE" - - private val errCannotOverride = new DeltaIllegalArgumentException( - "DELTA_CANNOT_OVERRIDE_COORDINATED_COMMITS_CONFS", Array(command)) - - private def errMissingConfInCommand(key: String) = new DeltaIllegalArgumentException( - "DELTA_MUST_SET_ALL_COORDINATED_COMMITS_CONFS_IN_COMMAND", Array(command, key)) + test("During ALTER, unsetting Coordinated Commits configurations throws an exception.") { + CommitCoordinatorProvider.registerBuilder(TrackingInMemoryCommitCoordinatorBuilder(1)) - private def errMissingConfInSession(key: String) = new DeltaIllegalArgumentException( - "DELTA_MUST_SET_ALL_COORDINATED_COMMITS_CONFS_IN_SESSION", Array(command, key)) + withTempDir { tempDir => + sql(s"CREATE TABLE delta.`${tempDir.getAbsolutePath}` (id LONG) USING delta TBLPROPERTIES " + + s"('${COORDINATED_COMMITS_COORDINATOR_NAME.key}' = 'tracking-in-memory', " + + s"'${COORDINATED_COMMITS_COORDINATOR_CONF.key}' = '${JsonUtils.toJson(Map())}')") + val e = interceptWithUnwrapping[DeltaIllegalArgumentException] { + sql(s"ALTER TABLE delta.`${tempDir.getAbsolutePath}` UNSET TBLPROPERTIES " + + s"('${COORDINATED_COMMITS_COORDINATOR_NAME.key}', " + + s"'${COORDINATED_COMMITS_COORDINATOR_CONF.key}')") + } + checkError( + e, + "DELTA_CANNOT_UNSET_COORDINATED_COMMITS_CONFS", + sqlState = "42616", + parameters = Map[String, String]()) + } + } - private def errTableConfInCommand = new DeltaIllegalArgumentException( - "DELTA_CONF_OVERRIDE_NOT_SUPPORTED_IN_COMMAND", Array(command, tableConfKey)) + test("During ALTER, overriding ICT configurations on (potential) Coordinated Commits tables " + + "throws an exception.") { + CommitCoordinatorProvider.registerBuilder(TrackingInMemoryCommitCoordinatorBuilder(1)) - private def errTableConfInSession = new DeltaIllegalArgumentException( - "DELTA_CONF_OVERRIDE_NOT_SUPPORTED_IN_SESSION", - Array(command, tableConfDefaultKey, tableConfDefaultKey)) + // For a table that had Coordinated Commits enabled before the ALTER command. + withTempDir { tempDir => + sql(s"CREATE TABLE delta.`${tempDir.getAbsolutePath}` (id LONG) USING delta TBLPROPERTIES " + + s"('${COORDINATED_COMMITS_COORDINATOR_NAME.key}' = 'tracking-in-memory', " + + s"'${COORDINATED_COMMITS_COORDINATOR_CONF.key}' = '${JsonUtils.toJson(Map())}')") + val e = interceptWithUnwrapping[DeltaIllegalArgumentException] { + sql(s"ALTER TABLE delta.`${tempDir.getAbsolutePath}` SET TBLPROPERTIES " + + s"('${IN_COMMIT_TIMESTAMPS_ENABLED.key}' = 'false')") + } + checkError( + e, + "DELTA_CANNOT_MODIFY_COORDINATED_COMMITS_DEPENDENCIES", + sqlState = "42616", + parameters = Map("Command" -> "ALTER")) + } - private def testValidation( - tableExists: Boolean, - propertyOverrides: Map[String, String], - defaultConfs: Seq[(String, String)], - errorOpt: Option[DeltaIllegalArgumentException]): Unit = { + // For a table that is about to enable Coordinated Commits during the same ALTER command. withoutCoordinatedCommitsDefaultTableProperties { - withSQLConf(defaultConfs: _*) { - if (errorOpt.isDefined) { - val e = intercept[DeltaIllegalArgumentException] { - CoordinatedCommitsUtils.validateCoordinatedCommitsConfigurationsImpl( - spark, propertyOverrides, tableExists, command) - } - assert(e.getMessage.contains(errorOpt.get.getMessage)) - } else { - CoordinatedCommitsUtils.validateCoordinatedCommitsConfigurationsImpl( - spark, propertyOverrides, tableExists, command) + withTempDir { tempDir => + sql(s"CREATE TABLE delta.`${tempDir.getAbsolutePath}` (id LONG) USING delta") + val e = interceptWithUnwrapping[DeltaIllegalArgumentException] { + sql(s"ALTER TABLE delta.`${tempDir.getAbsolutePath}` SET TBLPROPERTIES " + + s"('${COORDINATED_COMMITS_COORDINATOR_NAME.key}' = 'tracking-in-memory', " + + s"'${COORDINATED_COMMITS_COORDINATOR_CONF.key}' = '${JsonUtils.toJson(Map())}', " + + s"'${IN_COMMIT_TIMESTAMPS_ENABLED.key}' = 'false')") } + checkError( + e, + "DELTA_CANNOT_SET_COORDINATED_COMMITS_DEPENDENCIES", + sqlState = "42616", + parameters = Map("Command" -> "ALTER")) } } } - // tableExists: True - // | False - // - // propertyOverrides: Map.empty - // | Map(cName) - // | Map(cName, cConf) - // | Map(cName, cConf, tableConf) - // | Map(tableConf) - // - // defaultConf: Seq.empty - // | Seq(cNameDefault) - // | Seq(cNameDefault, cConfDefault) - // | Seq(cNameDefault, cConfDefault, tableConfDefault) - // | Seq(tableConfDefault) - // - // errorOpt: None - // | Some(errCannotOverride) - // | Some(errMissingConfInCommand(cConfKey)) - // | Some(errMissingConfInSession(cConfKey)) - // | Some(errTableConfInCommand) - // | Some(errTableConfInSession) - - gridTest("During CLONE, CoordinatedCommitsUtils.validateCoordinatedCommitsConfigurationsImpl " + - "passes for existing target tables with no explicit Coordinated Commits Configurations.") ( - Seq( - Seq.empty, - // Not having any explicit Coordinated Commits configurations, but having an illegal - // combination of Coordinated Commits configurations in default: pass. - // This is because we don't consider default configurations when the table exists. - Seq(cNameDefault), - Seq(cNameDefault, cConfDefault), - Seq(cNameDefault, cConfDefault, tableConfDefault), - Seq(tableConfDefault) - ) - ) { defaultConfs: Seq[(String, String)] => - testValidation( - tableExists = true, - propertyOverrides = Map.empty, - defaultConfs, - errorOpt = None) - } + test("During ALTER, unsetting ICT configurations on Coordinated Commits tables throws an " + + "exception.") { + CommitCoordinatorProvider.registerBuilder(TrackingInMemoryCommitCoordinatorBuilder(1)) - gridTest("During CLONE, CoordinatedCommitsUtils.validateCoordinatedCommitsConfigurationsImpl " + - "fails for existing target tables with any explicit Coordinated Commits Configurations.") ( - Seq( - (Map(cName), Seq.empty), - (Map(cName), Seq(cNameDefault)), - (Map(cName), Seq(cNameDefault, cConfDefault)), - (Map(cName), Seq(cNameDefault, cConfDefault, tableConfDefault)), - (Map(cName), Seq(tableConfDefault)), - - (Map(cName, cConf), Seq.empty), - (Map(cName, cConf), Seq(cNameDefault)), - (Map(cName, cConf), Seq(cNameDefault, cConfDefault)), - (Map(cName, cConf), Seq(cNameDefault, cConfDefault, tableConfDefault)), - (Map(cName, cConf), Seq(tableConfDefault)), - - (Map(cName, cConf, tableConf), Seq.empty), - (Map(cName, cConf, tableConf), Seq(cNameDefault)), - (Map(cName, cConf, tableConf), Seq(cNameDefault, cConfDefault)), - (Map(cName, cConf, tableConf), Seq(cNameDefault, cConfDefault, tableConfDefault)), - (Map(cName, cConf, tableConf), Seq(tableConfDefault)), - - (Map(tableConf), Seq.empty), - (Map(tableConf), Seq(cNameDefault)), - (Map(tableConf), Seq(cNameDefault, cConfDefault)), - (Map(tableConf), Seq(cNameDefault, cConfDefault, tableConfDefault)), - (Map(tableConf), Seq(tableConfDefault)) - ) - ) { case ( - propertyOverrides: Map[String, String], - defaultConfs: Seq[(String, String)]) => - testValidation( - tableExists = true, - propertyOverrides, - defaultConfs, - errorOpt = Some(errCannotOverride)) + withTempDir { tempDir => + sql(s"CREATE TABLE delta.`${tempDir.getAbsolutePath}` (id LONG) USING delta TBLPROPERTIES " + + s"('${COORDINATED_COMMITS_COORDINATOR_NAME.key}' = 'tracking-in-memory', " + + s"'${COORDINATED_COMMITS_COORDINATOR_CONF.key}' = '${JsonUtils.toJson(Map())}')") + val e = interceptWithUnwrapping[DeltaIllegalArgumentException] { + sql(s"ALTER TABLE delta.`${tempDir.getAbsolutePath}` UNSET TBLPROPERTIES " + + s"('${IN_COMMIT_TIMESTAMPS_ENABLED.key}')") + } + checkError( + e, + "DELTA_CANNOT_MODIFY_COORDINATED_COMMITS_DEPENDENCIES", + sqlState = "42616", + parameters = Map("Command" -> "ALTER")) + } } - gridTest("During CLONE, CoordinatedCommitsUtils.validateCoordinatedCommitsConfigurationsImpl " + - "works correctly for new target tables with default Coordinated Commits Configurations.") ( - Seq( - (Seq.empty, None), - (Seq(cNameDefault), Some(errMissingConfInSession(cConfDefaultKey))), - (Seq(cNameDefault, cConfDefault), None), - (Seq(cNameDefault, cConfDefault, tableConfDefault), Some(errTableConfInSession)), - (Seq(tableConfDefault), Some(errTableConfInSession)) - ) - ) { case ( - defaultConfs: Seq[(String, String)], - errorOpt: Option[DeltaIllegalArgumentException]) => - testValidation( - tableExists = false, - propertyOverrides = Map.empty, - defaultConfs, - errorOpt) - } + test("During REPLACE, for non-CC tables, default CC configurations are ignored, but default " + + "ICT confs are retained, and existing ICT confs are discarded") { + // Non-CC table, REPLACE with default CC and ICT confs => Non-CC, but with ICT confs. + withTempDir { tempDir => + withoutCoordinatedCommitsDefaultTableProperties { + sql(s"CREATE TABLE delta.`${tempDir.getAbsolutePath}` (id LONG) USING delta") + } + withSQLConf(IN_COMMIT_TIMESTAMPS_ENABLED.defaultTablePropertyKey -> "true") { + sql(s"REPLACE TABLE delta.`${tempDir.getAbsolutePath}` (id STRING) USING delta") + } + assert(DeltaLog.forTable(spark, tempDir).snapshot.tableCommitCoordinatorClientOpt.isEmpty) + assert(DeltaLog.forTable(spark, tempDir).snapshot.metadata.configuration.contains( + IN_COMMIT_TIMESTAMPS_ENABLED.key)) + } - gridTest("During CLONE, CoordinatedCommitsUtils.validateCoordinatedCommitsConfigurationsImpl " + - "fails for new target tables with any illegal explicit Coordinated Commits Configurations.") ( - Seq( - (Map(cName), Seq.empty, Some(errMissingConfInCommand(cConfKey))), - (Map(cName), Seq(cNameDefault), Some(errMissingConfInCommand(cConfKey))), - (Map(cName), Seq(cNameDefault, cConfDefault), Some(errMissingConfInCommand(cConfKey))), - (Map(cName), Seq(cNameDefault, cConfDefault, tableConfDefault), - Some(errMissingConfInCommand(cConfKey))), - (Map(cName), Seq(tableConfDefault), Some(errMissingConfInCommand(cConfKey))), - - (Map(cName, cConf, tableConf), Seq.empty, Some(errTableConfInCommand)), - (Map(cName, cConf, tableConf), Seq(cNameDefault), Some(errTableConfInCommand)), - (Map(cName, cConf, tableConf), Seq(cNameDefault, cConfDefault), Some(errTableConfInCommand)), - (Map(cName, cConf, tableConf), Seq(cNameDefault, cConfDefault, tableConfDefault), - Some(errTableConfInCommand)), - (Map(cName, cConf, tableConf), Seq(tableConfDefault), Some(errTableConfInCommand)), - - (Map(tableConf), Seq.empty, Some(errTableConfInCommand)), - (Map(tableConf), Seq(cNameDefault), Some(errTableConfInCommand)), - (Map(tableConf), Seq(cNameDefault, cConfDefault), Some(errTableConfInCommand)), - (Map(tableConf), Seq(cNameDefault, cConfDefault, tableConfDefault), - Some(errTableConfInCommand)), - (Map(tableConf), Seq(tableConfDefault), Some(errTableConfInCommand)) - ) - ) { case ( - propertyOverrides: Map[String, String], - defaultConfs: Seq[(String, String)], - errorOpt: Option[DeltaIllegalArgumentException]) => - testValidation( - tableExists = false, - propertyOverrides, - defaultConfs, - errorOpt) + // Non-CC table with ICT confs, REPLACE with only default CC confs => Non-CC, also no ICT confs. + withTempDir { tempDir => + withoutCoordinatedCommitsDefaultTableProperties { + withSQLConf(IN_COMMIT_TIMESTAMPS_ENABLED.defaultTablePropertyKey -> "true") { + sql(s"CREATE TABLE delta.`${tempDir.getAbsolutePath}` (id LONG) USING delta") + } + } + sql(s"REPLACE TABLE delta.`${tempDir.getAbsolutePath}` (id STRING) USING delta") + assert(DeltaLog.forTable(spark, tempDir).snapshot.tableCommitCoordinatorClientOpt.isEmpty) + assert(!DeltaLog.forTable(spark, tempDir).snapshot.metadata.configuration.contains( + IN_COMMIT_TIMESTAMPS_ENABLED.key)) + } } - gridTest("During CLONE, CoordinatedCommitsUtils.validateCoordinatedCommitsConfigurationsImpl " + - "passes for new target tables with legal explicit Coordinated Commits Configurations.") ( - Seq( - // Having exactly Coordinator Name and Coordinator Conf explicitly, but having an illegal - // combination of Coordinated Commits configurations in default: pass. - // This is because we don't consider default configurations when explicit ones are provided. - Seq.empty, - Seq(cNameDefault), - Seq(cNameDefault, cConfDefault), - Seq(cNameDefault, cConfDefault, tableConfDefault), - Seq(tableConfDefault) - ) - ) { defaultConfs: Seq[(String, String)] => - testValidation( - tableExists = false, - propertyOverrides = Map(cName, cConf), - defaultConfs, - errorOpt = None) - } + test("During REPLACE, for CC tables, existing CC and ICT configurations are both retained.") { + CommitCoordinatorProvider.registerBuilder(TrackingInMemoryCommitCoordinatorBuilder(1)) - ///////////////////////////////////////////////////////////////////////////////////////////// - // Test CoordinatedCommitsUtils.validateCoordinatedCommitsConfigurationsImpl ENDS // - ///////////////////////////////////////////////////////////////////////////////////////////// + withTempDir { tempDir => + withoutCoordinatedCommitsDefaultTableProperties { + sql(s"CREATE TABLE delta.`${tempDir.getAbsolutePath}` (id LONG) USING delta") + sql(s"INSERT INTO delta.`${tempDir.getAbsolutePath}` VALUES (0)") + sql(s"ALTER TABLE delta.`${tempDir.getAbsolutePath}` SET TBLPROPERTIES " + + s"('${COORDINATED_COMMITS_COORDINATOR_NAME.key}' = 'tracking-in-memory', " + + s"'${COORDINATED_COMMITS_COORDINATOR_CONF.key}' = '${JsonUtils.toJson(Map())}')") + // All three ICT configurations should be set because Coordinated Commits is enabled later. + // REPLACE with default CC confs => CC, and all ICT confs. + sql(s"REPLACE TABLE delta.`${tempDir.getAbsolutePath}` (id STRING) USING delta") + assert(DeltaLog.forTable(spark, tempDir).snapshot.tableCommitCoordinatorClientOpt.nonEmpty) + CoordinatedCommitsUtils.ICT_TABLE_PROPERTY_KEYS.foreach { key => + assert(DeltaLog.forTable(spark, tempDir).snapshot.metadata.configuration.contains(key)) + } + } + } + } } diff --git a/spark/src/test/scala/org/apache/spark/sql/delta/coordinatedcommits/CoordinatedCommitsTestUtils.scala b/spark/src/test/scala/org/apache/spark/sql/delta/coordinatedcommits/CoordinatedCommitsTestUtils.scala index 8e1cebb7200..05c08d9d33a 100644 --- a/spark/src/test/scala/org/apache/spark/sql/delta/coordinatedcommits/CoordinatedCommitsTestUtils.scala +++ b/spark/src/test/scala/org/apache/spark/sql/delta/coordinatedcommits/CoordinatedCommitsTestUtils.scala @@ -16,14 +16,17 @@ package org.apache.spark.sql.delta.coordinatedcommits +import java.util.Optional import java.util.concurrent.atomic.AtomicInteger +import scala.util.control.NonFatal + import org.apache.spark.sql.delta.{DeltaConfigs, DeltaLog, DeltaTestUtilsBase} import org.apache.spark.sql.delta.DeltaConfigs.COORDINATED_COMMITS_COORDINATOR_NAME import org.apache.spark.sql.delta.actions.{Action, CommitInfo, Metadata, Protocol} import org.apache.spark.sql.delta.util.JsonUtils import io.delta.storage.LogStore -import io.delta.storage.commit.{CommitCoordinatorClient, CommitResponse, GetCommitsResponse => JGetCommitsResponse, UpdatedActions} +import io.delta.storage.commit.{CommitCoordinatorClient, CommitResponse, GetCommitsResponse => JGetCommitsResponse, TableDescriptor, TableIdentifier, UpdatedActions} import io.delta.storage.commit.actions.{AbstractMetadata, AbstractProtocol} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path @@ -247,42 +250,36 @@ class TrackingCommitCoordinatorClient( override def commit( logStore: LogStore, hadoopConf: Configuration, - logPath: Path, - coordinatedCommitsTableConf: java.util.Map[String, String], + tableDesc: TableDescriptor, commitVersion: Long, actions: java.util.Iterator[String], updatedActions: UpdatedActions): CommitResponse = recordOperation("commit") { delegatingCommitCoordinatorClient.commit( logStore, hadoopConf, - logPath, - coordinatedCommitsTableConf, + tableDesc, commitVersion, actions, updatedActions) } override def getCommits( - logPath: Path, - coordinatedCommitsTableConf: java.util.Map[String, String], + tableDesc: TableDescriptor, startVersion: java.lang.Long, endVersion: java.lang.Long): JGetCommitsResponse = recordOperation("getCommits") { - delegatingCommitCoordinatorClient.getCommits( - logPath, coordinatedCommitsTableConf, startVersion, endVersion) + delegatingCommitCoordinatorClient.getCommits(tableDesc, startVersion, endVersion) } override def backfillToVersion( logStore: LogStore, hadoopConf: Configuration, - logPath: Path, - coordinatedCommitsTableConf: java.util.Map[String, String], + tableDesc: TableDescriptor, version: Long, lastKnownBackfilledVersion: java.lang.Long): Unit = recordOperation("backfillToVersion") { delegatingCommitCoordinatorClient.backfillToVersion( logStore, hadoopConf, - logPath, - coordinatedCommitsTableConf, + tableDesc, version, lastKnownBackfilledVersion) } @@ -305,12 +302,13 @@ class TrackingCommitCoordinatorClient( override def registerTable( logPath: Path, + tableIdentifier: Optional[TableIdentifier], currentVersion: Long, currentMetadata: AbstractMetadata, currentProtocol: AbstractProtocol): java.util.Map[String, String] = recordOperation("registerTable") { delegatingCommitCoordinatorClient.registerTable( - logPath, currentVersion, currentMetadata, currentProtocol) + logPath, tableIdentifier, currentVersion, currentMetadata, currentProtocol) } } @@ -328,6 +326,32 @@ trait CoordinatedCommitsBaseSuite final def coordinatedCommitsEnabledInTests: Boolean = coordinatedCommitsBackfillBatchSize.nonEmpty + // In case some tests reuse the table path/name with DROP table, this method can be used to + // clean the table data in the commit coordinator. Note that we should call this before + // the table actually gets DROP. + def deleteTableFromCommitCoordinator(tableName: String): Unit = { + val cc = CommitCoordinatorProvider.getCommitCoordinatorClient( + defaultCommitsCoordinatorName, defaultCommitsCoordinatorConf, spark) + assert( + cc.isInstanceOf[TrackingCommitCoordinatorClient], + s"Please implement delete/drop method for coordinator: ${cc.getClass.getName}") + val location = try { + spark.sql(s"describe detail $tableName") + .select("location") + .first + .getAs[String](0) + } catch { + case NonFatal(_) => + // Ignore if the table does not exist/broken. + return + } + val logPath = location + "/_delta_log" + cc.asInstanceOf[TrackingCommitCoordinatorClient] + .delegatingCommitCoordinatorClient + .asInstanceOf[InMemoryCommitCoordinator] + .dropTable(new Path(logPath)) + } + override protected def sparkConf: SparkConf = { if (coordinatedCommitsBackfillBatchSize.nonEmpty) { val coordinatedCommitsCoordinatorJson = JsonUtils.toJson(defaultCommitsCoordinatorConf) diff --git a/spark/src/test/scala/org/apache/spark/sql/delta/coordinatedcommits/CoordinatedCommitsUtilsSuite.scala b/spark/src/test/scala/org/apache/spark/sql/delta/coordinatedcommits/CoordinatedCommitsUtilsSuite.scala new file mode 100644 index 00000000000..158fe6d5e6d --- /dev/null +++ b/spark/src/test/scala/org/apache/spark/sql/delta/coordinatedcommits/CoordinatedCommitsUtilsSuite.scala @@ -0,0 +1,387 @@ +/* + * Copyright (2021) The Delta Lake Project Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.delta.coordinatedcommits + +import scala.jdk.CollectionConverters._ + +import org.apache.spark.sql.delta.DeltaConfigs.{COORDINATED_COMMITS_COORDINATOR_CONF, COORDINATED_COMMITS_COORDINATOR_NAME, COORDINATED_COMMITS_TABLE_CONF} +import org.apache.spark.sql.delta.DeltaIllegalArgumentException +import org.scalatest.Tag + +import org.apache.spark.sql.QueryTest +import org.apache.spark.sql.test.SharedSparkSession + +class CoordinatedCommitsUtilsSuite extends QueryTest + with SharedSparkSession + with CoordinatedCommitsTestUtils { + + ///////////////////////////////////////////////////////////////////////////////////////////// + // Test CoordinatedCommitsUtils.validateCoordinatedCommitsConfigurationsImpl STARTS // + ///////////////////////////////////////////////////////////////////////////////////////////// + + def gridTest[A](testNamePrefix: String, testTags: Tag*)(params: Seq[A])( + testFun: A => Unit): Unit = { + for (param <- params) { + test(testNamePrefix + s" ($param)", testTags: _*)(testFun(param)) + } + } + + private val cNameKey = COORDINATED_COMMITS_COORDINATOR_NAME.key + private val cConfKey = COORDINATED_COMMITS_COORDINATOR_CONF.key + private val tableConfKey = COORDINATED_COMMITS_TABLE_CONF.key + private val cName = cNameKey -> "some-cc-name" + private val cConf = cConfKey -> "some-cc-conf" + private val tableConf = tableConfKey -> "some-table-conf" + + private val cNameDefaultKey = COORDINATED_COMMITS_COORDINATOR_NAME.defaultTablePropertyKey + private val cConfDefaultKey = COORDINATED_COMMITS_COORDINATOR_CONF.defaultTablePropertyKey + private val tableConfDefaultKey = COORDINATED_COMMITS_TABLE_CONF.defaultTablePropertyKey + private val cNameDefault = cNameDefaultKey -> "some-cc-name" + private val cConfDefault = cConfDefaultKey -> "some-cc-conf" + private val tableConfDefault = tableConfDefaultKey -> "some-table-conf" + + private val command = "CLONE" + + private def errCannotOverride = new DeltaIllegalArgumentException( + "DELTA_CANNOT_OVERRIDE_COORDINATED_COMMITS_CONFS", Array(command)) + + private def errMissingConfInCommand(key: String) = new DeltaIllegalArgumentException( + "DELTA_MUST_SET_ALL_COORDINATED_COMMITS_CONFS_IN_COMMAND", Array(command, key)) + + private def errMissingConfInSession(key: String) = new DeltaIllegalArgumentException( + "DELTA_MUST_SET_ALL_COORDINATED_COMMITS_CONFS_IN_SESSION", Array(command, key)) + + private def errTableConfInCommand = new DeltaIllegalArgumentException( + "DELTA_CONF_OVERRIDE_NOT_SUPPORTED_IN_COMMAND", Array(command, tableConfKey)) + + private def errTableConfInSession = new DeltaIllegalArgumentException( + "DELTA_CONF_OVERRIDE_NOT_SUPPORTED_IN_SESSION", + Array(command, tableConfDefaultKey, tableConfDefaultKey)) + + private def testValidationForCreateDeltaTableCommand( + tableExists: Boolean, + propertyOverrides: Map[String, String], + defaultConfs: Seq[(String, String)], + errorOpt: Option[DeltaIllegalArgumentException]): Unit = { + withoutCoordinatedCommitsDefaultTableProperties { + withSQLConf(defaultConfs: _*) { + if (errorOpt.isDefined) { + val e = intercept[DeltaIllegalArgumentException] { + CoordinatedCommitsUtils.validateConfigurationsForCreateDeltaTableCommandImpl( + spark, propertyOverrides, tableExists, command) + } + checkError( + e, + errorOpt.get.getErrorClass, + sqlState = errorOpt.get.getSqlState, + parameters = errorOpt.get.getMessageParameters.asScala.toMap) + } else { + CoordinatedCommitsUtils.validateConfigurationsForCreateDeltaTableCommandImpl( + spark, propertyOverrides, tableExists, command) + } + } + } + } + + // tableExists: True + // | False + // + // propertyOverrides: Map.empty + // | Map(cName) + // | Map(cName, cConf) + // | Map(cName, cConf, tableConf) + // | Map(tableConf) + // + // defaultConf: Seq.empty + // | Seq(cNameDefault) + // | Seq(cNameDefault, cConfDefault) + // | Seq(cNameDefault, cConfDefault, tableConfDefault) + // | Seq(tableConfDefault) + // + // errorOpt: None + // | Some(errCannotOverride) + // | Some(errMissingConfInCommand(cConfKey)) + // | Some(errMissingConfInSession(cConfKey)) + // | Some(errTableConfInCommand) + // | Some(errTableConfInSession) + + gridTest("During CLONE, CoordinatedCommitsUtils.validateCoordinatedCommitsConfigurationsImpl " + + "passes for existing target tables with no explicit Coordinated Commits Configurations.") ( + Seq( + Seq.empty, + // Not having any explicit Coordinated Commits configurations, but having an illegal + // combination of Coordinated Commits configurations in default: pass. + // This is because we don't consider default configurations when the table exists. + Seq(cNameDefault), + Seq(cNameDefault, cConfDefault), + Seq(cNameDefault, cConfDefault, tableConfDefault), + Seq(tableConfDefault) + ) + ) { defaultConfs: Seq[(String, String)] => + testValidationForCreateDeltaTableCommand( + tableExists = true, + propertyOverrides = Map.empty, + defaultConfs, + errorOpt = None) + } + + gridTest("During CLONE, CoordinatedCommitsUtils.validateCoordinatedCommitsConfigurationsImpl " + + "fails for existing target tables with any explicit Coordinated Commits Configurations.") ( + Seq( + (Map(cName), Seq.empty), + (Map(cName), Seq(cNameDefault)), + (Map(cName), Seq(cNameDefault, cConfDefault)), + (Map(cName), Seq(cNameDefault, cConfDefault, tableConfDefault)), + (Map(cName), Seq(tableConfDefault)), + + (Map(cName, cConf), Seq.empty), + (Map(cName, cConf), Seq(cNameDefault)), + (Map(cName, cConf), Seq(cNameDefault, cConfDefault)), + (Map(cName, cConf), Seq(cNameDefault, cConfDefault, tableConfDefault)), + (Map(cName, cConf), Seq(tableConfDefault)), + + (Map(cName, cConf, tableConf), Seq.empty), + (Map(cName, cConf, tableConf), Seq(cNameDefault)), + (Map(cName, cConf, tableConf), Seq(cNameDefault, cConfDefault)), + (Map(cName, cConf, tableConf), Seq(cNameDefault, cConfDefault, tableConfDefault)), + (Map(cName, cConf, tableConf), Seq(tableConfDefault)), + + (Map(tableConf), Seq.empty), + (Map(tableConf), Seq(cNameDefault)), + (Map(tableConf), Seq(cNameDefault, cConfDefault)), + (Map(tableConf), Seq(cNameDefault, cConfDefault, tableConfDefault)), + (Map(tableConf), Seq(tableConfDefault)) + ) + ) { case ( + propertyOverrides: Map[String, String], + defaultConfs: Seq[(String, String)]) => + testValidationForCreateDeltaTableCommand( + tableExists = true, + propertyOverrides, + defaultConfs, + errorOpt = Some(errCannotOverride)) + } + + gridTest("During CLONE, CoordinatedCommitsUtils.validateCoordinatedCommitsConfigurationsImpl " + + "works correctly for new target tables with default Coordinated Commits Configurations.") ( + Seq( + (Seq.empty, None), + (Seq(cNameDefault), Some(errMissingConfInSession(cConfDefaultKey))), + (Seq(cNameDefault, cConfDefault), None), + (Seq(cNameDefault, cConfDefault, tableConfDefault), Some(errTableConfInSession)), + (Seq(tableConfDefault), Some(errTableConfInSession)) + ) + ) { case ( + defaultConfs: Seq[(String, String)], + errorOpt: Option[DeltaIllegalArgumentException]) => + testValidationForCreateDeltaTableCommand( + tableExists = false, + propertyOverrides = Map.empty, + defaultConfs, + errorOpt) + } + + gridTest("During CLONE, CoordinatedCommitsUtils.validateCoordinatedCommitsConfigurationsImpl " + + "fails for new target tables with any illegal explicit Coordinated Commits Configurations.") ( + Seq( + (Map(cName), Seq.empty, Some(errMissingConfInCommand(cConfKey))), + (Map(cName), Seq(cNameDefault), Some(errMissingConfInCommand(cConfKey))), + (Map(cName), Seq(cNameDefault, cConfDefault), Some(errMissingConfInCommand(cConfKey))), + (Map(cName), Seq(cNameDefault, cConfDefault, tableConfDefault), + Some(errMissingConfInCommand(cConfKey))), + (Map(cName), Seq(tableConfDefault), Some(errMissingConfInCommand(cConfKey))), + + (Map(cName, cConf, tableConf), Seq.empty, Some(errTableConfInCommand)), + (Map(cName, cConf, tableConf), Seq(cNameDefault), Some(errTableConfInCommand)), + (Map(cName, cConf, tableConf), Seq(cNameDefault, cConfDefault), Some(errTableConfInCommand)), + (Map(cName, cConf, tableConf), Seq(cNameDefault, cConfDefault, tableConfDefault), + Some(errTableConfInCommand)), + (Map(cName, cConf, tableConf), Seq(tableConfDefault), Some(errTableConfInCommand)), + + (Map(tableConf), Seq.empty, Some(errTableConfInCommand)), + (Map(tableConf), Seq(cNameDefault), Some(errTableConfInCommand)), + (Map(tableConf), Seq(cNameDefault, cConfDefault), Some(errTableConfInCommand)), + (Map(tableConf), Seq(cNameDefault, cConfDefault, tableConfDefault), + Some(errTableConfInCommand)), + (Map(tableConf), Seq(tableConfDefault), Some(errTableConfInCommand)) + ) + ) { case ( + propertyOverrides: Map[String, String], + defaultConfs: Seq[(String, String)], + errorOpt: Option[DeltaIllegalArgumentException]) => + testValidationForCreateDeltaTableCommand( + tableExists = false, + propertyOverrides, + defaultConfs, + errorOpt) + } + + gridTest("During CLONE, CoordinatedCommitsUtils.validateCoordinatedCommitsConfigurationsImpl " + + "passes for new target tables with legal explicit Coordinated Commits Configurations.") ( + Seq( + // Having exactly Coordinator Name and Coordinator Conf explicitly, but having an illegal + // combination of Coordinated Commits configurations in default: pass. + // This is because we don't consider default configurations when explicit ones are provided. + Seq.empty, + Seq(cNameDefault), + Seq(cNameDefault, cConfDefault), + Seq(cNameDefault, cConfDefault, tableConfDefault), + Seq(tableConfDefault) + ) + ) { defaultConfs: Seq[(String, String)] => + testValidationForCreateDeltaTableCommand( + tableExists = false, + propertyOverrides = Map(cName, cConf), + defaultConfs, + errorOpt = None) + } + + private def testValidateConfigurationsForAlterTableSetPropertiesDeltaCommand( + existingConfs: Map[String, String], + propertyOverrides: Map[String, String], + errorOpt: Option[DeltaIllegalArgumentException]): Unit = { + if (errorOpt.isDefined) { + val e = intercept[DeltaIllegalArgumentException] { + CoordinatedCommitsUtils.validateConfigurationsForAlterTableSetPropertiesDeltaCommand( + existingConfs, propertyOverrides) + } + checkError( + e, + errorOpt.get.getErrorClass, + sqlState = errorOpt.get.getSqlState, + parameters = errorOpt.get.getMessageParameters.asScala.toMap) + } else { + CoordinatedCommitsUtils.validateConfigurationsForAlterTableSetPropertiesDeltaCommand( + existingConfs, propertyOverrides) + } + } + + gridTest("During ALTER, `validateConfigurationsForAlterTableSetPropertiesDeltaCommand` " + + "works correctly for tables without Coordinated Commits configurations.") { + Seq( + (Map.empty, None), + (Map(cName), Some(new DeltaIllegalArgumentException( + "DELTA_MUST_SET_ALL_COORDINATED_COMMITS_CONFS_IN_COMMAND", Array("ALTER", cConfKey)))), + (Map(cName, cConf), None), + (Map(cName, cConf, tableConf), Some(new DeltaIllegalArgumentException( + "DELTA_CONF_OVERRIDE_NOT_SUPPORTED_IN_COMMAND", Array("ALTER", tableConfKey)))), + (Map(tableConf), Some(new DeltaIllegalArgumentException( + "DELTA_CONF_OVERRIDE_NOT_SUPPORTED_IN_COMMAND", Array("ALTER", tableConfKey)))) + ) + } { case ( + propertyOverrides: Map[String, String], + errorOpt: Option[DeltaIllegalArgumentException]) => + testValidateConfigurationsForAlterTableSetPropertiesDeltaCommand( + existingConfs = Map.empty, + propertyOverrides, + errorOpt) + } + + test("During ALTER, `validateConfigurationsForAlterTableSetPropertiesDeltaCommand` " + + "passes with no overrides for tables with Coordinated Commits configurations.") { + testValidateConfigurationsForAlterTableSetPropertiesDeltaCommand( + existingConfs = Map(cName, cConf, tableConf), + propertyOverrides = Map.empty, + errorOpt = None) + } + + gridTest("During ALTER, `validateConfigurationsForAlterTableSetPropertiesDeltaCommand` " + + "fails with overrides for tables with Coordinated Commits configurations.") ( + Seq( + Map(cName), + Map(cName, cConf), + Map(cName, cConf, tableConf), + Map(tableConf) + ) + ) { propertyOverrides: Map[String, String] => + testValidateConfigurationsForAlterTableSetPropertiesDeltaCommand( + existingConfs = Map(cName, cConf, tableConf), + propertyOverrides, + errorOpt = Some(new DeltaIllegalArgumentException( + "DELTA_CANNOT_OVERRIDE_COORDINATED_COMMITS_CONFS", Array("ALTER")))) + } + + private def errCannotUnset = new DeltaIllegalArgumentException( + "DELTA_CANNOT_UNSET_COORDINATED_COMMITS_CONFS", Array.empty) + + private def testValidateConfigurationsForAlterTableUnsetPropertiesDeltaCommand( + existingConfs: Map[String, String], + propKeysToUnset: Seq[String], + errorOpt: Option[DeltaIllegalArgumentException]): Unit = { + if (errorOpt.isDefined) { + val e = intercept[DeltaIllegalArgumentException] { + CoordinatedCommitsUtils.validateConfigurationsForAlterTableUnsetPropertiesDeltaCommand( + existingConfs, propKeysToUnset) + } + checkError( + e, + errorOpt.get.getErrorClass, + sqlState = errorOpt.get.getSqlState, + parameters = errorOpt.get.getMessageParameters.asScala.toMap) + } else { + CoordinatedCommitsUtils.validateConfigurationsForAlterTableUnsetPropertiesDeltaCommand( + existingConfs, propKeysToUnset) + } + } + + gridTest("During ALTER, `validateConfigurationsForAlterTableUnsetPropertiesDeltaCommand` " + + "fails with overrides for tables with Coordinated Commits configurations.") { + Seq( + Seq(cNameKey), + Seq(cNameKey, cConfKey), + Seq(cNameKey, cConfKey, tableConfKey), + Seq(tableConfKey) + ) + } { propKeysToUnset: Seq[String] => + testValidateConfigurationsForAlterTableUnsetPropertiesDeltaCommand( + existingConfs = Map(cName, cConf, tableConf), + propKeysToUnset, + errorOpt = Some(errCannotUnset)) + } + + gridTest("During ALTER, `validateConfigurationsForAlterTableUnsetPropertiesDeltaCommand` " + + "passes with no overrides for tables with or without Coordinated Commits configurations.") { + Seq( + Map.empty, + Map(cName, cConf, tableConf) + ) + } { case existingConfs: Map[String, String] => + testValidateConfigurationsForAlterTableUnsetPropertiesDeltaCommand( + existingConfs, + propKeysToUnset = Seq.empty, + errorOpt = None) + } + + gridTest("During ALTER, `validateConfigurationsForAlterTableUnsetPropertiesDeltaCommand` " + + "passes with overrides for tables without Coordinated Commits configurations.") { + Seq( + Seq(cNameKey), + Seq(cNameKey, cConfKey), + Seq(cNameKey, cConfKey, tableConfKey), + Seq(tableConfKey) + ) + } { propKeysToUnset: Seq[String] => + testValidateConfigurationsForAlterTableUnsetPropertiesDeltaCommand( + existingConfs = Map.empty, + propKeysToUnset, + errorOpt = None) + } + + ///////////////////////////////////////////////////////////////////////////////////////////// + // Test CoordinatedCommitsUtils.validateCoordinatedCommitsConfigurationsImpl ENDS // + ///////////////////////////////////////////////////////////////////////////////////////////// +} diff --git a/spark/src/test/scala/org/apache/spark/sql/delta/coordinatedcommits/DynamoDBCommitCoordinatorClientSuite.scala b/spark/src/test/scala/org/apache/spark/sql/delta/coordinatedcommits/DynamoDBCommitCoordinatorClientSuite.scala index 8a8b09fd7f3..1022bd211c2 100644 --- a/spark/src/test/scala/org/apache/spark/sql/delta/coordinatedcommits/DynamoDBCommitCoordinatorClientSuite.scala +++ b/spark/src/test/scala/org/apache/spark/sql/delta/coordinatedcommits/DynamoDBCommitCoordinatorClientSuite.scala @@ -16,6 +16,7 @@ package org.apache.spark.sql.delta.coordinatedcommits +import java.util.Optional import java.util.concurrent.locks.ReentrantReadWriteLock import scala.collection.JavaConverters._ @@ -26,6 +27,7 @@ import com.amazonaws.services.dynamodbv2.model.{AttributeValue, ConditionalCheck import org.apache.spark.sql.delta.{DeltaConfigs, DeltaLog} import org.apache.spark.sql.delta.actions.{Metadata, Protocol} import org.apache.spark.sql.delta.sources.DeltaSQLConf +import org.apache.spark.sql.delta.test.DeltaTestImplicits._ import org.apache.spark.sql.delta.util.{FileNames, JsonUtils} import io.delta.dynamodbcommitcoordinator.{DynamoDBCommitCoordinatorClient, DynamoDBCommitCoordinatorClientBuilder} import io.delta.storage.commit.{CommitCoordinatorClient, CommitFailedException => JCommitFailedException, GetCommitsResponse => JGetCommitsResponse} @@ -156,10 +158,10 @@ abstract class DynamoDBCommitCoordinatorClientSuite(batchSize: Long) extends CommitCoordinatorClientImplSuiteBase { override protected def createTableCommitCoordinatorClient( - deltaLog: DeltaLog) - : TableCommitCoordinatorClient = { + deltaLog: DeltaLog): TableCommitCoordinatorClient = { val cs = TestDynamoDBCommitCoordinatorBuilder(batchSize = batchSize).build(spark, Map.empty) - val tableConf = cs.registerTable(deltaLog.logPath, -1L, Metadata(), Protocol(1, 1)) + val tableConf = cs.registerTable( + deltaLog.logPath, Optional.empty(), -1L, Metadata(), Protocol(1, 1)) TableCommitCoordinatorClient(cs, deltaLog, tableConf.asScala.toMap) } @@ -213,10 +215,7 @@ abstract class DynamoDBCommitCoordinatorClientSuite(batchSize: Long) 1, // writeCapacityUnits skipPathCheck) val tableConf = commitCoordinator.registerTable( - logPath, - -1L, - Metadata(), - Protocol(1, 1)) + logPath, Optional.empty(), -1L, Metadata(), Protocol(1, 1)) val wrongTablePath = new Path(logPath.getParent, "wrongTable") val wrongLogPath = new Path(wrongTablePath, logPath.getName) val fs = wrongLogPath.getFileSystem(log.newDeltaHadoopConf()) diff --git a/spark/src/test/scala/org/apache/spark/sql/delta/coordinatedcommits/InMemoryCommitCoordinatorSuite.scala b/spark/src/test/scala/org/apache/spark/sql/delta/coordinatedcommits/InMemoryCommitCoordinatorSuite.scala index 647ac025c9b..828f90313ef 100644 --- a/spark/src/test/scala/org/apache/spark/sql/delta/coordinatedcommits/InMemoryCommitCoordinatorSuite.scala +++ b/spark/src/test/scala/org/apache/spark/sql/delta/coordinatedcommits/InMemoryCommitCoordinatorSuite.scala @@ -16,10 +16,13 @@ package org.apache.spark.sql.delta.coordinatedcommits +import java.util.Optional + import scala.collection.JavaConverters._ import org.apache.spark.sql.delta.DeltaLog import org.apache.spark.sql.delta.actions.Protocol +import org.apache.spark.sql.delta.test.DeltaTestImplicits._ import io.delta.storage.commit.{GetCommitsResponse => JGetCommitsResponse} import org.apache.hadoop.fs.Path @@ -29,7 +32,8 @@ abstract class InMemoryCommitCoordinatorSuite(batchSize: Int) override protected def createTableCommitCoordinatorClient( deltaLog: DeltaLog): TableCommitCoordinatorClient = { val cs = InMemoryCommitCoordinatorBuilder(batchSize).build(spark, Map.empty) - val conf = cs.registerTable(deltaLog.logPath, -1L, initMetadata, Protocol(1, 1)) + val conf = cs.registerTable( + deltaLog.logPath, Optional.empty(), -1L, initMetadata, Protocol(1, 1)) TableCommitCoordinatorClient(cs, deltaLog, conf.asScala.toMap) } diff --git a/spark/src/test/scala/org/apache/spark/sql/delta/rowid/RowTrackingBackfillSuite.scala b/spark/src/test/scala/org/apache/spark/sql/delta/rowid/RowTrackingBackfillSuite.scala index 8eaaeef98a5..1a2a76e9cdb 100644 --- a/spark/src/test/scala/org/apache/spark/sql/delta/rowid/RowTrackingBackfillSuite.scala +++ b/spark/src/test/scala/org/apache/spark/sql/delta/rowid/RowTrackingBackfillSuite.scala @@ -460,7 +460,8 @@ class RowTrackingBackfillSuite assert( afterProtocol.minWriterVersion === TableFeatureProtocolUtils.TABLE_FEATURES_MIN_WRITER_VERSION) - assert(afterProtocol.readerFeatures === None) + assert(afterProtocol.readerFeatures === Some(Set( + ColumnMappingTableFeature.name))) assert( afterProtocol.writerFeatures === Some(( prevProtocol.implicitlyAndExplicitlySupportedFeatures ++ diff --git a/spark/src/test/scala/org/apache/spark/sql/delta/rowtracking/MaterializedColumnSuite.scala b/spark/src/test/scala/org/apache/spark/sql/delta/rowtracking/MaterializedColumnSuite.scala index 551dd9f3702..6b4802f2c05 100644 --- a/spark/src/test/scala/org/apache/spark/sql/delta/rowtracking/MaterializedColumnSuite.scala +++ b/spark/src/test/scala/org/apache/spark/sql/delta/rowtracking/MaterializedColumnSuite.scala @@ -89,7 +89,7 @@ class MaterializedColumnSuite extends RowIdTestUtils sql(s"ALTER TABLE $testTableName " + s"RENAME COLUMN $testDataColumnName TO `$materializedColumnName`") } - checkError(error, errorClass = "DELTA_ADDING_COLUMN_WITH_INTERNAL_NAME_FAILED", + checkError(error, "DELTA_ADDING_COLUMN_WITH_INTERNAL_NAME_FAILED", parameters = Map("colName" -> materializedColumnName)) } } @@ -111,7 +111,7 @@ class MaterializedColumnSuite extends RowIdTestUtils val error = intercept[DeltaRuntimeException] { sql(s"CREATE OR REPLACE TABLE $targetName SHALLOW CLONE $sourceName") } - checkError(error, errorClass = "DELTA_ADDING_COLUMN_WITH_INTERNAL_NAME_FAILED", + checkError(error, "DELTA_ADDING_COLUMN_WITH_INTERNAL_NAME_FAILED", parameters = Map("colName" -> materializedColumnName)) } } diff --git a/spark/src/test/scala/org/apache/spark/sql/delta/rowtracking/RowTrackingReadWriteSuite.scala b/spark/src/test/scala/org/apache/spark/sql/delta/rowtracking/RowTrackingReadWriteSuite.scala index 100a5cfa281..80fc7556251 100644 --- a/spark/src/test/scala/org/apache/spark/sql/delta/rowtracking/RowTrackingReadWriteSuite.scala +++ b/spark/src/test/scala/org/apache/spark/sql/delta/rowtracking/RowTrackingReadWriteSuite.scala @@ -220,7 +220,7 @@ class RowTrackingReadWriteSuite extends RowIdTestUtils val errorRowIds = intercept[AnalysisException](sql(insertStmt1 + " VALUES(1, 2)")) checkError( errorRowIds, - errorClass = "UNRESOLVED_COLUMN.WITH_SUGGESTION", + "UNRESOLVED_COLUMN.WITH_SUGGESTION", parameters = errorRowIds.messageParameters, queryContext = Array(ExpectedContext(insertStmt1, 0, insertStmt1.length - 1))) @@ -229,7 +229,7 @@ class RowTrackingReadWriteSuite extends RowIdTestUtils val errorRowCommitVersions = intercept[AnalysisException](sql(insertStmt2 + " VALUES(1, 2)")) checkError( errorRowCommitVersions, - errorClass = "UNRESOLVED_COLUMN.WITH_SUGGESTION", + "UNRESOLVED_COLUMN.WITH_SUGGESTION", parameters = errorRowCommitVersions.messageParameters, queryContext = Array(ExpectedContext(insertStmt2, 0, insertStmt2.length - 1))) } @@ -285,7 +285,7 @@ class RowTrackingReadWriteSuite extends RowIdTestUtils } checkError( error, - errorClass = "UNRESOLVED_COLUMN.WITH_SUGGESTION", + "UNRESOLVED_COLUMN.WITH_SUGGESTION", parameters = error.messageParameters) } @@ -303,7 +303,7 @@ class RowTrackingReadWriteSuite extends RowIdTestUtils } checkError( error, - errorClass = "UNRESOLVED_COLUMN.WITH_SUGGESTION", + "UNRESOLVED_COLUMN.WITH_SUGGESTION", parameters = error.messageParameters) } } diff --git a/spark/src/test/scala/org/apache/spark/sql/delta/schema/CheckConstraintsSuite.scala b/spark/src/test/scala/org/apache/spark/sql/delta/schema/CheckConstraintsSuite.scala index 7f14b19e3b2..f86fe164e8a 100644 --- a/spark/src/test/scala/org/apache/spark/sql/delta/schema/CheckConstraintsSuite.scala +++ b/spark/src/test/scala/org/apache/spark/sql/delta/schema/CheckConstraintsSuite.scala @@ -80,7 +80,7 @@ class CheckConstraintsSuite extends QueryTest exception = intercept[AnalysisException] { sql(s"ALTER TABLE $table ADD CONSTRAINT integerVal CHECK (3)") }, - errorClass = "DELTA_NON_BOOLEAN_CHECK_CONSTRAINT", + "DELTA_NON_BOOLEAN_CHECK_CONSTRAINT", parameters = Map( "name" -> "integerVal", "expr" -> "3" @@ -92,10 +92,10 @@ class CheckConstraintsSuite extends QueryTest test("can't add constraint referencing non-existent columns") { withTestTable { table => checkError( - exception = intercept[AnalysisException] { + intercept[AnalysisException] { sql(s"ALTER TABLE $table ADD CONSTRAINT c CHECK (does_not_exist)") }, - errorClass = "UNRESOLVED_COLUMN.WITH_SUGGESTION", + "UNRESOLVED_COLUMN.WITH_SUGGESTION", parameters = Map( "objectName" -> "`does_not_exist`", "proposal" -> "`text`, `num`" @@ -451,7 +451,7 @@ class CheckConstraintsSuite extends QueryTest } checkError( exception, - errorClass = "DELTA_EXCEED_CHAR_VARCHAR_LIMIT", + "DELTA_EXCEED_CHAR_VARCHAR_LIMIT", parameters = Map( "value" -> "a very long string", "expr" -> "((value IS NULL) OR (length(value) <= 12))" @@ -474,7 +474,7 @@ class CheckConstraintsSuite extends QueryTest } checkError( error1, - errorClass = "DELTA_CANNOT_DROP_CHECK_CONSTRAINT_FEATURE", + "DELTA_CANNOT_DROP_CHECK_CONSTRAINT_FEATURE", parameters = Map("constraints" -> "`c1`, `c2`") ) val deltaLog = DeltaLog.forTable(spark, TableIdentifier("table")) @@ -488,7 +488,7 @@ class CheckConstraintsSuite extends QueryTest } checkError( error2, - errorClass = "DELTA_CANNOT_DROP_CHECK_CONSTRAINT_FEATURE", + "DELTA_CANNOT_DROP_CHECK_CONSTRAINT_FEATURE", parameters = Map("constraints" -> "`c2`") ) val featureNames2 = diff --git a/spark/src/test/scala/org/apache/spark/sql/delta/schema/InvariantEnforcementSuite.scala b/spark/src/test/scala/org/apache/spark/sql/delta/schema/InvariantEnforcementSuite.scala index 157f444b47e..7fe7e1288f7 100644 --- a/spark/src/test/scala/org/apache/spark/sql/delta/schema/InvariantEnforcementSuite.scala +++ b/spark/src/test/scala/org/apache/spark/sql/delta/schema/InvariantEnforcementSuite.scala @@ -408,8 +408,9 @@ class InvariantEnforcementSuite extends QueryTest configuration = txn.metadata.configuration + ("delta.constraints.mychk" -> "valueA < valueB")) txn.commit(Seq(newMetadata), DeltaOperations.ManualUpdate) - assert(table.deltaLog.update().protocol.minWriterVersion === - CheckConstraintsTableFeature.minWriterVersion) + val protocol = table.deltaLog.update().protocol + assert(protocol.implicitlyAndExplicitlySupportedFeatures + .contains(CheckConstraintsTableFeature)) spark.sql("INSERT INTO constraint VALUES (50, 100, null)") val e = intercept[InvariantViolationException] { spark.sql("INSERT INTO constraint VALUES (100, 50, null)") diff --git a/spark/src/test/scala/org/apache/spark/sql/delta/schema/SchemaUtilsSuite.scala b/spark/src/test/scala/org/apache/spark/sql/delta/schema/SchemaUtilsSuite.scala index 8dfb54c11f8..a8fbfd51ff8 100644 --- a/spark/src/test/scala/org/apache/spark/sql/delta/schema/SchemaUtilsSuite.scala +++ b/spark/src/test/scala/org/apache/spark/sql/delta/schema/SchemaUtilsSuite.scala @@ -87,8 +87,8 @@ class SchemaUtilsSuite extends QueryTest val err = getError(e) assert(err.isDefined, "exception with the error class not found") checkError( - exception = err.get, - errorClass = errorClass, + err.get, + errorClass, parameters = params, matchPVals = true) } @@ -1258,6 +1258,35 @@ class SchemaUtilsSuite extends QueryTest } } + test("addColumn - top level array") { + val a = StructField("a", IntegerType) + val b = StructField("b", StringType) + val schema = ArrayType(new StructType().add(a).add(b)) + + val x = StructField("x", LongType) + assert(SchemaUtils.addColumn(schema, x, Seq(0, 1)) === + ArrayType(new StructType().add(a).add(x).add(b))) + } + + test("addColumn - top level map") { + val k = StructField("k", IntegerType) + val v = StructField("v", StringType) + val schema = MapType( + keyType = new StructType().add(k), + valueType = new StructType().add(v)) + + val x = StructField("x", LongType) + assert(SchemaUtils.addColumn(schema, x, Seq(0, 1)) === + MapType( + keyType = new StructType().add(k).add(x), + valueType = new StructType().add(v))) + + assert(SchemaUtils.addColumn(schema, x, Seq(1, 1)) === + MapType( + keyType = new StructType().add(k), + valueType = new StructType().add(v).add(x))) + } + //////////////////////////// // dropColumn //////////////////////////// @@ -1511,6 +1540,29 @@ class SchemaUtilsSuite extends QueryTest } } + test("dropColumn - top level array") { + val schema = ArrayType(new StructType().add("a", IntegerType).add("b", StringType)) + + assert(SchemaUtils.dropColumn(schema, Seq(0, 0))._1 === + ArrayType(new StructType().add("b", StringType))) + } + + test("dropColumn - top level map") { + val schema = MapType( + keyType = new StructType().add("k", IntegerType).add("k2", StringType), + valueType = new StructType().add("v", StringType).add("v2", StringType)) + + assert(SchemaUtils.dropColumn(schema, Seq(0, 0))._1 === + MapType( + keyType = new StructType().add("k2", StringType), + valueType = new StructType().add("v", StringType).add("v2", StringType))) + + assert(SchemaUtils.dropColumn(schema, Seq(1, 0))._1 === + MapType( + keyType = new StructType().add("k", IntegerType).add("k2", StringType), + valueType = new StructType().add("v2", StringType))) + } + ///////////////////////////////// // normalizeColumnNamesInDataType ///////////////////////////////// @@ -1680,8 +1732,8 @@ class SchemaUtilsSuite extends QueryTest Seq("x", "Y"), new StructType()) } checkError( - exception = exception, - errorClass = "DELTA_CANNOT_RESOLVE_COLUMN", + exception, + "DELTA_CANNOT_RESOLVE_COLUMN", sqlState = "42703", parameters = Map("columnName" -> "x.Y.bb", "schema" -> "root\n") ) @@ -1948,8 +2000,8 @@ class SchemaUtilsSuite extends QueryTest ) } checkError( - exception = exception, - errorClass = "DELTA_CANNOT_RESOLVE_COLUMN", + exception, + "DELTA_CANNOT_RESOLVE_COLUMN", sqlState = "42703", parameters = Map("columnName" -> "two", "schema" -> tableSchema.treeString) ) @@ -1974,8 +2026,8 @@ class SchemaUtilsSuite extends QueryTest ) } checkError( - exception = exception, - errorClass = "DELTA_CANNOT_RESOLVE_COLUMN", + exception, + "DELTA_CANNOT_RESOLVE_COLUMN", sqlState = "42703", parameters = Map("columnName" -> "s.two", "schema" -> tableSchema.treeString) ) @@ -2348,8 +2400,8 @@ class SchemaUtilsSuite extends QueryTest mergeSchemas(longType, sourceType) } checkError( - exception = e.getCause.asInstanceOf[AnalysisException], - errorClass = "DELTA_MERGE_INCOMPATIBLE_DATATYPE", + e.getCause.asInstanceOf[AnalysisException], + "DELTA_MERGE_INCOMPATIBLE_DATATYPE", parameters = Map("currentDataType" -> "LongType", "updateDataType" -> sourceType.head.dataType.toString)) } @@ -2584,6 +2636,45 @@ class SchemaUtilsSuite extends QueryTest assert(update === res3) } + test("transform top level array type") { + val at = ArrayType( + new StructType() + .add("s1", IntegerType) + ) + + var visitedFields = 0 + val updated = SchemaMergingUtils.transformColumns(at) { + case (_, field, _) => + visitedFields += 1 + field.copy(name = "s1_1", dataType = StringType) + } + + assert(visitedFields === 1) + assert(updated === ArrayType(new StructType().add("s1_1", StringType))) + } + + test("transform top level map type") { + val mt = MapType( + new StructType() + .add("k1", IntegerType), + new StructType() + .add("v1", IntegerType) + ) + + var visitedFields = 0 + val updated = SchemaMergingUtils.transformColumns(mt) { + case (_, field, _) => + visitedFields += 1 + field.copy(name = field.name + "_1", dataType = StringType) + } + + assert(visitedFields === 2) + assert(updated === MapType( + new StructType().add("k1_1", StringType), + new StructType().add("v1_1", StringType) + )) + } + //////////////////////////// // pruneEmptyStructs //////////////////////////// @@ -2637,10 +2728,10 @@ class SchemaUtilsSuite extends QueryTest badCharacters.foreach { char => Seq(s"a${char}b", s"${char}ab", s"ab${char}", char.toString).foreach { name => checkError( - exception = intercept[AnalysisException] { + intercept[AnalysisException] { SchemaUtils.checkFieldNames(Seq(name)) }, - errorClass = "DELTA_INVALID_CHARACTERS_IN_COLUMN_NAME", + "DELTA_INVALID_CHARACTERS_IN_COLUMN_NAME", parameters = Map("columnName" -> s"$name") ) } diff --git a/spark/src/test/scala/org/apache/spark/sql/delta/skipping/ClusteredTableTestUtils.scala b/spark/src/test/scala/org/apache/spark/sql/delta/skipping/ClusteredTableTestUtils.scala index 1ab4f0ce10e..3757793e5d9 100644 --- a/spark/src/test/scala/org/apache/spark/sql/delta/skipping/ClusteredTableTestUtils.scala +++ b/spark/src/test/scala/org/apache/spark/sql/delta/skipping/ClusteredTableTestUtils.scala @@ -21,6 +21,7 @@ import org.apache.spark.sql.delta.skipping.clustering.temp.ClusterBySpec import org.apache.spark.sql.delta.{DeltaLog, Snapshot} import org.apache.spark.sql.delta.DeltaOperations.{CLUSTERING_PARAMETER_KEY, ZORDER_PARAMETER_KEY} import org.apache.spark.sql.delta.commands.optimize.OptimizeMetrics +import org.apache.spark.sql.delta.coordinatedcommits.CoordinatedCommitsBaseSuite import org.apache.spark.sql.delta.hooks.UpdateCatalog import org.apache.spark.sql.delta.sources.DeltaSQLConf import org.apache.spark.sql.delta.util.JsonUtils @@ -32,7 +33,10 @@ import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.test.SharedSparkSession import org.apache.spark.util.Utils -trait ClusteredTableTestUtilsBase extends SparkFunSuite with SharedSparkSession { +trait ClusteredTableTestUtilsBase + extends SparkFunSuite + with SharedSparkSession + with CoordinatedCommitsBaseSuite { import testImplicits._ /** @@ -161,6 +165,23 @@ trait ClusteredTableTestUtilsBase extends SparkFunSuite with SharedSparkSession } } + protected def deleteTableFromCommitCoordinatorIfNeeded(table: String): Unit = { + if (coordinatedCommitsEnabledInTests) { + // Clean up the table data in commit coordinator because DROP/REPLACE TABLE does not bother + // commit coordinator. + deleteTableFromCommitCoordinator(table) + } + } + + override def withTable(tableNames: String*)(f: => Unit): Unit = { + Utils.tryWithSafeFinally(f) { + tableNames.foreach { name => + deleteTableFromCommitCoordinatorIfNeeded(name) + spark.sql(s"DROP TABLE IF EXISTS $name") + } + } + } + def withClusteredTable[T]( table: String, schema: String, @@ -170,6 +191,7 @@ trait ClusteredTableTestUtilsBase extends SparkFunSuite with SharedSparkSession createOrReplaceClusteredTable("CREATE", table, schema, clusterBy, tableProperties, location) Utils.tryWithSafeFinally(f) { + deleteTableFromCommitCoordinatorIfNeeded(table) spark.sql(s"DROP TABLE IF EXISTS $table") } } diff --git a/spark/src/test/scala/org/apache/spark/sql/delta/skipping/clustering/ClusteredTableDDLSuite.scala b/spark/src/test/scala/org/apache/spark/sql/delta/skipping/clustering/ClusteredTableDDLSuite.scala index b522aa16e58..a2fe9f5bd8c 100644 --- a/spark/src/test/scala/org/apache/spark/sql/delta/skipping/clustering/ClusteredTableDDLSuite.scala +++ b/spark/src/test/scala/org/apache/spark/sql/delta/skipping/clustering/ClusteredTableDDLSuite.scala @@ -267,8 +267,8 @@ trait ClusteredTableCreateOrReplaceDDLSuiteBase extends QueryTest assert(dataTypeOpt.nonEmpty, s"Can't find column $colName " + s"in schema ${tableSchema.treeString}") checkError( - exception = e, - errorClass = "DELTA_CLUSTERING_COLUMNS_DATATYPE_NOT_SUPPORTED", + e, + "DELTA_CLUSTERING_COLUMNS_DATATYPE_NOT_SUPPORTED", parameters = Map("columnsWithDataTypes" -> s"$colName : ${dataTypeOpt.get.sql}") ) } @@ -287,8 +287,8 @@ trait ClusteredTableCreateOrReplaceDDLSuiteBase extends QueryTest "CREATE", testTable, "a INT, b INT, c INT, d INT, e INT", "a, b, c, d, e") } checkError( - exception = e, - errorClass = "DELTA_CLUSTER_BY_INVALID_NUM_COLUMNS", + e, + "DELTA_CLUSTER_BY_INVALID_NUM_COLUMNS", parameters = Map("numColumnsLimit" -> "4", "actualNumColumns" -> "5") ) } @@ -305,8 +305,8 @@ trait ClusteredTableCreateOrReplaceDDLSuiteBase extends QueryTest "CREATE", testTable, sourceTable, "a, b, c, d, e", location = location) } checkError( - exception = e, - errorClass = "DELTA_CLUSTER_BY_INVALID_NUM_COLUMNS", + e, + "DELTA_CLUSTER_BY_INVALID_NUM_COLUMNS", parameters = Map("numColumnsLimit" -> "4", "actualNumColumns" -> "5") ) } @@ -354,8 +354,8 @@ trait ClusteredTableCreateOrReplaceDDLSuiteBase extends QueryTest indexedColumns, Some(tableSchema))) checkError( - exception = e, - errorClass = "DELTA_CLUSTERING_COLUMN_MISSING_STATS", + e, + "DELTA_CLUSTERING_COLUMN_MISSING_STATS", parameters = Map( "columns" -> "col1.col12, col2", "schema" -> """root @@ -411,8 +411,8 @@ trait ClusteredTableCreateOrReplaceDDLSuiteBase extends QueryTest None, location = Some(dir.getPath))) checkError( - exception = e, - errorClass = "DELTA_CLUSTERING_COLUMN_MISSING_STATS", + e, + "DELTA_CLUSTERING_COLUMN_MISSING_STATS", parameters = Map( "columns" -> "col1.col12, col2", "schema" -> """root @@ -456,8 +456,8 @@ trait ClusteredTableCreateOrReplaceDDLSuiteBase extends QueryTest indexedColumns, Some(nonEligibleTableSchema))) checkError( - exception = e, - errorClass = "DELTA_CLUSTERING_COLUMNS_DATATYPE_NOT_SUPPORTED", + e, + "DELTA_CLUSTERING_COLUMNS_DATATYPE_NOT_SUPPORTED", parameters = Map("columnsWithDataTypes" -> "col1.col11 : ARRAY") ) } @@ -553,8 +553,8 @@ trait ClusteredTableDDLWithColumnMapping sql(s"ALTER TABLE $testTable DROP COLUMNS (col1)") } checkError( - exception = e, - errorClass = "DELTA_UNSUPPORTED_DROP_CLUSTERING_COLUMN", + e, + "DELTA_UNSUPPORTED_DROP_CLUSTERING_COLUMN", parameters = Map("columnList" -> "col1") ) // Drop non-clustering columns are allowed. @@ -568,8 +568,8 @@ trait ClusteredTableDDLWithColumnMapping sql(s"ALTER TABLE $testTable DROP COLUMNS (col1, col2)") } checkError( - exception = e, - errorClass = "DELTA_UNSUPPORTED_DROP_CLUSTERING_COLUMN", + e, + "DELTA_UNSUPPORTED_DROP_CLUSTERING_COLUMN", parameters = Map("columnList" -> "col1,col2") ) } @@ -582,8 +582,8 @@ trait ClusteredTableDDLWithColumnMapping sql(s"ALTER TABLE $testTable DROP COLUMNS (col1, col3)") } checkError( - exception = e, - errorClass = "DELTA_UNSUPPORTED_DROP_CLUSTERING_COLUMN", + e, + "DELTA_UNSUPPORTED_DROP_CLUSTERING_COLUMN", parameters = Map("columnList" -> "col1") ) } @@ -659,7 +659,7 @@ trait ClusteredTableDDLSuiteBase } checkError( e, - errorClass = "DELTA_CLUSTER_BY_INVALID_NUM_COLUMNS", + "DELTA_CLUSTER_BY_INVALID_NUM_COLUMNS", parameters = Map( "numColumnsLimit" -> "4", "actualNumColumns" -> "5") @@ -782,8 +782,8 @@ trait ClusteredTableDDLSuiteBase sql(s"OPTIMIZE $testTable ZORDER BY (a)") } checkError( - exception = e2, - errorClass = "DELTA_CLUSTERING_WITH_ZORDER_BY", + e2, + "DELTA_CLUSTERING_WITH_ZORDER_BY", parameters = Map("zOrderBy" -> "a") ) } @@ -911,7 +911,7 @@ trait ClusteredTableDDLSuiteBase } checkError( e, - errorClass = "DELTA_CANNOT_MODIFY_TABLE_PROPERTY", + "DELTA_CANNOT_MODIFY_TABLE_PROPERTY", parameters = Map("prop" -> "clusteringColumns")) } } @@ -1169,7 +1169,7 @@ trait ClusteredTableDDLDataSourceV2SuiteBase } checkError( e, - errorClass = "DELTA_CREATE_TABLE_WITH_DIFFERENT_CLUSTERING", + "DELTA_CREATE_TABLE_WITH_DIFFERENT_CLUSTERING", parameters = Map( "path" -> dir.toURI.toString.stripSuffix("/"), "specifiedColumns" -> "", @@ -1194,7 +1194,7 @@ trait ClusteredTableDDLDataSourceV2SuiteBase } checkError( e, - errorClass = "DELTA_CREATE_TABLE_WITH_DIFFERENT_CLUSTERING", + "DELTA_CREATE_TABLE_WITH_DIFFERENT_CLUSTERING", parameters = Map( "path" -> dir.toURI.toString.stripSuffix("/"), "specifiedColumns" -> "col2", @@ -1235,7 +1235,7 @@ trait ClusteredTableDDLDataSourceV2SuiteBase } checkError( e, - errorClass = "DELTA_CREATE_TABLE_WITH_DIFFERENT_CLUSTERING", + "DELTA_CREATE_TABLE_WITH_DIFFERENT_CLUSTERING", parameters = Map( "path" -> dir.toURI.toString.stripSuffix("/"), "specifiedColumns" -> "col1", diff --git a/spark/src/test/scala/org/apache/spark/sql/delta/test/DeltaTestImplicits.scala b/spark/src/test/scala/org/apache/spark/sql/delta/test/DeltaTestImplicits.scala index 4363ec6368e..ac0206e4321 100644 --- a/spark/src/test/scala/org/apache/spark/sql/delta/test/DeltaTestImplicits.scala +++ b/spark/src/test/scala/org/apache/spark/sql/delta/test/DeltaTestImplicits.scala @@ -23,8 +23,10 @@ import org.apache.spark.sql.delta.DeltaOperations.{ManualUpdate, Operation, Writ import org.apache.spark.sql.delta.actions.{Action, AddFile, Metadata, Protocol} import org.apache.spark.sql.delta.catalog.DeltaTableV2 import org.apache.spark.sql.delta.commands.optimize.OptimizeMetrics +import org.apache.spark.sql.delta.coordinatedcommits.TableCommitCoordinatorClient import org.apache.spark.sql.delta.hooks.AutoCompact import org.apache.spark.sql.delta.stats.StatisticsCollection +import io.delta.storage.commit.{CommitResponse, GetCommitsResponse, UpdatedActions} import org.apache.hadoop.fs.Path import org.apache.spark.sql.{SaveMode, SparkSession} @@ -91,6 +93,40 @@ object DeltaTestImplicits { } } + /** Helper class for working with [[TableCommitCoordinatorClient]] */ + implicit class TableCommitCoordinatorClientTestHelper( + tableCommitCoordinatorClient: TableCommitCoordinatorClient) { + + def commit( + commitVersion: Long, + actions: Iterator[String], + updatedActions: UpdatedActions): CommitResponse = { + tableCommitCoordinatorClient.commit( + commitVersion, actions, updatedActions, tableIdentifierOpt = None) + } + + def getCommits( + startVersion: Option[Long] = None, + endVersion: Option[Long] = None): GetCommitsResponse = { + tableCommitCoordinatorClient.getCommits(tableIdentifierOpt = None, startVersion, endVersion) + } + + def backfillToVersion( + version: Long, + lastKnownBackfilledVersion: Option[Long] = None): Unit = { + tableCommitCoordinatorClient.backfillToVersion( + tableIdentifierOpt = None, version, lastKnownBackfilledVersion) + } + } + + + /** Helper class for working with [[Snapshot]] */ + implicit class SnapshotTestHelper(snapshot: Snapshot) { + def ensureCommitFilesBackfilled(): Unit = { + snapshot.ensureCommitFilesBackfilled(tableIdentifierOpt = None) + } + } + /** * Helper class for working with the most recent snapshot in the deltaLog */ diff --git a/spark/src/test/scala/org/apache/spark/sql/delta/typewidening/TypeWideningAlterTableNestedSuite.scala b/spark/src/test/scala/org/apache/spark/sql/delta/typewidening/TypeWideningAlterTableNestedSuite.scala index 15c0594c8b4..476bfecbdec 100644 --- a/spark/src/test/scala/org/apache/spark/sql/delta/typewidening/TypeWideningAlterTableNestedSuite.scala +++ b/spark/src/test/scala/org/apache/spark/sql/delta/typewidening/TypeWideningAlterTableNestedSuite.scala @@ -57,8 +57,8 @@ trait TypeWideningAlterTableNestedTests { // Running ALTER TABLE CHANGE COLUMN on non-leaf fields is invalid. var alterTableSql = s"ALTER TABLE delta.`$tempPath` CHANGE COLUMN s TYPE struct" checkError( - exception = intercept[AnalysisException] { sql(alterTableSql) }, - errorClass = "CANNOT_UPDATE_FIELD.STRUCT_TYPE", + intercept[AnalysisException] { sql(alterTableSql) }, + "CANNOT_UPDATE_FIELD.STRUCT_TYPE", parameters = Map( "table" -> s"`spark_catalog`.`delta`.`$tempPath`", "fieldName" -> "`s`" @@ -71,8 +71,8 @@ trait TypeWideningAlterTableNestedTests { alterTableSql = s"ALTER TABLE delta.`$tempPath` CHANGE COLUMN m TYPE map" checkError( - exception = intercept[AnalysisException] { sql(alterTableSql) }, - errorClass = "CANNOT_UPDATE_FIELD.MAP_TYPE", + intercept[AnalysisException] { sql(alterTableSql) }, + "CANNOT_UPDATE_FIELD.MAP_TYPE", parameters = Map( "table" -> s"`spark_catalog`.`delta`.`$tempPath`", "fieldName" -> "`m`" @@ -85,8 +85,8 @@ trait TypeWideningAlterTableNestedTests { alterTableSql = s"ALTER TABLE delta.`$tempPath` CHANGE COLUMN a TYPE array" checkError( - exception = intercept[AnalysisException] { sql(alterTableSql) }, - errorClass = "CANNOT_UPDATE_FIELD.ARRAY_TYPE", + intercept[AnalysisException] { sql(alterTableSql) }, + "CANNOT_UPDATE_FIELD.ARRAY_TYPE", parameters = Map( "table" -> s"`spark_catalog`.`delta`.`$tempPath`", "fieldName" -> "`a`" diff --git a/spark/src/test/scala/org/apache/spark/sql/delta/typewidening/TypeWideningAlterTableSuite.scala b/spark/src/test/scala/org/apache/spark/sql/delta/typewidening/TypeWideningAlterTableSuite.scala index 70f19ed4154..f772c9ca2aa 100644 --- a/spark/src/test/scala/org/apache/spark/sql/delta/typewidening/TypeWideningAlterTableSuite.scala +++ b/spark/src/test/scala/org/apache/spark/sql/delta/typewidening/TypeWideningAlterTableSuite.scala @@ -99,10 +99,10 @@ trait TypeWideningAlterTableTests // are rejected in Delta when the ALTER TABLE command is executed. if (Cast.canUpCast(testCase.fromType, testCase.toType)) { checkError( - exception = intercept[DeltaAnalysisException] { + intercept[DeltaAnalysisException] { sql(alterTableSql) }, - errorClass = "DELTA_UNSUPPORTED_ALTER_TABLE_CHANGE_COL_OP", + "DELTA_UNSUPPORTED_ALTER_TABLE_CHANGE_COL_OP", sqlState = None, parameters = Map( "fieldPath" -> "value", @@ -111,10 +111,10 @@ trait TypeWideningAlterTableTests ) } else { checkError( - exception = intercept[AnalysisException] { + intercept[AnalysisException] { sql(alterTableSql) }, - errorClass = "NOT_SUPPORTED_CHANGE_COLUMN", + "NOT_SUPPORTED_CHANGE_COLUMN", sqlState = None, parameters = Map( "table" -> s"`spark_catalog`.`delta`.`$tempPath`", @@ -176,10 +176,10 @@ trait TypeWideningAlterTableTests .mkString(", ") checkError( - exception = intercept[DeltaTableFeatureException] { + intercept[DeltaTableFeatureException] { sql(s"ALTER TABLE delta.`$tempPath` CHANGE COLUMN a TYPE TIMESTAMP_NTZ") }, - errorClass = "DELTA_FEATURES_REQUIRE_MANUAL_ENABLEMENT", + "DELTA_FEATURES_REQUIRE_MANUAL_ENABLEMENT", parameters = Map( "unsupportedFeatures" -> "timestampNtz", "supportedFeatures" -> currentFeatures diff --git a/spark/src/test/scala/org/apache/spark/sql/delta/typewidening/TypeWideningConstraintsSuite.scala b/spark/src/test/scala/org/apache/spark/sql/delta/typewidening/TypeWideningConstraintsSuite.scala index 1c6f8b05e77..17e75bbb2d8 100644 --- a/spark/src/test/scala/org/apache/spark/sql/delta/typewidening/TypeWideningConstraintsSuite.scala +++ b/spark/src/test/scala/org/apache/spark/sql/delta/typewidening/TypeWideningConstraintsSuite.scala @@ -57,10 +57,10 @@ trait TypeWideningConstraintsTests { self: QueryTest with TypeWideningTestMixin // Changing the type of a column that a CHECK constraint depends on is not allowed. checkError( - exception = intercept[DeltaAnalysisException] { + intercept[DeltaAnalysisException] { sql("ALTER TABLE t CHANGE COLUMN a TYPE SMALLINT") }, - errorClass = "DELTA_CONSTRAINT_DEPENDENT_COLUMN_CHANGE", + "DELTA_CONSTRAINT_DEPENDENT_COLUMN_CHANGE", parameters = Map( "columnName" -> "a", "constraints" -> "delta.constraints.ck -> hash ( a ) > 0" @@ -81,10 +81,10 @@ trait TypeWideningConstraintsTests { self: QueryTest with TypeWideningTestMixin checkAnswer(sql("SELECT hash(a.x) FROM t"), Row(1765031574)) checkError( - exception = intercept[DeltaAnalysisException] { + intercept[DeltaAnalysisException] { sql("ALTER TABLE t CHANGE COLUMN a.x TYPE SMALLINT") }, - errorClass = "DELTA_CONSTRAINT_DEPENDENT_COLUMN_CHANGE", + "DELTA_CONSTRAINT_DEPENDENT_COLUMN_CHANGE", parameters = Map( "columnName" -> "a.x", "constraints" -> "delta.constraints.ck -> hash ( a . x ) > 0" @@ -105,10 +105,10 @@ trait TypeWideningConstraintsTests { self: QueryTest with TypeWideningTestMixin withSQLConf(DeltaSQLConf.DELTA_SCHEMA_AUTO_MIGRATE.key -> "true") { checkError( - exception = intercept[DeltaAnalysisException] { + intercept[DeltaAnalysisException] { sql("INSERT INTO t VALUES (200)") }, - errorClass = "DELTA_CONSTRAINT_DATA_TYPE_MISMATCH", + "DELTA_CONSTRAINT_DATA_TYPE_MISMATCH", parameters = Map( "columnName" -> "a", "columnType" -> "TINYINT", @@ -128,30 +128,63 @@ trait TypeWideningConstraintsTests { self: QueryTest with TypeWideningTestMixin withSQLConf(DeltaSQLConf.DELTA_SCHEMA_AUTO_MIGRATE.key -> "true") { checkError( - exception = intercept[DeltaAnalysisException] { + intercept[DeltaAnalysisException] { sql("INSERT INTO t (a) VALUES (named_struct('x', 200, 'y', CAST(5 AS byte)))") }, - errorClass = "DELTA_CONSTRAINT_DATA_TYPE_MISMATCH", + "DELTA_CONSTRAINT_DATA_TYPE_MISMATCH", parameters = Map( - "columnName" -> "a", - "columnType" -> "STRUCT", - "dataType" -> "STRUCT", + "columnName" -> "a.x", + "columnType" -> "TINYINT", + "dataType" -> "INT", "constraints" -> "delta.constraints.ck -> hash ( a . x ) > 0" - )) + ) + ) + + // changing the type of struct field `a.y` when it's not + // the field referenced by the CHECK constraint is allowed. + sql("INSERT INTO t (a) VALUES (named_struct('x', CAST(2 AS byte), 'y', 500))") + checkAnswer(sql("SELECT hash(a.x) FROM t"), Seq(Row(1765031574), Row(1765031574))) + } + } + } - // We're currently too strict and reject changing the type of struct field a.y even though - // it's not the field referenced by the CHECK constraint. + test("check constraint on nested field with complex type evolution") { + withTable("t") { + sql("CREATE TABLE t (a struct, y: byte>) USING DELTA") + sql("ALTER TABLE t ADD CONSTRAINT ck CHECK (hash(a.x.z) > 0)") + sql("INSERT INTO t (a) VALUES (named_struct('x', named_struct('z', 2, 'h', 3), 'y', 4))") + checkAnswer(sql("SELECT hash(a.x.z) FROM t"), Row(1765031574)) + + withSQLConf(DeltaSQLConf.DELTA_SCHEMA_AUTO_MIGRATE.key -> "true") { checkError( - exception = intercept[DeltaAnalysisException] { - sql("INSERT INTO t (a) VALUES (named_struct('x', CAST(2 AS byte), 'y', 500))") + intercept[DeltaAnalysisException] { + sql( + s""" + | INSERT INTO t (a) VALUES ( + | named_struct('x', named_struct('z', 200, 'h', 3), 'y', 4) + | ) + |""".stripMargin + ) }, - errorClass = "DELTA_CONSTRAINT_DATA_TYPE_MISMATCH", + "DELTA_CONSTRAINT_DATA_TYPE_MISMATCH", parameters = Map( - "columnName" -> "a", - "columnType" -> "STRUCT", - "dataType" -> "STRUCT", - "constraints" -> "delta.constraints.ck -> hash ( a . x ) > 0" - )) + "columnName" -> "a.x.z", + "columnType" -> "TINYINT", + "dataType" -> "INT", + "constraints" -> "delta.constraints.ck -> hash ( a . x . z ) > 0" + ) + ) + + // changing the type of struct field `a.y` and `a.x.h` when it's not + // the field referenced by the CHECK constraint is allowed. + sql( + """ + | INSERT INTO t (a) VALUES ( + | named_struct('x', named_struct('z', CAST(2 AS BYTE), 'h', 2002), 'y', 1030) + | ) + |""".stripMargin + ) + checkAnswer(sql("SELECT hash(a.x.z) FROM t"), Seq(Row(1765031574), Row(1765031574))) } } } diff --git a/spark/src/test/scala/org/apache/spark/sql/delta/typewidening/TypeWideningFeatureCompatibilitySuite.scala b/spark/src/test/scala/org/apache/spark/sql/delta/typewidening/TypeWideningFeatureCompatibilitySuite.scala index 67250fe0bb6..d524d2fcd62 100644 --- a/spark/src/test/scala/org/apache/spark/sql/delta/typewidening/TypeWideningFeatureCompatibilitySuite.scala +++ b/spark/src/test/scala/org/apache/spark/sql/delta/typewidening/TypeWideningFeatureCompatibilitySuite.scala @@ -53,10 +53,10 @@ trait TypeWideningCompatibilityTests { .drop(CDCReader.CDC_COMMIT_VERSION) checkErrorMatchPVals( - exception = intercept[DeltaUnsupportedOperationException] { + intercept[DeltaUnsupportedOperationException] { readCDF(start = 1, end = 1).collect() }, - errorClass = "DELTA_CHANGE_DATA_FEED_INCOMPATIBLE_DATA_SCHEMA", + "DELTA_CHANGE_DATA_FEED_INCOMPATIBLE_DATA_SCHEMA", parameters = Map( "start" -> "1", "end" -> "1", @@ -92,10 +92,10 @@ trait TypeWideningCompatibilityTests { checkAnswer(readCDF(start = 1, end = 1), Seq(Row(1, "insert"), Row(2, "insert"))) checkErrorMatchPVals( - exception = intercept[DeltaUnsupportedOperationException] { + intercept[DeltaUnsupportedOperationException] { readCDF(start = 1, end = 3) }, - errorClass = "DELTA_CHANGE_DATA_FEED_INCOMPATIBLE_SCHEMA_CHANGE", + "DELTA_CHANGE_DATA_FEED_INCOMPATIBLE_SCHEMA_CHANGE", parameters = Map( "start" -> "1", "end" -> "3", diff --git a/spark/src/test/scala/org/apache/spark/sql/delta/typewidening/TypeWideningGeneratedColumnsSuite.scala b/spark/src/test/scala/org/apache/spark/sql/delta/typewidening/TypeWideningGeneratedColumnsSuite.scala index 032cd1d8a99..7f8ebc2033d 100644 --- a/spark/src/test/scala/org/apache/spark/sql/delta/typewidening/TypeWideningGeneratedColumnsSuite.scala +++ b/spark/src/test/scala/org/apache/spark/sql/delta/typewidening/TypeWideningGeneratedColumnsSuite.scala @@ -48,10 +48,10 @@ trait TypeWideningGeneratedColumnTests extends GeneratedColumnTest { // Changing the type of a column that a generated column depends on is not allowed. checkError( - exception = intercept[DeltaAnalysisException] { + intercept[DeltaAnalysisException] { sql("ALTER TABLE t CHANGE COLUMN a TYPE SMALLINT") }, - errorClass = "DELTA_GENERATED_COLUMNS_DEPENDENT_COLUMN_CHANGE", + "DELTA_GENERATED_COLUMNS_DEPENDENT_COLUMN_CHANGE", parameters = Map( "columnName" -> "a", "generatedColumns" -> "gen -> hash(a)" @@ -77,10 +77,10 @@ trait TypeWideningGeneratedColumnTests extends GeneratedColumnTest { checkAnswer(sql("SELECT hash(a.x) FROM t"), Row(1765031574)) checkError( - exception = intercept[DeltaAnalysisException] { + intercept[DeltaAnalysisException] { sql("ALTER TABLE t CHANGE COLUMN a.x TYPE SMALLINT") }, - errorClass = "DELTA_GENERATED_COLUMNS_DEPENDENT_COLUMN_CHANGE", + "DELTA_GENERATED_COLUMNS_DEPENDENT_COLUMN_CHANGE", parameters = Map( "columnName" -> "a.x", "generatedColumns" -> "gen -> hash(a.x)" @@ -106,10 +106,10 @@ trait TypeWideningGeneratedColumnTests extends GeneratedColumnTest { withSQLConf(DeltaSQLConf.DELTA_SCHEMA_AUTO_MIGRATE.key -> "true") { checkError( - exception = intercept[DeltaAnalysisException] { + intercept[DeltaAnalysisException] { sql("INSERT INTO t (a) VALUES (200)") }, - errorClass = "DELTA_GENERATED_COLUMNS_DATA_TYPE_MISMATCH", + "DELTA_GENERATED_COLUMNS_DATA_TYPE_MISMATCH", parameters = Map( "columnName" -> "a", "columnType" -> "TINYINT", @@ -130,34 +130,73 @@ trait TypeWideningGeneratedColumnTests extends GeneratedColumnTest { partitionColumns = Seq.empty ) sql("INSERT INTO t (a) VALUES (named_struct('x', 2, 'y', 3))") - checkAnswer(sql("SELECT hash(a.x) FROM t"), Row(1765031574)) + checkAnswer(sql("SELECT gen FROM t"), Row(1765031574)) withSQLConf(DeltaSQLConf.DELTA_SCHEMA_AUTO_MIGRATE.key -> "true") { checkError( - exception = intercept[DeltaAnalysisException] { + intercept[DeltaAnalysisException] { sql("INSERT INTO t (a) VALUES (named_struct('x', 200, 'y', CAST(5 AS byte)))") }, - errorClass = "DELTA_GENERATED_COLUMNS_DATA_TYPE_MISMATCH", + "DELTA_GENERATED_COLUMNS_DATA_TYPE_MISMATCH", parameters = Map( - "columnName" -> "a", - "columnType" -> "STRUCT", - "dataType" -> "STRUCT", + "columnName" -> "a.x", + "columnType" -> "TINYINT", + "dataType" -> "INT", "generatedColumns" -> "gen -> hash(a.x)" - )) + ) + ) - // We're currently too strict and reject changing the type of struct field a.y even though - // it's not the field referenced by the generated column. + // changing the type of struct field `a.y` when it's not + // the field referenced by the generated column is allowed. + sql("INSERT INTO t (a) VALUES (named_struct('x', CAST(2 AS byte), 'y', 200))") + checkAnswer(sql("SELECT gen FROM t"), Seq(Row(1765031574), Row(1765031574))) + } + } + } + + test("generated column on nested field with complex type evolution") { + withTable("t") { + createTable( + tableName = "t", + path = None, + schemaString = "a struct, y: byte>, gen int", + generatedColumns = Map("gen" -> "hash(a.x.z)"), + partitionColumns = Seq.empty + ) + + sql("INSERT INTO t (a) VALUES (named_struct('x', named_struct('z', 2, 'h', 3), 'y', 4))") + checkAnswer(sql("SELECT gen FROM t"), Row(1765031574)) + + withSQLConf(DeltaSQLConf.DELTA_SCHEMA_AUTO_MIGRATE.key -> "true") { checkError( - exception = intercept[DeltaAnalysisException] { - sql("INSERT INTO t (a) VALUES (named_struct('x', CAST(2 AS byte), 'y', 200))") + intercept[DeltaAnalysisException] { + sql( + s""" + | INSERT INTO t (a) VALUES ( + | named_struct('x', named_struct('z', 200, 'h', 3), 'y', 4) + | ) + |""".stripMargin + ) }, - errorClass = "DELTA_GENERATED_COLUMNS_DATA_TYPE_MISMATCH", + "DELTA_GENERATED_COLUMNS_DATA_TYPE_MISMATCH", parameters = Map( - "columnName" -> "a", - "columnType" -> "STRUCT", - "dataType" -> "STRUCT", - "generatedColumns" -> "gen -> hash(a.x)" - )) + "columnName" -> "a.x.z", + "columnType" -> "TINYINT", + "dataType" -> "INT", + "generatedColumns" -> "gen -> hash(a.x.z)" + ) + ) + + // changing the type of struct field `a.y` when it's not + // the field referenced by the generated column is allowed. + sql( + """ + | INSERT INTO t (a) VALUES ( + | named_struct('x', named_struct('z', CAST(2 AS BYTE), 'h', 2002), 'y', 1030) + | ) + |""".stripMargin + ) + checkAnswer(sql("SELECT gen FROM t"), Seq(Row(1765031574), Row(1765031574))) } } } diff --git a/spark/src/test/scala/org/apache/spark/sql/delta/typewidening/TypeWideningInsertSchemaEvolutionSuite.scala b/spark/src/test/scala/org/apache/spark/sql/delta/typewidening/TypeWideningInsertSchemaEvolutionSuite.scala index e97bf55f1af..a1b5028b6db 100644 --- a/spark/src/test/scala/org/apache/spark/sql/delta/typewidening/TypeWideningInsertSchemaEvolutionSuite.scala +++ b/spark/src/test/scala/org/apache/spark/sql/delta/typewidening/TypeWideningInsertSchemaEvolutionSuite.scala @@ -25,7 +25,6 @@ import org.apache.spark.SparkConf import org.apache.spark.sql.{DataFrame, Dataset, QueryTest, Row, SaveMode} import org.apache.spark.sql.catalyst.plans.logical.{AppendData, LogicalPlan} import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Relation -import org.apache.spark.sql.functions.{col, lit} import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.internal.SQLConf.StoreAssignmentPolicy import org.apache.spark.sql.types._ @@ -51,7 +50,9 @@ class TypeWideningInsertSchemaEvolutionSuite /** * Tests covering type widening during schema evolution in INSERT. */ -trait TypeWideningInsertSchemaEvolutionTests extends TypeWideningTestCases { +trait TypeWideningInsertSchemaEvolutionTests + extends DeltaInsertIntoTest + with TypeWideningTestCases { self: QueryTest with TypeWideningTestMixin with DeltaDMLTestUtils => import testImplicits._ @@ -160,256 +161,57 @@ trait TypeWideningInsertSchemaEvolutionTests extends TypeWideningTestCases { checkAnswer(readDeltaTable(tempPath), Row(1)) } - - /** - * There are **many** different ways to run an insert: - * - Using SQL or the dataframe v1 and v2 APIs. - * - Append vs. Overwrite / Partition overwrite. - * - Position-based vs. name-based resolution. - * - * Each take a unique path through analysis. The abstractions below captures these different - * inserts to allow more easily running tests with all or a subset of them. - * - * @param mode Append or Overwrite. This dictates in particular what the expected result after the - * insert should be. - * @param name A human-readable name for the insert type displayed in the test names. - */ - trait Insert { - val mode: SaveMode - val name: String - - /** - * The method that tests will call to run the insert. Each type of insert must implement its - * sepcific way to run insert. - */ - def runInsert(columns: Seq[String], whereCol: String, whereValue: Int): Unit - - /** SQL keyword for this type of insert. */ - def intoOrOverwrite: String = if (mode == SaveMode.Append) "INTO" else "OVERWRITE" - - /** The expected content of the table after the insert. */ - def expectedResult(initialDF: DataFrame, insertedDF: DataFrame): DataFrame = - if (mode == SaveMode.Overwrite) insertedDF - else initialDF.unionByName(insertedDF, allowMissingColumns = true) - } - - /** INSERT INTO/OVERWRITE */ - case class SQLInsertByPosition(mode: SaveMode) extends Insert { - val name: String = s"INSERT $intoOrOverwrite" - def runInsert(columns: Seq[String], whereCol: String, whereValue: Int): Unit = - sql(s"INSERT $intoOrOverwrite target SELECT * FROM source") - } - - /** INSERT INTO/OVERWRITE (a, b) */ - case class SQLInsertColList(mode: SaveMode) extends Insert { - val name: String = s"INSERT $intoOrOverwrite (columns) - $mode" - def runInsert(columns: Seq[String], whereCol: String, whereValue: Int): Unit = { - val colList = columns.mkString(", ") - sql(s"INSERT $intoOrOverwrite target ($colList) SELECT $colList FROM source") - } - } - - /** INSERT INTO/OVERWRITE BY NAME */ - case class SQLInsertByName(mode: SaveMode) extends Insert { - val name: String = s"INSERT $intoOrOverwrite BY NAME - $mode" - def runInsert(columns: Seq[String], whereCol: String, whereValue: Int): Unit = - sql(s"INSERT $intoOrOverwrite target SELECT ${columns.mkString(", ")} FROM source") - } - - /** INSERT INTO REPLACE WHERE */ - object SQLInsertOverwriteReplaceWhere extends Insert { - val mode: SaveMode = SaveMode.Overwrite - val name: String = s"INSERT INTO REPLACE WHERE" - def runInsert(columns: Seq[String], whereCol: String, whereValue: Int): Unit = - sql(s"INSERT INTO target REPLACE WHERE $whereCol = $whereValue " + - s"SELECT ${columns.mkString(", ")} FROM source") - } - - /** INSERT OVERWRITE PARTITION (part = 1) */ - object SQLInsertOverwritePartitionByPosition extends Insert { - val mode: SaveMode = SaveMode.Overwrite - val name: String = s"INSERT OVERWRITE PARTITION (partition)" - def runInsert(columns: Seq[String], whereCol: String, whereValue: Int): Unit = { - val assignments = columns.filterNot(_ == whereCol).mkString(", ") - sql(s"INSERT OVERWRITE target PARTITION ($whereCol = $whereValue) " + - s"SELECT $assignments FROM source") - } - } - - /** INSERT OVERWRITE PARTITION (part = 1) (a, b) */ - object SQLInsertOverwritePartitionColList extends Insert { - val mode: SaveMode = SaveMode.Overwrite - val name: String = s"INSERT OVERWRITE PARTITION (partition) (columns)" - def runInsert(columns: Seq[String], whereCol: String, whereValue: Int): Unit = { - val assignments = columns.filterNot(_ == whereCol).mkString(", ") - sql(s"INSERT OVERWRITE target PARTITION ($whereCol = $whereValue) ($assignments) " + - s"SELECT $assignments FROM source") - } - } - - /** df.write.mode(mode).insertInto() */ - case class DFv1InsertInto(mode: SaveMode) extends Insert { - val name: String = s"DFv1 insertInto() - $mode" - def runInsert(columns: Seq[String], whereCol: String, whereValue: Int): Unit = - spark.read.table("source").write.mode(mode).insertInto("target") - } - - /** df.write.mode(mode).saveAsTable() */ - case class DFv1SaveAsTable(mode: SaveMode) extends Insert { - val name: String = s"DFv1 saveAsTable() - $mode" - def runInsert(columns: Seq[String], whereCol: String, whereValue: Int): Unit = { - spark.read.table("source").write.mode(mode).format("delta").saveAsTable("target") - } - } - - /** df.writeTo.append() */ - object DFv2Append extends Insert { self: Insert => - val mode: SaveMode = SaveMode.Append - val name: String = "DFv2 append()" - def runInsert(columns: Seq[String], whereCol: String, whereValue: Int): Unit = { - spark.read.table("source").writeTo("target").append() - } - } - - /** df.writeTo.overwrite() */ - object DFv2Overwrite extends Insert { self: Insert => - val mode: SaveMode = SaveMode.Overwrite - val name: String = s"DFv2 overwrite()" - def runInsert(columns: Seq[String], whereCol: String, whereValue: Int): Unit = { - spark.read.table("source").writeTo("target").overwrite(col(whereCol) === lit(whereValue)) - } - } - - /** df.writeTo.overwritePartitions() */ - object DFv2OverwritePartition extends Insert { self: Insert => - override val mode: SaveMode = SaveMode.Overwrite - val name: String = s"DFv2 overwritePartitions()" - def runInsert(columns: Seq[String], whereCol: String, whereValue: Int): Unit = { - spark.read.table("source").writeTo("target").overwritePartitions() - } - } - - /** Collects all the types of insert previously defined. */ - protected lazy val allInsertTypes: Seq[Insert] = Seq( - SQLInsertOverwriteReplaceWhere, - SQLInsertOverwritePartitionByPosition, - SQLInsertOverwritePartitionColList, - DFv2Append, - DFv2Overwrite, - DFv2OverwritePartition - ) ++ (for { - mode: SaveMode <- Seq(SaveMode.Append, SaveMode.Overwrite) - insert: Insert <- Seq( - SQLInsertByPosition(mode), - SQLInsertColList(mode), - SQLInsertByName(mode), - DFv1InsertInto(mode), - DFv1SaveAsTable(mode) - ) - } yield insert) - - /** - * Test runner for type evolution in INSERT. - * @param name Test name - * @param initialSchemaDDL Initial schema of the table to be inserted into (as a DDL string). - * @param initialJsonData Initial data present in the table to be inserted into (as a JSON - * string). - * @param partitionBy Partition columns for the initial table. - * @param insertSchemaDDL Schema of the data to be inserted (as a DDL string). - * @param insertJsonData Data to be inserted (as a JSON string) - * @param overwriteWhere Where clause for overwrite PARTITION / REPLACE WHERE (as - * colName -> value) - * @param expectedSchema Expected schema of the table after the insert. - * @param includeInserts List of insert types to run the test with. Defaults to all inserts. - * @param excludeInserts List of insert types to exclude when running the test. Defaults to no - * inserts excluded. - */ - def testInsertTypeEvolution(name: String)( - initialSchemaDDL: String, - initialJsonData: Seq[String], - partitionBy: Seq[String] = Seq.empty, - insertSchemaDDL: String, - insertJsonData: Seq[String], - overwriteWhere: (String, Int), - expectedSchema: StructType, - includeInserts: Seq[Insert] = allInsertTypes, - excludeInserts: Seq[Insert] = Seq.empty): Unit = { - for (insert <- includeInserts.filterNot(excludeInserts.toSet)) { - test(s"${insert.name} - $name") { - withTable("source", "target") { - val initialDF = readFromJSON(initialJsonData, StructType.fromDDL(initialSchemaDDL)) - val writer = initialDF.write.format("delta") - if (partitionBy.nonEmpty) { - writer.partitionBy(partitionBy: _*) - } - writer.saveAsTable("target") - // Write the data to insert to a table so that we can use it in both SQL and dataframe - // writer inserts. - val insertDF = readFromJSON(insertJsonData, StructType.fromDDL(insertSchemaDDL)) - insertDF.write.format("delta").saveAsTable("source") - - insert.runInsert( - columns = insertDF.schema.map(_.name), - whereCol = overwriteWhere._1, - whereValue = overwriteWhere._2 - ) - - val target = spark.read.table("target") - assert(target.schema === expectedSchema) - checkAnswer(target, insert.expectedResult(initialDF, insertDF)) - } - } - } - } - - testInsertTypeEvolution("top-level type evolution")( + testInserts("top-level type evolution")( initialSchemaDDL = "a int, b short", initialJsonData = Seq("""{ "a": 1, "b": 2 }"""), partitionBy = Seq("a"), overwriteWhere = "a" -> 1, insertSchemaDDL = "a int, b int", insertJsonData = Seq("""{ "a": 1, "b": 4 }"""), - expectedSchema = StructType(new StructType() + expectedResult = ExpectedResult.Success(expectedSchema = new StructType() .add("a", IntegerType) .add("b", IntegerType, nullable = true, - metadata = typeWideningMetadata(version = 1, from = ShortType, to = IntegerType))) + metadata = typeWideningMetadata(version = 1, from = ShortType, to = IntegerType))), + excludeInserts = Seq(StreamingInsert) ) - testInsertTypeEvolution("top-level type evolution with column upcast")( + testInserts("top-level type evolution with column upcast")( initialSchemaDDL = "a int, b short, c int", initialJsonData = Seq("""{ "a": 1, "b": 2, "c": 3 }"""), partitionBy = Seq("a"), overwriteWhere = "a" -> 1, insertSchemaDDL = "a int, b int, c short", insertJsonData = Seq("""{ "a": 1, "b": 5, "c": 6 }"""), - expectedSchema = new StructType() + expectedResult = ExpectedResult.Success(expectedSchema = new StructType() .add("a", IntegerType) .add("b", IntegerType, nullable = true, metadata = typeWideningMetadata(version = 1, from = ShortType, to = IntegerType)) - .add("c", IntegerType) + .add("c", IntegerType)), + excludeInserts = Seq(StreamingInsert) ) - testInsertTypeEvolution("top-level type evolution with schema evolution")( + testInserts("top-level type evolution with schema evolution")( initialSchemaDDL = "a int, b short", initialJsonData = Seq("""{ "a": 1, "b": 2 }"""), partitionBy = Seq("a"), overwriteWhere = "a" -> 1, insertSchemaDDL = "a int, b int, c int", insertJsonData = Seq("""{ "a": 1, "b": 4, "c": 5 }"""), - expectedSchema = new StructType() + expectedResult = ExpectedResult.Success(expectedSchema = new StructType() .add("a", IntegerType) .add("b", IntegerType, nullable = true, metadata = typeWideningMetadata(version = 1, from = ShortType, to = IntegerType)) - .add("c", IntegerType), + .add("c", IntegerType)), // INSERT INTO/OVERWRITE (a, b) VALUES doesn't support schema evolution. excludeInserts = Seq( SQLInsertColList(SaveMode.Append), SQLInsertColList(SaveMode.Overwrite), - SQLInsertOverwritePartitionColList) + SQLInsertOverwritePartitionColList, + StreamingInsert) ) - testInsertTypeEvolution("nested type evolution by position")( + testInserts("nested type evolution by position")( initialSchemaDDL = "key int, s struct, m map, a array", initialJsonData = Seq("""{ "key": 1, "s": { "x": 1, "y": 2 }, "m": { "p": 3 }, "a": [4] }"""), @@ -417,7 +219,7 @@ trait TypeWideningInsertSchemaEvolutionTests extends TypeWideningTestCases { overwriteWhere = "key" -> 1, insertSchemaDDL = "key int, s struct, m map, a array", insertJsonData = Seq("""{ "key": 1, "s": { "x": 4, "y": 5 }, "m": { "p": 6 }, "a": [7] }"""), - expectedSchema = new StructType() + expectedResult = ExpectedResult.Success(expectedSchema = new StructType() .add("key", IntegerType) .add("s", new StructType() .add("x", ShortType) @@ -434,11 +236,12 @@ trait TypeWideningInsertSchemaEvolutionTests extends TypeWideningTestCases { version = 1, from = ShortType, to = IntegerType, - path = Seq("element"))) + path = Seq("element")))), + excludeInserts = Seq(StreamingInsert) ) - testInsertTypeEvolution("nested type evolution with struct evolution by position")( + testInserts("nested type evolution with struct evolution by position")( initialSchemaDDL = "key int, s struct, m map, a array", initialJsonData = Seq("""{ "key": 1, "s": { "x": 1, "y": 2 }, "m": { "p": 3 }, "a": [4] }"""), @@ -448,7 +251,7 @@ trait TypeWideningInsertSchemaEvolutionTests extends TypeWideningTestCases { "key int, s struct, m map, a array", insertJsonData = Seq("""{ "key": 1, "s": { "x": 4, "y": 5, "z": 8 }, "m": { "p": 6 }, "a": [7] }"""), - expectedSchema = new StructType() + expectedResult = ExpectedResult.Success(expectedSchema = new StructType() .add("key", IntegerType) .add("s", new StructType() .add("x", ShortType) @@ -466,75 +269,81 @@ trait TypeWideningInsertSchemaEvolutionTests extends TypeWideningTestCases { version = 1, from = ShortType, to = IntegerType, - path = Seq("element"))) + path = Seq("element")))), + excludeInserts = Seq(StreamingInsert) ) - testInsertTypeEvolution("nested struct type evolution with field upcast")( + testInserts("nested struct type evolution with field upcast")( initialSchemaDDL = "key int, s struct", initialJsonData = Seq("""{ "key": 1, "s": { "x": 1, "y": 2 } }"""), partitionBy = Seq("key"), overwriteWhere = "key" -> 1, insertSchemaDDL = "key int, s struct", insertJsonData = Seq("""{ "key": 1, "s": { "x": 4, "y": 5 } }"""), - expectedSchema = new StructType() + expectedResult = ExpectedResult.Success(expectedSchema = new StructType() .add("key", IntegerType) .add("s", new StructType() .add("x", IntegerType) .add("y", IntegerType, nullable = true, - metadata = typeWideningMetadata(version = 1, from = ShortType, to = IntegerType))) + metadata = typeWideningMetadata(version = 1, from = ShortType, to = IntegerType)))), + excludeInserts = Seq(StreamingInsert) ) // Interestingly, we introduced a special case to handle schema evolution / casting for structs // directly nested into an array. This doesn't always work with maps or with elements that // aren't a struct (see other tests). - testInsertTypeEvolution("nested struct type evolution with field upcast in array")( + testInserts("nested struct type evolution with field upcast in array")( initialSchemaDDL = "key int, a array>", initialJsonData = Seq("""{ "key": 1, "a": [ { "x": 1, "y": 2 } ] }"""), partitionBy = Seq("key"), overwriteWhere = "key" -> 1, insertSchemaDDL = "key int, a array>", insertJsonData = Seq("""{ "key": 1, "a": [ { "x": 3, "y": 4 } ] }"""), - expectedSchema = new StructType() + expectedResult = ExpectedResult.Success(expectedSchema = new StructType() .add("key", IntegerType) .add("a", ArrayType(new StructType() .add("x", IntegerType) .add("y", IntegerType, nullable = true, - metadata = typeWideningMetadata(version = 1, from = ShortType, to = IntegerType)))) + metadata = typeWideningMetadata(version = 1, from = ShortType, to = IntegerType))))), + excludeInserts = Seq(StreamingInsert) ) // The next two tests document inconsistencies when handling maps. Using SQL doesn't allow type // evolution but using the dataframe API does. - testInsertTypeEvolution("nested struct type evolution with field upcast in map")( + testInserts("nested struct type evolution with field upcast in map")( initialSchemaDDL = "key int, m map>", initialJsonData = Seq("""{ "key": 1, "m": { "a": { "x": 1, "y": 2 } } }"""), partitionBy = Seq("key"), overwriteWhere = "key" -> 1, insertSchemaDDL = "key int, m map>", insertJsonData = Seq("""{ "key": 1, "m": { "a": { "x": 3, "y": 4 } } }"""), - expectedSchema = new StructType() + expectedResult = ExpectedResult.Success(expectedSchema = new StructType() .add("key", IntegerType) // Type evolution wasn't applied in the map. .add("m", MapType(StringType, new StructType() .add("x", IntegerType) - .add("y", ShortType))), + .add("y", ShortType)))), excludeInserts = Seq( DFv1SaveAsTable(SaveMode.Append), DFv1SaveAsTable(SaveMode.Overwrite), + DFv1Save(SaveMode.Append), + DFv1Save(SaveMode.Overwrite), DFv2Append, DFv2Overwrite, - DFv2OverwritePartition + DFv2OverwritePartition, + StreamingInsert ) ) - testInsertTypeEvolution("nested struct type evolution with field upcast in map")( + testInserts("nested struct type evolution with field upcast in map")( initialSchemaDDL = "key int, m map>", initialJsonData = Seq("""{ "key": 1, "m": { "a": { "x": 1, "y": 2 } } }"""), partitionBy = Seq("key"), overwriteWhere = "key" -> 1, insertSchemaDDL = "key int, m map>", insertJsonData = Seq("""{ "key": 1, "m": { "a": { "x": 3, "y": 4 } } }"""), - expectedSchema = StructType(new StructType() + expectedResult = ExpectedResult.Success(expectedSchema = new StructType() .add("key", IntegerType) // Type evolution was applied in the map. .add("m", MapType(StringType, new StructType() @@ -544,6 +353,8 @@ trait TypeWideningInsertSchemaEvolutionTests extends TypeWideningTestCases { includeInserts = Seq( DFv1SaveAsTable(SaveMode.Append), DFv1SaveAsTable(SaveMode.Overwrite), + DFv1Save(SaveMode.Append), + DFv1Save(SaveMode.Overwrite), DFv2Append, DFv2Overwrite, DFv2OverwritePartition diff --git a/spark/src/test/scala/org/apache/spark/sql/delta/typewidening/TypeWideningStreamingSinkSuite.scala b/spark/src/test/scala/org/apache/spark/sql/delta/typewidening/TypeWideningStreamingSinkSuite.scala new file mode 100644 index 00000000000..de5fd6c36ec --- /dev/null +++ b/spark/src/test/scala/org/apache/spark/sql/delta/typewidening/TypeWideningStreamingSinkSuite.scala @@ -0,0 +1,229 @@ +/* + * Copyright (2021) The Delta Lake Project Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.delta.typewidening + +import org.apache.spark.sql.delta._ +import org.apache.spark.sql.delta.sources.{DeltaSink, DeltaSQLConf} + +import org.apache.spark.sql.Row +import org.apache.spark.sql.execution.streaming.StreamExecution +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.streaming.OutputMode +import org.apache.spark.sql.types._ + +/** + * Suite covering automatic type widening in the Delta streaming sink. + */ +class TypeWideningStreamingSinkSuite + extends DeltaSinkImplicitCastSuiteBase + with TypeWideningTestMixin { + + import testImplicits._ + + override def beforeAll(): Unit = { + super.beforeAll() + // Set by default confs to enable automatic type widening in all tests. Negative tests should + // explicitly disable these. + spark.conf.set(DeltaSQLConf.DELTA_STREAMING_SINK_ALLOW_IMPLICIT_CASTS.key, "true") + spark.conf.set(DeltaConfigs.ENABLE_TYPE_WIDENING.defaultTablePropertyKey, "true") + spark.conf.set(DeltaSQLConf.DELTA_SCHEMA_AUTO_MIGRATE.key, "true") + // Ensure we don't silently cast test inputs to null on overflow. + spark.conf.set(SQLConf.ANSI_ENABLED.key, "true") + } + + test("type isn't widened if schema evolution is disabled") { + withDeltaStream[Int] { stream => + stream.write(17)("CAST(value AS SHORT)") + assert(stream.currentSchema("value").dataType === ShortType) + checkAnswer(stream.read(), Row(17)) + + withSQLConf(DeltaSQLConf.DELTA_SCHEMA_AUTO_MIGRATE.key -> "false") { + stream.write(53)("CAST(value AS INT)") + assert(stream.currentSchema("value").dataType === ShortType) + checkAnswer(stream.read(), Row(17) :: Row(53) :: Nil) + } + } + } + + test("type isn't widened if type widening is disabled") { + withDeltaStream[Int] { stream => + withSQLConf(DeltaConfigs.ENABLE_TYPE_WIDENING.defaultTablePropertyKey -> "false") { + stream.write(17)("CAST(value AS SHORT)") + assert(stream.currentSchema("value").dataType === ShortType) + checkAnswer(stream.read(), Row(17)) + + stream.write(53)("CAST(value AS INT)") + assert(stream.currentSchema("value").dataType === ShortType) + checkAnswer(stream.read(), Row(17) :: Row(53) :: Nil) + } + } + } + + test("type is widened if type widening and schema evolution are enabled") { + withDeltaStream[Int] { stream => + stream.write(17)("CAST(value AS SHORT)") + assert(stream.currentSchema("value").dataType === ShortType) + checkAnswer(stream.read(), Row(17)) + + stream.write(Int.MaxValue)("CAST(value AS INT)") + assert(stream.currentSchema("value").dataType === IntegerType) + checkAnswer(stream.read(), Row(17) :: Row(Int.MaxValue) :: Nil) + } + } + + test("type can be widened even if type casting is disabled in the sink") { + withDeltaStream[Int] { stream => + stream.write(17)("CAST(value AS SHORT)") + assert(stream.currentSchema("value").dataType === ShortType) + checkAnswer(stream.read(), Row(17)) + + withSQLConf(DeltaSQLConf.DELTA_STREAMING_SINK_ALLOW_IMPLICIT_CASTS.key -> "false") { + stream.write(Int.MaxValue)("CAST(value AS INT)") + assert(stream.currentSchema("value").dataType === IntegerType) + checkAnswer(stream.read(), Row(17) :: Row(Int.MaxValue) :: Nil) + } + } + } + + test("type isn't changed if it's not a wider type") { + withDeltaStream[Int] { stream => + stream.write(Int.MaxValue)("CAST(value AS INT)") + assert(stream.currentSchema("value").dataType === IntegerType) + checkAnswer(stream.read(), Row(Int.MaxValue)) + + stream.write(17)("CAST(value AS SHORT)") + assert(stream.currentSchema("value").dataType === IntegerType) + checkAnswer(stream.read(), Row(Int.MaxValue) :: Row(17) :: Nil) + } + } + + test("type isn't changed if it's not eligible for automatic widening: int -> decimal") { + withDeltaStream[Int] { stream => + stream.write(17)("CAST(value AS INT)") + assert(stream.currentSchema("value").dataType === IntegerType) + checkAnswer(stream.read(), Row(17)) + + stream.write(567)("CAST(value AS DECIMAL(20, 0))") + assert(stream.currentSchema("value").dataType === IntegerType) + checkAnswer(stream.read(), Row(17) :: Row(567) :: Nil) + } + } + + test("type isn't changed if it's not eligible for automatic widening: int -> double") { + withDeltaStream[Int] { stream => + stream.write(17)("CAST(value AS INT)") + assert(stream.currentSchema("value").dataType === IntegerType) + checkAnswer(stream.read(), Row(17)) + + stream.write(567)("CAST(value AS DOUBLE)") + assert(stream.currentSchema("value").dataType === IntegerType) + checkAnswer(stream.read(), Row(17) :: Row(567) :: Nil) + } + } + + test("widen type and add a new column with schema evolution") { + withDeltaStream[(Int, Int)] { stream => + stream.write((17, -1))("CAST(_1 AS SHORT) AS a") + assert(stream.currentSchema === new StructType().add("a", ShortType)) + checkAnswer(stream.read(), Row(17)) + + stream.write((12, 3456))("CAST(_1 AS INT) AS a", "CAST(_2 AS DECIMAL(10, 2)) AS b") + assert(stream.currentSchema === new StructType() + .add("a", IntegerType, nullable = true, + metadata = typeWideningMetadata(version = 1, from = ShortType, to = IntegerType)) + .add("b", DecimalType(10, 2))) + checkAnswer(stream.read(), Row(17, null) :: Row(12, 3456) :: Nil) + } + } + + test("widen type during write with missing column") { + withDeltaStream[(Int, Int)] { stream => + stream.write((17, 45))("CAST(_1 AS SHORT) AS a", "CAST(_2 AS LONG) AS b") + assert(stream.currentSchema === new StructType() + .add("a", ShortType) + .add("b", LongType)) + checkAnswer(stream.read(), Row(17, 45)) + + stream.write((12, -1))("CAST(_1 AS INT) AS a") + assert(stream.currentSchema === new StructType() + .add("a", IntegerType, nullable = true, + metadata = typeWideningMetadata(version = 1, from = ShortType, to = IntegerType)) + .add("b", LongType)) + checkAnswer(stream.read(), Row(17, 45) :: Row(12, null) :: Nil) + } + } + + test("widen type after column rename and drop") { + withDeltaStream[(Int, Int)] { stream => + stream.write((17, 45))("CAST(_1 AS SHORT) AS a", "CAST(_2 AS DECIMAL(10, 0)) AS b") + assert(stream.currentSchema === new StructType() + .add("a", ShortType) + .add("b", DecimalType(10, 0))) + checkAnswer(stream.read(), Row(17, 45)) + + sql( + s""" + |ALTER TABLE delta.`${stream.deltaLog.dataPath}` SET TBLPROPERTIES ( + | 'delta.columnMapping.mode' = 'name', + | 'delta.minReaderVersion' = '2', + | 'delta.minWriterVersion' = '5' + |) + """.stripMargin) + sql(s"ALTER TABLE delta.`${stream.deltaLog.dataPath}` DROP COLUMN b") + sql(s"ALTER TABLE delta.`${stream.deltaLog.dataPath}` RENAME COLUMN a to c") + assert(stream.currentSchema === new StructType().add("c", ShortType)) + + stream.write((12, -1))("CAST(_1 AS INT) AS c") + assert(stream.currentSchema === new StructType().add("c", IntegerType, nullable = true, + metadata = typeWideningMetadata(version = 4, from = ShortType, to = IntegerType))) + checkAnswer(stream.read(), Row(17) :: Row(12) :: Nil) + } + } + + test("type widening in addBatch") { + withTempDir { tempDir => + val tablePath = tempDir.getAbsolutePath + val deltaLog = DeltaLog.forTable(spark, tablePath) + sqlContext.sparkContext.setLocalProperty(StreamExecution.QUERY_ID_KEY, "streaming_query") + val sink = DeltaSink( + sqlContext, + path = deltaLog.dataPath, + partitionColumns = Seq.empty, + outputMode = OutputMode.Append(), + options = new DeltaOptions(options = Map.empty, conf = spark.sessionState.conf) + ) + + val schema = new StructType().add("value", ShortType) + + { + val data = Seq(0, 1).toDF("value").selectExpr("CAST(value AS SHORT)") + sink.addBatch(0, data) + val df = spark.read.format("delta").load(tablePath) + assert(df.schema === schema) + checkAnswer(df, Row(0) :: Row(1) :: Nil) + } + { + val data = Seq(2, 3).toDF("value").selectExpr("CAST(value AS INT)") + sink.addBatch(1, data) + val df = spark.read.format("delta").load(tablePath) + assert(df.schema === new StructType().add("value", IntegerType, nullable = true, + metadata = typeWideningMetadata(version = 1, from = ShortType, to = IntegerType))) + checkAnswer(df, Row(0) :: Row(1) :: Row(2) :: Row(3) :: Nil) + } + } + } +} diff --git a/spark/src/test/scala/org/apache/spark/sql/delta/typewidening/TypeWideningTableFeatureSuite.scala b/spark/src/test/scala/org/apache/spark/sql/delta/typewidening/TypeWideningTableFeatureSuite.scala index b4ede10f3dd..64805d26027 100644 --- a/spark/src/test/scala/org/apache/spark/sql/delta/typewidening/TypeWideningTableFeatureSuite.scala +++ b/spark/src/test/scala/org/apache/spark/sql/delta/typewidening/TypeWideningTableFeatureSuite.scala @@ -110,11 +110,11 @@ trait TypeWideningTableFeatureTests extends RowTrackingTestUtils with TypeWideni sql(s"CREATE TABLE delta.`$tempPath` (a int) USING DELTA " + s"TBLPROPERTIES ('${DeltaConfigs.ENABLE_TYPE_WIDENING.key}' = 'false')") checkError( - exception = intercept[SparkException] { + intercept[SparkException] { sql(s"ALTER TABLE delta.`$tempPath` " + s"SET TBLPROPERTIES ('${DeltaConfigs.ENABLE_TYPE_WIDENING.key}' = 'bla')") }, - errorClass = "_LEGACY_ERROR_TEMP_2045", + "_LEGACY_ERROR_TEMP_2045", parameters = Map( "message" -> "For input string: \"bla\"" ) @@ -128,10 +128,10 @@ trait TypeWideningTableFeatureTests extends RowTrackingTestUtils with TypeWideni s"TBLPROPERTIES ('${DeltaConfigs.ENABLE_TYPE_WIDENING.key}' = 'false')") checkError( - exception = intercept[AnalysisException] { + intercept[AnalysisException] { sql(s"ALTER TABLE delta.`$tempPath` CHANGE COLUMN a TYPE SMALLINT") }, - errorClass = "DELTA_UNSUPPORTED_ALTER_TABLE_CHANGE_COL_OP", + "DELTA_UNSUPPORTED_ALTER_TABLE_CHANGE_COL_OP", parameters = Map( "fieldPath" -> "a", "oldField" -> "TINYINT", @@ -147,10 +147,10 @@ trait TypeWideningTableFeatureTests extends RowTrackingTestUtils with TypeWideni s"SET TBLPROPERTIES ('${DeltaConfigs.ENABLE_TYPE_WIDENING.key}' = 'false')") checkError( - exception = intercept[AnalysisException] { + intercept[AnalysisException] { sql(s"ALTER TABLE delta.`$tempPath` CHANGE COLUMN a TYPE INT") }, - errorClass = "DELTA_UNSUPPORTED_ALTER_TABLE_CHANGE_COL_OP", + "DELTA_UNSUPPORTED_ALTER_TABLE_CHANGE_COL_OP", parameters = Map( "fieldPath" -> "a", "oldField" -> "SMALLINT", @@ -192,12 +192,12 @@ trait TypeWideningTableFeatureTests extends RowTrackingTestUtils with TypeWideni val deltaLog = DeltaLog.forTable(spark, TableIdentifier(tableName, Some(databaseName))) checkError( - exception = intercept[DeltaTableFeatureException] { + intercept[DeltaTableFeatureException] { sql(s"ALTER TABLE $databaseName.$tableName " + s"DROP FEATURE '${TypeWideningPreviewTableFeature.name}'" ).collect() }, - errorClass = "DELTA_FEATURE_DROP_WAIT_FOR_RETENTION_PERIOD", + "DELTA_FEATURE_DROP_WAIT_FOR_RETENTION_PERIOD", parameters = Map( "feature" -> TypeWideningPreviewTableFeature.name, "logRetentionPeriodKey" -> DeltaConfigs.LOG_RETENTION.key, @@ -442,10 +442,10 @@ trait TypeWideningTableFeatureTests extends RowTrackingTestUtils with TypeWideni } checkError( - exception = intercept[DeltaIllegalStateException] { + intercept[DeltaIllegalStateException] { readDeltaTable(tempPath).collect() }, - errorClass = "DELTA_UNSUPPORTED_TYPE_CHANGE_IN_SCHEMA", + "DELTA_UNSUPPORTED_TYPE_CHANGE_IN_SCHEMA", parameters = Map( "fieldName" -> "a.element", "fromType" -> "INT", diff --git a/spark/src/test/scala/org/apache/spark/sql/delta/typewidening/TypeWideningTestMixin.scala b/spark/src/test/scala/org/apache/spark/sql/delta/typewidening/TypeWideningTestMixin.scala index b33bb3ad077..71ce4072162 100644 --- a/spark/src/test/scala/org/apache/spark/sql/delta/typewidening/TypeWideningTestMixin.scala +++ b/spark/src/test/scala/org/apache/spark/sql/delta/typewidening/TypeWideningTestMixin.scala @@ -150,8 +150,8 @@ trait TypeWideningDropFeatureTestMixin dropFeature.run(spark) case ExpectedOutcome.FAIL_CURRENT_VERSION_USES_FEATURE => checkError( - exception = intercept[DeltaTableFeatureException] { dropFeature.run(spark) }, - errorClass = "DELTA_FEATURE_DROP_WAIT_FOR_RETENTION_PERIOD", + intercept[DeltaTableFeatureException] { dropFeature.run(spark) }, + "DELTA_FEATURE_DROP_WAIT_FOR_RETENTION_PERIOD", parameters = Map( "feature" -> feature.name, "logRetentionPeriodKey" -> DeltaConfigs.LOG_RETENTION.key, @@ -163,8 +163,8 @@ trait TypeWideningDropFeatureTestMixin ) case ExpectedOutcome.FAIL_HISTORICAL_VERSION_USES_FEATURE => checkError( - exception = intercept[DeltaTableFeatureException] { dropFeature.run(spark) }, - errorClass = "DELTA_FEATURE_DROP_HISTORICAL_VERSIONS_EXIST", + intercept[DeltaTableFeatureException] { dropFeature.run(spark) }, + "DELTA_FEATURE_DROP_HISTORICAL_VERSIONS_EXIST", parameters = Map( "feature" -> feature.name, "logRetentionPeriodKey" -> DeltaConfigs.LOG_RETENTION.key, @@ -176,8 +176,8 @@ trait TypeWideningDropFeatureTestMixin ) case ExpectedOutcome.FAIL_FEATURE_NOT_PRESENT => checkError( - exception = intercept[DeltaTableFeatureException] { dropFeature.run(spark) }, - errorClass = "DELTA_FEATURE_DROP_FEATURE_NOT_PRESENT", + intercept[DeltaTableFeatureException] { dropFeature.run(spark) }, + "DELTA_FEATURE_DROP_FEATURE_NOT_PRESENT", parameters = Map("feature" -> feature.name) ) } diff --git a/spark/src/test/scala/org/apache/spark/sql/delta/uniform/UniFormE2ESuite.scala b/spark/src/test/scala/org/apache/spark/sql/delta/uniform/UniFormE2ESuite.scala index b08c7192592..cd9fd6da3ff 100644 --- a/spark/src/test/scala/org/apache/spark/sql/delta/uniform/UniFormE2ESuite.scala +++ b/spark/src/test/scala/org/apache/spark/sql/delta/uniform/UniFormE2ESuite.scala @@ -37,6 +37,77 @@ abstract class UniFormE2EIcebergSuiteBase extends UniFormE2ETest { } } + test("Insert Partitioned Table") { + val partitionColumns = Array( + "str STRING", + "i INTEGER", + "l LONG", + "s SHORT", + "b BYTE", + "dt DATE", + "bin BINARY", + "bool BOOLEAN", + "ts_ntz TIMESTAMP_NTZ", + "ts TIMESTAMP") + + val partitionValues: Array[Any] = Array( + "'some_value'", + 1, + 1234567L, + 1000, + 119, + "to_date('2016-12-31', 'yyyy-MM-dd')", + "'asdf'", + true, + "TIMESTAMP_NTZ'2021-12-06 00:00:00'", + "TIMESTAMP'2023-08-18 05:00:00UTC-7'" + ) + + partitionColumns zip partitionValues map { + partitionColumnsAndValues => + val partitionColumnName = + partitionColumnsAndValues._1.split(" ")(0) + val tableName = testTableName + "_" + partitionColumnName + withTable(tableName) { + write( + s"""CREATE TABLE $tableName (${partitionColumnsAndValues._1}, col1 INT) + | USING DELTA + | PARTITIONED BY ($partitionColumnName) + | TBLPROPERTIES ( + | 'delta.columnMapping.mode' = 'name', + | 'delta.enableIcebergCompatV2' = 'true', + | 'delta.universalFormat.enabledFormats' = 'iceberg' + |)""".stripMargin) + write(s"INSERT INTO $tableName VALUES (${partitionColumnsAndValues._2}, 123)") + val verificationQuery = s"SELECT col1 FROM $tableName " + + s"where ${partitionColumnName}=${partitionColumnsAndValues._2}" + // Verify against Delta read and Iceberg read + checkAnswer(spark.sql(verificationQuery), Seq(Row(123))) + checkAnswer(createReaderSparkSession.sql(verificationQuery), Seq(Row(123))) + } + } + } + + test("Insert Partitioned Table - Multiple Partitions") { + withTable(testTableName) { + write( + s"""CREATE TABLE $testTableName (id int, ts timestamp, col1 INT) + | USING DELTA + | PARTITIONED BY (id, ts) + | TBLPROPERTIES ( + | 'delta.columnMapping.mode' = 'name', + | 'delta.enableIcebergCompatV2' = 'true', + | 'delta.universalFormat.enabledFormats' = 'iceberg' + |)""".stripMargin) + write(s"INSERT INTO $testTableName VALUES (1, TIMESTAMP'2023-08-18 05:00:00UTC-7', 123)") + val verificationQuery = s"SELECT col1 FROM $testTableName " + + s"where id=1 and ts=TIMESTAMP'2023-08-18 05:00:00UTC-7'" + // Verify against Delta read and Iceberg read + checkAnswer(spark.sql(verificationQuery), Seq(Row(123))) + checkAnswer(createReaderSparkSession.sql(verificationQuery), Seq(Row(123))) + } + } + test("CIUD") { withTable(testTableName) { write( diff --git a/storage/src/main/java/io/delta/storage/S3SingleDriverLogStore.java b/storage/src/main/java/io/delta/storage/S3SingleDriverLogStore.java index a1c4a296d7b..36d5cd2877e 100644 --- a/storage/src/main/java/io/delta/storage/S3SingleDriverLogStore.java +++ b/storage/src/main/java/io/delta/storage/S3SingleDriverLogStore.java @@ -23,14 +23,8 @@ import java.net.URISyntaxException; import java.nio.charset.StandardCharsets; import java.util.*; -import java.util.concurrent.TimeUnit; -import java.util.stream.Collectors; -import java.util.stream.Stream; -import com.google.common.cache.Cache; -import com.google.common.cache.CacheBuilder; import com.google.common.io.CountingOutputStream; -import io.delta.storage.internal.FileNameUtils; import io.delta.storage.internal.PathLock; import io.delta.storage.internal.S3LogStoreUtil; import org.apache.hadoop.conf.Configuration; @@ -41,12 +35,12 @@ import org.apache.hadoop.fs.RawLocalFileSystem; /** - * Single Spark-driver/JVM LogStore implementation for S3. + * Single JVM LogStore implementation for S3. *

* We assume the following from S3's {@link FileSystem} implementations: *

    *
  • File writing on S3 is all-or-nothing, whether overwrite or not.
  • - *
  • List-after-write can be inconsistent.
  • + *
  • List-after-write is strongly consistent.
  • *
*

* Regarding file creation, this implementation: @@ -55,12 +49,6 @@ *

  • Failures during stream write may leak resources, but may never result in partial * writes.
  • * - *

    - * Regarding directory listing, this implementation: - *

      - *
    • returns a list by merging the files listed from S3 and recently-written files from the - * cache.
    • - *
    */ public class S3SingleDriverLogStore extends HadoopFileSystemLogStore { @@ -85,16 +73,6 @@ public class S3SingleDriverLogStore extends HadoopFileSystemLogStore { */ private static final PathLock pathLock = new PathLock(); - /** - * A global cache that records the metadata of the files recently written. - * As list-after-write may be inconsistent on S3, we can use the files in the cache - * to fix the inconsistent file listing. - */ - private static final Cache writtenPathCache = - CacheBuilder.newBuilder() - .expireAfterAccess(120, TimeUnit.MINUTES) - .build(); - ///////////////////////////////////////////// // Constructor and Instance Helper Methods // ///////////////////////////////////////////// @@ -103,13 +81,6 @@ public S3SingleDriverLogStore(Configuration hadoopConf) { super(hadoopConf); } - /** - * Check if the path is an initial version of a Delta log. - */ - private boolean isInitialVersion(Path path) { - return FileNameUtils.isDeltaFile(path) && FileNameUtils.deltaVersion(path) == 0L; - } - private Path resolvePath(FileSystem fs, Path path) { return stripUserInfo(fs.makeQualified(path)); } @@ -137,57 +108,6 @@ private Path stripUserInfo(Path path) { } } - /** - * Merge two lists of {@link FileStatus} into a single list ordered by file path name. - * In case both lists have {@link FileStatus}'s for the same file path, keep the one from - * `listWithPrecedence` and discard the other from `list`. - */ - private Iterator mergeFileLists( - List list, - List listWithPrecedence) { - final Map fileStatusMap = new HashMap<>(); - - // insert all elements from `listWithPrecedence` (highest priority) - // and then insert elements from `list` if and only if that key doesn't already exist - Stream.concat(listWithPrecedence.stream(), list.stream()) - .forEach(fs -> fileStatusMap.putIfAbsent(fs.getPath(), fs)); - - return fileStatusMap - .values() - .stream() - .sorted(Comparator.comparing(a -> a.getPath().getName())) - .iterator(); - } - - /** - * List files starting from `resolvedPath` (inclusive) in the same directory. - */ - private List listFromCache( - FileSystem fs, - Path resolvedPath) { - final Path pathKey = stripUserInfo(resolvedPath); - - return writtenPathCache - .asMap() - .entrySet() - .stream() - .filter(e -> { - final Path path = e.getKey(); - return path.getParent().equals(pathKey.getParent()) && - path.getName().compareTo(pathKey.getName()) >= 0; - }).map(e -> { - final Path path = e.getKey(); - final FileMetadata fileMetadata = e.getValue(); - return new FileStatus( - fileMetadata.length, - false, // isDir - 1, // block_replication - fs.getDefaultBlockSize(path), - fileMetadata.modificationTime, - path); - }).collect(Collectors.toList()); - } - /** * List files starting from `resolvedPath` (inclusive) in the same directory, which merges * the file system list and the cache list when `useCache` is on, otherwise @@ -195,8 +115,7 @@ private List listFromCache( */ private Iterator listFromInternal( FileSystem fs, - Path resolvedPath, - boolean useCache) throws IOException { + Path resolvedPath) throws IOException { final Path parentPath = resolvedPath.getParent(); if (!fs.exists(parentPath)) { throw new FileNotFoundException( @@ -214,30 +133,11 @@ private Iterator listFromInternal( statuses = S3LogStoreUtil.s3ListFromArray(fs, resolvedPath, parentPath); } - final List listedFromFs = Arrays + return Arrays .stream(statuses) .filter(s -> s.getPath().getName().compareTo(resolvedPath.getName()) >= 0) - .collect(Collectors.toList()); - - final List listedFromCache = useCache ? - listFromCache(fs, resolvedPath) : Collections.emptyList(); - - // File statuses listed from file system take precedence - return mergeFileLists(listedFromCache, listedFromFs); - } - - /** - * Check if a path exists. Normally we check both the file system and the cache, but when the - * path is the first version of a Delta log, we ignore the cache. - */ - private boolean exists( - FileSystem fs, - Path resolvedPath) throws IOException { - final boolean useCache = !isInitialVersion(resolvedPath); - final Iterator iter = listFromInternal(fs, resolvedPath, useCache); - if (!iter.hasNext()) return false; - - return iter.next().getPath().getName().equals(resolvedPath.getName()); + .sorted(Comparator.comparing(a -> a.getPath().getName())) + .iterator(); } //////////////////////// @@ -255,7 +155,7 @@ public void write( try { pathLock.acquire(resolvedPath); try { - if (exists(fs, resolvedPath) && !overwrite) { + if (fs.exists(resolvedPath) && !overwrite) { throw new java.nio.file.FileAlreadyExistsException( resolvedPath.toUri().toString() ); @@ -268,26 +168,6 @@ public void write( stream.write((actions.next() + "\n").getBytes(StandardCharsets.UTF_8)); } stream.close(); - - // When a Delta log starts afresh, all cached files in that Delta log become - // obsolete, so we remove them from the cache. - if (isInitialVersion(resolvedPath)) { - final List obsoleteFiles = writtenPathCache - .asMap() - .keySet() - .stream() - .filter(p -> p.getParent().equals(resolvedPath.getParent())) - .collect(Collectors.toList()); - - writtenPathCache.invalidateAll(obsoleteFiles); - } - - // Cache the information of written files to help fix the inconsistency in future - // listings - writtenPathCache.put( - resolvedPath, - new FileMetadata(stream.getCount(), System.currentTimeMillis()) - ); } catch (org.apache.hadoop.fs.FileAlreadyExistsException e) { // Convert Hadoop's FileAlreadyExistsException to Java's FileAlreadyExistsException throw new java.nio.file.FileAlreadyExistsException(e.getMessage()); @@ -303,28 +183,11 @@ public void write( public Iterator listFrom(Path path, Configuration hadoopConf) throws IOException { final FileSystem fs = path.getFileSystem(hadoopConf); final Path resolvedPath = resolvePath(fs, path); - return listFromInternal(fs, resolvedPath, true); // useCache=true + return listFromInternal(fs, resolvedPath); } @Override public Boolean isPartialWriteVisible(Path path, Configuration hadoopConf) { return false; } - - ////////////////// - // Helper Class // - ////////////////// - - /** - * The file metadata to be stored in the cache. - */ - private class FileMetadata { - private long length; - private long modificationTime; - - public FileMetadata(long length, long modificationTime) { - this.length = length; - this.modificationTime = modificationTime; - } - } } diff --git a/storage/src/main/java/io/delta/storage/commit/CommitCoordinatorClient.java b/storage/src/main/java/io/delta/storage/commit/CommitCoordinatorClient.java index 2c8a0223ce3..a7db7e8a23f 100644 --- a/storage/src/main/java/io/delta/storage/commit/CommitCoordinatorClient.java +++ b/storage/src/main/java/io/delta/storage/commit/CommitCoordinatorClient.java @@ -19,6 +19,7 @@ import java.io.IOException; import java.util.Iterator; import java.util.Map; +import java.util.Optional; import io.delta.storage.commit.actions.AbstractMetadata; import io.delta.storage.commit.actions.AbstractProtocol; @@ -51,13 +52,15 @@ public interface CommitCoordinatorClient { * the upgrade commit needs to be a file system commit which will write the backfilled file * directly. * - * @param logPath The path to the delta log of the table that should be converted - * @param currentVersion The currentTableVersion is the version of the table just before - * conversion. currentTableVersion + 1 represents the commit that - * will do the conversion. This must be backfilled atomically. - * currentTableVersion + 2 represents the first commit after conversion. - * This will go through the CommitCoordinatorClient and the client is - * free to choose when it wants to backfill this commit. + * @param logPath The path to the delta log of the table that should be converted + * @param tableIdentifier The optional tableIdentifier for the table. Some commit coordinators may + * choose to make this compulsory and error out if this is not provided. + * @param currentVersion The currentTableVersion is the version of the table just before + * conversion. currentTableVersion + 1 represents the commit that + * will do the conversion. This must be backfilled atomically. + * currentTableVersion + 2 represents the first commit after conversion. + * This will go through the CommitCoordinatorClient and the client is + * free to choose when it wants to backfill this commit. * @param currentMetadata The metadata of the table at currentTableVersion * @param currentProtocol The protocol of the table at currentTableVersion * @return A map of key-value pairs which is issued by the commit coordinator to identify the @@ -67,6 +70,7 @@ public interface CommitCoordinatorClient { */ Map registerTable( Path logPath, + Optional tableIdentifier, long currentVersion, AbstractMetadata currentMetadata, AbstractProtocol currentProtocol); @@ -75,15 +79,13 @@ Map registerTable( * API to commit the given set of actions to the table represented by logPath at the * given commitVersion. * - * @param logStore The log store to use for writing the commit file. - * @param hadoopConf The Hadoop configuration required to access the file system. - * @param logPath The path to the delta log of the table that should be committed to. - * @param tableConf The table configuration that was returned by the commit coordinator - * client during registration. - * @param commitVersion The version of the commit that is being committed. - * @param actions The actions that need to be committed. - * @param updatedActions The commit info and any metadata or protocol changes that are made - * as part of this commit. + * @param logStore The log store to use for writing the commit file. + * @param hadoopConf The Hadoop configuration required to access the file system. + * @param tableDescriptor The descriptor for the table. + * @param commitVersion The version of the commit that is being committed. + * @param actions The actions that need to be committed. + * @param updatedActions The commit info and any metadata or protocol changes that are made + * as part of this commit. * @return CommitResponse which contains the file status of the committed commit file. If the * commit is already backfilled, then the file status could be omitted from the response * and the client could retrieve the information by itself. @@ -92,8 +94,7 @@ Map registerTable( CommitResponse commit( LogStore logStore, Configuration hadoopConf, - Path logPath, - Map tableConf, + TableDescriptor tableDescriptor, long commitVersion, Iterator actions, UpdatedActions updatedActions) throws CommitFailedException; @@ -111,18 +112,14 @@ CommitResponse commit( * coordinator. Note that returning latestTableVersion as -1 is acceptable only if the commit * coordinator never ratified any version, i.e. it never accepted any unbackfilled commit. * - * @param logPath The path to the delta log of the table for which the unbackfilled - * commits should be retrieved. - * @param tableConf The table configuration that was returned by the commit coordinator - * during registration. - * @param startVersion The minimum version of the commit that should be returned. Can be null. - * @param endVersion The maximum version of the commit that should be returned. Can be null. - * @return GetCommitsResponse which has a list of {@link Commit}s and the latestTableVersion which is - * tracked by {@link CommitCoordinatorClient}. + * @param tableDescriptor The descriptor for the table. + * @param startVersion The minimum version of the commit that should be returned. Can be null. + * @param endVersion The maximum version of the commit that should be returned. Can be null. + * @return GetCommitsResponse which has a list of {@link Commit}s and the latestTableVersion which + * is tracked by {@link CommitCoordinatorClient}. */ GetCommitsResponse getCommits( - Path logPath, - Map tableConf, + TableDescriptor tableDescriptor, Long startVersion, Long endVersion); @@ -133,12 +130,11 @@ GetCommitsResponse getCommits( * If this API returns successfully, that means the backfill must have been completed, although * the commit coordinator may not be aware of it yet. * - * @param logStore The log store to use for writing the backfilled commits. - * @param hadoopConf The Hadoop configuration required to access the file system. - * @param logPath The path to the delta log of the table that should be backfilled. - * @param tableConf The table configuration that was returned by the commit coordinator - * during registration. - * @param version The version till which the commit coordinator client should backfill. + * @param logStore The log store to use for writing the backfilled commits. + * @param hadoopConf The Hadoop configuration required to access the file system. + * @param tableDescriptor The descriptor for the table. + * @param version The version till which the commit coordinator client should + * backfill. * @param lastKnownBackfilledVersion The last known version that was backfilled before this API * was called. If it is None or invalid, then the commit * coordinator client should backfill from the beginning of @@ -148,8 +144,7 @@ GetCommitsResponse getCommits( void backfillToVersion( LogStore logStore, Configuration hadoopConf, - Path logPath, - Map tableConf, + TableDescriptor tableDescriptor, long version, Long lastKnownBackfilledVersion) throws IOException; diff --git a/storage/src/main/java/io/delta/storage/commit/CommitFailedException.java b/storage/src/main/java/io/delta/storage/commit/CommitFailedException.java index d20653226b5..454f6c9048a 100644 --- a/storage/src/main/java/io/delta/storage/commit/CommitFailedException.java +++ b/storage/src/main/java/io/delta/storage/commit/CommitFailedException.java @@ -25,7 +25,7 @@ /** * Exception raised by - * {@link io.delta.storage.commit.CommitCoordinatorClient#commit(LogStore, Configuration, Path, Map, long, Iterator, UpdatedActions)} + * {@link CommitCoordinatorClient#commit(LogStore, Configuration, TableDescriptor, long, Iterator, UpdatedActions)} * *
      *  | retryable | conflict  | meaning                                                         |
    diff --git a/storage/src/main/java/io/delta/storage/commit/CommitResponse.java b/storage/src/main/java/io/delta/storage/commit/CommitResponse.java
    index f23b73fd193..cc882d41902 100644
    --- a/storage/src/main/java/io/delta/storage/commit/CommitResponse.java
    +++ b/storage/src/main/java/io/delta/storage/commit/CommitResponse.java
    @@ -25,7 +25,7 @@
     
     /**
      * Response container for
    - * {@link io.delta.storage.commit.CommitCoordinatorClient#commit(LogStore, Configuration, Path, Map, long, Iterator, UpdatedActions)}.
    + * {@link CommitCoordinatorClient#commit(LogStore, Configuration, TableDescriptor, long, Iterator, UpdatedActions)}.
      */
     public class CommitResponse {
     
    diff --git a/storage/src/test/scala/io/delta/storage/commit/CoordinatedCommitsUtils.java b/storage/src/main/java/io/delta/storage/commit/CoordinatedCommitsUtils.java
    similarity index 75%
    rename from storage/src/test/scala/io/delta/storage/commit/CoordinatedCommitsUtils.java
    rename to storage/src/main/java/io/delta/storage/commit/CoordinatedCommitsUtils.java
    index 144b6acec23..cdd03d5ed75 100644
    --- a/storage/src/test/scala/io/delta/storage/commit/CoordinatedCommitsUtils.java
    +++ b/storage/src/main/java/io/delta/storage/commit/CoordinatedCommitsUtils.java
    @@ -33,12 +33,24 @@ public class CoordinatedCommitsUtils {
         private CoordinatedCommitsUtils() {}
     
         /** The subdirectory in which to store the unbackfilled commit files. */
    -    final static String COMMIT_SUBDIR = "_commits";
    +    private static final String COMMIT_SUBDIR = "_commits";
     
    -    /** The configuration key for the coordinated commits owner. */
    -    private static final String COORDINATED_COMMITS_COORDINATOR_CONF_KEY =
    +    /** The configuration key for the coordinated commits owner name. */
    +    private static final String COORDINATED_COMMITS_COORDINATOR_NAME_KEY =
                 "delta.coordinatedCommits.commitCoordinator-preview";
     
    +    /**
    +     * Creates a new unbackfilled delta file path for the given commit version.
    +     * The path is of the form `tablePath/_delta_log/_commits/00000000000000000001.uuid.json`.
    +     */
    +    public static Path generateUnbackfilledDeltaFilePath(
    +            Path logPath,
    +            long version) {
    +        String uuid = UUID.randomUUID().toString();
    +        Path basePath = new Path(logPath, COMMIT_SUBDIR);
    +        return new Path(basePath, String.format("%020d.%s.json", version, uuid));
    +    }
    +
         /**
          * Returns the path to the backfilled delta file for the given commit version.
          * The path is of the form `tablePath/_delta_log/00000000000000000001.json`.
    @@ -56,9 +68,9 @@ public static boolean isCoordinatedCommitsToFSConversion(
                 Long commitVersion,
                 UpdatedActions updatedActions) {
             boolean oldMetadataHasCoordinatedCommits =
    -                !getCoordinator(updatedActions.getOldMetadata()).isEmpty();
    +                getCoordinatorName(updatedActions.getOldMetadata()).isPresent();
             boolean newMetadataHasCoordinatedCommits =
    -                !getCoordinator(updatedActions.getNewMetadata()).isEmpty();
    +                getCoordinatorName(updatedActions.getNewMetadata()).isPresent();
             return oldMetadataHasCoordinatedCommits && !newMetadataHasCoordinatedCommits && commitVersion > 0;
         }
     
    @@ -108,10 +120,17 @@ public static Path commitDirPath(Path logPath) {
             return new Path(logPath, COMMIT_SUBDIR);
         }
     
    -    private static String getCoordinator(AbstractMetadata metadata) {
    +    /**
    +     * Retrieves the coordinator name from the provided abstract metadata.
    +     * If no coordinator is set, an empty optional is returned.
    +     *
    +     * @param metadata The abstract metadata from which to retrieve the coordinator name.
    +     * @return The coordinator name if set, otherwise an empty optional.
    +     */
    +    public static Optional getCoordinatorName(AbstractMetadata metadata) {
             String coordinator = metadata
                     .getConfiguration()
    -                .get(COORDINATED_COMMITS_COORDINATOR_CONF_KEY);
    -        return coordinator != null ? coordinator : "";
    +                .get(COORDINATED_COMMITS_COORDINATOR_NAME_KEY);
    +        return Optional.ofNullable(coordinator);
         }
     }
    diff --git a/storage/src/main/java/io/delta/storage/commit/GetCommitsResponse.java b/storage/src/main/java/io/delta/storage/commit/GetCommitsResponse.java
    index 221b6c85ab4..490f52b26cb 100644
    --- a/storage/src/main/java/io/delta/storage/commit/GetCommitsResponse.java
    +++ b/storage/src/main/java/io/delta/storage/commit/GetCommitsResponse.java
    @@ -23,7 +23,7 @@
     
     /**
      * Response container for
    - * {@link io.delta.storage.commit.CommitCoordinatorClient#getCommits(Path, Map, Long, Long)}.
    + * {@link CommitCoordinatorClient#getCommits(TableDescriptor, Long, Long)}.
      */
     public class GetCommitsResponse {
     
    diff --git a/storage/src/main/java/io/delta/storage/commit/TableDescriptor.java b/storage/src/main/java/io/delta/storage/commit/TableDescriptor.java
    new file mode 100644
    index 00000000000..aa496855379
    --- /dev/null
    +++ b/storage/src/main/java/io/delta/storage/commit/TableDescriptor.java
    @@ -0,0 +1,52 @@
    +/*
    + * Copyright (2021) The Delta Lake Project Authors.
    + *
    + * Licensed under the Apache License, Version 2.0 (the "License");
    + * you may not use this file except in compliance with the License.
    + * You may obtain a copy of the License at
    + *
    + * http://www.apache.org/licenses/LICENSE-2.0
    + *
    + * Unless required by applicable law or agreed to in writing, software
    + * distributed under the License is distributed on an "AS IS" BASIS,
    + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    + * See the License for the specific language governing permissions and
    + * limitations under the License.
    + */
    +
    +package io.delta.storage.commit;
    +
    +import java.util.Arrays;
    +import java.util.Map;
    +import java.util.Optional;
    +
    +import org.apache.hadoop.fs.Path;
    +
    +/**
    + * Container for all the info to uniquely identify the table
    + */
    +public class TableDescriptor {
    +
    +    private Path logPath;
    +    private Optional tableIdentifier;
    +
    +    private Map tableConf;
    +
    +    public TableDescriptor(Path logPath, Optional tableIdentifier, Map tableConf) {
    +        this.logPath = logPath;
    +        this.tableIdentifier = tableIdentifier;
    +        this.tableConf = tableConf;
    +    }
    +
    +    public Optional getTableIdentifier() {
    +        return tableIdentifier;
    +    }
    +
    +    public Path getLogPath() {
    +        return logPath;
    +    }
    +
    +    public Map getTableConf() {
    +        return tableConf;
    +    }
    +}
    diff --git a/storage/src/main/java/io/delta/storage/commit/TableIdentifier.java b/storage/src/main/java/io/delta/storage/commit/TableIdentifier.java
    new file mode 100644
    index 00000000000..9e58471165b
    --- /dev/null
    +++ b/storage/src/main/java/io/delta/storage/commit/TableIdentifier.java
    @@ -0,0 +1,62 @@
    +/*
    + * Copyright (2021) The Delta Lake Project Authors.
    + *
    + * Licensed under the Apache License, Version 2.0 (the "License");
    + * you may not use this file except in compliance with the License.
    + * You may obtain a copy of the License at
    + *
    + * http://www.apache.org/licenses/LICENSE-2.0
    + *
    + * Unless required by applicable law or agreed to in writing, software
    + * distributed under the License is distributed on an "AS IS" BASIS,
    + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    + * See the License for the specific language governing permissions and
    + * limitations under the License.
    + */
    +
    +package io.delta.storage.commit;
    +
    +/**
    + * Identifier for a table.
    + */
    +public class TableIdentifier {
    +
    +    // The name of the table.
    +    private String name;
    +
    +    // The namespace of the table. e.g.  / 
    +    private String[] namespace;
    +
    +    public TableIdentifier(String[] namespace, String name) {
    +        this.namespace = namespace;
    +        this.name = name;
    +    }
    +
    +    public TableIdentifier(String firstName, String... rest) {
    +        String[] namespace = new String[rest.length];
    +        String name;
    +        if (rest.length > 0) {
    +            name = rest[rest.length-1];
    +            namespace[0] = firstName;
    +            System.arraycopy(rest, 0, namespace, 1, rest.length-1);
    +        } else {
    +            name = firstName;
    +        }
    +        this.namespace = namespace;
    +        this.name = name;
    +    }
    +
    +    /**
    +     * Returns the namespace of the table.
    +     */
    +    public String[] getNamespace() {
    +        return namespace;
    +    }
    +
    +    /**
    +     * Returns the name of the table.
    +     */
    +    public String getName() {
    +        return name;
    +    }
    +}
    diff --git a/storage/src/test/scala/io/delta/storage/commit/InMemoryCommitCoordinator.scala b/storage/src/test/scala/io/delta/storage/commit/InMemoryCommitCoordinator.scala
    index 294ee926a93..551969a8d4e 100644
    --- a/storage/src/test/scala/io/delta/storage/commit/InMemoryCommitCoordinator.scala
    +++ b/storage/src/test/scala/io/delta/storage/commit/InMemoryCommitCoordinator.scala
    @@ -15,12 +15,14 @@
      */
     package io.delta.storage.commit
     
    +import java.lang.{Long => JLong}
     import java.nio.file.FileAlreadyExistsException
    -import java.{lang, util}
    -import java.util._
    +import java.util.{ArrayList, Collections, Iterator => JIterator, Map => JMap, Optional, TreeMap, UUID}
     import java.util.concurrent.ConcurrentHashMap
     import java.util.concurrent.locks.ReentrantReadWriteLock
    +
     import io.delta.storage.LogStore
    +import io.delta.storage.commit.CoordinatedCommitsUtils
     import io.delta.storage.commit.actions.AbstractMetadata
     import io.delta.storage.commit.actions.AbstractProtocol
     import org.apache.hadoop.conf.Configuration
    @@ -62,7 +64,7 @@ class InMemoryCommitCoordinator(val batchSize: Long) extends CommitCoordinatorCl
         def lastRatifiedCommitVersion: Long = if (!active) -1 else maxCommitVersion
     
         // Map from version to Commit data
    -    val commitsMap: util.TreeMap[Long, Commit] = new util.TreeMap[Long, Commit]
    +    val commitsMap: TreeMap[Long, Commit] = new TreeMap[Long, Commit]
         // We maintain maxCommitVersion explicitly since commitsMap might be empty
         // if all commits for a table have been backfilled.
         val lock: ReentrantReadWriteLock = new ReentrantReadWriteLock()
    @@ -71,10 +73,11 @@ class InMemoryCommitCoordinator(val batchSize: Long) extends CommitCoordinatorCl
       private[commit] val perTableMap = new ConcurrentHashMap[String, PerTableData]()
     
       override def registerTable(
    -    logPath: Path,
    -    currentVersion: Long,
    -    currentMetadata: AbstractMetadata,
    -    currentProtocol: AbstractProtocol): util.Map[String, String] = {
    +      logPath: Path,
    +      tableIdentifier: Optional[TableIdentifier],
    +      currentVersion: Long,
    +      currentMetadata: AbstractMetadata,
    +      currentProtocol: AbstractProtocol): JMap[String, String] = {
         val newPerTableData = new PerTableData(currentVersion + 1)
         perTableMap.compute(logPath.toString, (_, existingData) => {
           if (existingData != null) {
    @@ -97,13 +100,13 @@ class InMemoryCommitCoordinator(val batchSize: Long) extends CommitCoordinatorCl
       }
     
       override def commit(
    -    logStore: LogStore,
    -    hadoopConf: Configuration,
    -    logPath: Path,
    -    coordinatedCommitsTableConf: util.Map[String, String],
    -    commitVersion: Long,
    -    actions: util.Iterator[String],
    -    updatedActions: UpdatedActions): CommitResponse = {
    +      logStore: LogStore,
    +      hadoopConf: Configuration,
    +      tableDesc: TableDescriptor,
    +      commitVersion: Long,
    +      actions: JIterator[String],
    +      updatedActions: UpdatedActions): CommitResponse = {
    +    val logPath = tableDesc.getLogPath
         val tablePath = CoordinatedCommitsUtils.getTablePath(logPath)
         if (commitVersion == 0) {
           throw new CommitFailedException(false, false, "Commit version 0 must go via filesystem.")
    @@ -116,8 +119,7 @@ class InMemoryCommitCoordinator(val batchSize: Long) extends CommitCoordinatorCl
             "Making sure commits are backfilled until {}" + " version for table {}",
             commitVersion - 1,
             tablePath)
    -      backfillToVersion(
    -        logStore, hadoopConf, logPath, coordinatedCommitsTableConf, commitVersion - 1, null)
    +      backfillToVersion(logStore, hadoopConf, tableDesc, commitVersion - 1, null)
         }
         // Write new commit file in _commits directory
         val fileStatus = CoordinatedCommitsUtils.writeCommitFile(
    @@ -142,24 +144,18 @@ class InMemoryCommitCoordinator(val batchSize: Long) extends CommitCoordinatorCl
             "Making sure commits are backfilled till {} version for table {}",
             commitVersion,
             tablePath)
    -      backfillToVersion(
    -        logStore,
    -        hadoopConf,
    -        logPath,
    -        coordinatedCommitsTableConf,
    -        commitVersion,
    -        null)
    +      backfillToVersion(logStore, hadoopConf, tableDesc, commitVersion, null)
         }
         logger.info("Commit {} done successfully on table {}", commitVersion, tablePath)
         commitResponse
       }
     
       override def getCommits(
    -    logPath: Path,
    -    coordinatedCommitsTableConf: util.Map[String, String],
    -    startVersion: lang.Long,
    -    endVersion: lang.Long): GetCommitsResponse = withReadLock[GetCommitsResponse](logPath) {
    -    val tableData = perTableMap.get(logPath.toString)
    +      tableDesc: TableDescriptor,
    +      startVersion: JLong,
    +      endVersion: JLong)
    +      : GetCommitsResponse = withReadLock[GetCommitsResponse](tableDesc.getLogPath) {
    +    val tableData = perTableMap.get(tableDesc.getLogPath.toString)
         val startVersionOpt = Optional.ofNullable(startVersion)
         val endVersionOpt = Optional.ofNullable(endVersion)
         val effectiveStartVersion = startVersionOpt.orElse(0L)
    @@ -170,16 +166,16 @@ class InMemoryCommitCoordinator(val batchSize: Long) extends CommitCoordinatorCl
           else tableData.commitsMap.lastKey)
         val commitsInRange = tableData.commitsMap.subMap(effectiveStartVersion, effectiveEndVersion + 1)
         new GetCommitsResponse(
    -      new util.ArrayList[Commit](commitsInRange.values), tableData.lastRatifiedCommitVersion)
    +      new ArrayList[Commit](commitsInRange.values), tableData.lastRatifiedCommitVersion)
       }
     
       override def backfillToVersion(
    -    logStore: LogStore,
    -    hadoopConf: Configuration,
    -    logPath: Path,
    -    coordinatedCommitsTableConf: util.Map[String, String],
    -    version: Long,
    -    lastKnownBackfilledVersion: lang.Long): Unit = {
    +      logStore: LogStore,
    +      hadoopConf: Configuration,
    +      tableDesc: TableDescriptor,
    +      version: Long,
    +      lastKnownBackfilledVersion: JLong): Unit = {
    +    val logPath = tableDesc.getLogPath
         // Confirm the last backfilled version by checking the backfilled delta file's existence.
         var validLastKnownBackfilledVersion = lastKnownBackfilledVersion
         if (lastKnownBackfilledVersion != null) {
    @@ -188,9 +184,9 @@ class InMemoryCommitCoordinator(val batchSize: Long) extends CommitCoordinatorCl
             validLastKnownBackfilledVersion = null
           }
         }
    -    var startVersion: lang.Long = null
    +    var startVersion: JLong = null
         if (validLastKnownBackfilledVersion != null) startVersion = validLastKnownBackfilledVersion + 1
    -    val commitsResponse = getCommits(logPath, coordinatedCommitsTableConf, startVersion, version)
    +    val commitsResponse = getCommits(tableDesc, startVersion, version)
         commitsResponse.getCommits.forEach((commit: Commit) => {
           backfill(logStore, hadoopConf, logPath, commit.getVersion, commit.getFileStatus)
         })
    @@ -200,11 +196,11 @@ class InMemoryCommitCoordinator(val batchSize: Long) extends CommitCoordinatorCl
     
       /** Backfills a given `fileStatus` to `version`.json */
       protected def backfill(
    -    logStore: LogStore,
    -    hadoopConf: Configuration,
    -    logPath: Path,
    -    version: Long,
    -    fileStatus: FileStatus): Unit = {
    +      logStore: LogStore,
    +      hadoopConf: Configuration,
    +      logPath: Path,
    +      version: Long,
    +      fileStatus: FileStatus): Unit = {
         val targetFile = CoordinatedCommitsUtils.getBackfilledDeltaFilePath(logPath, version)
         logger.info("Backfilling commit " + fileStatus.getPath + " to " + targetFile)
         val commitContentIterator = logStore.read(fileStatus.getPath, hadoopConf)
    @@ -220,10 +216,10 @@ class InMemoryCommitCoordinator(val batchSize: Long) extends CommitCoordinatorCl
       protected def generateUUID(): String = UUID.randomUUID().toString
     
       private def addToMap(
    -    logPath: Path,
    -    commitVersion: Long,
    -    commitFile: FileStatus,
    -    commitTimestamp: Long): CommitResponse = withWriteLock[CommitResponse](logPath) {
    +      logPath: Path,
    +      commitVersion: Long,
    +      commitFile: FileStatus,
    +      commitTimestamp: Long): CommitResponse = withWriteLock[CommitResponse](logPath) {
         val tableData = perTableMap.get(logPath.toString)
         val expectedVersion = tableData.maxCommitVersion + 1
         if (commitVersion != expectedVersion) {