Do not split commits that contain DVs in CDF streaming

Cherry-pick of 9f70ee5 for branch-2.4. - When the add and remove file entries for a commit that contains DVs gets split into different batches, we don't recognise that they belong together and we need to produce CDF by calculating the diff of the two DVS. - This PR prevents splitting of these commits, just like we do for AddCDCFile actions to prevent splitting the update_{pre/post}_image information. GitOrigin-RevId: 11438c01ecc69f7c55c3a8826fa540f6b984e4c4
delta-io · Aug 22, 2023 · 9ef19f1 · 9ef19f1
1 parent de19540
commit 9ef19f1
Show file tree

Hide file tree

Showing 3 changed files with 63 additions and 12 deletions.
diff --git a/core/src/main/scala/org/apache/spark/sql/delta/sources/DeltaSource.scala b/core/src/main/scala/org/apache/spark/sql/delta/sources/DeltaSource.scala
@@ -1033,12 +1033,12 @@ case class DeltaSource(
     var commitProcessedInBatch = false
 
     /**
-     * This overloaded method checks if all the AddCDCFiles for a commit can be accommodated by
+     * This overloaded method checks if all the FileActions for a commit can be accommodated by
      * the rate limit.
      */
-    def admit(fileActions: Seq[AddCDCFile]): Boolean = {
-      def getSize(actions: Seq[AddCDCFile]): Long = {
-        actions.foldLeft(0L) { (l, r) => l + r.size }
+    def admit(fileActions: Seq[FileAction]): Boolean = {
+      def getSize(actions: Seq[FileAction]): Long = {
+        actions.foldLeft(0L) { (l, r) => l + r.getFileSize }
       }
       if (fileActions.isEmpty) {
         true

diff --git a/core/src/main/scala/org/apache/spark/sql/delta/sources/DeltaSourceCDCSupport.scala b/core/src/main/scala/org/apache/spark/sql/delta/sources/DeltaSourceCDCSupport.scala
@@ -148,13 +148,31 @@ trait DeltaSourceCDCSupport { self: DeltaSource =>
           // We allow entries with no file actions and index as [[DeltaSourceOffset.BASE_INDEX]]
           // that are used primarily to update latest offset when no other
           // file action based entries are present.
-          fileActions.filter(indexedFile => hasAddsOrRemoves(indexedFile) ||
-            hasNoFileActionAndStartIndex(indexedFile))
-            .filter(
-              isValidIndexedFile(_, fromVersion, fromIndex, endOffset)
-            ).takeWhile { indexedFile =>
-            admissionControl.admit(Option(indexedFile.getFileAction))
-          }.toIterator
+          val filteredFiles = fileActions
+            .filter { indexedFile =>
+              hasAddsOrRemoves(indexedFile) || hasNoFileActionAndStartIndex(indexedFile)
+            }
+            .filter(isValidIndexedFile(_, fromVersion, fromIndex, endOffset))
+          val filteredFileActions = filteredFiles.flatMap(f => Option(f.getFileAction))
+          val hasDeletionVectors = filteredFileActions.exists {
+            case add: AddFile => add.deletionVector != null
+            case remove: RemoveFile => remove.deletionVector != null
+            case _ => false
+          }
+          if (hasDeletionVectors) {
+            // We cannot split up add/remove pairs with Deletion Vectors, because we will get the
+            // wrong result.
+            // So in this case we behave as above with CDC files and either admit all or none.
+            if (admissionControl.admit(filteredFileActions)) {
+              filteredFiles.toIterator
+            } else {
+              Iterator()
+            }
+          } else {
+            filteredFiles.takeWhile { indexedFile =>
+              admissionControl.admit(Option(indexedFile.getFileAction))
+            }.toIterator
+          }
         }
       }
     }
@@ -279,7 +297,6 @@ trait DeltaSourceCDCSupport { self: DeltaSource =>
     } else {
       filterAndIndexDeltaLogs(fromVersion)
     }
-
     // In this case, filterFiles will consume the available capacity. We use takeWhile
     // to stop the iteration when we reach the limit which will save us from reading
     // unnecessary log files.

diff --git a/core/src/test/scala/org/apache/spark/sql/delta/DeltaCDCStreamSuite.scala b/core/src/test/scala/org/apache/spark/sql/delta/DeltaCDCStreamSuite.scala
@@ -1006,6 +1006,40 @@ trait DeltaCDCStreamSuiteBase extends StreamTest with DeltaSQLCommandTest
       }
     }
   }
+
+  // LC-1281: Ensure that when we would split batches into one file at a time, we still produce
+  // correct CDF even in cases where the CDF may need to compare multiple file actions from the
+  // same commit to be correct, such as with persistent deletion vectors.
+  test("double delete-only on the same file") {
+    withTempDir { tableDir =>
+      val tablePath = tableDir.toString
+      spark.range(start = 0L, end = 10L, step = 1L, numPartitions = 1).toDF("id")
+        .write.format("delta").save(tablePath)
+
+      spark.sql(s"DELETE FROM delta.`$tablePath` WHERE id IN (1, 3, 6)")
+      spark.sql(s"DELETE FROM delta.`$tablePath` WHERE id IN (2, 4, 7)")
+
+      val stream = spark.readStream
+        .format("delta")
+        .option(DeltaOptions.CDC_READ_OPTION, true)
+        .option(DeltaOptions.MAX_FILES_PER_TRIGGER_OPTION, 1)
+        .option(DeltaOptions.STARTING_VERSION_OPTION, 1)
+        .load(tablePath)
+        .drop(CDCReader.CDC_COMMIT_TIMESTAMP)
+
+      testStream(stream)(
+        ProcessAllAvailable(),
+        CheckAnswer(
+          (1L, "delete", 1L),
+          (3L, "delete", 1L),
+          (6L, "delete", 1L),
+          (2L, "delete", 2L),
+          (4L, "delete", 2L),
+          (7L, "delete", 2L)
+        )
+      )
+    }
+  }
 }
 
 class DeltaCDCStreamDeletionVectorSuite extends DeltaCDCStreamSuite