|
49 | 49 | import org.apache.iceberg.exceptions.ValidationException;
|
50 | 50 | import org.apache.iceberg.hadoop.HiddenPathFilter;
|
51 | 51 | import org.apache.iceberg.io.BulkDeletionFailureException;
|
| 52 | +import org.apache.iceberg.io.FileIO; |
52 | 53 | import org.apache.iceberg.io.SupportsBulkOperations;
|
53 | 54 | import org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting;
|
54 | 55 | import org.apache.iceberg.relocated.com.google.common.base.Preconditions;
|
|
60 | 61 | import org.apache.iceberg.spark.JobGroupInfo;
|
61 | 62 | import org.apache.iceberg.util.Pair;
|
62 | 63 | import org.apache.iceberg.util.PropertyUtil;
|
| 64 | +import org.apache.iceberg.util.SerializableConsumer; |
63 | 65 | import org.apache.iceberg.util.Tasks;
|
64 | 66 | import org.apache.spark.api.java.JavaRDD;
|
65 | 67 | import org.apache.spark.api.java.function.FlatMapFunction;
|
| 68 | +import org.apache.spark.api.java.function.ForeachPartitionFunction; |
66 | 69 | import org.apache.spark.api.java.function.MapPartitionsFunction;
|
67 | 70 | import org.apache.spark.broadcast.Broadcast;
|
68 | 71 | import org.apache.spark.sql.Column;
|
@@ -121,6 +124,9 @@ public class DeleteOrphanFilesSparkAction extends BaseSparkAction<DeleteOrphanFi
|
121 | 124 | private Dataset<Row> compareToFileList;
|
122 | 125 | private Consumer<String> deleteFunc = null;
|
123 | 126 | private ExecutorService deleteExecutorService = null;
|
| 127 | + private SetAccumulator<Pair<String, String>> conflicts = new SetAccumulator<>(); |
| 128 | + private Dataset<String> orphanFileDF = null; |
| 129 | + private boolean includeFilePathsInResults = true; |
124 | 130 |
|
125 | 131 | DeleteOrphanFilesSparkAction(SparkSession spark, Table table) {
|
126 | 132 | super(spark);
|
@@ -185,6 +191,10 @@ public DeleteOrphanFilesSparkAction deleteWith(Consumer<String> newDeleteFunc) {
|
185 | 191 | return this;
|
186 | 192 | }
|
187 | 193 |
|
| 194 | + public void includeFilePathsInResult(boolean include) { |
| 195 | + this.includeFilePathsInResults = include; |
| 196 | + } |
| 197 | + |
188 | 198 | public DeleteOrphanFilesSparkAction compareToFileList(Dataset<Row> files) {
|
189 | 199 | StructType schema = files.schema();
|
190 | 200 |
|
@@ -233,46 +243,87 @@ private String jobDesc() {
|
233 | 243 | return String.format("Deleting orphan files (%s) from %s", optionsAsString, table.name());
|
234 | 244 | }
|
235 | 245 |
|
236 |
| - private void deleteFiles(SupportsBulkOperations io, List<String> paths) { |
237 |
| - try { |
238 |
| - io.deleteFiles(paths); |
239 |
| - LOG.info("Deleted {} files using bulk deletes", paths.size()); |
240 |
| - } catch (BulkDeletionFailureException e) { |
241 |
| - int deletedFilesCount = paths.size() - e.numberFailedObjects(); |
242 |
| - LOG.warn("Deleted only {} of {} files using bulk deletes", deletedFilesCount, paths.size()); |
243 |
| - } |
| 246 | + private SerializableConsumer<Iterator<String>> bulkDeleteFiles(SupportsBulkOperations io) { |
| 247 | + return stringIterator -> { |
| 248 | + List<String> paths = Lists.newArrayList(stringIterator); |
| 249 | + try { |
| 250 | + io.deleteFiles(paths); |
| 251 | + } catch (BulkDeletionFailureException e) { |
| 252 | + int deletedFilesCount = paths.size() - e.numberFailedObjects(); |
| 253 | + LOG.warn("Deleted only {} of {} files using bulk deletes", deletedFilesCount, paths.size()); |
| 254 | + } |
| 255 | + }; |
244 | 256 | }
|
245 | 257 |
|
246 |
| - private DeleteOrphanFiles.Result doExecute() { |
| 258 | + private Result doExecute() { |
247 | 259 | Dataset<FileURI> actualFileIdentDS = actualFileIdentDS();
|
248 | 260 | Dataset<FileURI> validFileIdentDS = validFileIdentDS();
|
| 261 | + Dataset<String> orphanFiles = |
| 262 | + findOrphanFiles(spark(), actualFileIdentDS, validFileIdentDS).coalesce(10); |
249 | 263 |
|
250 |
| - List<String> orphanFiles = |
251 |
| - findOrphanFiles(spark(), actualFileIdentDS, validFileIdentDS, prefixMismatchMode); |
| 264 | + // to materialize the dataframe to populate the accumulator |
| 265 | + orphanFiles.cache(); |
| 266 | + orphanFiles.count(); |
| 267 | + if (prefixMismatchMode == PrefixMismatchMode.ERROR && !conflicts.value().isEmpty()) { |
| 268 | + throw new ValidationException( |
| 269 | + "Unable to determine whether certain files are orphan. " |
| 270 | + + "Metadata references files that match listed/provided files except for authority/scheme. " |
| 271 | + + "Please, inspect the conflicting authorities/schemes and provide which of them are equal " |
| 272 | + + "by further configuring the action via equalSchemes() and equalAuthorities() methods. " |
| 273 | + + "Set the prefix mismatch mode to 'NONE' to ignore remaining locations with conflicting " |
| 274 | + + "authorities/schemes or to 'DELETE' iff you are ABSOLUTELY confident that remaining conflicting " |
| 275 | + + "authorities/schemes are different. It will be impossible to recover deleted files. " |
| 276 | + + "Conflicting authorities/schemes: %s.", |
| 277 | + conflicts.value()); |
| 278 | + } |
252 | 279 |
|
253 |
| - if (deleteFunc == null && table.io() instanceof SupportsBulkOperations) { |
254 |
| - deleteFiles((SupportsBulkOperations) table.io(), orphanFiles); |
| 280 | + FileIO io = table.io(); |
| 281 | + if (deleteFunc == null && io instanceof SupportsBulkOperations) { |
| 282 | + Consumer<Iterator<String>> iteratorConsumer = bulkDeleteFiles((SupportsBulkOperations) io); |
| 283 | + orphanFiles.foreachPartition((ForeachPartitionFunction<String>) iteratorConsumer::accept); |
255 | 284 | } else {
|
256 |
| - |
257 |
| - Tasks.Builder<String> deleteTasks = |
258 |
| - Tasks.foreach(orphanFiles) |
259 |
| - .noRetry() |
260 |
| - .executeWith(deleteExecutorService) |
261 |
| - .suppressFailureWhenFinished() |
262 |
| - .onFailure((file, exc) -> LOG.warn("Failed to delete file: {}", file, exc)); |
263 |
| - |
264 |
| - if (deleteFunc == null) { |
265 |
| - LOG.info( |
266 |
| - "Table IO {} does not support bulk operations. Using non-bulk deletes.", |
267 |
| - table.io().getClass().getName()); |
268 |
| - deleteTasks.run(table.io()::deleteFile); |
| 285 | + Consumer<String> deleteFunction = nonBulkDeleteFiles(); |
| 286 | + if (deleteFunction instanceof SerializableConsumer && deleteExecutorService == null) { |
| 287 | + orphanFiles.foreachPartition( |
| 288 | + (ForeachPartitionFunction<String>) |
| 289 | + iterator -> { |
| 290 | + while (iterator.hasNext()) { |
| 291 | + String next = iterator.next(); |
| 292 | + deleteFunction.accept(next); |
| 293 | + } |
| 294 | + }); |
269 | 295 | } else {
|
270 |
| - LOG.info("Custom delete function provided. Using non-bulk deletes"); |
271 |
| - deleteTasks.run(deleteFunc::accept); |
| 296 | + List<String> filesToDelete = orphanFiles.collectAsList(); |
| 297 | + Tasks.Builder<String> deleteTasks = |
| 298 | + Tasks.foreach(filesToDelete) |
| 299 | + .noRetry() |
| 300 | + .executeWith(deleteExecutorService) |
| 301 | + .suppressFailureWhenFinished() |
| 302 | + .onFailure((file, exc) -> LOG.warn("Failed to delete file: {}", file, exc)); |
| 303 | + deleteTasks.run(deleteFunction::accept); |
272 | 304 | }
|
273 | 305 | }
|
| 306 | + return results(orphanFiles); |
| 307 | + } |
| 308 | + |
| 309 | + private Consumer<String> nonBulkDeleteFiles() { |
| 310 | + FileIO io = table.io(); |
| 311 | + if (deleteFunc == null) { |
| 312 | + LOG.info( |
| 313 | + "Table IO {} does not support bulk operations. Using non-bulk deletes.", |
| 314 | + io.getClass().getName()); |
| 315 | + return (SerializableConsumer<String>) io::deleteFile; |
| 316 | + } else { |
| 317 | + LOG.info("Custom delete function provided. Using non-bulk deletes"); |
| 318 | + return deleteFunc; |
| 319 | + } |
| 320 | + } |
274 | 321 |
|
275 |
| - return ImmutableDeleteOrphanFiles.Result.builder().orphanFileLocations(orphanFiles).build(); |
| 322 | + Result results(Dataset<String> df) { |
| 323 | + return ImmutableDeleteOrphanFiles.Result.builder() |
| 324 | + .orphanFileLocations(includeFilePathsInResults ? df.collectAsList() : Lists.newArrayList()) |
| 325 | + .orphanFileLocationsCount(df.count()) |
| 326 | + .build(); |
276 | 327 | }
|
277 | 328 |
|
278 | 329 | private Dataset<FileURI> validFileIdentDS() {
|
@@ -387,38 +438,28 @@ private static void listDirRecursively(
|
387 | 438 | }
|
388 | 439 | }
|
389 | 440 |
|
390 |
| - @VisibleForTesting |
391 |
| - static List<String> findOrphanFiles( |
392 |
| - SparkSession spark, |
393 |
| - Dataset<FileURI> actualFileIdentDS, |
394 |
| - Dataset<FileURI> validFileIdentDS, |
395 |
| - PrefixMismatchMode prefixMismatchMode) { |
396 |
| - |
397 |
| - SetAccumulator<Pair<String, String>> conflicts = new SetAccumulator<>(); |
398 |
| - spark.sparkContext().register(conflicts); |
399 |
| - |
| 441 | + Dataset<String> findOrphanFiles( |
| 442 | + SparkSession spark, Dataset<FileURI> actualFileIdentDS, Dataset<FileURI> validFileIdentDS) { |
| 443 | + if (orphanFileDF != null) { |
| 444 | + return orphanFileDF; |
| 445 | + } |
| 446 | + SetAccumulator<Pair<String, String>> conflictsAccumulator = accumulator(); |
| 447 | + spark.sparkContext().register(conflictsAccumulator); |
400 | 448 | Column joinCond = actualFileIdentDS.col("path").equalTo(validFileIdentDS.col("path"));
|
401 | 449 |
|
402 |
| - List<String> orphanFiles = |
| 450 | + orphanFileDF = |
403 | 451 | actualFileIdentDS
|
404 | 452 | .joinWith(validFileIdentDS, joinCond, "leftouter")
|
405 |
| - .mapPartitions(new FindOrphanFiles(prefixMismatchMode, conflicts), Encoders.STRING()) |
406 |
| - .collectAsList(); |
| 453 | + .mapPartitions( |
| 454 | + new FindOrphanFiles(prefixMismatchMode, conflictsAccumulator), Encoders.STRING()); |
| 455 | + return orphanFileDF; |
| 456 | + } |
407 | 457 |
|
408 |
| - if (prefixMismatchMode == PrefixMismatchMode.ERROR && !conflicts.value().isEmpty()) { |
409 |
| - throw new ValidationException( |
410 |
| - "Unable to determine whether certain files are orphan. " |
411 |
| - + "Metadata references files that match listed/provided files except for authority/scheme. " |
412 |
| - + "Please, inspect the conflicting authorities/schemes and provide which of them are equal " |
413 |
| - + "by further configuring the action via equalSchemes() and equalAuthorities() methods. " |
414 |
| - + "Set the prefix mismatch mode to 'NONE' to ignore remaining locations with conflicting " |
415 |
| - + "authorities/schemes or to 'DELETE' iff you are ABSOLUTELY confident that remaining conflicting " |
416 |
| - + "authorities/schemes are different. It will be impossible to recover deleted files. " |
417 |
| - + "Conflicting authorities/schemes: %s.", |
418 |
| - conflicts.value()); |
| 458 | + private SetAccumulator<Pair<String, String>> accumulator() { |
| 459 | + if (conflicts == null) { |
| 460 | + conflicts = new SetAccumulator<>(); |
419 | 461 | }
|
420 |
| - |
421 |
| - return orphanFiles; |
| 462 | + return conflicts; |
422 | 463 | }
|
423 | 464 |
|
424 | 465 | private static Map<String, String> flattenMap(Map<String, String> map) {
|
|
0 commit comments