From 8731441e0338e8f4f64bafdcc2032774233ed4bc Mon Sep 17 00:00:00 2001 From: Robert Stupp Date: Mon, 11 Nov 2024 16:07:06 +0100 Subject: [PATCH] GC: consider statistics files (#9898) (Untested) implementation to consider referenced statistics files as 'live'. --- CHANGELOG.md | 2 ++ .../gc/iceberg/IcebergContentToFiles.java | 21 +++++++++++++++++-- 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7b1ba0df82c..9b8d3413e2b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -40,6 +40,8 @@ as necessary. Empty sections will not end in the release notes. ### Fixes +- GC: Consider referenced statistics (and partition statistics) files as 'live'. + ### Commits ## [0.99.0] Release (2024-09-26) diff --git a/gc/gc-iceberg/src/main/java/org/projectnessie/gc/iceberg/IcebergContentToFiles.java b/gc/gc-iceberg/src/main/java/org/projectnessie/gc/iceberg/IcebergContentToFiles.java index fd2d08b7d46..7a646590672 100644 --- a/gc/gc-iceberg/src/main/java/org/projectnessie/gc/iceberg/IcebergContentToFiles.java +++ b/gc/gc-iceberg/src/main/java/org/projectnessie/gc/iceberg/IcebergContentToFiles.java @@ -32,7 +32,9 @@ import org.apache.iceberg.ManifestFile; import org.apache.iceberg.ManifestReaderUtil; import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.PartitionStatisticsFile; import org.apache.iceberg.Snapshot; +import org.apache.iceberg.StatisticsFile; import org.apache.iceberg.TableMetadata; import org.apache.iceberg.TableMetadataParser; import org.apache.iceberg.io.CloseableIterable; @@ -137,11 +139,26 @@ private Stream extractTableFiles(ContentReference contentReferenc Snapshot snapshot = snapshotId < 0L ? tableMetadata.currentSnapshot() : tableMetadata.snapshot(snapshotId); - Map specsById = tableMetadata.specsById(); - Stream allFiles = elementaryUrisFromSnapshot(snapshot, contentReference); if (snapshot != null) { + long effectiveSnapshotId = snapshot.snapshotId(); + allFiles = + Stream.concat( + allFiles, + tableMetadata.statisticsFiles().stream() + .filter(s -> s.snapshotId() == effectiveSnapshotId) + .map(StatisticsFile::path) + .map(StorageUri::of)); + allFiles = + Stream.concat( + allFiles, + tableMetadata.partitionStatisticsFiles().stream() + .filter(s -> s.snapshotId() == effectiveSnapshotId) + .map(PartitionStatisticsFile::path) + .map(StorageUri::of)); + + Map specsById = tableMetadata.specsById(); allFiles = Stream.concat( allFiles,