From 8731441e0338e8f4f64bafdcc2032774233ed4bc Mon Sep 17 00:00:00 2001
From: Robert Stupp <snazy@snazy.de>
Date: Mon, 11 Nov 2024 16:07:06 +0100
Subject: [PATCH] GC: consider statistics files (#9898)

(Untested) implementation to consider referenced statistics files as 'live'.
---
 CHANGELOG.md                                  |  2 ++
 .../gc/iceberg/IcebergContentToFiles.java     | 21 +++++++++++++++++--
 2 files changed, 21 insertions(+), 2 deletions(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 7b1ba0df82c..9b8d3413e2b 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -40,6 +40,8 @@ as necessary. Empty sections will not end in the release notes.
 
 ### Fixes
 
+- GC: Consider referenced statistics (and partition statistics) files as 'live'.
+
 ### Commits
 
 ## [0.99.0] Release (2024-09-26)
diff --git a/gc/gc-iceberg/src/main/java/org/projectnessie/gc/iceberg/IcebergContentToFiles.java b/gc/gc-iceberg/src/main/java/org/projectnessie/gc/iceberg/IcebergContentToFiles.java
index fd2d08b7d46..7a646590672 100644
--- a/gc/gc-iceberg/src/main/java/org/projectnessie/gc/iceberg/IcebergContentToFiles.java
+++ b/gc/gc-iceberg/src/main/java/org/projectnessie/gc/iceberg/IcebergContentToFiles.java
@@ -32,7 +32,9 @@
 import org.apache.iceberg.ManifestFile;
 import org.apache.iceberg.ManifestReaderUtil;
 import org.apache.iceberg.PartitionSpec;
+import org.apache.iceberg.PartitionStatisticsFile;
 import org.apache.iceberg.Snapshot;
+import org.apache.iceberg.StatisticsFile;
 import org.apache.iceberg.TableMetadata;
 import org.apache.iceberg.TableMetadataParser;
 import org.apache.iceberg.io.CloseableIterable;
@@ -137,11 +139,26 @@ private Stream<FileReference> extractTableFiles(ContentReference contentReferenc
     Snapshot snapshot =
         snapshotId < 0L ? tableMetadata.currentSnapshot() : tableMetadata.snapshot(snapshotId);
 
-    Map<Integer, PartitionSpec> specsById = tableMetadata.specsById();
-
     Stream<StorageUri> allFiles = elementaryUrisFromSnapshot(snapshot, contentReference);
 
     if (snapshot != null) {
+      long effectiveSnapshotId = snapshot.snapshotId();
+      allFiles =
+          Stream.concat(
+              allFiles,
+              tableMetadata.statisticsFiles().stream()
+                  .filter(s -> s.snapshotId() == effectiveSnapshotId)
+                  .map(StatisticsFile::path)
+                  .map(StorageUri::of));
+      allFiles =
+          Stream.concat(
+              allFiles,
+              tableMetadata.partitionStatisticsFiles().stream()
+                  .filter(s -> s.snapshotId() == effectiveSnapshotId)
+                  .map(PartitionStatisticsFile::path)
+                  .map(StorageUri::of));
+
+      Map<Integer, PartitionSpec> specsById = tableMetadata.specsById();
       allFiles =
           Stream.concat(
               allFiles,