|
1 | 1 | use crate::construct_version_graph_orchestrator::{
|
2 |
| - ConstructVersionGraphError, ConstructVersionGraphOrchestrator, |
| 2 | + ConstructVersionGraphError, ConstructVersionGraphOrchestrator, VersionGraph, |
3 | 3 | };
|
4 | 4 | use crate::operators::compute_versions_to_delete_from_graph::{
|
5 | 5 | CollectionVersionAction, ComputeVersionsToDeleteError, ComputeVersionsToDeleteInput,
|
@@ -60,6 +60,7 @@ pub struct GarbageCollectorOrchestrator {
|
60 | 60 | file_ref_counts: HashMap<String, u32>,
|
61 | 61 | num_pending_tasks: usize,
|
62 | 62 | min_versions_to_keep: u32,
|
| 63 | + graph: Option<VersionGraph>, |
63 | 64 |
|
64 | 65 | num_files_deleted: u32,
|
65 | 66 | num_versions_deleted: u32,
|
@@ -105,6 +106,7 @@ impl GarbageCollectorOrchestrator {
|
105 | 106 | pending_list_files_at_version_tasks: HashSet::new(),
|
106 | 107 | num_pending_tasks: 0,
|
107 | 108 | min_versions_to_keep,
|
| 109 | + graph: None, |
108 | 110 |
|
109 | 111 | num_files_deleted: 0,
|
110 | 112 | num_versions_deleted: 0,
|
@@ -215,6 +217,7 @@ impl GarbageCollectorOrchestrator {
|
215 | 217 | );
|
216 | 218 | let output = orchestrator.run(self.system.clone()).await?;
|
217 | 219 | self.version_files = output.version_files;
|
| 220 | + self.graph = Some(output.graph.clone()); |
218 | 221 |
|
219 | 222 | let task = wrap(
|
220 | 223 | Box::new(ComputeVersionsToDeleteOperator {}),
|
@@ -375,6 +378,63 @@ impl GarbageCollectorOrchestrator {
|
375 | 378 | output.file_paths,
|
376 | 379 | );
|
377 | 380 |
|
| 381 | + if output.file_paths.is_empty() { |
| 382 | + // We only allow empty file paths if the version is 0 and all ancestors are also at v0. Otherwise, compaction should have flushed new file paths. This check is defensive and should never fail. |
| 383 | + let graph = self |
| 384 | + .graph |
| 385 | + .as_ref() |
| 386 | + .ok_or(GarbageCollectorError::InvariantViolation( |
| 387 | + "Expected graph to be set".to_string(), |
| 388 | + ))?; |
| 389 | + |
| 390 | + let this_node = graph |
| 391 | + .node_indices() |
| 392 | + .find(|&n| { |
| 393 | + let node = graph.node_weight(n).expect("Node should exist"); |
| 394 | + node.collection_id == output.collection_id && node.version == output.version |
| 395 | + }) |
| 396 | + .ok_or(GarbageCollectorError::InvariantViolation(format!( |
| 397 | + "Expected to find node for collection {} at version {}", |
| 398 | + output.collection_id, output.version |
| 399 | + )))?; |
| 400 | + |
| 401 | + let root = graph |
| 402 | + .node_indices() |
| 403 | + .find(|&n| { |
| 404 | + graph |
| 405 | + .neighbors_directed(n, petgraph::Direction::Incoming) |
| 406 | + .next() |
| 407 | + .is_none() |
| 408 | + }) |
| 409 | + .ok_or(GarbageCollectorError::InvariantViolation( |
| 410 | + "Expected to find root node".to_string(), |
| 411 | + ))?; |
| 412 | + |
| 413 | + let versions_from_root_to_this_node = |
| 414 | + petgraph::algo::astar(graph, root, |finish| finish == this_node, |_| 1, |_| 0) |
| 415 | + .ok_or(GarbageCollectorError::InvariantViolation(format!( |
| 416 | + "Expected to find path from root to node for {}@v{}", |
| 417 | + output.collection_id, output.version |
| 418 | + )))? |
| 419 | + .1 |
| 420 | + .into_iter() |
| 421 | + .map(|i| { |
| 422 | + let node = graph.node_weight(i).expect("Node should exist"); |
| 423 | + node.version |
| 424 | + }) |
| 425 | + .collect::<Vec<_>>(); |
| 426 | + let are_all_versions_v0 = versions_from_root_to_this_node |
| 427 | + .iter() |
| 428 | + .all(|&version| version == 0); |
| 429 | + |
| 430 | + if !are_all_versions_v0 { |
| 431 | + return Err(GarbageCollectorError::InvariantViolation(format!( |
| 432 | + "Version {} of collection {} has no file paths, but has non-v0 ancestors. This should never happen.", |
| 433 | + output.version, output.collection_id |
| 434 | + ))); |
| 435 | + } |
| 436 | + } |
| 437 | + |
378 | 438 | // Update the file ref counts. Counts in the map should:
|
379 | 439 | // - be 0 if we know about the file but it is unused
|
380 | 440 | // - be > 0 if we know about the file and it is used
|
@@ -728,3 +788,107 @@ impl Handler<TaskResult<DeleteVersionsAtSysDbOutput, DeleteVersionsAtSysDbError>
|
728 | 788 | }
|
729 | 789 | }
|
730 | 790 | }
|
| 791 | + |
| 792 | +#[cfg(test)] |
| 793 | +mod tests { |
| 794 | + use super::GarbageCollectorOrchestrator; |
| 795 | + use chroma_blockstore::RootManager; |
| 796 | + use chroma_cache::nop::NopCache; |
| 797 | + use chroma_storage::test_storage; |
| 798 | + use chroma_sysdb::TestSysDb; |
| 799 | + use chroma_system::{Dispatcher, Orchestrator, System}; |
| 800 | + use chroma_types::{ |
| 801 | + CollectionUuid, Segment, SegmentFlushInfo, SegmentScope, SegmentType, SegmentUuid, |
| 802 | + }; |
| 803 | + use chrono::DateTime; |
| 804 | + use std::{collections::HashMap, sync::Arc, time::SystemTime}; |
| 805 | + |
| 806 | + #[tokio::test(flavor = "multi_thread")] |
| 807 | + async fn errors_on_empty_file_paths() { |
| 808 | + let storage = test_storage(); |
| 809 | + let mut test_sysdb = TestSysDb::new(); |
| 810 | + test_sysdb.set_storage(Some(storage.clone())); |
| 811 | + let mut sysdb = chroma_sysdb::SysDb::Test(test_sysdb); |
| 812 | + |
| 813 | + let system = System::new(); |
| 814 | + let dispatcher = Dispatcher::new(Default::default()); |
| 815 | + let dispatcher_handle = system.start_component(dispatcher); |
| 816 | + let root_manager = RootManager::new(storage.clone(), Box::new(NopCache)); |
| 817 | + |
| 818 | + let tenant = "test_tenant".to_string(); |
| 819 | + let database = "test_database".to_string(); |
| 820 | + |
| 821 | + let root_collection_id = CollectionUuid::new(); |
| 822 | + let segment_id = SegmentUuid::new(); |
| 823 | + let segment = Segment { |
| 824 | + id: segment_id, |
| 825 | + r#type: SegmentType::BlockfileMetadata, |
| 826 | + scope: SegmentScope::METADATA, |
| 827 | + collection: root_collection_id, |
| 828 | + metadata: None, |
| 829 | + file_path: HashMap::new(), |
| 830 | + }; |
| 831 | + |
| 832 | + sysdb |
| 833 | + .create_collection( |
| 834 | + tenant.clone(), |
| 835 | + database, |
| 836 | + root_collection_id, |
| 837 | + "Root Collection".to_string(), |
| 838 | + vec![segment], |
| 839 | + None, |
| 840 | + None, |
| 841 | + None, |
| 842 | + false, |
| 843 | + ) |
| 844 | + .await |
| 845 | + .unwrap(); |
| 846 | + |
| 847 | + // Create v1 with no file paths |
| 848 | + sysdb |
| 849 | + .flush_compaction( |
| 850 | + tenant, |
| 851 | + root_collection_id, |
| 852 | + 0, |
| 853 | + 0, |
| 854 | + Arc::new([SegmentFlushInfo { |
| 855 | + segment_id, |
| 856 | + file_paths: HashMap::new(), |
| 857 | + }]), |
| 858 | + 0, |
| 859 | + 0, |
| 860 | + ) |
| 861 | + .await |
| 862 | + .unwrap(); |
| 863 | + |
| 864 | + // Should fail |
| 865 | + let mut collections = sysdb |
| 866 | + .get_collections(Some(root_collection_id), None, None, None, None, 0) |
| 867 | + .await |
| 868 | + .unwrap(); |
| 869 | + let root_collection = collections.pop().unwrap(); |
| 870 | + let orchestrator = GarbageCollectorOrchestrator::new( |
| 871 | + root_collection_id, |
| 872 | + root_collection.version_file_path.unwrap(), |
| 873 | + None, |
| 874 | + DateTime::from_timestamp( |
| 875 | + SystemTime::now() |
| 876 | + .duration_since(SystemTime::UNIX_EPOCH) |
| 877 | + .unwrap() |
| 878 | + .as_secs() as i64, |
| 879 | + 0, |
| 880 | + ) |
| 881 | + .unwrap(), |
| 882 | + sysdb, |
| 883 | + dispatcher_handle, |
| 884 | + system.clone(), |
| 885 | + storage, |
| 886 | + root_manager, |
| 887 | + crate::types::CleanupMode::Delete, |
| 888 | + 1, |
| 889 | + ); |
| 890 | + let result = orchestrator.run(system).await; |
| 891 | + assert!(result.is_err()); |
| 892 | + assert!(format!("{:?}", result).contains("no file paths")); |
| 893 | + } |
| 894 | +} |
0 commit comments