Skip to content

Commit d097326

Browse files
authored
Prune checkpoints in Lambda (#4777)
* Prune old file sources on Lambda * Refactor the custom source id serde into a separate module * Improve tests * Fix some bugs * Fix filter on existing sources * Simplify source cleanup rule * Minor adjustments * Go back to a drastic pruning strategy * Use reset_source_checkpoint instead of delete_source
1 parent 2402e67 commit d097326

File tree

5 files changed

+110
-50
lines changed

5 files changed

+110
-50
lines changed

quickwit/Cargo.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

quickwit/quickwit-lambda/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ path = "src/bin/searcher.rs"
2222
anyhow = { workspace = true }
2323
aws_lambda_events = "0.15.0"
2424
chitchat = { workspace = true }
25+
chrono = { workspace = true }
2526
flate2 = { workspace = true }
2627
lambda_http = "0.10.0"
2728
lambda_runtime = "0.10.0"

quickwit/quickwit-lambda/src/indexer/environment.rs

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,3 +38,10 @@ pub static DISABLE_MERGE: Lazy<bool> =
3838

3939
pub static DISABLE_JANITOR: Lazy<bool> =
4040
Lazy::new(|| var("QW_LAMBDA_DISABLE_JANITOR").is_ok_and(|v| v.as_str() == "true"));
41+
42+
pub static MAX_CHECKPOINTS: Lazy<usize> = Lazy::new(|| {
43+
var("QW_LAMBDA_MAX_CHECKPOINTS").map_or(100, |v| {
44+
v.parse()
45+
.expect("QW_LAMBDA_MAX_CHECKPOINTS must be a positive integer")
46+
})
47+
});

quickwit/quickwit-lambda/src/indexer/ingest/helpers.rs

Lines changed: 94 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,6 @@ use anyhow::{bail, Context};
2525
use chitchat::transport::ChannelTransport;
2626
use chitchat::FailureDetectorConfig;
2727
use quickwit_actors::{ActorHandle, Mailbox, Universe};
28-
use quickwit_cli::run_index_checklist;
2928
use quickwit_cluster::{Cluster, ClusterMember};
3029
use quickwit_common::pubsub::EventBroker;
3130
use quickwit_common::runtimes::RuntimesConfig;
@@ -34,7 +33,7 @@ use quickwit_config::merge_policy_config::MergePolicyConfig;
3433
use quickwit_config::service::QuickwitService;
3534
use quickwit_config::{
3635
load_index_config_from_user_config, ConfigFormat, IndexConfig, NodeConfig, SourceConfig,
37-
SourceInputFormat, SourceParams, TransformConfig, CLI_SOURCE_ID,
36+
SourceInputFormat, SourceParams, TransformConfig,
3837
};
3938
use quickwit_index_management::IndexService;
4039
use quickwit_indexing::actors::{
@@ -44,10 +43,13 @@ use quickwit_indexing::models::{DetachIndexingPipeline, DetachMergePipeline, Spa
4443
use quickwit_indexing::IndexingPipeline;
4544
use quickwit_ingest::IngesterPool;
4645
use quickwit_janitor::{start_janitor_service, JanitorService};
47-
use quickwit_metastore::CreateIndexRequestExt;
46+
use quickwit_metastore::{
47+
CreateIndexRequestExt, CreateIndexResponseExt, IndexMetadata, IndexMetadataResponseExt,
48+
};
4849
use quickwit_proto::indexing::CpuCapacity;
4950
use quickwit_proto::metastore::{
50-
CreateIndexRequest, MetastoreError, MetastoreService, MetastoreServiceClient,
51+
CreateIndexRequest, IndexMetadataRequest, MetastoreError, MetastoreService,
52+
MetastoreServiceClient, ResetSourceCheckpointRequest,
5153
};
5254
use quickwit_proto::types::{NodeId, PipelineUid};
5355
use quickwit_search::SearchJobPlacer;
@@ -56,7 +58,11 @@ use quickwit_telemetry::payload::{QuickwitFeature, QuickwitTelemetryInfo, Teleme
5658
use tracing::{debug, info, instrument};
5759

5860
use crate::environment::INDEX_ID;
59-
use crate::indexer::environment::{DISABLE_JANITOR, DISABLE_MERGE, INDEX_CONFIG_URI};
61+
use crate::indexer::environment::{
62+
DISABLE_JANITOR, DISABLE_MERGE, INDEX_CONFIG_URI, MAX_CHECKPOINTS,
63+
};
64+
65+
const LAMBDA_SOURCE_ID: &str = "_ingest-lambda-source";
6066

6167
/// The indexing service needs to update its cluster chitchat state so that the control plane is
6268
/// aware of the running tasks. We thus create a fake cluster to instantiate the indexing service
@@ -131,66 +137,71 @@ pub(super) async fn send_telemetry() {
131137
quickwit_telemetry::send_telemetry_event(TelemetryEvent::RunCommand).await;
132138
}
133139

134-
pub(super) fn configure_source(
140+
/// Convert the incomming file path to a source config
141+
pub(super) async fn configure_source(
135142
input_path: PathBuf,
136143
input_format: SourceInputFormat,
137144
vrl_script: Option<String>,
138-
) -> SourceConfig {
139-
let source_params = SourceParams::file(input_path);
145+
) -> anyhow::Result<SourceConfig> {
140146
let transform_config = vrl_script.map(|vrl_script| TransformConfig::new(vrl_script, None));
141-
SourceConfig {
142-
source_id: CLI_SOURCE_ID.to_string(),
147+
let source_params = SourceParams::file(input_path.clone());
148+
Ok(SourceConfig {
149+
source_id: LAMBDA_SOURCE_ID.to_owned(),
143150
num_pipelines: NonZeroUsize::new(1).expect("1 is always non-zero."),
144151
enabled: true,
145152
source_params,
146153
transform_config,
147154
input_format,
148-
}
155+
})
149156
}
150157

151158
/// Check if the index exists, creating or overwriting it if necessary
152159
pub(super) async fn init_index_if_necessary(
153160
metastore: &mut MetastoreServiceClient,
154161
storage_resolver: &StorageResolver,
155-
source_config: &SourceConfig,
156162
default_index_root_uri: &Uri,
157163
overwrite: bool,
158-
) -> anyhow::Result<()> {
159-
let checklist_result =
160-
run_index_checklist(metastore, storage_resolver, &INDEX_ID, Some(source_config)).await;
161-
if let Err(e) = checklist_result {
162-
let is_not_found = e
163-
.downcast_ref()
164-
.is_some_and(|meta_error| matches!(meta_error, MetastoreError::NotFound(_)));
165-
if !is_not_found {
166-
bail!(e);
164+
) -> anyhow::Result<IndexMetadata> {
165+
let metadata_result = metastore
166+
.index_metadata(IndexMetadataRequest::for_index_id(INDEX_ID.clone()))
167+
.await;
168+
let metadata = match metadata_result {
169+
Ok(_) if overwrite => {
170+
info!(
171+
index_id = *INDEX_ID,
172+
"Overwrite enabled, clearing existing index",
173+
);
174+
let mut index_service = IndexService::new(metastore.clone(), storage_resolver.clone());
175+
index_service.clear_index(&INDEX_ID).await?;
176+
metastore
177+
.index_metadata(IndexMetadataRequest::for_index_id(INDEX_ID.clone()))
178+
.await?
179+
.deserialize_index_metadata()?
167180
}
168-
info!(
169-
index_id = *INDEX_ID,
170-
index_config_uri = *INDEX_CONFIG_URI,
171-
"Index not found, creating it"
172-
);
173-
let index_config = load_index_config(storage_resolver, default_index_root_uri).await?;
174-
if index_config.index_id != *INDEX_ID {
175-
bail!(
176-
"Expected index ID was {} but config file had {}",
177-
*INDEX_ID,
178-
index_config.index_id,
181+
Ok(metadata_resp) => metadata_resp.deserialize_index_metadata()?,
182+
Err(MetastoreError::NotFound(_)) => {
183+
info!(
184+
index_id = *INDEX_ID,
185+
index_config_uri = *INDEX_CONFIG_URI,
186+
"Index not found, creating it"
179187
);
188+
let index_config = load_index_config(storage_resolver, default_index_root_uri).await?;
189+
if index_config.index_id != *INDEX_ID {
190+
bail!(
191+
"Expected index ID was {} but config file had {}",
192+
*INDEX_ID,
193+
index_config.index_id,
194+
);
195+
}
196+
let create_resp = metastore
197+
.create_index(CreateIndexRequest::try_from_index_config(&index_config)?)
198+
.await?;
199+
info!("index created");
200+
create_resp.deserialize_index_metadata()?
180201
}
181-
metastore
182-
.create_index(CreateIndexRequest::try_from_index_config(&index_config)?)
183-
.await?;
184-
info!("index created");
185-
} else if overwrite {
186-
info!(
187-
index_id = *INDEX_ID,
188-
"Overwrite enabled, clearing existing index",
189-
);
190-
let mut index_service = IndexService::new(metastore.clone(), storage_resolver.clone());
191-
index_service.clear_index(&INDEX_ID).await?;
192-
}
193-
Ok(())
202+
Err(e) => bail!(e),
203+
};
204+
Ok(metadata)
194205
}
195206

196207
pub(super) async fn spawn_services(
@@ -249,6 +260,7 @@ pub(super) async fn spawn_services(
249260
Ok((indexing_service_handle, janitor_service_opt))
250261
}
251262

263+
/// Spawn and split an indexing pipeline
252264
pub(super) async fn spawn_pipelines(
253265
indexing_server_mailbox: &Mailbox<IndexingService>,
254266
source_config: SourceConfig,
@@ -271,6 +283,43 @@ pub(super) async fn spawn_pipelines(
271283
Ok((indexing_pipeline_handle, merge_pipeline_handle))
272284
}
273285

286+
/// Prune old Lambda file checkpoints if there are too many
287+
///
288+
/// Without pruning checkpoints accumulate indifinitely. This is particularly
289+
/// problematic when indexing a lot of small files, as the metastore will grow
290+
/// large even for a small index.
291+
///
292+
/// The current implementation just deletes all checkpoints if there are more
293+
/// than QW_LAMBDA_MAX_CHECKPOINTS. When this purging is performed, the Lambda
294+
/// indexer might ingest the same file again if it receives a duplicate
295+
/// notification.
296+
pub(super) async fn prune_lambda_source(
297+
metastore: &mut MetastoreServiceClient,
298+
index_metadata: IndexMetadata,
299+
) -> anyhow::Result<()> {
300+
let lambda_checkpoint_opt = index_metadata
301+
.checkpoint
302+
.source_checkpoint(LAMBDA_SOURCE_ID);
303+
304+
if let Some(lambda_checkpoint) = lambda_checkpoint_opt {
305+
if lambda_checkpoint.num_partitions() > *MAX_CHECKPOINTS {
306+
info!(
307+
partitions = lambda_checkpoint.num_partitions(),
308+
"prune Lambda checkpoints"
309+
);
310+
metastore
311+
.reset_source_checkpoint(ResetSourceCheckpointRequest {
312+
index_uid: Some(index_metadata.index_uid.clone()),
313+
source_id: LAMBDA_SOURCE_ID.to_owned(),
314+
})
315+
.await?;
316+
}
317+
}
318+
319+
Ok(())
320+
}
321+
322+
/// Observe the merge pipeline until there are no more ongoing merges
274323
pub(super) async fn wait_for_merges(
275324
merge_pipeline_handle: ActorHandle<MergePipeline>,
276325
) -> anyhow::Result<()> {

quickwit/quickwit-lambda/src/indexer/ingest/mod.rs

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ use quickwit_indexing::models::IndexingStatistics;
3838
use tracing::{debug, info};
3939

4040
use crate::indexer::environment::{CONFIGURATION_TEMPLATE, DISABLE_JANITOR};
41-
use crate::indexer::ingest::helpers::wait_for_merges;
41+
use crate::indexer::ingest::helpers::{prune_lambda_source, wait_for_merges};
4242
use crate::utils::load_node_config;
4343

4444
#[derive(Debug, Eq, PartialEq)]
@@ -58,17 +58,17 @@ pub async fn ingest(args: IngestArgs) -> anyhow::Result<IndexingStatistics> {
5858
let (config, storage_resolver, mut metastore) =
5959
load_node_config(CONFIGURATION_TEMPLATE).await?;
6060

61-
let source_config = configure_source(args.input_path, args.input_format, args.vrl_script);
62-
63-
init_index_if_necessary(
61+
let index_metadata = init_index_if_necessary(
6462
&mut metastore,
6563
&storage_resolver,
66-
&source_config,
6764
&config.default_index_root_uri,
6865
args.overwrite,
6966
)
7067
.await?;
7168

69+
let source_config =
70+
configure_source(args.input_path, args.input_format, args.vrl_script).await?;
71+
7272
let mut services = vec![QuickwitService::Indexer];
7373
if !*DISABLE_JANITOR {
7474
services.push(QuickwitService::Janitor);
@@ -92,6 +92,8 @@ pub async fn ingest(args: IngestArgs) -> anyhow::Result<IndexingStatistics> {
9292
let (indexing_pipeline_handle, merge_pipeline_handle) =
9393
spawn_pipelines(indexing_service_handle.mailbox(), source_config).await?;
9494

95+
prune_lambda_source(&mut metastore, index_metadata).await?;
96+
9597
debug!("wait for indexing to complete");
9698
let statistics = start_statistics_reporting_loop(indexing_pipeline_handle, false).await?;
9799

0 commit comments

Comments
 (0)