Skip to content

Commit 624dfed

Browse files
committed
Go back to a drastic pruning strategy
1 parent f6065a4 commit 624dfed

File tree

4 files changed

+51
-202
lines changed

4 files changed

+51
-202
lines changed

quickwit/quickwit-lambda/src/indexer/environment.rs

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,3 +38,10 @@ pub static DISABLE_MERGE: Lazy<bool> =
3838

3939
pub static DISABLE_JANITOR: Lazy<bool> =
4040
Lazy::new(|| var("QW_LAMBDA_DISABLE_JANITOR").is_ok_and(|v| v.as_str() == "true"));
41+
42+
pub static MAX_CHECKPOINTS: Lazy<usize> = Lazy::new(|| {
43+
var("QW_LAMBDA_MAX_CHECKPOINTS").map_or(100, |v| {
44+
v.parse()
45+
.expect("QW_LAMBDA_MAX_CHECKPOINTS must be a positive integer")
46+
})
47+
});

quickwit/quickwit-lambda/src/indexer/ingest/helpers.rs

Lines changed: 40 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,6 @@ use std::path::{Path, PathBuf};
2424
use anyhow::{bail, Context};
2525
use chitchat::transport::ChannelTransport;
2626
use chitchat::FailureDetectorConfig;
27-
use chrono::Utc;
2827
use quickwit_actors::{ActorHandle, Mailbox, Universe};
2928
use quickwit_cluster::{Cluster, ClusterMember};
3029
use quickwit_common::pubsub::EventBroker;
@@ -49,20 +48,21 @@ use quickwit_metastore::{
4948
};
5049
use quickwit_proto::indexing::CpuCapacity;
5150
use quickwit_proto::metastore::{
52-
AddSourceRequest, CreateIndexRequest, DeleteSourceRequest, IndexMetadataRequest,
53-
MetastoreError, MetastoreService, MetastoreServiceClient,
51+
CreateIndexRequest, DeleteSourceRequest, IndexMetadataRequest, MetastoreError,
52+
MetastoreService, MetastoreServiceClient,
5453
};
5554
use quickwit_proto::types::{NodeId, PipelineUid};
5655
use quickwit_search::SearchJobPlacer;
5756
use quickwit_storage::StorageResolver;
5857
use quickwit_telemetry::payload::{QuickwitFeature, QuickwitTelemetryInfo, TelemetryEvent};
5958
use tracing::{debug, info, instrument};
6059

61-
use super::source_id::{
62-
create_lambda_source_id, filter_prunable_lambda_source_ids, is_lambda_source_id,
63-
};
6460
use crate::environment::INDEX_ID;
65-
use crate::indexer::environment::{DISABLE_JANITOR, DISABLE_MERGE, INDEX_CONFIG_URI};
61+
use crate::indexer::environment::{
62+
DISABLE_JANITOR, DISABLE_MERGE, INDEX_CONFIG_URI, MAX_CHECKPOINTS,
63+
};
64+
65+
const LAMBDA_SOURCE_ID: &str = "_ingest-lambda-source";
6666

6767
/// The indexing service needs to update its cluster chitchat state so that the control plane is
6868
/// aware of the running tasks. We thus create a fake cluster to instantiate the indexing service
@@ -137,60 +137,22 @@ pub(super) async fn send_telemetry() {
137137
quickwit_telemetry::send_telemetry_event(TelemetryEvent::RunCommand).await;
138138
}
139139

140-
/// Convert the incomming file path to a source config and save it to the metastore
141-
///
142-
/// If a Lambda file source already exists with the same path, format and transform, reuse it.
140+
/// Convert the incomming file path to a source config
143141
pub(super) async fn configure_source(
144-
metastore: &mut MetastoreServiceClient,
145142
input_path: PathBuf,
146143
input_format: SourceInputFormat,
147-
index_metadata: &IndexMetadata,
148144
vrl_script: Option<String>,
149145
) -> anyhow::Result<SourceConfig> {
150146
let transform_config = vrl_script.map(|vrl_script| TransformConfig::new(vrl_script, None));
151147
let source_params = SourceParams::file(input_path.clone());
152-
153-
let existing_sources_for_config: Vec<_> = index_metadata
154-
.sources
155-
.iter()
156-
.filter(|(src_id, src_config)| {
157-
is_lambda_source_id(src_id)
158-
&& src_config.source_params == source_params
159-
&& src_config.input_format == input_format
160-
&& src_config.transform_config == transform_config
161-
})
162-
.map(|(src_id, _)| src_id)
163-
.collect();
164-
165-
let source_id = match existing_sources_for_config.len() {
166-
0 => create_lambda_source_id(Utc::now()),
167-
1 => existing_sources_for_config[0].clone(),
168-
n => bail!(
169-
"Found {} existing Lambda sources for file {:?}, expected at most 1",
170-
n,
171-
input_path,
172-
),
173-
};
174-
175-
let src_config = SourceConfig {
176-
source_id,
148+
Ok(SourceConfig {
149+
source_id: LAMBDA_SOURCE_ID.to_owned(),
177150
num_pipelines: NonZeroUsize::new(1).expect("1 is always non-zero."),
178151
enabled: true,
179152
source_params,
180153
transform_config,
181154
input_format,
182-
};
183-
184-
if existing_sources_for_config.is_empty() {
185-
metastore
186-
.add_source(AddSourceRequest {
187-
index_uid: Some(index_metadata.index_uid.clone()),
188-
source_config_json: serde_json::to_string(&src_config)?,
189-
})
190-
.await?;
191-
}
192-
193-
Ok(src_config)
155+
})
194156
}
195157

196158
/// Check if the index exists, creating or overwriting it if necessary
@@ -321,26 +283,39 @@ pub(super) async fn spawn_pipelines(
321283
Ok((indexing_pipeline_handle, merge_pipeline_handle))
322284
}
323285

324-
/// Delete old Lambda file sources
325-
pub(super) async fn prune_file_sources(
286+
/// Prune old Lambda file checkpoints if there are too many
287+
///
288+
/// Without pruning checkpoints accumulate indifinitely. This is particularly
289+
/// problematic when indexing a lot of small files, as the metastore will grow
290+
/// large even for a small index.
291+
///
292+
/// The current implementation just deletes all checkpoints if there are more
293+
/// than QW_LAMBDA_MAX_CHECKPOINTS. When this purging is performed, the Lambda
294+
/// indexer might ingest the same file again if it receives a duplicate
295+
/// notification.
296+
pub(super) async fn prune_lambda_source(
326297
metastore: &mut MetastoreServiceClient,
327298
index_metadata: IndexMetadata,
328299
) -> anyhow::Result<()> {
329-
let prunable_sources: Vec<_> =
330-
filter_prunable_lambda_source_ids(index_metadata.sources.keys())?.collect();
331-
info!(
332-
existing = index_metadata.sources.len(),
333-
prunable = prunable_sources.len(),
334-
"prune file sources"
335-
);
336-
for src_id in prunable_sources {
337-
metastore
338-
.delete_source(DeleteSourceRequest {
339-
index_uid: Some(index_metadata.index_uid.clone()),
340-
source_id: src_id.clone(),
341-
})
342-
.await?;
300+
let lambda_checkpoint_opt = index_metadata
301+
.checkpoint
302+
.source_checkpoint(LAMBDA_SOURCE_ID);
303+
304+
if let Some(lambda_checkpoint) = lambda_checkpoint_opt {
305+
if lambda_checkpoint.num_partitions() > *MAX_CHECKPOINTS {
306+
info!(
307+
partitions = lambda_checkpoint.num_partitions(),
308+
"prune Lambda checkpoints"
309+
);
310+
metastore
311+
.delete_source(DeleteSourceRequest {
312+
index_uid: Some(index_metadata.index_uid.clone()),
313+
source_id: LAMBDA_SOURCE_ID.to_owned(),
314+
})
315+
.await?;
316+
}
343317
}
318+
344319
Ok(())
345320
}
346321

quickwit/quickwit-lambda/src/indexer/ingest/mod.rs

Lines changed: 4 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@
1818
// along with this program. If not, see <http://www.gnu.org/licenses/>.
1919

2020
mod helpers;
21-
mod source_id;
2221

2322
use std::collections::HashSet;
2423
use std::path::PathBuf;
@@ -39,7 +38,7 @@ use quickwit_indexing::models::IndexingStatistics;
3938
use tracing::{debug, info};
4039

4140
use crate::indexer::environment::{CONFIGURATION_TEMPLATE, DISABLE_JANITOR};
42-
use crate::indexer::ingest::helpers::{prune_file_sources, wait_for_merges};
41+
use crate::indexer::ingest::helpers::{prune_lambda_source, wait_for_merges};
4342
use crate::utils::load_node_config;
4443

4544
#[derive(Debug, Eq, PartialEq)]
@@ -67,14 +66,8 @@ pub async fn ingest(args: IngestArgs) -> anyhow::Result<IndexingStatistics> {
6766
)
6867
.await?;
6968

70-
let source_config = configure_source(
71-
&mut metastore,
72-
args.input_path,
73-
args.input_format,
74-
&index_metadata,
75-
args.vrl_script,
76-
)
77-
.await?;
69+
let source_config =
70+
configure_source(args.input_path, args.input_format, args.vrl_script).await?;
7871

7972
let mut services = vec![QuickwitService::Indexer];
8073
if !*DISABLE_JANITOR {
@@ -99,7 +92,7 @@ pub async fn ingest(args: IngestArgs) -> anyhow::Result<IndexingStatistics> {
9992
let (indexing_pipeline_handle, merge_pipeline_handle) =
10093
spawn_pipelines(indexing_service_handle.mailbox(), source_config).await?;
10194

102-
prune_file_sources(&mut metastore, index_metadata).await?;
95+
prune_lambda_source(&mut metastore, index_metadata).await?;
10396

10497
debug!("wait for indexing to complete");
10598
let statistics = start_statistics_reporting_loop(indexing_pipeline_handle, false).await?;

quickwit/quickwit-lambda/src/indexer/ingest/source_id.rs

Lines changed: 0 additions & 126 deletions
This file was deleted.

0 commit comments

Comments
 (0)