Skip to content

Commit

Permalink
chore: backfill sourceCode without redo calculate
Browse files Browse the repository at this point in the history
Signed-off-by: Wei Zhang <[email protected]>
  • Loading branch information
zwpaper committed Dec 18, 2024
1 parent 700151c commit 3f7fbc9
Show file tree
Hide file tree
Showing 3 changed files with 99 additions and 24 deletions.
30 changes: 15 additions & 15 deletions crates/tabby-common/src/index/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,21 @@ impl IndexSchema {
])
}

/// Build a query to find the document with the given `doc_id`, include chunks.
pub fn doc_query_with_chunks(&self, corpus: &str, doc_id: &str) -> impl Query {
let doc_id_query = TermQuery::new(
Term::from_field_text(self.field_id, doc_id),
tantivy::schema::IndexRecordOption::Basic,
);

BooleanQuery::new(vec![
// Must match the corpus
(Occur::Must, self.corpus_query(corpus)),
// Must match the doc id
(Occur::Must, Box::new(doc_id_query)),
])
}

pub fn doc_indexed_after(
&self,
corpus: &str,
Expand Down Expand Up @@ -264,21 +279,6 @@ impl IndexSchema {
])
}

/// Build a query to find the document with the given `doc_id`, include chunks.
pub fn doc_query_with_chunks(&self, corpus: &str, doc_id: &str) -> impl Query {
let doc_id_query = TermQuery::new(
Term::from_field_text(self.field_id, doc_id),
tantivy::schema::IndexRecordOption::Basic,
);

BooleanQuery::new(vec![
// Must match the corpus
(Occur::Must, self.corpus_query(corpus)),
// Must match the doc id
(Occur::Must, Box::new(doc_id_query)),
])
}

pub fn corpus_query(&self, corpus: &str) -> Box<dyn Query> {
Box::new(TermQuery::new(
Term::from_field_text(self.field_corpus, corpus),
Expand Down
67 changes: 58 additions & 9 deletions crates/tabby-index/src/code/index.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,9 @@ use std::{pin::pin, sync::Arc};
use async_stream::stream;
use futures::StreamExt;
use ignore::{DirEntry, Walk};
use tabby_common::index::{code, corpus};
use tabby_common::index::{self, code, corpus};
use tabby_inference::Embedding;
use tantivy::{doc, TantivyDocument};
use tracing::warn;

use super::{
Expand Down Expand Up @@ -101,7 +102,10 @@ async fn add_changed_documents(

let id = SourceCode::to_index_id(&repository.source_id, &key).id;

// Skip if already indexed and has no failed chunks,
// when skip, we should check if the document needs to be backfilled.
if !require_updates(cloned_index.clone(), &id) {
backfill_commit_if_needed(cloned_index.clone(), &id, commit);
continue;
}

Expand Down Expand Up @@ -139,22 +143,25 @@ async fn add_changed_documents(
count_docs
}

// 1. Backfill if the document is missing the commit field
// 2. Skip if already indexed and has no failed chunks
fn require_updates(indexer: Arc<Indexer>, id: &str) -> bool {
if should_backfill(indexer.clone(), id) {
return true;
}
if indexer.is_indexed(id) && !indexer.has_failed_chunks(id) {
return false;
};

true
}

fn should_backfill(indexer: Arc<Indexer>, id: &str) -> bool {
// v0.23.0 add the commit field to the code document.
!indexer.has_attribute_field(id, code::fields::ATTRIBUTE_COMMIT)
// v0.23.0 add the commit field to the code document.
async fn backfill_commit_if_needed(indexer: Arc<Indexer>, id: &str, commit: &str) -> Result<()> {
if indexer.has_attribute_field(id, code::fields::ATTRIBUTE_COMMIT) {
return Ok(());
}

let doc = indexer.get_doc(id).await?;
indexer.delete_doc(id);
indexer.add(create_document_with_commit(&doc, commit));

Ok(())
}

fn is_valid_file(file: &SourceCode) -> bool {
Expand All @@ -165,6 +172,48 @@ fn is_valid_file(file: &SourceCode) -> bool {
&& file.number_fraction <= MAX_NUMBER_FRACTION
}

fn create_document_with_commit(doc: &TantivyDocument, commit: &str) -> TantivyDocument {
let schema = tabby_common::index::IndexSchema::instance();
doc! {
schema.field_id => get_text(doc, schema.field_id),
schema.field_source_id => get_text(doc, schema.field_source_id).to_string(),
schema.field_corpus => get_text(doc, schema.field_corpus).to_string(),
schema.field_attributes => json!({
code::fields::ATTRIBUTE_COMMIT: commit,
}),
schema.field_updated_at => get_text(doc, schema.field_updated_at).to_string(),,
schema.field_failed_chunks_count => get_json_number_field(doc, schema.field_failed_chunks_count, code::fields::FAILED_CHUNKS_COUNT) as usize,
}
}

fn get_text(doc: &TantivyDocument, field: schema::Field) -> &str {
doc.get_first(field).unwrap().as_str().unwrap()
}

fn get_json_number_field(doc: &TantivyDocument, field: schema::Field, name: &str) -> i64 {
doc.get_first(field)
.unwrap()
.as_object()
.unwrap()
.find(|(k, _)| *k == name)
.unwrap()
.1
.as_i64()
.unwrap()
}

fn get_json_text_field<'a>(doc: &'a TantivyDocument, field: schema::Field, name: &str) -> &'a str {
doc.get_first(field)
.unwrap()
.as_object()
.unwrap()
.find(|(k, _)| *k == name)
.unwrap()
.1
.as_str()
.unwrap()
}

#[cfg(test)]
mod tests {
use futures::StreamExt;
Expand Down
26 changes: 26 additions & 0 deletions crates/tabby-index/src/indexer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -197,13 +197,39 @@ impl Indexer {
.expect("Failed to add document");
}

pub async fn get_doc(&self, id: &str) -> Result<TantivyDocument> {
let schema = IndexSchema::instance();
let query = schema.doc_query(&self.corpus, id);
let docs = match self.searcher.search(&query, &TopDocs::with_limit(1)) {
Ok(docs) => docs,
Err(e) => {
debug!("query tantivy error: {}", e);
return Err(e.into());
}
};
if docs.is_empty() {
bail!("Document not found: {}", id);
}

self.searcher
.doc(docs.first().unwrap().1)
.map_err(|e| e.into())
}

pub fn delete(&self, id: &str) {
let schema = IndexSchema::instance();
let _ = self
.writer
.delete_query(Box::new(schema.doc_query_with_chunks(&self.corpus, id)));
}

pub fn delete_doc(&self, id: &str) {
let schema = IndexSchema::instance();
let _ = self
.writer
.delete_query(Box::new(schema.doc_query(&self.corpus, id)));
}

pub fn commit(mut self) {
self.writer.commit().expect("Failed to commit changes");
self.writer
Expand Down

0 comments on commit 3f7fbc9

Please sign in to comment.