Skip to content

Commit

Permalink
chore: backfill sourceCode without redo calculate
Browse files Browse the repository at this point in the history
Signed-off-by: Wei Zhang <[email protected]>
  • Loading branch information
zwpaper committed Dec 18, 2024
1 parent b754142 commit 8068290
Show file tree
Hide file tree
Showing 3 changed files with 110 additions and 24 deletions.
30 changes: 15 additions & 15 deletions crates/tabby-common/src/index/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,21 @@ impl IndexSchema {
])
}

/// Build a query to find the document with the given `doc_id`, include chunks.
pub fn doc_query_with_chunks(&self, corpus: &str, doc_id: &str) -> impl Query {
let doc_id_query = TermQuery::new(
Term::from_field_text(self.field_id, doc_id),
tantivy::schema::IndexRecordOption::Basic,
);

BooleanQuery::new(vec![
// Must match the corpus
(Occur::Must, self.corpus_query(corpus)),
// Must match the doc id
(Occur::Must, Box::new(doc_id_query)),
])
}

pub fn doc_indexed_after(
&self,
corpus: &str,
Expand Down Expand Up @@ -264,21 +279,6 @@ impl IndexSchema {
])
}

/// Build a query to find the document with the given `doc_id`, include chunks.
pub fn doc_query_with_chunks(&self, corpus: &str, doc_id: &str) -> impl Query {
let doc_id_query = TermQuery::new(
Term::from_field_text(self.field_id, doc_id),
tantivy::schema::IndexRecordOption::Basic,
);

BooleanQuery::new(vec![
// Must match the corpus
(Occur::Must, self.corpus_query(corpus)),
// Must match the doc id
(Occur::Must, Box::new(doc_id_query)),
])
}

pub fn corpus_query(&self, corpus: &str) -> Box<dyn Query> {
Box::new(TermQuery::new(
Term::from_field_text(self.field_corpus, corpus),
Expand Down
79 changes: 70 additions & 9 deletions crates/tabby-index/src/code/index.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,11 @@ use std::{pin::pin, sync::Arc};
use async_stream::stream;
use futures::StreamExt;
use ignore::{DirEntry, Walk};
use tabby_common::index::{code, corpus};
use tabby_common::index::{self, code, corpus};
use tabby_inference::Embedding;
use tracing::warn;
use tantivy::TantivyDocument;
use

use super::{
create_code_builder,
Expand Down Expand Up @@ -101,7 +103,10 @@ async fn add_changed_documents(

let id = SourceCode::to_index_id(&repository.source_id, &key).id;

// Skip if already indexed and has no failed chunks,
// when skip, we should check if the document needs to be backfilled.
if !require_updates(cloned_index.clone(), &id) {
backfill_commit_if_needed(cloned_index.clone(), &id, commit);
continue;
}

Expand Down Expand Up @@ -139,22 +144,31 @@ async fn add_changed_documents(
count_docs
}

// 1. Backfill if the document is missing the commit field
// 2. Skip if already indexed and has no failed chunks
fn require_updates(indexer: Arc<Indexer>, id: &str) -> bool {
if should_backfill(indexer.clone(), id) {
return true;
}
if indexer.is_indexed(id) && !indexer.has_failed_chunks(id) {
return false;
};

true
}

fn should_backfill(indexer: Arc<Indexer>, id: &str) -> bool {
// v0.23.0 add the commit field to the code document.
!indexer.has_attribute_field(id, code::fields::CHUNK_COMMIT)
// v0.23.0 add the commit field to the code document.
async fn backfill_commit_if_needed(indexer: Arc<Indexer>, id: &str, commit: &str) -> () {
if indexer.has_attribute_field(id, code::fields::CHUNK_COMMIT) {
return ();
}

let docs = indexer.get_doc_with_chunks(id).await;
indexer.delete(id);

for doc in docs {
let mut doc = doc;
if get_text(doc, schema.field_chunk_id).is_empty() {
indexer.add(doc);
continue
}
indexer.add(create_source_code_with_commit(&doc, commit));
}
}

fn is_valid_file(file: &SourceCode) -> bool {
Expand All @@ -165,6 +179,53 @@ fn is_valid_file(file: &SourceCode) -> bool {
&& file.number_fraction <= MAX_NUMBER_FRACTION
}

fn create_source_code_with_commit(doc: &TantivyDocument, commit: &str) -> SourceCode {
let schema = tabby_common::index::IndexSchema::instance();
SourceCode {
source_file_id: SourceCode::source_file_id_from_id(get_text(doc, schema.field_id)).expect("TODO"),
source_id: get_text(doc, schema.field_source_id).to_string(),
git_url: get_text(doc, schema.field_chunk_attributes, code::fields::CHUNK_GIT_URL).to_string(),
commit: commit.to_owned(),
basedir: get_text(doc, schema.field_chunk_attributes, code::fields::CHUNK_BASEDIR).to_string(),
filepath: get_text(doc, schema.field_chunk_attributes, code::fields::CHUNK_FILEPATH).to_string(),
language: get_text(doc, schema.field_chunk_attributes, code::fields::CHUNK_LANGUAGE).to_string(),
max_line_length: get_json_number_field(doc, schema.field_chunk_attributes, code::fields::CHUNK_MAX_LINE_LENGTH) as usize,
avg_line_length: get_json_number_field(doc, schema.field_chunk_attributes, code::fields::CHUNK_AVG_LINE_LENGTH) as f32,
alphanum_fraction: get_json_number_field(doc, schema.field_chunk_attributes, code::fields::CHUNK_ALPHANUM_FRACTION) as f32,
number_fraction: get_json_number_field(doc, schema.field_chunk_attributes, code::fields::CHUNK_NUMBER_FRACTION) as f32,
num_lines: get_json_number_field(doc, schema.field_chunk_attributes, code::fields::CHUNK_NUM_LINES) as usize,
tags: get_json_text_field(doc, schema.field_chunk_attributes, code::fields::CHUNK_TAGS).to_string(),
}
}

fn get_text(doc: &TantivyDocument, field: schema::Field) -> &str {
doc.get_first(field).unwrap().as_str().unwrap()
}

fn get_json_number_field(doc: &TantivyDocument, field: schema::Field, name: &str) -> i64 {
doc.get_first(field)
.unwrap()
.as_object()
.unwrap()
.find(|(k, _)| *k == name)
.unwrap()
.1
.as_i64()
.unwrap()
}

fn get_json_text_field<'a>(doc: &'a TantivyDocument, field: schema::Field, name: &str) -> &'a str {
doc.get_first(field)
.unwrap()
.as_object()
.unwrap()
.find(|(k, _)| *k == name)
.unwrap()
.1
.as_str()
.unwrap()
}

#[cfg(test)]
mod tests {
use futures::StreamExt;
Expand Down
25 changes: 25 additions & 0 deletions crates/tabby-index/src/indexer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,31 @@ impl Indexer {
.expect("Failed to add document");
}

pub async fn get_doc_with_chunks(&self, id: &str) -> Vec<TantivyDocument> {
let schema = IndexSchema::instance();
let query = schema.doc_query_with_chunks(&self.corpus, id);
let docs = match self
.searcher
.search(&query, &TopDocs::with_limit(usize::MAX))
{
Ok(docs) => docs,
Err(e) => {
debug!("query tantivy error: {}", e);
return vec![];
}
};

docs.iter()
.filter_map(|doc| match self.searcher.doc(doc.1) {
Ok(doc) => Some(doc),
Err(e) => {
debug!("Failed to read document: {}", e);
None
}
})
.collect::<Vec<_>>()
}

pub fn delete(&self, id: &str) {
let schema = IndexSchema::instance();
let _ = self
Expand Down

0 comments on commit 8068290

Please sign in to comment.