chore: backfill sourceCode without redo calculate

Signed-off-by: Wei Zhang <[email protected]>
TabbyML · Dec 18, 2024 · 8068290 · 8068290
1 parent b754142
commit 8068290
Show file tree

Hide file tree

Showing 3 changed files with 110 additions and 24 deletions.
diff --git a/crates/tabby-common/src/index/mod.rs b/crates/tabby-common/src/index/mod.rs
@@ -178,6 +178,21 @@ impl IndexSchema {
         ])
     }
 
+    /// Build a query to find the document with the given `doc_id`, include chunks.
+    pub fn doc_query_with_chunks(&self, corpus: &str, doc_id: &str) -> impl Query {
+        let doc_id_query = TermQuery::new(
+            Term::from_field_text(self.field_id, doc_id),
+            tantivy::schema::IndexRecordOption::Basic,
+        );
+
+        BooleanQuery::new(vec![
+            // Must match the corpus
+            (Occur::Must, self.corpus_query(corpus)),
+            // Must match the doc id
+            (Occur::Must, Box::new(doc_id_query)),
+        ])
+    }
+
     pub fn doc_indexed_after(
         &self,
         corpus: &str,
@@ -264,21 +279,6 @@ impl IndexSchema {
         ])
     }
 
-    /// Build a query to find the document with the given `doc_id`, include chunks.
-    pub fn doc_query_with_chunks(&self, corpus: &str, doc_id: &str) -> impl Query {
-        let doc_id_query = TermQuery::new(
-            Term::from_field_text(self.field_id, doc_id),
-            tantivy::schema::IndexRecordOption::Basic,
-        );
-
-        BooleanQuery::new(vec![
-            // Must match the corpus
-            (Occur::Must, self.corpus_query(corpus)),
-            // Must match the doc id
-            (Occur::Must, Box::new(doc_id_query)),
-        ])
-    }
-
     pub fn corpus_query(&self, corpus: &str) -> Box<dyn Query> {
         Box::new(TermQuery::new(
             Term::from_field_text(self.field_corpus, corpus),

diff --git a/crates/tabby-index/src/code/index.rs b/crates/tabby-index/src/code/index.rs
@@ -3,9 +3,11 @@ use std::{pin::pin, sync::Arc};
 use async_stream::stream;
 use futures::StreamExt;
 use ignore::{DirEntry, Walk};
-use tabby_common::index::{code, corpus};
+use tabby_common::index::{self, code, corpus};
 use tabby_inference::Embedding;
 use tracing::warn;
+use tantivy::TantivyDocument;
+use
 
 use super::{
     create_code_builder,
@@ -101,7 +103,10 @@ async fn add_changed_documents(
 
             let id = SourceCode::to_index_id(&repository.source_id, &key).id;
 
+            // Skip if already indexed and has no failed chunks,
+            // when skip, we should check if the document needs to be backfilled.
             if !require_updates(cloned_index.clone(), &id) {
+                backfill_commit_if_needed(cloned_index.clone(), &id, commit);
                 continue;
             }
 
@@ -139,22 +144,31 @@ async fn add_changed_documents(
     count_docs
 }
 
-// 1. Backfill if the document is missing the commit field
-// 2. Skip if already indexed and has no failed chunks
 fn require_updates(indexer: Arc<Indexer>, id: &str) -> bool {
-    if should_backfill(indexer.clone(), id) {
-        return true;
-    }
     if indexer.is_indexed(id) && !indexer.has_failed_chunks(id) {
         return false;
     };
 
     true
 }
 
-fn should_backfill(indexer: Arc<Indexer>, id: &str) -> bool {
-    // v0.23.0 add the commit field to the code document.
-    !indexer.has_attribute_field(id, code::fields::CHUNK_COMMIT)
+// v0.23.0 add the commit field to the code document.
+async fn backfill_commit_if_needed(indexer: Arc<Indexer>, id: &str, commit: &str) -> () {
+    if indexer.has_attribute_field(id, code::fields::CHUNK_COMMIT) {
+        return ();
+    }
+
+    let docs = indexer.get_doc_with_chunks(id).await;
+    indexer.delete(id);
+
+    for doc in docs {
+        let mut doc = doc;
+        if get_text(doc, schema.field_chunk_id).is_empty() {
+            indexer.add(doc);
+            continue
+        }
+        indexer.add(create_source_code_with_commit(&doc, commit));
+    }
 }
 
 fn is_valid_file(file: &SourceCode) -> bool {
@@ -165,6 +179,53 @@ fn is_valid_file(file: &SourceCode) -> bool {
         && file.number_fraction <= MAX_NUMBER_FRACTION
 }
 
+fn create_source_code_with_commit(doc: &TantivyDocument, commit: &str) -> SourceCode {
+    let schema = tabby_common::index::IndexSchema::instance();
+    SourceCode {
+        source_file_id: SourceCode::source_file_id_from_id(get_text(doc, schema.field_id)).expect("TODO"),
+        source_id: get_text(doc, schema.field_source_id).to_string(),
+        git_url: get_text(doc, schema.field_chunk_attributes, code::fields::CHUNK_GIT_URL).to_string(),
+        commit: commit.to_owned(),
+        basedir: get_text(doc, schema.field_chunk_attributes, code::fields::CHUNK_BASEDIR).to_string(),
+        filepath: get_text(doc, schema.field_chunk_attributes, code::fields::CHUNK_FILEPATH).to_string(),
+        language: get_text(doc, schema.field_chunk_attributes, code::fields::CHUNK_LANGUAGE).to_string(),
+        max_line_length: get_json_number_field(doc, schema.field_chunk_attributes, code::fields::CHUNK_MAX_LINE_LENGTH) as usize,
+        avg_line_length: get_json_number_field(doc, schema.field_chunk_attributes, code::fields::CHUNK_AVG_LINE_LENGTH) as f32,
+        alphanum_fraction: get_json_number_field(doc, schema.field_chunk_attributes, code::fields::CHUNK_ALPHANUM_FRACTION) as f32,
+        number_fraction: get_json_number_field(doc, schema.field_chunk_attributes, code::fields::CHUNK_NUMBER_FRACTION) as f32,
+        num_lines: get_json_number_field(doc, schema.field_chunk_attributes, code::fields::CHUNK_NUM_LINES) as usize,
+        tags: get_json_text_field(doc, schema.field_chunk_attributes, code::fields::CHUNK_TAGS).to_string(),
+    }
+}
+
+fn get_text(doc: &TantivyDocument, field: schema::Field) -> &str {
+    doc.get_first(field).unwrap().as_str().unwrap()
+}
+
+fn get_json_number_field(doc: &TantivyDocument, field: schema::Field, name: &str) -> i64 {
+    doc.get_first(field)
+        .unwrap()
+        .as_object()
+        .unwrap()
+        .find(|(k, _)| *k == name)
+        .unwrap()
+        .1
+        .as_i64()
+        .unwrap()
+}
+
+fn get_json_text_field<'a>(doc: &'a TantivyDocument, field: schema::Field, name: &str) -> &'a str {
+    doc.get_first(field)
+        .unwrap()
+        .as_object()
+        .unwrap()
+        .find(|(k, _)| *k == name)
+        .unwrap()
+        .1
+        .as_str()
+        .unwrap()
+}
+
 #[cfg(test)]
 mod tests {
     use futures::StreamExt;

diff --git a/crates/tabby-index/src/indexer.rs b/crates/tabby-index/src/indexer.rs
@@ -197,6 +197,31 @@ impl Indexer {
             .expect("Failed to add document");
     }
 
+    pub async fn get_doc_with_chunks(&self, id: &str) -> Vec<TantivyDocument> {
+        let schema = IndexSchema::instance();
+        let query = schema.doc_query_with_chunks(&self.corpus, id);
+        let docs = match self
+            .searcher
+            .search(&query, &TopDocs::with_limit(usize::MAX))
+        {
+            Ok(docs) => docs,
+            Err(e) => {
+                debug!("query tantivy error: {}", e);
+                return vec![];
+            }
+        };
+
+        docs.iter()
+            .filter_map(|doc| match self.searcher.doc(doc.1) {
+                Ok(doc) => Some(doc),
+                Err(e) => {
+                    debug!("Failed to read document: {}", e);
+                    None
+                }
+            })
+            .collect::<Vec<_>>()
+    }
+
     pub fn delete(&self, id: &str) {
         let schema = IndexSchema::instance();
         let _ = self