From 36907bb4dc604c2715242d7cedfc04cde7cf60ff Mon Sep 17 00:00:00 2001 From: KCaverly Date: Fri, 30 Jun 2023 16:14:11 -0400 Subject: [PATCH] updated vector store indexing to only use languages with an embedding.scm treesitter query Co-authored-by: maxbrunsfeld --- crates/language/src/language.rs | 44 +++++++++++++++++++ crates/vector_store/src/vector_store.rs | 22 +++++++--- crates/vector_store/src/vector_store_tests.rs | 2 +- crates/zed/src/languages.rs | 1 + crates/zed/src/languages/rust/embedding.scm | 36 +++++++++++++++ 5 files changed, 98 insertions(+), 7 deletions(-) create mode 100644 crates/zed/src/languages/rust/embedding.scm diff --git a/crates/language/src/language.rs b/crates/language/src/language.rs index b880cbc8d7..4ef9d25894 100644 --- a/crates/language/src/language.rs +++ b/crates/language/src/language.rs @@ -350,6 +350,7 @@ pub struct LanguageQueries { pub brackets: Option>, pub indents: Option>, pub outline: Option>, + pub embedding: Option>, pub injections: Option>, pub overrides: Option>, } @@ -495,6 +496,7 @@ pub struct Grammar { pub(crate) brackets_config: Option, pub(crate) indents_config: Option, pub outline_config: Option, + pub embedding_config: Option, pub(crate) injection_config: Option, pub(crate) override_config: Option, pub(crate) highlight_map: Mutex, @@ -516,6 +518,15 @@ pub struct OutlineConfig { pub extra_context_capture_ix: Option, } +#[derive(Debug)] +pub struct EmbeddingConfig { + pub query: Query, + pub item_capture_ix: u32, + pub name_capture_ix: u32, + pub context_capture_ix: Option, + pub extra_context_capture_ix: Option, +} + struct InjectionConfig { query: Query, content_capture_ix: u32, @@ -1145,6 +1156,7 @@ impl Language { highlights_query: None, brackets_config: None, outline_config: None, + embedding_config: None, indents_config: None, injection_config: None, override_config: None, @@ -1181,6 +1193,9 @@ impl Language { if let Some(query) = queries.outline { self = self.with_outline_query(query.as_ref())?; } + if let Some(query) = queries.embedding { + self = self.with_embedding_query(query.as_ref())?; + } if let Some(query) = queries.injections { self = self.with_injection_query(query.as_ref())?; } @@ -1189,6 +1204,7 @@ impl Language { } Ok(self) } + pub fn with_highlights_query(mut self, source: &str) -> Result { let grammar = self.grammar_mut(); grammar.highlights_query = Some(Query::new(grammar.ts_language, source)?); @@ -1223,6 +1239,34 @@ impl Language { Ok(self) } + pub fn with_embedding_query(mut self, source: &str) -> Result { + let grammar = self.grammar_mut(); + let query = Query::new(grammar.ts_language, source)?; + let mut item_capture_ix = None; + let mut name_capture_ix = None; + let mut context_capture_ix = None; + let mut extra_context_capture_ix = None; + get_capture_indices( + &query, + &mut [ + ("item", &mut item_capture_ix), + ("name", &mut name_capture_ix), + ("context", &mut context_capture_ix), + ("context.extra", &mut extra_context_capture_ix), + ], + ); + if let Some((item_capture_ix, name_capture_ix)) = item_capture_ix.zip(name_capture_ix) { + grammar.embedding_config = Some(EmbeddingConfig { + query, + item_capture_ix, + name_capture_ix, + context_capture_ix, + extra_context_capture_ix, + }); + } + Ok(self) + } + pub fn with_brackets_query(mut self, source: &str) -> Result { let grammar = self.grammar_mut(); let query = Query::new(grammar.ts_language, source)?; diff --git a/crates/vector_store/src/vector_store.rs b/crates/vector_store/src/vector_store.rs index 876a6018b8..35a467b82f 100644 --- a/crates/vector_store/src/vector_store.rs +++ b/crates/vector_store/src/vector_store.rs @@ -136,8 +136,8 @@ impl VectorStore { content: String, ) -> Result { let grammar = language.grammar().ok_or_else(|| anyhow!("no grammar"))?; - let outline_config = grammar - .outline_config + let embedding_config = grammar + .embedding_config .as_ref() .ok_or_else(|| anyhow!("no outline query"))?; @@ -148,13 +148,17 @@ impl VectorStore { let mut documents = Vec::new(); let mut context_spans = Vec::new(); - for mat in cursor.matches(&outline_config.query, tree.root_node(), content.as_bytes()) { + for mat in cursor.matches( + &embedding_config.query, + tree.root_node(), + content.as_bytes(), + ) { let mut item_range = None; let mut name_range = None; for capture in mat.captures { - if capture.index == outline_config.item_capture_ix { + if capture.index == embedding_config.item_capture_ix { item_range = Some(capture.node.byte_range()); - } else if capture.index == outline_config.name_capture_ix { + } else if capture.index == embedding_config.name_capture_ix { name_range = Some(capture.node.byte_range()); } } @@ -266,7 +270,11 @@ impl VectorStore { .language_for_file(&absolute_path, None) .await { - if language.name().as_ref() != "Rust" { + if language + .grammar() + .and_then(|grammar| grammar.embedding_config.as_ref()) + .is_none() + { continue; } @@ -359,6 +367,8 @@ impl VectorStore { this.worktree_db_ids.extend(worktree_db_ids); }); + log::info!("Semantic Indexing Complete!"); + anyhow::Ok(()) }) } diff --git a/crates/vector_store/src/vector_store_tests.rs b/crates/vector_store/src/vector_store_tests.rs index e232ba9f21..78470ad4be 100644 --- a/crates/vector_store/src/vector_store_tests.rs +++ b/crates/vector_store/src/vector_store_tests.rs @@ -46,7 +46,7 @@ async fn test_vector_store(cx: &mut TestAppContext) { }, Some(tree_sitter_rust::language()), ) - .with_outline_query( + .with_embedding_query( r#" (function_item name: (identifier) @name diff --git a/crates/zed/src/languages.rs b/crates/zed/src/languages.rs index 44e144e89b..820f564151 100644 --- a/crates/zed/src/languages.rs +++ b/crates/zed/src/languages.rs @@ -170,6 +170,7 @@ fn load_queries(name: &str) -> LanguageQueries { brackets: load_query(name, "/brackets"), indents: load_query(name, "/indents"), outline: load_query(name, "/outline"), + embedding: load_query(name, "/embedding"), injections: load_query(name, "/injections"), overrides: load_query(name, "/overrides"), } diff --git a/crates/zed/src/languages/rust/embedding.scm b/crates/zed/src/languages/rust/embedding.scm new file mode 100644 index 0000000000..ea8bab9f68 --- /dev/null +++ b/crates/zed/src/languages/rust/embedding.scm @@ -0,0 +1,36 @@ +(struct_item + (visibility_modifier)? @context + "struct" @context + name: (_) @name) @item + +(enum_item + (visibility_modifier)? @context + "enum" @context + name: (_) @name) @item + +(impl_item + "impl" @context + trait: (_)? @name + "for"? @context + type: (_) @name) @item + +(trait_item + (visibility_modifier)? @context + "trait" @context + name: (_) @name) @item + +(function_item + (visibility_modifier)? @context + (function_modifiers)? @context + "fn" @context + name: (_) @name) @item + +(function_signature_item + (visibility_modifier)? @context + (function_modifiers)? @context + "fn" @context + name: (_) @name) @item + +(macro_definition + . "macro_rules!" @context + name: (_) @name) @item