From e221f23018d9b9883327712874b1237725922a0a Mon Sep 17 00:00:00 2001 From: KCaverly Date: Tue, 1 Aug 2023 10:30:34 -0400 Subject: [PATCH] add support for markdown files to semantic search --- crates/semantic_index/src/parsing.rs | 16 ++++++++++++++++ crates/semantic_index/src/semantic_index.rs | 1 + 2 files changed, 17 insertions(+) diff --git a/crates/semantic_index/src/parsing.rs b/crates/semantic_index/src/parsing.rs index 643db8c798..cef23862c5 100644 --- a/crates/semantic_index/src/parsing.rs +++ b/crates/semantic_index/src/parsing.rs @@ -21,6 +21,7 @@ const CODE_CONTEXT_TEMPLATE: &str = "The below code snippet is from file ''\n\n```\n\n```"; const ENTIRE_FILE_TEMPLATE: &str = "The below snippet is from file ''\n\n```\n\n```"; +const MARKDOWN_CONTEXT_TEMPLATE: &str = "The below file contents is from file ''\n\n"; pub const PARSEABLE_ENTIRE_FILE_TYPES: &[&str] = &["TOML", "YAML", "CSS", "HEEX", "ERB", "SVELTE", "HTML"]; @@ -70,6 +71,19 @@ impl CodeContextRetriever { }]) } + fn parse_markdown_file(&self, relative_path: &Path, content: &str) -> Result> { + let document_span = MARKDOWN_CONTEXT_TEMPLATE + .replace("", relative_path.to_string_lossy().as_ref()) + .replace("", &content); + + Ok(vec![Document { + range: 0..content.len(), + content: document_span, + embedding: Vec::new(), + name: "Markdown".to_string(), + }]) + } + fn get_matches_in_file( &mut self, content: &str, @@ -136,6 +150,8 @@ impl CodeContextRetriever { if PARSEABLE_ENTIRE_FILE_TYPES.contains(&language_name.as_ref()) { return self.parse_entire_file(relative_path, language_name, &content); + } else if &language_name.to_string() == &"Markdown".to_string() { + return self.parse_markdown_file(relative_path, &content); } let mut documents = self.parse_file(content, language)?; diff --git a/crates/semantic_index/src/semantic_index.rs b/crates/semantic_index/src/semantic_index.rs index bd114de216..23c75f4014 100644 --- a/crates/semantic_index/src/semantic_index.rs +++ b/crates/semantic_index/src/semantic_index.rs @@ -613,6 +613,7 @@ impl SemanticIndex { .await { if !PARSEABLE_ENTIRE_FILE_TYPES.contains(&language.name().as_ref()) + && &language.name().as_ref() != &"Markdown" && language .grammar() .and_then(|grammar| grammar.embedding_config.as_ref())