diff --git a/crates/assistant/src/slash_command/fetch_command.rs b/crates/assistant/src/slash_command/fetch_command.rs index cc675ecebe..7a8230186f 100644 --- a/crates/assistant/src/slash_command/fetch_command.rs +++ b/crates/assistant/src/slash_command/fetch_command.rs @@ -62,6 +62,7 @@ impl FetchSlashCommand { match content_type { ContentType::Html => { let mut handlers: Vec = vec![ + Rc::new(RefCell::new(markdown::WebpageChromeRemover)), Rc::new(RefCell::new(markdown::ParagraphHandler)), Rc::new(RefCell::new(markdown::HeadingHandler)), Rc::new(RefCell::new(markdown::ListHandler)), diff --git a/crates/html_to_markdown/src/markdown.rs b/crates/html_to_markdown/src/markdown.rs index 58d4e73f0f..07791e74f7 100644 --- a/crates/html_to_markdown/src/markdown.rs +++ b/crates/html_to_markdown/src/markdown.rs @@ -1,6 +1,30 @@ use crate::html_element::HtmlElement; use crate::markdown_writer::{HandleTag, HandlerOutcome, MarkdownWriter, StartTagOutcome}; +pub struct WebpageChromeRemover; + +impl HandleTag for WebpageChromeRemover { + fn should_handle(&self, tag: &str) -> bool { + match tag { + "head" | "script" | "style" | "nav" => true, + _ => false, + } + } + + fn handle_tag_start( + &mut self, + tag: &HtmlElement, + _writer: &mut MarkdownWriter, + ) -> StartTagOutcome { + match tag.tag() { + "head" | "script" | "style" | "nav" => return StartTagOutcome::Skip, + _ => {} + } + + StartTagOutcome::Continue + } +} + pub struct ParagraphHandler; impl HandleTag for ParagraphHandler {