From ddf07253c46b3e07c863e0b6068687e9c5cee5e0 Mon Sep 17 00:00:00 2001 From: Marshall Bowers Date: Wed, 19 Jun 2024 09:50:02 -0400 Subject: [PATCH] assistant: Strip out general website chrome in `/fetch` command (#13264) This PR updates the `/fetch` command to strip out general website chrome that likely won't contain content on any websites. Release Notes: - N/A --- .../src/slash_command/fetch_command.rs | 1 + crates/html_to_markdown/src/markdown.rs | 24 +++++++++++++++++++ 2 files changed, 25 insertions(+) diff --git a/crates/assistant/src/slash_command/fetch_command.rs b/crates/assistant/src/slash_command/fetch_command.rs index cc675ecebe..7a8230186f 100644 --- a/crates/assistant/src/slash_command/fetch_command.rs +++ b/crates/assistant/src/slash_command/fetch_command.rs @@ -62,6 +62,7 @@ impl FetchSlashCommand { match content_type { ContentType::Html => { let mut handlers: Vec = vec![ + Rc::new(RefCell::new(markdown::WebpageChromeRemover)), Rc::new(RefCell::new(markdown::ParagraphHandler)), Rc::new(RefCell::new(markdown::HeadingHandler)), Rc::new(RefCell::new(markdown::ListHandler)), diff --git a/crates/html_to_markdown/src/markdown.rs b/crates/html_to_markdown/src/markdown.rs index 58d4e73f0f..07791e74f7 100644 --- a/crates/html_to_markdown/src/markdown.rs +++ b/crates/html_to_markdown/src/markdown.rs @@ -1,6 +1,30 @@ use crate::html_element::HtmlElement; use crate::markdown_writer::{HandleTag, HandlerOutcome, MarkdownWriter, StartTagOutcome}; +pub struct WebpageChromeRemover; + +impl HandleTag for WebpageChromeRemover { + fn should_handle(&self, tag: &str) -> bool { + match tag { + "head" | "script" | "style" | "nav" => true, + _ => false, + } + } + + fn handle_tag_start( + &mut self, + tag: &HtmlElement, + _writer: &mut MarkdownWriter, + ) -> StartTagOutcome { + match tag.tag() { + "head" | "script" | "style" | "nav" => return StartTagOutcome::Skip, + _ => {} + } + + StartTagOutcome::Continue + } +} + pub struct ParagraphHandler; impl HandleTag for ParagraphHandler {