From e5e85e781faaabe4a56683ee454e53b7080efcbc Mon Sep 17 00:00:00 2001 From: Yuya Nishihara Date: Wed, 9 Oct 2024 17:16:53 +0900 Subject: [PATCH] diff: inline contents and ranges vecs up to two sides MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This appears to be a bit faster if there are tons of unchanged ranges. ``` group new old ----- --- --- bench_diff_git_git_read_tree_c 1.00 58.5±0.12µs 1.07 62.7±0.60µs bench_diff_lines/modified/10k 1.00 34.2±0.72ms 1.08 37.0±1.09ms bench_diff_lines/modified/1k 1.00 3.1±0.08ms 1.12 3.5±0.01ms bench_diff_lines/reversed/10k 1.00 28.0±0.15ms 1.01 28.4±0.51ms bench_diff_lines/reversed/1k 1.00 616.0±16.20µs 1.00 617.0±9.29µs bench_diff_lines/unchanged/10k 1.00 3.5±0.04ms 1.10 3.9±0.06ms bench_diff_lines/unchanged/1k 1.00 328.4±4.44µs 1.07 352.0±1.41µs ``` --- cli/src/diff_util.rs | 7 ++++--- lib/src/diff.rs | 24 +++++++++++++++--------- 2 files changed, 19 insertions(+), 12 deletions(-) diff --git a/cli/src/diff_util.rs b/cli/src/diff_util.rs index de7035146..7c6b4b46f 100644 --- a/cli/src/diff_util.rs +++ b/cli/src/diff_util.rs @@ -45,6 +45,7 @@ use jj_lib::diff::CompareBytesIgnoreAllWhitespace; use jj_lib::diff::CompareBytesIgnoreWhitespaceAmount; use jj_lib::diff::Diff; use jj_lib::diff::DiffHunk; +use jj_lib::diff::DiffHunkContentVec; use jj_lib::diff::DiffHunkKind; use jj_lib::files::DiffLineHunkSide; use jj_lib::files::DiffLineIterator; @@ -582,7 +583,7 @@ fn show_color_words_diff_hunks( /// Prints `num_after` lines, ellipsis, and `num_before` lines. fn show_color_words_context_lines( formatter: &mut dyn Formatter, - contexts: &[Vec<&BStr>], + contexts: &[DiffHunkContentVec], mut line_number: DiffLineNumber, options: &ColorWordsDiffOptions, num_after: usize, @@ -1288,7 +1289,7 @@ fn unified_diff_hunks<'content>( // Just use the right (i.e. new) content. We could count the // number of skipped lines separately, but the number of the // context lines should match the displayed content. - let [_, right] = hunk.contents.try_into().unwrap(); + let [_, right] = hunk.contents[..].try_into().unwrap(); let mut lines = right.split_inclusive(|b| *b == b'\n').fuse(); if !current_hunk.lines.is_empty() { // The previous hunk line should be either removed/added. @@ -1604,7 +1605,7 @@ fn get_diff_stat( match hunk.kind { DiffHunkKind::Matching => {} DiffHunkKind::Different => { - let [left, right] = hunk.contents.try_into().unwrap(); + let [left, right] = hunk.contents[..].try_into().unwrap(); removed += left.split_inclusive(|b| *b == b'\n').count(); added += right.split_inclusive(|b| *b == b'\n').count(); } diff --git a/lib/src/diff.rs b/lib/src/diff.rs index 18114c943..4903aa2ec 100644 --- a/lib/src/diff.rs +++ b/lib/src/diff.rs @@ -26,6 +26,8 @@ use std::slice; use bstr::BStr; use hashbrown::HashTable; use itertools::Itertools; +use smallvec::smallvec; +use smallvec::SmallVec; pub fn find_line_ranges(text: &[u8]) -> Vec> { text.split_inclusive(|b| *b == b'\n') @@ -508,8 +510,9 @@ fn intersect_unchanged_words( #[derive(Clone, PartialEq, Eq, Debug)] struct UnchangedRange { + // Inline up to two sides (base + one other) base: Range, - others: Vec>, + others: SmallVec<[Range; 1]>, } impl UnchangedRange { @@ -538,7 +541,7 @@ impl UnchangedRange { #[derive(Clone, Debug)] pub struct Diff<'input> { base_input: &'input BStr, - other_inputs: Vec<&'input BStr>, + other_inputs: SmallVec<[&'input BStr; 1]>, /// Sorted list of ranges of unchanged regions in bytes. /// /// The list should never be empty. The first and the last region may be @@ -554,7 +557,7 @@ impl<'input> Diff<'input> { ) -> Self { let mut inputs = inputs.into_iter().map(BStr::new); let base_input = inputs.next().expect("inputs must not be empty"); - let other_inputs = inputs.collect_vec(); + let other_inputs: SmallVec<[&BStr; 1]> = inputs.collect(); // First tokenize each input let base_token_ranges: Vec>; let other_token_ranges: Vec>>; @@ -583,7 +586,7 @@ impl<'input> Diff<'input> { fn with_inputs_and_token_ranges( base_input: &'input BStr, - other_inputs: Vec<&'input BStr>, + other_inputs: SmallVec<[&'input BStr; 1]>, base_token_ranges: &[Range], other_token_ranges: &[Vec>], compare: impl CompareBytes, @@ -600,7 +603,7 @@ impl<'input> Diff<'input> { [] => { let whole_range = UnchangedRange { base: 0..base_source.text.len(), - others: vec![], + others: smallvec![], }; vec![whole_range] } @@ -611,7 +614,7 @@ impl<'input> Diff<'input> { // Add an empty range at the start to make life easier for hunks(). unchanged_regions.push(UnchangedRange { base: 0..0, - others: vec![0..0; other_inputs.len()], + others: smallvec![0..0; other_inputs.len()], }); let mut first_positions = Vec::new(); collect_unchanged_words( @@ -795,7 +798,7 @@ impl<'input> Diff<'input> { #[derive(Clone, Debug, Eq, PartialEq)] pub struct DiffHunk<'input> { pub kind: DiffHunkKind, - pub contents: Vec<&'input BStr>, + pub contents: DiffHunkContentVec<'input>, } impl<'input> DiffHunk<'input> { @@ -824,6 +827,9 @@ pub enum DiffHunkKind { Different, } +// Inline up to two sides +pub type DiffHunkContentVec<'input> = SmallVec<[&'input BStr; 2]>; + pub struct DiffHunkIterator<'diff, 'input> { diff: &'diff Diff<'input>, previous: &'diff UnchangedRange, @@ -850,12 +856,12 @@ impl<'diff, 'input> Iterator for DiffHunkIterator<'diff, 'input> { fn next(&mut self) -> Option { if !self.unchanged_emitted { self.unchanged_emitted = true; - let contents = self.diff.hunk_at(self.previous).collect_vec(); + let contents = self.diff.hunk_at(self.previous).collect(); let kind = DiffHunkKind::Matching; return Some(DiffHunk { kind, contents }); } let current = self.unchanged_iter.next()?; - let contents = self.diff.hunk_between(self.previous, current).collect_vec(); + let contents: DiffHunkContentVec = self.diff.hunk_between(self.previous, current).collect(); debug_assert!( contents.iter().any(|content| !content.is_empty()), "unchanged regions should have been compacted"