diff: inline contents and ranges vecs up to two sides

This appears to be a bit faster if there are tons of unchanged ranges.

```
group                             new                     old
-----                             ---                     ---
bench_diff_git_git_read_tree_c    1.00     58.5±0.12µs    1.07     62.7±0.60µs
bench_diff_lines/modified/10k     1.00     34.2±0.72ms    1.08     37.0±1.09ms
bench_diff_lines/modified/1k      1.00      3.1±0.08ms    1.12      3.5±0.01ms
bench_diff_lines/reversed/10k     1.00     28.0±0.15ms    1.01     28.4±0.51ms
bench_diff_lines/reversed/1k      1.00   616.0±16.20µs    1.00    617.0±9.29µs
bench_diff_lines/unchanged/10k    1.00      3.5±0.04ms    1.10      3.9±0.06ms
bench_diff_lines/unchanged/1k     1.00    328.4±4.44µs    1.07    352.0±1.41µs
```
This commit is contained in:
Yuya Nishihara 2024-10-09 17:16:53 +09:00
parent 92103f4db2
commit e5e85e781f
2 changed files with 19 additions and 12 deletions
cli/src
lib/src

View file

@ -45,6 +45,7 @@ use jj_lib::diff::CompareBytesIgnoreAllWhitespace;
use jj_lib::diff::CompareBytesIgnoreWhitespaceAmount;
use jj_lib::diff::Diff;
use jj_lib::diff::DiffHunk;
use jj_lib::diff::DiffHunkContentVec;
use jj_lib::diff::DiffHunkKind;
use jj_lib::files::DiffLineHunkSide;
use jj_lib::files::DiffLineIterator;
@ -582,7 +583,7 @@ fn show_color_words_diff_hunks(
/// Prints `num_after` lines, ellipsis, and `num_before` lines.
fn show_color_words_context_lines(
formatter: &mut dyn Formatter,
contexts: &[Vec<&BStr>],
contexts: &[DiffHunkContentVec],
mut line_number: DiffLineNumber,
options: &ColorWordsDiffOptions,
num_after: usize,
@ -1288,7 +1289,7 @@ fn unified_diff_hunks<'content>(
// Just use the right (i.e. new) content. We could count the
// number of skipped lines separately, but the number of the
// context lines should match the displayed content.
let [_, right] = hunk.contents.try_into().unwrap();
let [_, right] = hunk.contents[..].try_into().unwrap();
let mut lines = right.split_inclusive(|b| *b == b'\n').fuse();
if !current_hunk.lines.is_empty() {
// The previous hunk line should be either removed/added.
@ -1604,7 +1605,7 @@ fn get_diff_stat(
match hunk.kind {
DiffHunkKind::Matching => {}
DiffHunkKind::Different => {
let [left, right] = hunk.contents.try_into().unwrap();
let [left, right] = hunk.contents[..].try_into().unwrap();
removed += left.split_inclusive(|b| *b == b'\n').count();
added += right.split_inclusive(|b| *b == b'\n').count();
}

View file

@ -26,6 +26,8 @@ use std::slice;
use bstr::BStr;
use hashbrown::HashTable;
use itertools::Itertools;
use smallvec::smallvec;
use smallvec::SmallVec;
pub fn find_line_ranges(text: &[u8]) -> Vec<Range<usize>> {
text.split_inclusive(|b| *b == b'\n')
@ -508,8 +510,9 @@ fn intersect_unchanged_words(
#[derive(Clone, PartialEq, Eq, Debug)]
struct UnchangedRange {
// Inline up to two sides (base + one other)
base: Range<usize>,
others: Vec<Range<usize>>,
others: SmallVec<[Range<usize>; 1]>,
}
impl UnchangedRange {
@ -538,7 +541,7 @@ impl UnchangedRange {
#[derive(Clone, Debug)]
pub struct Diff<'input> {
base_input: &'input BStr,
other_inputs: Vec<&'input BStr>,
other_inputs: SmallVec<[&'input BStr; 1]>,
/// Sorted list of ranges of unchanged regions in bytes.
///
/// The list should never be empty. The first and the last region may be
@ -554,7 +557,7 @@ impl<'input> Diff<'input> {
) -> Self {
let mut inputs = inputs.into_iter().map(BStr::new);
let base_input = inputs.next().expect("inputs must not be empty");
let other_inputs = inputs.collect_vec();
let other_inputs: SmallVec<[&BStr; 1]> = inputs.collect();
// First tokenize each input
let base_token_ranges: Vec<Range<usize>>;
let other_token_ranges: Vec<Vec<Range<usize>>>;
@ -583,7 +586,7 @@ impl<'input> Diff<'input> {
fn with_inputs_and_token_ranges(
base_input: &'input BStr,
other_inputs: Vec<&'input BStr>,
other_inputs: SmallVec<[&'input BStr; 1]>,
base_token_ranges: &[Range<usize>],
other_token_ranges: &[Vec<Range<usize>>],
compare: impl CompareBytes,
@ -600,7 +603,7 @@ impl<'input> Diff<'input> {
[] => {
let whole_range = UnchangedRange {
base: 0..base_source.text.len(),
others: vec![],
others: smallvec![],
};
vec![whole_range]
}
@ -611,7 +614,7 @@ impl<'input> Diff<'input> {
// Add an empty range at the start to make life easier for hunks().
unchanged_regions.push(UnchangedRange {
base: 0..0,
others: vec![0..0; other_inputs.len()],
others: smallvec![0..0; other_inputs.len()],
});
let mut first_positions = Vec::new();
collect_unchanged_words(
@ -795,7 +798,7 @@ impl<'input> Diff<'input> {
#[derive(Clone, Debug, Eq, PartialEq)]
pub struct DiffHunk<'input> {
pub kind: DiffHunkKind,
pub contents: Vec<&'input BStr>,
pub contents: DiffHunkContentVec<'input>,
}
impl<'input> DiffHunk<'input> {
@ -824,6 +827,9 @@ pub enum DiffHunkKind {
Different,
}
// Inline up to two sides
pub type DiffHunkContentVec<'input> = SmallVec<[&'input BStr; 2]>;
pub struct DiffHunkIterator<'diff, 'input> {
diff: &'diff Diff<'input>,
previous: &'diff UnchangedRange,
@ -850,12 +856,12 @@ impl<'diff, 'input> Iterator for DiffHunkIterator<'diff, 'input> {
fn next(&mut self) -> Option<Self::Item> {
if !self.unchanged_emitted {
self.unchanged_emitted = true;
let contents = self.diff.hunk_at(self.previous).collect_vec();
let contents = self.diff.hunk_at(self.previous).collect();
let kind = DiffHunkKind::Matching;
return Some(DiffHunk { kind, contents });
}
let current = self.unchanged_iter.next()?;
let contents = self.diff.hunk_between(self.previous, current).collect_vec();
let contents: DiffHunkContentVec = self.diff.hunk_between(self.previous, current).collect();
debug_assert!(
contents.iter().any(|content| !content.is_empty()),
"unchanged regions should have been compacted"