From 0dd000d2366bf4c8ad7f290ef2250a6be544739c Mon Sep 17 00:00:00 2001 From: Martin von Zweigbergk Date: Wed, 7 Apr 2021 10:07:37 -0700 Subject: [PATCH] diff: do final refinement at byte-level for non-word bytes This results in significantly more readable diffs on commits like 659393bec219 in this repo. Before: test bench_diff_10k_lines_reversed ... bench: 38,122,998 ns/iter (+/- 557,688) test bench_diff_10k_modified_lines ... bench: 32,556,563 ns/iter (+/- 548,114) test bench_diff_10k_unchanged_lines ... bench: 4,231 ns/iter (+/- 15) test bench_diff_1k_lines_reversed ... bench: 958,296 ns/iter (+/- 46,963) test bench_diff_1k_modified_lines ... bench: 3,014,723 ns/iter (+/- 15,830) test bench_diff_1k_unchanged_lines ... bench: 249 ns/iter (+/- 2) test bench_diff_git_git_read_tree_c ... bench: 78,599 ns/iter (+/- 1,079) After: test bench_diff_10k_lines_reversed ... bench: 38,289,493 ns/iter (+/- 413,712) test bench_diff_10k_modified_lines ... bench: 37,352,516 ns/iter (+/- 1,293,950) test bench_diff_10k_unchanged_lines ... bench: 4,238 ns/iter (+/- 13) test bench_diff_1k_lines_reversed ... bench: 967,253 ns/iter (+/- 8,506) test bench_diff_1k_modified_lines ... bench: 3,358,028 ns/iter (+/- 37,154) test bench_diff_1k_unchanged_lines ... bench: 233 ns/iter (+/- 1) test bench_diff_git_git_read_tree_c ... bench: 95,787 ns/iter (+/- 740) So the biggest slowdown is when there are modified lines. --- lib/src/diff.rs | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/lib/src/diff.rs b/lib/src/diff.rs index c114164ef..f297e6ebf 100644 --- a/lib/src/diff.rs +++ b/lib/src/diff.rs @@ -37,18 +37,21 @@ pub fn find_line_ranges(text: &[u8]) -> Vec> { ranges } +fn is_word_byte(b: u8) -> bool { + // TODO: Make this configurable (probably higher up in the call stack) + matches!(b, b'A'..=b'Z' | b'a'..=b'z' | b'0'..=b'9' | b'_') +} + pub fn find_word_ranges(text: &[u8]) -> Vec> { let mut word_ranges = vec![]; let mut word_start_pos = 0; let mut in_word = false; for (i, b) in text.iter().enumerate() { - // TODO: Make this configurable (probably higher up in the call stack) - let is_word_byte = matches!(*b, b'A'..=b'Z' | b'a'..=b'z' | b'0'..=b'9' | b'_'); - if in_word && !is_word_byte { + if in_word && !is_word_byte(*b) { in_word = false; word_ranges.push(word_start_pos..i); word_start_pos = i; - } else if !in_word && is_word_byte { + } else if !in_word && is_word_byte(*b) { in_word = true; word_start_pos = i; } @@ -59,10 +62,10 @@ pub fn find_word_ranges(text: &[u8]) -> Vec> { word_ranges } -pub fn find_newline_ranges(text: &[u8]) -> Vec> { +pub fn find_nonword_ranges(text: &[u8]) -> Vec> { let mut ranges = vec![]; for (i, b) in text.iter().enumerate() { - if *b == b'\n' { + if !is_word_byte(*b) { ranges.push(i..i + 1); } } @@ -472,7 +475,7 @@ pub fn diff<'a>(left: &'a [u8], right: &'a [u8]) -> Vec> { let range_diffs = vec![RangeDiff::Replaced(0..left.len(), 0..right.len())]; let range_diffs = refine_changed_ranges(left, right, &range_diffs, &find_line_ranges); let range_diffs = refine_changed_ranges(left, right, &range_diffs, &find_word_ranges); - let range_diffs = refine_changed_ranges(left, right, &range_diffs, &find_newline_ranges); + let range_diffs = refine_changed_ranges(left, right, &range_diffs, &find_nonword_ranges); range_diffs_to_slice_diffs(left, right, &range_diffs) }