diff: do final refinement at byte-level for non-word bytes

This results in significantly more readable diffs on commits like
659393bec2 in this repo.


Before:
test bench_diff_10k_lines_reversed  ... bench:  38,122,998 ns/iter (+/- 557,688)
test bench_diff_10k_modified_lines  ... bench:  32,556,563 ns/iter (+/- 548,114)
test bench_diff_10k_unchanged_lines ... bench:       4,231 ns/iter (+/- 15)
test bench_diff_1k_lines_reversed   ... bench:     958,296 ns/iter (+/- 46,963)
test bench_diff_1k_modified_lines   ... bench:   3,014,723 ns/iter (+/- 15,830)
test bench_diff_1k_unchanged_lines  ... bench:         249 ns/iter (+/- 2)
test bench_diff_git_git_read_tree_c ... bench:      78,599 ns/iter (+/- 1,079)

After:
test bench_diff_10k_lines_reversed  ... bench:  38,289,493 ns/iter (+/- 413,712)
test bench_diff_10k_modified_lines  ... bench:  37,352,516 ns/iter (+/- 1,293,950)
test bench_diff_10k_unchanged_lines ... bench:       4,238 ns/iter (+/- 13)
test bench_diff_1k_lines_reversed   ... bench:     967,253 ns/iter (+/- 8,506)
test bench_diff_1k_modified_lines   ... bench:   3,358,028 ns/iter (+/- 37,154)
test bench_diff_1k_unchanged_lines  ... bench:         233 ns/iter (+/- 1)
test bench_diff_git_git_read_tree_c ... bench:      95,787 ns/iter (+/- 740)


So the biggest slowdown is when there are modified lines.
This commit is contained in:
Martin von Zweigbergk 2021-04-07 10:07:37 -07:00
parent f634ff0e3f
commit 0dd000d236

View file

@ -37,18 +37,21 @@ pub fn find_line_ranges(text: &[u8]) -> Vec<Range<usize>> {
ranges ranges
} }
fn is_word_byte(b: u8) -> bool {
// TODO: Make this configurable (probably higher up in the call stack)
matches!(b, b'A'..=b'Z' | b'a'..=b'z' | b'0'..=b'9' | b'_')
}
pub fn find_word_ranges(text: &[u8]) -> Vec<Range<usize>> { pub fn find_word_ranges(text: &[u8]) -> Vec<Range<usize>> {
let mut word_ranges = vec![]; let mut word_ranges = vec![];
let mut word_start_pos = 0; let mut word_start_pos = 0;
let mut in_word = false; let mut in_word = false;
for (i, b) in text.iter().enumerate() { for (i, b) in text.iter().enumerate() {
// TODO: Make this configurable (probably higher up in the call stack) if in_word && !is_word_byte(*b) {
let is_word_byte = matches!(*b, b'A'..=b'Z' | b'a'..=b'z' | b'0'..=b'9' | b'_');
if in_word && !is_word_byte {
in_word = false; in_word = false;
word_ranges.push(word_start_pos..i); word_ranges.push(word_start_pos..i);
word_start_pos = i; word_start_pos = i;
} else if !in_word && is_word_byte { } else if !in_word && is_word_byte(*b) {
in_word = true; in_word = true;
word_start_pos = i; word_start_pos = i;
} }
@ -59,10 +62,10 @@ pub fn find_word_ranges(text: &[u8]) -> Vec<Range<usize>> {
word_ranges word_ranges
} }
pub fn find_newline_ranges(text: &[u8]) -> Vec<Range<usize>> { pub fn find_nonword_ranges(text: &[u8]) -> Vec<Range<usize>> {
let mut ranges = vec![]; let mut ranges = vec![];
for (i, b) in text.iter().enumerate() { for (i, b) in text.iter().enumerate() {
if *b == b'\n' { if !is_word_byte(*b) {
ranges.push(i..i + 1); ranges.push(i..i + 1);
} }
} }
@ -472,7 +475,7 @@ pub fn diff<'a>(left: &'a [u8], right: &'a [u8]) -> Vec<SliceDiff<'a>> {
let range_diffs = vec![RangeDiff::Replaced(0..left.len(), 0..right.len())]; let range_diffs = vec![RangeDiff::Replaced(0..left.len(), 0..right.len())];
let range_diffs = refine_changed_ranges(left, right, &range_diffs, &find_line_ranges); let range_diffs = refine_changed_ranges(left, right, &range_diffs, &find_line_ranges);
let range_diffs = refine_changed_ranges(left, right, &range_diffs, &find_word_ranges); let range_diffs = refine_changed_ranges(left, right, &range_diffs, &find_word_ranges);
let range_diffs = refine_changed_ranges(left, right, &range_diffs, &find_newline_ranges); let range_diffs = refine_changed_ranges(left, right, &range_diffs, &find_nonword_ranges);
range_diffs_to_slice_diffs(left, right, &range_diffs) range_diffs_to_slice_diffs(left, right, &range_diffs)
} }