mirror of
https://github.com/martinvonz/jj.git
synced 2025-01-31 00:12:06 +00:00
diff: also recurse into final region after after unchanged regions
See test case for details. Before: test bench_diff_10k_lines_reversed ... bench: 36,249,659 ns/iter (+/- 174,455) test bench_diff_10k_modified_lines ... bench: 37,258,890 ns/iter (+/- 803,963) test bench_diff_10k_unchanged_lines ... bench: 4,252 ns/iter (+/- 69) test bench_diff_1k_lines_reversed ... bench: 982,834 ns/iter (+/- 6,467) test bench_diff_1k_modified_lines ... bench: 3,343,469 ns/iter (+/- 23,243) test bench_diff_1k_unchanged_lines ... bench: 231 ns/iter (+/- 2) test bench_diff_git_git_read_tree_c ... bench: 95,559 ns/iter (+/- 816) After: test bench_diff_10k_lines_reversed ... bench: 36,186,715 ns/iter (+/- 196,903) test bench_diff_10k_modified_lines ... bench: 37,511,000 ns/iter (+/- 1,370,476) test bench_diff_10k_unchanged_lines ... bench: 3,099 ns/iter (+/- 8) test bench_diff_1k_lines_reversed ... bench: 986,010 ns/iter (+/- 11,565) test bench_diff_1k_modified_lines ... bench: 3,370,938 ns/iter (+/- 17,041) test bench_diff_1k_unchanged_lines ... bench: 230 ns/iter (+/- 2) test bench_diff_git_git_read_tree_c ... bench: 102,189 ns/iter (+/- 1,052) So this patch makes diffing even slower (but still easily fast enough for all cases I've run into in real life). There's probably a lot that can be done to make things faster, but the first priority is that the diffs are correct and easy to read.
This commit is contained in:
parent
0844a2ec8c
commit
102f7a0416
1 changed files with 47 additions and 8 deletions
|
@ -132,11 +132,11 @@ impl Debug for SliceDiff<'_> {
|
|||
fn fmt(&self, f: &mut Formatter<'_>) -> Result<(), std::fmt::Error> {
|
||||
match self {
|
||||
SliceDiff::Unchanged(data) => f
|
||||
.debug_tuple("Unchanged")
|
||||
.debug_tuple("SliceDiff::Unchanged")
|
||||
.field(&String::from_utf8_lossy(data))
|
||||
.finish(),
|
||||
SliceDiff::Replaced(left, right) => f
|
||||
.debug_tuple("Replaced")
|
||||
.debug_tuple("SliceDiff::Replaced")
|
||||
.field(&String::from_utf8_lossy(left))
|
||||
.field(&String::from_utf8_lossy(right))
|
||||
.finish(),
|
||||
|
@ -205,10 +205,6 @@ pub(crate) fn unchanged_ranges(
|
|||
return vec![];
|
||||
}
|
||||
|
||||
// TODO: Don't waste time calculating entire histogram. We don't need to keep
|
||||
// data about common entries. If a word has more than N occurrences, we should
|
||||
// just ignore it (and assume that everything changes if we have no less common
|
||||
// words).
|
||||
let max_occurrences = 100;
|
||||
let mut left_histogram = Histogram::calculate(left, left_ranges, max_occurrences);
|
||||
if *left_histogram.count_to_words.first_entry().unwrap().key() > max_occurrences {
|
||||
|
@ -228,6 +224,9 @@ pub(crate) fn unchanged_ranges(
|
|||
}
|
||||
}
|
||||
}
|
||||
if uncommon_shared_words.is_empty() {
|
||||
return vec![];
|
||||
}
|
||||
|
||||
// Let's say our inputs are "a b a b" and "a b c c b a b". We will have found
|
||||
// the least common words to be "a" and "b". We now assume that each
|
||||
|
@ -274,6 +273,7 @@ pub(crate) fn unchanged_ranges(
|
|||
for (_pos, word, occurrence) in &right_positions {
|
||||
left_index_by_right_index.push(*left_position_map.get(&(*word, *occurrence)).unwrap());
|
||||
}
|
||||
|
||||
let lcs = find_lcs(&left_index_by_right_index);
|
||||
|
||||
// Produce output ranges, recursing into the modified areas between the elements
|
||||
|
@ -303,6 +303,19 @@ pub(crate) fn unchanged_ranges(
|
|||
previous_left_position = left_position + 1;
|
||||
previous_right_position = right_position + 1;
|
||||
}
|
||||
// Also recurse into range at end (after common ranges).
|
||||
let skipped_left_positions = previous_left_position..left_ranges.len();
|
||||
let skipped_right_positions = previous_right_position..right_ranges.len();
|
||||
if !skipped_left_positions.is_empty() || !skipped_right_positions.is_empty() {
|
||||
for unchanged_nested_range in unchanged_ranges(
|
||||
left,
|
||||
right,
|
||||
&left_ranges[skipped_left_positions],
|
||||
&right_ranges[skipped_right_positions],
|
||||
) {
|
||||
result.push(unchanged_nested_range);
|
||||
}
|
||||
}
|
||||
|
||||
result
|
||||
}
|
||||
|
@ -795,7 +808,31 @@ mod tests {
|
|||
}
|
||||
|
||||
#[test]
|
||||
fn test_diff_gitgit_read_tree_c() {
|
||||
fn test_diff_real_case_write_fmt() {
|
||||
// This is from src/ui.rs in commit f44d246e3f88 in this repo. It highlights the
|
||||
// need for recursion into the range at the end: after splitting at "Arguments"
|
||||
// and "styler", the region at the end has the unique words "write_fmt"
|
||||
// and "fmt", but we forgot to recurse into that region, so we ended up
|
||||
// saying that "write_fmt(fmt).unwrap()" was replaced by b"write_fmt(fmt)".
|
||||
assert_eq!(diff(
|
||||
b" pub fn write_fmt(&mut self, fmt: fmt::Arguments<\'_>) {\n self.styler().write_fmt(fmt).unwrap()\n",
|
||||
b" pub fn write_fmt(&mut self, fmt: fmt::Arguments<\'_>) -> io::Result<()> {\n self.styler().write_fmt(fmt)\n"
|
||||
),
|
||||
vec![
|
||||
SliceDiff::Unchanged(b" pub fn write_fmt(&mut self, fmt: fmt::Arguments<\'_"),
|
||||
SliceDiff::Unchanged(b">) "),
|
||||
SliceDiff::Replaced(b"", b"-> io::Result<()> "),
|
||||
SliceDiff::Unchanged(b"{\n "),
|
||||
SliceDiff::Unchanged(b"self.styler().write_fmt(fmt"),
|
||||
SliceDiff::Unchanged(b")"),
|
||||
SliceDiff::Replaced(b".unwrap()", b""),
|
||||
SliceDiff::Unchanged(b"\n")
|
||||
]
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_diff_real_case_gitgit_read_tree_c() {
|
||||
// This is the diff from commit e497ea2a9b in the git.git repo
|
||||
assert_eq!(
|
||||
diff(
|
||||
|
@ -947,7 +984,9 @@ int main(int argc, char **argv)
|
|||
SliceDiff::Replaced(b"", b"\t\tint fd;\n\n"),
|
||||
SliceDiff::Unchanged(b"\t\tif (size < len + 20 || sscanf(buffer, \"%o\", &mode) != 1)\n\t\t\tusage(\"corrupt \'tree\' file\");\n\t\tbuffer = sha1 + 20;\n\t\tsize -= len + 20;\n"),
|
||||
SliceDiff::Unchanged(b"\t\t"),
|
||||
SliceDiff::Replaced(b"printf(\"%o %s (%s)\\n\", mode, path, sha1_to_hex(", b"data = read_sha1_file("),
|
||||
SliceDiff::Replaced(b"printf", b"data = read_sha1_file"),
|
||||
SliceDiff::Unchanged(b"("),
|
||||
SliceDiff::Replaced(b"\"%o %s (%s)\\n\", mode, path, sha1_to_hex(", b""),
|
||||
SliceDiff::Unchanged(b"sha1"),
|
||||
SliceDiff::Replaced(b"", b", type, &filesize"),
|
||||
SliceDiff::Unchanged(b")"),
|
||||
|
|
Loading…
Reference in a new issue