diff: also recurse into final region after after unchanged regions

See test case for details. Before: test bench_diff_10k_lines_reversed ... bench: 36,249,659 ns/iter (+/- 174,455) test bench_diff_10k_modified_lines ... bench: 37,258,890 ns/iter (+/- 803,963) test bench_diff_10k_unchanged_lines ... bench: 4,252 ns/iter (+/- 69) test bench_diff_1k_lines_reversed ... bench: 982,834 ns/iter (+/- 6,467) test bench_diff_1k_modified_lines ... bench: 3,343,469 ns/iter (+/- 23,243) test bench_diff_1k_unchanged_lines ... bench: 231 ns/iter (+/- 2) test bench_diff_git_git_read_tree_c ... bench: 95,559 ns/iter (+/- 816) After: test bench_diff_10k_lines_reversed ... bench: 36,186,715 ns/iter (+/- 196,903) test bench_diff_10k_modified_lines ... bench: 37,511,000 ns/iter (+/- 1,370,476) test bench_diff_10k_unchanged_lines ... bench: 3,099 ns/iter (+/- 8) test bench_diff_1k_lines_reversed ... bench: 986,010 ns/iter (+/- 11,565) test bench_diff_1k_modified_lines ... bench: 3,370,938 ns/iter (+/- 17,041) test bench_diff_1k_unchanged_lines ... bench: 230 ns/iter (+/- 2) test bench_diff_git_git_read_tree_c ... bench: 102,189 ns/iter (+/- 1,052) So this patch makes diffing even slower (but still easily fast enough for all cases I've run into in real life). There's probably a lot that can be done to make things faster, but the first priority is that the diffs are correct and easy to read.
2025-01-31 00:12:06 +00:00 · 2021-04-07 23:26:19 -07:00 · 2021-04-07 23:26:19 -07:00 · 102f7a0416
commit 102f7a0416
parent 0844a2ec8c
1 changed files with 47 additions and 8 deletions
--- a/lib/src/diff.rs
+++ b/lib/src/diff.rs
@ -132,11 +132,11 @@ impl Debug for SliceDiff<'_> {
    fn fmt(&self, f: &mut Formatter<'_>) -> Result<(), std::fmt::Error> {
        match self {
            SliceDiff::Unchanged(data) => f
-                .debug_tuple("Unchanged")
+                .debug_tuple("SliceDiff::Unchanged")
                .field(&String::from_utf8_lossy(data))
                .finish(),
            SliceDiff::Replaced(left, right) => f
-                .debug_tuple("Replaced")
+                .debug_tuple("SliceDiff::Replaced")
                .field(&String::from_utf8_lossy(left))
                .field(&String::from_utf8_lossy(right))
                .finish(),
@ -205,10 +205,6 @@ pub(crate) fn unchanged_ranges(
        return vec![];
    }

-    // TODO: Don't waste time calculating entire histogram. We don't need to keep
-    // data about common entries. If a word has more than N occurrences, we should
-    // just ignore it (and assume that everything changes if we have no less common
-    // words).
    let max_occurrences = 100;
    let mut left_histogram = Histogram::calculate(left, left_ranges, max_occurrences);
    if *left_histogram.count_to_words.first_entry().unwrap().key() > max_occurrences {
@ -228,6 +224,9 @@ pub(crate) fn unchanged_ranges(
            }
        }
    }
+    if uncommon_shared_words.is_empty() {
+        return vec![];
+    }

    // Let's say our inputs are "a b a b" and "a b c c b a b". We will have found
    // the least common words to be "a" and "b". We now assume that each
@ -274,6 +273,7 @@ pub(crate) fn unchanged_ranges(
    for (_pos, word, occurrence) in &right_positions {
        left_index_by_right_index.push(*left_position_map.get(&(*word, *occurrence)).unwrap());
    }
+
    let lcs = find_lcs(&left_index_by_right_index);

    // Produce output ranges, recursing into the modified areas between the elements
@ -303,6 +303,19 @@ pub(crate) fn unchanged_ranges(
        previous_left_position = left_position + 1;
        previous_right_position = right_position + 1;
    }
+    // Also recurse into range at end (after common ranges).
+    let skipped_left_positions = previous_left_position..left_ranges.len();
+    let skipped_right_positions = previous_right_position..right_ranges.len();
+    if !skipped_left_positions.is_empty() || !skipped_right_positions.is_empty() {
+        for unchanged_nested_range in unchanged_ranges(
+            left,
+            right,
+            &left_ranges[skipped_left_positions],
+            &right_ranges[skipped_right_positions],
+        ) {
+            result.push(unchanged_nested_range);
+        }
+    }

    result
 }
@ -795,7 +808,31 @@ mod tests {
    }

    #[test]
-    fn test_diff_gitgit_read_tree_c() {
+    fn test_diff_real_case_write_fmt() {
+        // This is from src/ui.rs in commit f44d246e3f88 in this repo. It highlights the
+        // need for recursion into the range at the end: after splitting at "Arguments"
+        // and "styler", the region at the end has the unique words "write_fmt"
+        // and "fmt", but we forgot to recurse into that region, so we ended up
+        // saying that "write_fmt(fmt).unwrap()" was replaced by b"write_fmt(fmt)".
+        assert_eq!(diff(
+                b"    pub fn write_fmt(&mut self, fmt: fmt::Arguments<\'_>) {\n        self.styler().write_fmt(fmt).unwrap()\n",
+                b"    pub fn write_fmt(&mut self, fmt: fmt::Arguments<\'_>) -> io::Result<()> {\n        self.styler().write_fmt(fmt)\n"
+            ),
+            vec![
+                SliceDiff::Unchanged(b"    pub fn write_fmt(&mut self, fmt: fmt::Arguments<\'_"),
+                SliceDiff::Unchanged(b">) "),
+                SliceDiff::Replaced(b"", b"-> io::Result<()> "),
+                SliceDiff::Unchanged(b"{\n        "),
+                SliceDiff::Unchanged(b"self.styler().write_fmt(fmt"),
+                SliceDiff::Unchanged(b")"),
+                SliceDiff::Replaced(b".unwrap()", b""),
+                SliceDiff::Unchanged(b"\n")
+            ]
+        );
+    }
+
+    #[test]
+    fn test_diff_real_case_gitgit_read_tree_c() {
        // This is the diff from commit e497ea2a9b in the git.git repo
        assert_eq!(
            diff(
@ -947,7 +984,9 @@ int main(int argc, char **argv)
               SliceDiff::Replaced(b"", b"\t\tint fd;\n\n"),
               SliceDiff::Unchanged(b"\t\tif (size < len + 20 || sscanf(buffer, \"%o\", &mode) != 1)\n\t\t\tusage(\"corrupt \'tree\' file\");\n\t\tbuffer = sha1 + 20;\n\t\tsize -= len + 20;\n"),
               SliceDiff::Unchanged(b"\t\t"),
-               SliceDiff::Replaced(b"printf(\"%o %s (%s)\\n\", mode, path, sha1_to_hex(", b"data = read_sha1_file("),
+               SliceDiff::Replaced(b"printf", b"data = read_sha1_file"),
+               SliceDiff::Unchanged(b"("),
+               SliceDiff::Replaced(b"\"%o %s (%s)\\n\", mode, path, sha1_to_hex(", b""),
               SliceDiff::Unchanged(b"sha1"),
               SliceDiff::Replaced(b"", b", type, &filesize"),
               SliceDiff::Unchanged(b")"),