diff: use new diff algorithm for content diff

The previous patch switched over the content-merge code to use the new histogram diff code. This patch switches over the content-diff code to use the histogram diff code. As before, the immediate goal is to speed it up. `jj diff -r c28ded83fc` in the git.git repo is a good example of a diff that's extremely slow to calculate with our current LCS-based diff. With this patch, that drops from 35 s to 0.12 s. The diff was slightly better before. I think that's mostly because of our different definition of a "word" in the data. We can improve that later. The speedup we get now is easily worth the slightly worse diff.
2024-12-24 12:48:55 +00:00 · 2021-03-26 09:52:05 -07:00 · 2021-03-26 09:52:05 -07:00 · c071d412af
commit c071d412af
parent 3c35dbace6
4 changed files with 33 additions and 71 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -345,12 +345,6 @@ dependencies = [
 "memchr",
 ]

-[[package]]
-name = "diff"
-version = "0.1.12"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0e25ea47919b1560c4e3b7fe0aaab9becf5b84a10325ddf7db0f0ba5e1026499"
-
 [[package]]
 name = "digest"
 version = "0.8.1"
@ -556,7 +550,6 @@ dependencies = [
 "clap",
 "config",
 "criterion",
- "diff",
 "dirs",
 "git2",
 "hex",
@ -585,7 +578,6 @@ dependencies = [
 "bytes",
 "chrono",
 "config",
- "diff",
 "dirs",
 "git2",
 "hex",
--- a/Cargo.toml
+++ b/Cargo.toml
@ -26,7 +26,6 @@ chrono = "0.4.19"
 clap = "2.33.3"
 config = "0.10.1"
 criterion = "0.3.3"
-diff = "0.1.12"
 dirs = "3.0.1"
 git2 = "0.13.14"
 hex = "0.4.2"
--- a/lib/Cargo.toml
+++ b/lib/Cargo.toml
@ -22,7 +22,6 @@ bytes = "1.0.0"
 byteorder = "1.3.4"
 chrono = "0.4.19"
 config = "0.10.1"
-diff = "0.1.12"
 dirs = "3.0.1"
 git2 = "0.13.14"
 hex = "0.4.2"
--- a/lib/src/files.rs
+++ b/lib/src/files.rs
@ -15,38 +15,7 @@
 use std::fmt::{Debug, Error, Formatter};
 use std::ops::Range;

-use diff::slice as diff_slice;
-
-fn is_word_byte(a: u8) -> bool {
-    a.is_ascii_alphanumeric() || a == b'_'
-}
-
-fn is_same_word(a: u8, b: u8) -> bool {
-    // Don't allow utf-8 code points to be split into separate words
-    (is_word_byte(a) && is_word_byte(b)) || a & 0x80 != 0
-}
-
-fn tokenize(data: &[u8]) -> Vec<&[u8]> {
-    // TODO: Fix this code to not be so inefficient, and to allow the word
-    // delimiter to be configured.
-    let mut output = vec![];
-    let mut word_start_pos = 0;
-    let mut maybe_prev: Option<u8> = None;
-    for (i, b) in data.iter().enumerate() {
-        let b = *b;
-        if let Some(prev) = maybe_prev {
-            if !is_same_word(prev, b) {
-                output.push(&data[word_start_pos..i]);
-                word_start_pos = i;
-            }
-        }
-        maybe_prev = Some(b);
-    }
-    if word_start_pos < data.len() {
-        output.push(&data[word_start_pos..]);
-    }
-    output
-}
+use crate::diff;

 #[derive(PartialEq, Eq, Clone, Debug)]
 pub enum DiffHunk<'a> {
@ -81,9 +50,6 @@ impl DiffLine<'_> {
 pub fn diff<'a>(left: &'a [u8], right: &'a [u8], callback: &mut impl FnMut(&DiffLine<'a>)) {
    // TODO: Should we attempt to interpret as utf-8 and otherwise break only at
    // newlines?
-    let left_tokens = tokenize(left);
-    let right_tokens = tokenize(right);
-    let result = diff_slice(&left_tokens, &right_tokens);
    let mut diff_line = DiffLine {
        left_line_number: 1,
        right_line_number: 1,
@ -91,33 +57,38 @@ pub fn diff<'a>(left: &'a [u8], right: &'a [u8], callback: &mut impl FnMut(&Diff
        has_right_content: false,
        hunks: vec![],
    };
-    for hunk in result {
+    for hunk in diff::diff(left, right) {
        match hunk {
-            diff::Result::Both(left, right) => {
-                assert!(left == right);
+            diff::SliceDiff::Unchanged(text) => {
+                let lines = text.split_inclusive(|b| *b == b'\n');
+                for line in lines {
                    diff_line.has_left_content = true;
                    diff_line.has_right_content = true;
-                diff_line.hunks.push(DiffHunk::Unmodified(left));
-                if left == &[b'\n'] {
+                    diff_line.hunks.push(DiffHunk::Unmodified(line));
+                    if line.ends_with(b"\n") {
                        callback(&diff_line);
                        diff_line.left_line_number += 1;
                        diff_line.right_line_number += 1;
                        diff_line.reset_line();
                    }
                }
-            diff::Result::Left(left) => {
+            }
+            diff::SliceDiff::Replaced(left, right) => {
+                let left_lines = left.split_inclusive(|b| *b == b'\n');
+                for left_line in left_lines {
                    diff_line.has_left_content = true;
-                diff_line.hunks.push(DiffHunk::Removed(left));
-                if left == &[b'\n'] {
+                    diff_line.hunks.push(DiffHunk::Removed(left_line));
+                    if left_line.ends_with(b"\n") {
                        callback(&diff_line);
                        diff_line.left_line_number += 1;
                        diff_line.reset_line();
                    }
                }
-            diff::Result::Right(right) => {
+                let right_lines = right.split_inclusive(|b| *b == b'\n');
+                for right_line in right_lines {
                    diff_line.has_right_content = true;
-                diff_line.hunks.push(DiffHunk::Added(right));
-                if right == &[b'\n'] {
+                    diff_line.hunks.push(DiffHunk::Added(right_line));
+                    if right_line.ends_with(b"\n") {
                        callback(&diff_line);
                        diff_line.right_line_number += 1;
                        diff_line.reset_line();
@ -125,6 +96,7 @@ pub fn diff<'a>(left: &'a [u8], right: &'a [u8], callback: &mut impl FnMut(&Diff
                }
            }
        }
+    }
    if !diff_line.hunks.is_empty() {
        callback(&diff_line);
    }