From c071d412af9c516210473f2c5b38a83842fabf8e Mon Sep 17 00:00:00 2001
From: Martin von Zweigbergk <martinvonz@google.com>
Date: Fri, 26 Mar 2021 09:52:05 -0700
Subject: [PATCH] diff: use new diff algorithm for content diff

The previous patch switched over the content-merge code to use the new
histogram diff code. This patch switches over the content-diff code to
use the histogram diff code. As before, the immediate goal is to speed
it up. `jj diff -r c28ded83fc` in the git.git repo is a good example
of a diff that's extremely slow to calculate with our current
LCS-based diff. With this patch, that drops from 35 s to 0.12 s.

The diff was slightly better before. I think that's mostly because of
our different definition of a "word" in the data. We can improve that
later. The speedup we get now is easily worth the slightly worse diff.
---
 Cargo.lock       |  8 -----
 Cargo.toml       |  1 -
 lib/Cargo.toml   |  1 -
 lib/src/files.rs | 94 +++++++++++++++++-------------------------------
 4 files changed, 33 insertions(+), 71 deletions(-)
diff --git a/Cargo.lock b/Cargo.lock
index 9f201daaf..92ba813f5 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -345,12 +345,6 @@ dependencies = [
  "memchr",
 ]
 
-[[package]]
-name = "diff"
-version = "0.1.12"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0e25ea47919b1560c4e3b7fe0aaab9becf5b84a10325ddf7db0f0ba5e1026499"
-
 [[package]]
 name = "digest"
 version = "0.8.1"
@@ -556,7 +550,6 @@ dependencies = [
  "clap",
  "config",
  "criterion",
- "diff",
  "dirs",
  "git2",
  "hex",
@@ -585,7 +578,6 @@ dependencies = [
  "bytes",
  "chrono",
  "config",
- "diff",
  "dirs",
  "git2",
  "hex",
diff --git a/Cargo.toml b/Cargo.toml
index 40a1110b6..88e1f7379 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -26,7 +26,6 @@ chrono = "0.4.19"
 clap = "2.33.3"
 config = "0.10.1"
 criterion = "0.3.3"
-diff = "0.1.12"
 dirs = "3.0.1"
 git2 = "0.13.14"
 hex = "0.4.2"
diff --git a/lib/Cargo.toml b/lib/Cargo.toml
index a90163dc0..48d8450b0 100644
--- a/lib/Cargo.toml
+++ b/lib/Cargo.toml
@@ -22,7 +22,6 @@ bytes = "1.0.0"
 byteorder = "1.3.4"
 chrono = "0.4.19"
 config = "0.10.1"
-diff = "0.1.12"
 dirs = "3.0.1"
 git2 = "0.13.14"
 hex = "0.4.2"
diff --git a/lib/src/files.rs b/lib/src/files.rs
index 79e15c1a1..35aebc694 100644
--- a/lib/src/files.rs
+++ b/lib/src/files.rs
@@ -15,38 +15,7 @@
 use std::fmt::{Debug, Error, Formatter};
 use std::ops::Range;
 
-use diff::slice as diff_slice;
-
-fn is_word_byte(a: u8) -> bool {
-    a.is_ascii_alphanumeric() || a == b'_'
-}
-
-fn is_same_word(a: u8, b: u8) -> bool {
-    // Don't allow utf-8 code points to be split into separate words
-    (is_word_byte(a) && is_word_byte(b)) || a & 0x80 != 0
-}
-
-fn tokenize(data: &[u8]) -> Vec<&[u8]> {
-    // TODO: Fix this code to not be so inefficient, and to allow the word
-    // delimiter to be configured.
-    let mut output = vec![];
-    let mut word_start_pos = 0;
-    let mut maybe_prev: Option<u8> = None;
-    for (i, b) in data.iter().enumerate() {
-        let b = *b;
-        if let Some(prev) = maybe_prev {
-            if !is_same_word(prev, b) {
-                output.push(&data[word_start_pos..i]);
-                word_start_pos = i;
-            }
-        }
-        maybe_prev = Some(b);
-    }
-    if word_start_pos < data.len() {
-        output.push(&data[word_start_pos..]);
-    }
-    output
-}
+use crate::diff;
 
 #[derive(PartialEq, Eq, Clone, Debug)]
 pub enum DiffHunk<'a> {
@@ -81,9 +50,6 @@ impl DiffLine<'_> {
 pub fn diff<'a>(left: &'a [u8], right: &'a [u8], callback: &mut impl FnMut(&DiffLine<'a>)) {
     // TODO: Should we attempt to interpret as utf-8 and otherwise break only at
     // newlines?
-    let left_tokens = tokenize(left);
-    let right_tokens = tokenize(right);
-    let result = diff_slice(&left_tokens, &right_tokens);
     let mut diff_line = DiffLine {
         left_line_number: 1,
         right_line_number: 1,
@@ -91,36 +57,42 @@ pub fn diff<'a>(left: &'a [u8], right: &'a [u8], callback: &mut impl FnMut(&Diff
         has_right_content: false,
         hunks: vec![],
     };
-    for hunk in result {
+    for hunk in diff::diff(left, right) {
         match hunk {
-            diff::Result::Both(left, right) => {
-                assert!(left == right);
-                diff_line.has_left_content = true;
-                diff_line.has_right_content = true;
-                diff_line.hunks.push(DiffHunk::Unmodified(left));
-                if left == &[b'\n'] {
-                    callback(&diff_line);
-                    diff_line.left_line_number += 1;
-                    diff_line.right_line_number += 1;
-                    diff_line.reset_line();
+            diff::SliceDiff::Unchanged(text) => {
+                let lines = text.split_inclusive(|b| *b == b'\n');
+                for line in lines {
+                    diff_line.has_left_content = true;
+                    diff_line.has_right_content = true;
+                    diff_line.hunks.push(DiffHunk::Unmodified(line));
+                    if line.ends_with(b"\n") {
+                        callback(&diff_line);
+                        diff_line.left_line_number += 1;
+                        diff_line.right_line_number += 1;
+                        diff_line.reset_line();
+                    }
                 }
             }
-            diff::Result::Left(left) => {
-                diff_line.has_left_content = true;
-                diff_line.hunks.push(DiffHunk::Removed(left));
-                if left == &[b'\n'] {
-                    callback(&diff_line);
-                    diff_line.left_line_number += 1;
-                    diff_line.reset_line();
+            diff::SliceDiff::Replaced(left, right) => {
+                let left_lines = left.split_inclusive(|b| *b == b'\n');
+                for left_line in left_lines {
+                    diff_line.has_left_content = true;
+                    diff_line.hunks.push(DiffHunk::Removed(left_line));
+                    if left_line.ends_with(b"\n") {
+                        callback(&diff_line);
+                        diff_line.left_line_number += 1;
+                        diff_line.reset_line();
+                    }
                 }
-            }
-            diff::Result::Right(right) => {
-                diff_line.has_right_content = true;
-                diff_line.hunks.push(DiffHunk::Added(right));
-                if right == &[b'\n'] {
-                    callback(&diff_line);
-                    diff_line.right_line_number += 1;
-                    diff_line.reset_line();
+                let right_lines = right.split_inclusive(|b| *b == b'\n');
+                for right_line in right_lines {
+                    diff_line.has_right_content = true;
+                    diff_line.hunks.push(DiffHunk::Added(right_line));
+                    if right_line.ends_with(b"\n") {
+                        callback(&diff_line);
+                        diff_line.right_line_number += 1;
+                        diff_line.reset_line();
+                    }
                 }
             }
         }