diff: use new diff algorithm for content diff

The previous patch switched over the content-merge code to use the new
histogram diff code. This patch switches over the content-diff code to
use the histogram diff code. As before, the immediate goal is to speed
it up. `jj diff -r c28ded83fc` in the git.git repo is a good example
of a diff that's extremely slow to calculate with our current
LCS-based diff. With this patch, that drops from 35 s to 0.12 s.

The diff was slightly better before. I think that's mostly because of
our different definition of a "word" in the data. We can improve that
later. The speedup we get now is easily worth the slightly worse diff.
This commit is contained in:
Martin von Zweigbergk 2021-03-26 09:52:05 -07:00
parent 3c35dbace6
commit c071d412af
4 changed files with 33 additions and 71 deletions

8
Cargo.lock generated
View file

@ -345,12 +345,6 @@ dependencies = [
"memchr",
]
[[package]]
name = "diff"
version = "0.1.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0e25ea47919b1560c4e3b7fe0aaab9becf5b84a10325ddf7db0f0ba5e1026499"
[[package]]
name = "digest"
version = "0.8.1"
@ -556,7 +550,6 @@ dependencies = [
"clap",
"config",
"criterion",
"diff",
"dirs",
"git2",
"hex",
@ -585,7 +578,6 @@ dependencies = [
"bytes",
"chrono",
"config",
"diff",
"dirs",
"git2",
"hex",

View file

@ -26,7 +26,6 @@ chrono = "0.4.19"
clap = "2.33.3"
config = "0.10.1"
criterion = "0.3.3"
diff = "0.1.12"
dirs = "3.0.1"
git2 = "0.13.14"
hex = "0.4.2"

View file

@ -22,7 +22,6 @@ bytes = "1.0.0"
byteorder = "1.3.4"
chrono = "0.4.19"
config = "0.10.1"
diff = "0.1.12"
dirs = "3.0.1"
git2 = "0.13.14"
hex = "0.4.2"

View file

@ -15,38 +15,7 @@
use std::fmt::{Debug, Error, Formatter};
use std::ops::Range;
use diff::slice as diff_slice;
fn is_word_byte(a: u8) -> bool {
a.is_ascii_alphanumeric() || a == b'_'
}
fn is_same_word(a: u8, b: u8) -> bool {
// Don't allow utf-8 code points to be split into separate words
(is_word_byte(a) && is_word_byte(b)) || a & 0x80 != 0
}
fn tokenize(data: &[u8]) -> Vec<&[u8]> {
// TODO: Fix this code to not be so inefficient, and to allow the word
// delimiter to be configured.
let mut output = vec![];
let mut word_start_pos = 0;
let mut maybe_prev: Option<u8> = None;
for (i, b) in data.iter().enumerate() {
let b = *b;
if let Some(prev) = maybe_prev {
if !is_same_word(prev, b) {
output.push(&data[word_start_pos..i]);
word_start_pos = i;
}
}
maybe_prev = Some(b);
}
if word_start_pos < data.len() {
output.push(&data[word_start_pos..]);
}
output
}
use crate::diff;
#[derive(PartialEq, Eq, Clone, Debug)]
pub enum DiffHunk<'a> {
@ -81,9 +50,6 @@ impl DiffLine<'_> {
pub fn diff<'a>(left: &'a [u8], right: &'a [u8], callback: &mut impl FnMut(&DiffLine<'a>)) {
// TODO: Should we attempt to interpret as utf-8 and otherwise break only at
// newlines?
let left_tokens = tokenize(left);
let right_tokens = tokenize(right);
let result = diff_slice(&left_tokens, &right_tokens);
let mut diff_line = DiffLine {
left_line_number: 1,
right_line_number: 1,
@ -91,36 +57,42 @@ pub fn diff<'a>(left: &'a [u8], right: &'a [u8], callback: &mut impl FnMut(&Diff
has_right_content: false,
hunks: vec![],
};
for hunk in result {
for hunk in diff::diff(left, right) {
match hunk {
diff::Result::Both(left, right) => {
assert!(left == right);
diff_line.has_left_content = true;
diff_line.has_right_content = true;
diff_line.hunks.push(DiffHunk::Unmodified(left));
if left == &[b'\n'] {
callback(&diff_line);
diff_line.left_line_number += 1;
diff_line.right_line_number += 1;
diff_line.reset_line();
diff::SliceDiff::Unchanged(text) => {
let lines = text.split_inclusive(|b| *b == b'\n');
for line in lines {
diff_line.has_left_content = true;
diff_line.has_right_content = true;
diff_line.hunks.push(DiffHunk::Unmodified(line));
if line.ends_with(b"\n") {
callback(&diff_line);
diff_line.left_line_number += 1;
diff_line.right_line_number += 1;
diff_line.reset_line();
}
}
}
diff::Result::Left(left) => {
diff_line.has_left_content = true;
diff_line.hunks.push(DiffHunk::Removed(left));
if left == &[b'\n'] {
callback(&diff_line);
diff_line.left_line_number += 1;
diff_line.reset_line();
diff::SliceDiff::Replaced(left, right) => {
let left_lines = left.split_inclusive(|b| *b == b'\n');
for left_line in left_lines {
diff_line.has_left_content = true;
diff_line.hunks.push(DiffHunk::Removed(left_line));
if left_line.ends_with(b"\n") {
callback(&diff_line);
diff_line.left_line_number += 1;
diff_line.reset_line();
}
}
}
diff::Result::Right(right) => {
diff_line.has_right_content = true;
diff_line.hunks.push(DiffHunk::Added(right));
if right == &[b'\n'] {
callback(&diff_line);
diff_line.right_line_number += 1;
diff_line.reset_line();
let right_lines = right.split_inclusive(|b| *b == b'\n');
for right_line in right_lines {
diff_line.has_right_content = true;
diff_line.hunks.push(DiffHunk::Added(right_line));
if right_line.ends_with(b"\n") {
callback(&diff_line);
diff_line.right_line_number += 1;
diff_line.reset_line();
}
}
}
}