mirror of
https://github.com/martinvonz/jj.git
synced 2024-12-24 12:48:55 +00:00
diff: use new diff algorithm for content diff
The previous patch switched over the content-merge code to use the new histogram diff code. This patch switches over the content-diff code to use the histogram diff code. As before, the immediate goal is to speed it up. `jj diff -r c28ded83fc` in the git.git repo is a good example of a diff that's extremely slow to calculate with our current LCS-based diff. With this patch, that drops from 35 s to 0.12 s. The diff was slightly better before. I think that's mostly because of our different definition of a "word" in the data. We can improve that later. The speedup we get now is easily worth the slightly worse diff.
This commit is contained in:
parent
3c35dbace6
commit
c071d412af
4 changed files with 33 additions and 71 deletions
8
Cargo.lock
generated
8
Cargo.lock
generated
|
@ -345,12 +345,6 @@ dependencies = [
|
|||
"memchr",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "diff"
|
||||
version = "0.1.12"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0e25ea47919b1560c4e3b7fe0aaab9becf5b84a10325ddf7db0f0ba5e1026499"
|
||||
|
||||
[[package]]
|
||||
name = "digest"
|
||||
version = "0.8.1"
|
||||
|
@ -556,7 +550,6 @@ dependencies = [
|
|||
"clap",
|
||||
"config",
|
||||
"criterion",
|
||||
"diff",
|
||||
"dirs",
|
||||
"git2",
|
||||
"hex",
|
||||
|
@ -585,7 +578,6 @@ dependencies = [
|
|||
"bytes",
|
||||
"chrono",
|
||||
"config",
|
||||
"diff",
|
||||
"dirs",
|
||||
"git2",
|
||||
"hex",
|
||||
|
|
|
@ -26,7 +26,6 @@ chrono = "0.4.19"
|
|||
clap = "2.33.3"
|
||||
config = "0.10.1"
|
||||
criterion = "0.3.3"
|
||||
diff = "0.1.12"
|
||||
dirs = "3.0.1"
|
||||
git2 = "0.13.14"
|
||||
hex = "0.4.2"
|
||||
|
|
|
@ -22,7 +22,6 @@ bytes = "1.0.0"
|
|||
byteorder = "1.3.4"
|
||||
chrono = "0.4.19"
|
||||
config = "0.10.1"
|
||||
diff = "0.1.12"
|
||||
dirs = "3.0.1"
|
||||
git2 = "0.13.14"
|
||||
hex = "0.4.2"
|
||||
|
|
|
@ -15,38 +15,7 @@
|
|||
use std::fmt::{Debug, Error, Formatter};
|
||||
use std::ops::Range;
|
||||
|
||||
use diff::slice as diff_slice;
|
||||
|
||||
fn is_word_byte(a: u8) -> bool {
|
||||
a.is_ascii_alphanumeric() || a == b'_'
|
||||
}
|
||||
|
||||
fn is_same_word(a: u8, b: u8) -> bool {
|
||||
// Don't allow utf-8 code points to be split into separate words
|
||||
(is_word_byte(a) && is_word_byte(b)) || a & 0x80 != 0
|
||||
}
|
||||
|
||||
fn tokenize(data: &[u8]) -> Vec<&[u8]> {
|
||||
// TODO: Fix this code to not be so inefficient, and to allow the word
|
||||
// delimiter to be configured.
|
||||
let mut output = vec![];
|
||||
let mut word_start_pos = 0;
|
||||
let mut maybe_prev: Option<u8> = None;
|
||||
for (i, b) in data.iter().enumerate() {
|
||||
let b = *b;
|
||||
if let Some(prev) = maybe_prev {
|
||||
if !is_same_word(prev, b) {
|
||||
output.push(&data[word_start_pos..i]);
|
||||
word_start_pos = i;
|
||||
}
|
||||
}
|
||||
maybe_prev = Some(b);
|
||||
}
|
||||
if word_start_pos < data.len() {
|
||||
output.push(&data[word_start_pos..]);
|
||||
}
|
||||
output
|
||||
}
|
||||
use crate::diff;
|
||||
|
||||
#[derive(PartialEq, Eq, Clone, Debug)]
|
||||
pub enum DiffHunk<'a> {
|
||||
|
@ -81,9 +50,6 @@ impl DiffLine<'_> {
|
|||
pub fn diff<'a>(left: &'a [u8], right: &'a [u8], callback: &mut impl FnMut(&DiffLine<'a>)) {
|
||||
// TODO: Should we attempt to interpret as utf-8 and otherwise break only at
|
||||
// newlines?
|
||||
let left_tokens = tokenize(left);
|
||||
let right_tokens = tokenize(right);
|
||||
let result = diff_slice(&left_tokens, &right_tokens);
|
||||
let mut diff_line = DiffLine {
|
||||
left_line_number: 1,
|
||||
right_line_number: 1,
|
||||
|
@ -91,33 +57,38 @@ pub fn diff<'a>(left: &'a [u8], right: &'a [u8], callback: &mut impl FnMut(&Diff
|
|||
has_right_content: false,
|
||||
hunks: vec![],
|
||||
};
|
||||
for hunk in result {
|
||||
for hunk in diff::diff(left, right) {
|
||||
match hunk {
|
||||
diff::Result::Both(left, right) => {
|
||||
assert!(left == right);
|
||||
diff::SliceDiff::Unchanged(text) => {
|
||||
let lines = text.split_inclusive(|b| *b == b'\n');
|
||||
for line in lines {
|
||||
diff_line.has_left_content = true;
|
||||
diff_line.has_right_content = true;
|
||||
diff_line.hunks.push(DiffHunk::Unmodified(left));
|
||||
if left == &[b'\n'] {
|
||||
diff_line.hunks.push(DiffHunk::Unmodified(line));
|
||||
if line.ends_with(b"\n") {
|
||||
callback(&diff_line);
|
||||
diff_line.left_line_number += 1;
|
||||
diff_line.right_line_number += 1;
|
||||
diff_line.reset_line();
|
||||
}
|
||||
}
|
||||
diff::Result::Left(left) => {
|
||||
}
|
||||
diff::SliceDiff::Replaced(left, right) => {
|
||||
let left_lines = left.split_inclusive(|b| *b == b'\n');
|
||||
for left_line in left_lines {
|
||||
diff_line.has_left_content = true;
|
||||
diff_line.hunks.push(DiffHunk::Removed(left));
|
||||
if left == &[b'\n'] {
|
||||
diff_line.hunks.push(DiffHunk::Removed(left_line));
|
||||
if left_line.ends_with(b"\n") {
|
||||
callback(&diff_line);
|
||||
diff_line.left_line_number += 1;
|
||||
diff_line.reset_line();
|
||||
}
|
||||
}
|
||||
diff::Result::Right(right) => {
|
||||
let right_lines = right.split_inclusive(|b| *b == b'\n');
|
||||
for right_line in right_lines {
|
||||
diff_line.has_right_content = true;
|
||||
diff_line.hunks.push(DiffHunk::Added(right));
|
||||
if right == &[b'\n'] {
|
||||
diff_line.hunks.push(DiffHunk::Added(right_line));
|
||||
if right_line.ends_with(b"\n") {
|
||||
callback(&diff_line);
|
||||
diff_line.right_line_number += 1;
|
||||
diff_line.reset_line();
|
||||
|
@ -125,6 +96,7 @@ pub fn diff<'a>(left: &'a [u8], right: &'a [u8], callback: &mut impl FnMut(&Diff
|
|||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if !diff_line.hunks.is_empty() {
|
||||
callback(&diff_line);
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue