From c071d412af9c516210473f2c5b38a83842fabf8e Mon Sep 17 00:00:00 2001 From: Martin von Zweigbergk Date: Fri, 26 Mar 2021 09:52:05 -0700 Subject: [PATCH] diff: use new diff algorithm for content diff The previous patch switched over the content-merge code to use the new histogram diff code. This patch switches over the content-diff code to use the histogram diff code. As before, the immediate goal is to speed it up. `jj diff -r c28ded83fc` in the git.git repo is a good example of a diff that's extremely slow to calculate with our current LCS-based diff. With this patch, that drops from 35 s to 0.12 s. The diff was slightly better before. I think that's mostly because of our different definition of a "word" in the data. We can improve that later. The speedup we get now is easily worth the slightly worse diff. --- Cargo.lock | 8 ----- Cargo.toml | 1 - lib/Cargo.toml | 1 - lib/src/files.rs | 94 +++++++++++++++++------------------------------- 4 files changed, 33 insertions(+), 71 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 9f201daaf..92ba813f5 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -345,12 +345,6 @@ dependencies = [ "memchr", ] -[[package]] -name = "diff" -version = "0.1.12" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0e25ea47919b1560c4e3b7fe0aaab9becf5b84a10325ddf7db0f0ba5e1026499" - [[package]] name = "digest" version = "0.8.1" @@ -556,7 +550,6 @@ dependencies = [ "clap", "config", "criterion", - "diff", "dirs", "git2", "hex", @@ -585,7 +578,6 @@ dependencies = [ "bytes", "chrono", "config", - "diff", "dirs", "git2", "hex", diff --git a/Cargo.toml b/Cargo.toml index 40a1110b6..88e1f7379 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -26,7 +26,6 @@ chrono = "0.4.19" clap = "2.33.3" config = "0.10.1" criterion = "0.3.3" -diff = "0.1.12" dirs = "3.0.1" git2 = "0.13.14" hex = "0.4.2" diff --git a/lib/Cargo.toml b/lib/Cargo.toml index a90163dc0..48d8450b0 100644 --- a/lib/Cargo.toml +++ b/lib/Cargo.toml @@ -22,7 +22,6 @@ bytes = "1.0.0" byteorder = "1.3.4" chrono = "0.4.19" config = "0.10.1" -diff = "0.1.12" dirs = "3.0.1" git2 = "0.13.14" hex = "0.4.2" diff --git a/lib/src/files.rs b/lib/src/files.rs index 79e15c1a1..35aebc694 100644 --- a/lib/src/files.rs +++ b/lib/src/files.rs @@ -15,38 +15,7 @@ use std::fmt::{Debug, Error, Formatter}; use std::ops::Range; -use diff::slice as diff_slice; - -fn is_word_byte(a: u8) -> bool { - a.is_ascii_alphanumeric() || a == b'_' -} - -fn is_same_word(a: u8, b: u8) -> bool { - // Don't allow utf-8 code points to be split into separate words - (is_word_byte(a) && is_word_byte(b)) || a & 0x80 != 0 -} - -fn tokenize(data: &[u8]) -> Vec<&[u8]> { - // TODO: Fix this code to not be so inefficient, and to allow the word - // delimiter to be configured. - let mut output = vec![]; - let mut word_start_pos = 0; - let mut maybe_prev: Option = None; - for (i, b) in data.iter().enumerate() { - let b = *b; - if let Some(prev) = maybe_prev { - if !is_same_word(prev, b) { - output.push(&data[word_start_pos..i]); - word_start_pos = i; - } - } - maybe_prev = Some(b); - } - if word_start_pos < data.len() { - output.push(&data[word_start_pos..]); - } - output -} +use crate::diff; #[derive(PartialEq, Eq, Clone, Debug)] pub enum DiffHunk<'a> { @@ -81,9 +50,6 @@ impl DiffLine<'_> { pub fn diff<'a>(left: &'a [u8], right: &'a [u8], callback: &mut impl FnMut(&DiffLine<'a>)) { // TODO: Should we attempt to interpret as utf-8 and otherwise break only at // newlines? - let left_tokens = tokenize(left); - let right_tokens = tokenize(right); - let result = diff_slice(&left_tokens, &right_tokens); let mut diff_line = DiffLine { left_line_number: 1, right_line_number: 1, @@ -91,36 +57,42 @@ pub fn diff<'a>(left: &'a [u8], right: &'a [u8], callback: &mut impl FnMut(&Diff has_right_content: false, hunks: vec![], }; - for hunk in result { + for hunk in diff::diff(left, right) { match hunk { - diff::Result::Both(left, right) => { - assert!(left == right); - diff_line.has_left_content = true; - diff_line.has_right_content = true; - diff_line.hunks.push(DiffHunk::Unmodified(left)); - if left == &[b'\n'] { - callback(&diff_line); - diff_line.left_line_number += 1; - diff_line.right_line_number += 1; - diff_line.reset_line(); + diff::SliceDiff::Unchanged(text) => { + let lines = text.split_inclusive(|b| *b == b'\n'); + for line in lines { + diff_line.has_left_content = true; + diff_line.has_right_content = true; + diff_line.hunks.push(DiffHunk::Unmodified(line)); + if line.ends_with(b"\n") { + callback(&diff_line); + diff_line.left_line_number += 1; + diff_line.right_line_number += 1; + diff_line.reset_line(); + } } } - diff::Result::Left(left) => { - diff_line.has_left_content = true; - diff_line.hunks.push(DiffHunk::Removed(left)); - if left == &[b'\n'] { - callback(&diff_line); - diff_line.left_line_number += 1; - diff_line.reset_line(); + diff::SliceDiff::Replaced(left, right) => { + let left_lines = left.split_inclusive(|b| *b == b'\n'); + for left_line in left_lines { + diff_line.has_left_content = true; + diff_line.hunks.push(DiffHunk::Removed(left_line)); + if left_line.ends_with(b"\n") { + callback(&diff_line); + diff_line.left_line_number += 1; + diff_line.reset_line(); + } } - } - diff::Result::Right(right) => { - diff_line.has_right_content = true; - diff_line.hunks.push(DiffHunk::Added(right)); - if right == &[b'\n'] { - callback(&diff_line); - diff_line.right_line_number += 1; - diff_line.reset_line(); + let right_lines = right.split_inclusive(|b| *b == b'\n'); + for right_line in right_lines { + diff_line.has_right_content = true; + diff_line.hunks.push(DiffHunk::Added(right_line)); + if right_line.ends_with(b"\n") { + callback(&diff_line); + diff_line.right_line_number += 1; + diff_line.reset_line(); + } } } }