From 3c35dbace653a35a50a32111a297b315770006ef Mon Sep 17 00:00:00 2001 From: Martin von Zweigbergk Date: Fri, 26 Mar 2021 08:44:18 -0700 Subject: [PATCH] merge: use new diff algorithm for finding sync regions With the histogram diff code from the previous patch, we can now start using that for finding the "sync regions" in 3-way merge. That helps a lot with the slow merging we had before this patch. `jj diff -r 9d540e9726` in the git.git repo drops from 22 s to 0.15 s with this patch. (That commit is a rather arbitrary merge commit from aroun 5 years ago.) With the new diff algorithm, the output of `jj diff -r 9d540e9726` in git.git looks better if we find unchanged sync regions based on lines than on words, so that's what I'm using in this patch. That's a change compared the the LCS-based diff we used before this patch. I suspect the reason that finding sync regions based on words works worse now is not because of the change from LCS to histogram but because of the change in how we define a word. My goal right now is mostly to make it faster; I'll get back to refining the diff result later. --- lib/src/diff.rs | 7 +++--- lib/src/files.rs | 65 +++++++++--------------------------------------- 2 files changed, 15 insertions(+), 57 deletions(-) diff --git a/lib/src/diff.rs b/lib/src/diff.rs index f20f15202..7cd2242a5 100644 --- a/lib/src/diff.rs +++ b/lib/src/diff.rs @@ -3,8 +3,7 @@ use std::collections::{BTreeMap, HashMap}; use std::fmt::{Debug, Formatter}; use std::ops::Range; -#[allow(dead_code)] -fn find_line_ranges(text: &[u8]) -> Vec> { +pub fn find_line_ranges(text: &[u8]) -> Vec> { let mut ranges = vec![]; let mut start = 0; loop { @@ -24,7 +23,7 @@ fn find_line_ranges(text: &[u8]) -> Vec> { ranges } -fn find_word_ranges(text: &[u8]) -> Vec> { +pub fn find_word_ranges(text: &[u8]) -> Vec> { let mut word_ranges = vec![]; let mut word_start_pos = 0; let mut in_word = false; @@ -169,7 +168,7 @@ fn find_lcs(input: &[usize]) -> Vec<(usize, usize)> { /// Finds unchanged ranges among the ones given as arguments. The data between /// those ranges is ignored. -fn unchanged_ranges( +pub(crate) fn unchanged_ranges( left: &[u8], right: &[u8], left_ranges: &[Range], diff --git a/lib/src/files.rs b/lib/src/files.rs index 599e81d80..79e15c1a1 100644 --- a/lib/src/files.rs +++ b/lib/src/files.rs @@ -171,63 +171,32 @@ struct SyncRegion { right: Range, } -fn diff_result_lengths(diff: &diff::Result<&&[u8]>) -> (usize, usize) { - match diff { - diff::Result::Left(&left) => (left.len(), 0), - diff::Result::Both(&left, &right) => (left.len(), right.len()), - diff::Result::Right(&right) => (0, right.len()), - } -} - -fn unmodified_regions( - left_tokens: &[&[u8]], - right_tokens: &[&[u8]], -) -> Vec<(Range, Range)> { - let diffs = diff_slice(left_tokens, right_tokens); - let mut left_pos = 0; - let mut right_pos = 0; - let mut regions = Vec::new(); - for diff in diffs { - let (left_len, right_len) = diff_result_lengths(&diff); - match diff { - diff::Result::Both(&left, &right) if left == right => regions.push(( - left_pos..left_pos + left_len, - right_pos..right_pos + right_len, - )), - _ => {} - } - left_pos += left_len; - right_pos += right_len; - } - regions -} - fn find_sync_regions(base: &[u8], left: &[u8], right: &[u8]) -> Vec { - let base_tokens = tokenize(base); - let left_tokens = tokenize(left); - let right_tokens = tokenize(right); + let base_tokens = crate::diff::find_line_ranges(base); + let left_tokens = crate::diff::find_line_ranges(left); + let right_tokens = crate::diff::find_line_ranges(right); - let left_regions = unmodified_regions(&base_tokens, &left_tokens); - let right_regions = unmodified_regions(&base_tokens, &right_tokens); + let left_regions = crate::diff::unchanged_ranges(base, left, &base_tokens, &left_tokens); + let right_regions = crate::diff::unchanged_ranges(base, right, &base_tokens, &right_tokens); let mut left_it = left_regions.iter().peekable(); let mut right_it = right_regions.iter().peekable(); let mut regions: Vec = vec![]; - while let (Some((left_base_region, left_region)), Some((right_base_region, right_region))) = + while let (Some((left_base_range, left_range)), Some((right_base_range, right_range))) = (left_it.peek(), right_it.peek()) { - // TODO: if left_base_region and right_base_region at least intersect, use the + // TODO: if left_base_range and right_base_range at least intersect, use the // intersection of the two regions. - if left_base_region == right_base_region { + if left_base_range == right_base_range { regions.push(SyncRegion { - base: left_base_region.clone(), - left: left_region.clone(), - right: right_region.clone(), + base: left_base_range.clone(), + left: left_range.clone(), + right: right_range.clone(), }); left_it.next().unwrap(); right_it.next().unwrap(); - } else if left_base_region.start < right_base_region.start { + } else if left_base_range.start < right_base_range.start { left_it.next().unwrap(); } else { right_it.next().unwrap(); @@ -311,21 +280,11 @@ mod tests { left: 0..1, right: 0..1 }, - SyncRegion { - base: 1..2, - left: 1..2, - right: 1..2 - }, SyncRegion { base: 2..3, left: 4..5, right: 2..3 }, - SyncRegion { - base: 3..4, - left: 5..6, - right: 3..4 - }, SyncRegion { base: 4..5, left: 6..7,