merge: use new diff algorithm for finding sync regions

With the histogram diff code from the previous patch, we can now start
using that for finding the "sync regions" in 3-way merge. That helps a
lot with the slow merging we had before this patch. `jj diff -r
9d540e9726` in the git.git repo drops from 22 s to 0.15 s with this
patch. (That commit is a rather arbitrary merge commit from aroun 5
years ago.)

With the new diff algorithm, the output of `jj diff -r 9d540e9726` in
git.git looks better if we find unchanged sync regions based on lines
than on words, so that's what I'm using in this patch. That's a change
compared the the LCS-based diff we used before this patch. I suspect
the reason that finding sync regions based on words works worse now is
not because of the change from LCS to histogram but because of the
change in how we define a word. My goal right now is mostly to make it
faster; I'll get back to refining the diff result later.
This commit is contained in:
Martin von Zweigbergk 2021-03-26 08:44:18 -07:00
parent 1e657c5331
commit 3c35dbace6
2 changed files with 15 additions and 57 deletions

View file

@ -3,8 +3,7 @@ use std::collections::{BTreeMap, HashMap};
use std::fmt::{Debug, Formatter};
use std::ops::Range;
#[allow(dead_code)]
fn find_line_ranges(text: &[u8]) -> Vec<Range<usize>> {
pub fn find_line_ranges(text: &[u8]) -> Vec<Range<usize>> {
let mut ranges = vec![];
let mut start = 0;
loop {
@ -24,7 +23,7 @@ fn find_line_ranges(text: &[u8]) -> Vec<Range<usize>> {
ranges
}
fn find_word_ranges(text: &[u8]) -> Vec<Range<usize>> {
pub fn find_word_ranges(text: &[u8]) -> Vec<Range<usize>> {
let mut word_ranges = vec![];
let mut word_start_pos = 0;
let mut in_word = false;
@ -169,7 +168,7 @@ fn find_lcs(input: &[usize]) -> Vec<(usize, usize)> {
/// Finds unchanged ranges among the ones given as arguments. The data between
/// those ranges is ignored.
fn unchanged_ranges(
pub(crate) fn unchanged_ranges(
left: &[u8],
right: &[u8],
left_ranges: &[Range<usize>],

View file

@ -171,63 +171,32 @@ struct SyncRegion {
right: Range<usize>,
}
fn diff_result_lengths(diff: &diff::Result<&&[u8]>) -> (usize, usize) {
match diff {
diff::Result::Left(&left) => (left.len(), 0),
diff::Result::Both(&left, &right) => (left.len(), right.len()),
diff::Result::Right(&right) => (0, right.len()),
}
}
fn unmodified_regions(
left_tokens: &[&[u8]],
right_tokens: &[&[u8]],
) -> Vec<(Range<usize>, Range<usize>)> {
let diffs = diff_slice(left_tokens, right_tokens);
let mut left_pos = 0;
let mut right_pos = 0;
let mut regions = Vec::new();
for diff in diffs {
let (left_len, right_len) = diff_result_lengths(&diff);
match diff {
diff::Result::Both(&left, &right) if left == right => regions.push((
left_pos..left_pos + left_len,
right_pos..right_pos + right_len,
)),
_ => {}
}
left_pos += left_len;
right_pos += right_len;
}
regions
}
fn find_sync_regions(base: &[u8], left: &[u8], right: &[u8]) -> Vec<SyncRegion> {
let base_tokens = tokenize(base);
let left_tokens = tokenize(left);
let right_tokens = tokenize(right);
let base_tokens = crate::diff::find_line_ranges(base);
let left_tokens = crate::diff::find_line_ranges(left);
let right_tokens = crate::diff::find_line_ranges(right);
let left_regions = unmodified_regions(&base_tokens, &left_tokens);
let right_regions = unmodified_regions(&base_tokens, &right_tokens);
let left_regions = crate::diff::unchanged_ranges(base, left, &base_tokens, &left_tokens);
let right_regions = crate::diff::unchanged_ranges(base, right, &base_tokens, &right_tokens);
let mut left_it = left_regions.iter().peekable();
let mut right_it = right_regions.iter().peekable();
let mut regions: Vec<SyncRegion> = vec![];
while let (Some((left_base_region, left_region)), Some((right_base_region, right_region))) =
while let (Some((left_base_range, left_range)), Some((right_base_range, right_range))) =
(left_it.peek(), right_it.peek())
{
// TODO: if left_base_region and right_base_region at least intersect, use the
// TODO: if left_base_range and right_base_range at least intersect, use the
// intersection of the two regions.
if left_base_region == right_base_region {
if left_base_range == right_base_range {
regions.push(SyncRegion {
base: left_base_region.clone(),
left: left_region.clone(),
right: right_region.clone(),
base: left_base_range.clone(),
left: left_range.clone(),
right: right_range.clone(),
});
left_it.next().unwrap();
right_it.next().unwrap();
} else if left_base_region.start < right_base_region.start {
} else if left_base_range.start < right_base_range.start {
left_it.next().unwrap();
} else {
right_it.next().unwrap();
@ -311,21 +280,11 @@ mod tests {
left: 0..1,
right: 0..1
},
SyncRegion {
base: 1..2,
left: 1..2,
right: 1..2
},
SyncRegion {
base: 2..3,
left: 4..5,
right: 2..3
},
SyncRegion {
base: 3..4,
left: 5..6,
right: 3..4
},
SyncRegion {
base: 4..5,
left: 6..7,