forked from mirrors/jj
merge: use new diff algorithm for finding sync regions
With the histogram diff code from the previous patch, we can now start using that for finding the "sync regions" in 3-way merge. That helps a lot with the slow merging we had before this patch. `jj diff -r 9d540e9726` in the git.git repo drops from 22 s to 0.15 s with this patch. (That commit is a rather arbitrary merge commit from aroun 5 years ago.) With the new diff algorithm, the output of `jj diff -r 9d540e9726` in git.git looks better if we find unchanged sync regions based on lines than on words, so that's what I'm using in this patch. That's a change compared the the LCS-based diff we used before this patch. I suspect the reason that finding sync regions based on words works worse now is not because of the change from LCS to histogram but because of the change in how we define a word. My goal right now is mostly to make it faster; I'll get back to refining the diff result later.
This commit is contained in:
parent
1e657c5331
commit
3c35dbace6
2 changed files with 15 additions and 57 deletions
|
@ -3,8 +3,7 @@ use std::collections::{BTreeMap, HashMap};
|
|||
use std::fmt::{Debug, Formatter};
|
||||
use std::ops::Range;
|
||||
|
||||
#[allow(dead_code)]
|
||||
fn find_line_ranges(text: &[u8]) -> Vec<Range<usize>> {
|
||||
pub fn find_line_ranges(text: &[u8]) -> Vec<Range<usize>> {
|
||||
let mut ranges = vec![];
|
||||
let mut start = 0;
|
||||
loop {
|
||||
|
@ -24,7 +23,7 @@ fn find_line_ranges(text: &[u8]) -> Vec<Range<usize>> {
|
|||
ranges
|
||||
}
|
||||
|
||||
fn find_word_ranges(text: &[u8]) -> Vec<Range<usize>> {
|
||||
pub fn find_word_ranges(text: &[u8]) -> Vec<Range<usize>> {
|
||||
let mut word_ranges = vec![];
|
||||
let mut word_start_pos = 0;
|
||||
let mut in_word = false;
|
||||
|
@ -169,7 +168,7 @@ fn find_lcs(input: &[usize]) -> Vec<(usize, usize)> {
|
|||
|
||||
/// Finds unchanged ranges among the ones given as arguments. The data between
|
||||
/// those ranges is ignored.
|
||||
fn unchanged_ranges(
|
||||
pub(crate) fn unchanged_ranges(
|
||||
left: &[u8],
|
||||
right: &[u8],
|
||||
left_ranges: &[Range<usize>],
|
||||
|
|
|
@ -171,63 +171,32 @@ struct SyncRegion {
|
|||
right: Range<usize>,
|
||||
}
|
||||
|
||||
fn diff_result_lengths(diff: &diff::Result<&&[u8]>) -> (usize, usize) {
|
||||
match diff {
|
||||
diff::Result::Left(&left) => (left.len(), 0),
|
||||
diff::Result::Both(&left, &right) => (left.len(), right.len()),
|
||||
diff::Result::Right(&right) => (0, right.len()),
|
||||
}
|
||||
}
|
||||
|
||||
fn unmodified_regions(
|
||||
left_tokens: &[&[u8]],
|
||||
right_tokens: &[&[u8]],
|
||||
) -> Vec<(Range<usize>, Range<usize>)> {
|
||||
let diffs = diff_slice(left_tokens, right_tokens);
|
||||
let mut left_pos = 0;
|
||||
let mut right_pos = 0;
|
||||
let mut regions = Vec::new();
|
||||
for diff in diffs {
|
||||
let (left_len, right_len) = diff_result_lengths(&diff);
|
||||
match diff {
|
||||
diff::Result::Both(&left, &right) if left == right => regions.push((
|
||||
left_pos..left_pos + left_len,
|
||||
right_pos..right_pos + right_len,
|
||||
)),
|
||||
_ => {}
|
||||
}
|
||||
left_pos += left_len;
|
||||
right_pos += right_len;
|
||||
}
|
||||
regions
|
||||
}
|
||||
|
||||
fn find_sync_regions(base: &[u8], left: &[u8], right: &[u8]) -> Vec<SyncRegion> {
|
||||
let base_tokens = tokenize(base);
|
||||
let left_tokens = tokenize(left);
|
||||
let right_tokens = tokenize(right);
|
||||
let base_tokens = crate::diff::find_line_ranges(base);
|
||||
let left_tokens = crate::diff::find_line_ranges(left);
|
||||
let right_tokens = crate::diff::find_line_ranges(right);
|
||||
|
||||
let left_regions = unmodified_regions(&base_tokens, &left_tokens);
|
||||
let right_regions = unmodified_regions(&base_tokens, &right_tokens);
|
||||
let left_regions = crate::diff::unchanged_ranges(base, left, &base_tokens, &left_tokens);
|
||||
let right_regions = crate::diff::unchanged_ranges(base, right, &base_tokens, &right_tokens);
|
||||
|
||||
let mut left_it = left_regions.iter().peekable();
|
||||
let mut right_it = right_regions.iter().peekable();
|
||||
|
||||
let mut regions: Vec<SyncRegion> = vec![];
|
||||
while let (Some((left_base_region, left_region)), Some((right_base_region, right_region))) =
|
||||
while let (Some((left_base_range, left_range)), Some((right_base_range, right_range))) =
|
||||
(left_it.peek(), right_it.peek())
|
||||
{
|
||||
// TODO: if left_base_region and right_base_region at least intersect, use the
|
||||
// TODO: if left_base_range and right_base_range at least intersect, use the
|
||||
// intersection of the two regions.
|
||||
if left_base_region == right_base_region {
|
||||
if left_base_range == right_base_range {
|
||||
regions.push(SyncRegion {
|
||||
base: left_base_region.clone(),
|
||||
left: left_region.clone(),
|
||||
right: right_region.clone(),
|
||||
base: left_base_range.clone(),
|
||||
left: left_range.clone(),
|
||||
right: right_range.clone(),
|
||||
});
|
||||
left_it.next().unwrap();
|
||||
right_it.next().unwrap();
|
||||
} else if left_base_region.start < right_base_region.start {
|
||||
} else if left_base_range.start < right_base_range.start {
|
||||
left_it.next().unwrap();
|
||||
} else {
|
||||
right_it.next().unwrap();
|
||||
|
@ -311,21 +280,11 @@ mod tests {
|
|||
left: 0..1,
|
||||
right: 0..1
|
||||
},
|
||||
SyncRegion {
|
||||
base: 1..2,
|
||||
left: 1..2,
|
||||
right: 1..2
|
||||
},
|
||||
SyncRegion {
|
||||
base: 2..3,
|
||||
left: 4..5,
|
||||
right: 2..3
|
||||
},
|
||||
SyncRegion {
|
||||
base: 3..4,
|
||||
left: 5..6,
|
||||
right: 3..4
|
||||
},
|
||||
SyncRegion {
|
||||
base: 4..5,
|
||||
left: 6..7,
|
||||
|
|
Loading…
Reference in a new issue