diff: accept diff inputs by generic iterator

This helps migrate internal [u8] variables to BStr.

b"" literals in tests are changed to &str to get around potential type
incompatibility between &[u8; N].
This commit is contained in:
Yuya Nishihara 2024-07-10 17:34:52 +09:00
parent 2ca3bad0ee
commit 59daef2351
5 changed files with 30 additions and 26 deletions

View file

@ -852,7 +852,7 @@ fn unified_diff_hunks<'content>(
right_line_range: 1..1,
lines: vec![],
};
let diff = Diff::by_line(&[left_content, right_content]);
let diff = Diff::by_line([left_content, right_content]);
let mut diff_hunks = diff.hunks().peekable();
while let Some(hunk) = diff_hunks.next() {
match hunk {
@ -910,7 +910,7 @@ fn inline_diff_hunks<'content>(
// Like Diff::default_refinement(), but doesn't try to match up contents by
// lines. We know left/right_contents have no matching lines.
let mut diff = Diff::for_tokenizer(&[left_content, right_content], diff::find_word_ranges);
let mut diff = Diff::for_tokenizer([left_content, right_content], diff::find_word_ranges);
diff.refine_changed_regions(diff::find_nonword_ranges);
for hunk in diff.hunks() {
match hunk {
@ -1128,7 +1128,7 @@ fn get_diff_stat(
// TODO: this matches git's behavior, which is to count the number of newlines
// in the file. but that behavior seems unhelpful; no one really cares how
// many `0x0a` characters are in an image.
let diff = Diff::by_line(&[&left_content.contents, &right_content.contents]);
let diff = Diff::by_line([&left_content.contents, &right_content.contents]);
let mut added = 0;
let mut removed = 0;
for hunk in diff.hunks() {

View file

@ -225,7 +225,7 @@ fn make_diff_sections(
left_contents: &str,
right_contents: &str,
) -> Result<Vec<scm_record::Section<'static>>, BuiltinToolError> {
let diff = Diff::by_line(&[left_contents.as_bytes(), right_contents.as_bytes()]);
let diff = Diff::by_line([left_contents.as_bytes(), right_contents.as_bytes()]);
let mut sections = Vec::new();
for hunk in diff.hunks() {
match hunk {

View file

@ -259,12 +259,12 @@ pub fn materialize_merge_result(
output.write_all(&left.0)?;
continue;
};
let diff1 = Diff::by_line(&[&left.0, &right1.0]).hunks().collect_vec();
let diff1 = Diff::by_line([&left.0, &right1.0]).hunks().collect_vec();
// Check if the diff against the next positive term is better. Since
// we want to preserve the order of the terms, we don't match against
// any later positive terms.
if let Some(right2) = hunk.get_add(add_index + 1) {
let diff2 = Diff::by_line(&[&left.0, &right2.0]).hunks().collect_vec();
let diff2 = Diff::by_line([&left.0, &right2.0]).hunks().collect_vec();
if diff_size(&diff2) < diff_size(&diff1) {
// If the next positive term is a better match, emit
// the current positive term as a snapshot and the next

View file

@ -408,13 +408,13 @@ fn intersect_regions(
}
impl<'input> Diff<'input> {
pub fn for_tokenizer(
inputs: &[&'input [u8]],
pub fn for_tokenizer<T: AsRef<[u8]> + ?Sized + 'input>(
inputs: impl IntoIterator<Item = &'input T>,
tokenizer: impl Fn(&[u8]) -> Vec<Range<usize>>,
) -> Self {
assert!(!inputs.is_empty());
let base_input = inputs[0];
let other_inputs = inputs.iter().skip(1).copied().collect_vec();
let mut inputs = inputs.into_iter().map(AsRef::as_ref);
let base_input = inputs.next().expect("inputs must not be empty");
let other_inputs = inputs.collect_vec();
// First tokenize each input
let base_token_ranges = tokenizer(base_input);
let other_token_ranges = other_inputs
@ -471,12 +471,16 @@ impl<'input> Diff<'input> {
diff
}
pub fn unrefined(inputs: &[&'input [u8]]) -> Self {
pub fn unrefined<T: AsRef<[u8]> + ?Sized + 'input>(
inputs: impl IntoIterator<Item = &'input T>,
) -> Self {
Diff::for_tokenizer(inputs, |_| vec![])
}
/// Compares `inputs` line by line.
pub fn by_line(inputs: &[&'input [u8]]) -> Self {
pub fn by_line<T: AsRef<[u8]> + ?Sized + 'input>(
inputs: impl IntoIterator<Item = &'input T>,
) -> Self {
Diff::for_tokenizer(inputs, find_line_ranges)
}
@ -486,7 +490,9 @@ impl<'input> Diff<'input> {
// That would let each user decide which hunks to refine. However, it would
// probably mean that many callers repeat the same code. Perhaps it
// should be possible to refine a whole diff *or* individual hunks.
pub fn default_refinement(inputs: &[&'input [u8]]) -> Self {
pub fn default_refinement<T: AsRef<[u8]> + ?Sized + 'input>(
inputs: impl IntoIterator<Item = &'input T>,
) -> Self {
let mut diff = Diff::for_tokenizer(inputs, find_line_ranges);
diff.refine_changed_regions(find_word_ranges);
diff.refine_changed_regions(find_nonword_ranges);
@ -526,7 +532,7 @@ impl<'input> Diff<'input> {
slices.push(&self.other_inputs[i][changed_range]);
}
let refined_diff = Diff::for_tokenizer(&slices, &tokenizer);
let refined_diff = Diff::for_tokenizer(slices, &tokenizer);
for UnchangedRange {
base_range,
@ -663,7 +669,7 @@ pub fn diff<'a>(left: &'a [u8], right: &'a [u8]) -> Vec<DiffHunk<'a>> {
return vec![DiffHunk::Different(vec![left, b""])];
}
Diff::default_refinement(&[left, right])
Diff::default_refinement([left, right])
.hunks()
.collect_vec()
}
@ -967,19 +973,19 @@ mod tests {
#[test]
fn test_diff_single_input() {
let diff = Diff::default_refinement(&[b"abc"]);
let diff = Diff::default_refinement(["abc"]);
assert_eq!(diff.hunks().collect_vec(), vec![DiffHunk::Matching(b"abc")]);
}
#[test]
fn test_diff_single_empty_input() {
let diff = Diff::default_refinement(&[b""]);
let diff = Diff::default_refinement([""]);
assert_eq!(diff.hunks().collect_vec(), vec![]);
}
#[test]
fn test_diff_two_inputs_one_different() {
let diff = Diff::default_refinement(&[b"a b c", b"a X c"]);
let diff = Diff::default_refinement(["a b c", "a X c"]);
assert_eq!(
diff.hunks().collect_vec(),
vec![
@ -992,7 +998,7 @@ mod tests {
#[test]
fn test_diff_multiple_inputs_one_different() {
let diff = Diff::default_refinement(&[b"a b c", b"a X c", b"a b c"]);
let diff = Diff::default_refinement(["a b c", "a X c", "a b c"]);
assert_eq!(
diff.hunks().collect_vec(),
vec![
@ -1005,7 +1011,7 @@ mod tests {
#[test]
fn test_diff_multiple_inputs_all_different() {
let diff = Diff::default_refinement(&[b"a b c", b"a X c", b"a c X"]);
let diff = Diff::default_refinement(["a b c", "a X c", "a c X"]);
assert_eq!(
diff.hunks().collect_vec(),
vec![
@ -1021,7 +1027,7 @@ mod tests {
fn test_diff_for_tokenizer_compacted() {
// Tests that unchanged regions are compacted when using for_tokenizer()
let diff = Diff::for_tokenizer(
&[b"a\nb\nc\nd\ne\nf\ng", b"a\nb\nc\nX\ne\nf\ng"],
["a\nb\nc\nd\ne\nf\ng", "a\nb\nc\nX\ne\nf\ng"],
find_line_ranges,
);
assert_eq!(

View file

@ -17,8 +17,6 @@
use std::collections::VecDeque;
use std::fmt::{Debug, Error, Formatter};
use itertools::Itertools;
use crate::diff;
use crate::diff::{Diff, DiffHunk};
use crate::merge::{trivial_merge, Merge};
@ -163,9 +161,9 @@ pub fn merge(slices: &Merge<&[u8]>) -> MergeResult {
// usually done for 3-way conflicts. Are there better heuristics when there are
// more than 3 parts?
let num_diffs = slices.removes().len();
let diff_inputs = slices.removes().chain(slices.adds()).copied().collect_vec();
let diff_inputs = slices.removes().chain(slices.adds());
let diff = Diff::by_line(&diff_inputs);
let diff = Diff::by_line(diff_inputs);
let mut resolved_hunk = ContentHunk(vec![]);
let mut merge_hunks: Vec<Merge<ContentHunk>> = vec![];
for diff_hunk in diff.hunks() {