diff: introduce newtype that represents word-range index

There are usize text indices/ranges and word-range indices. Let's make them
somewhat distinct.
This commit is contained in:
Yuya Nishihara 2024-09-24 13:00:36 +09:00
parent 739a5d8617
commit dd93e8f60b

View file

@ -73,6 +73,10 @@ pub fn find_nonword_ranges(text: &[u8]) -> Vec<Range<usize>> {
.collect() .collect()
} }
/// Index in a list of word (or token) ranges.
#[derive(Clone, Copy, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)]
struct WordPosition(usize);
#[derive(Clone, Debug)] #[derive(Clone, Debug)]
struct DiffSource<'input, 'aux> { struct DiffSource<'input, 'aux> {
text: &'input BStr, text: &'input BStr,
@ -87,29 +91,33 @@ impl<'input, 'aux> DiffSource<'input, 'aux> {
} }
} }
fn narrowed(&self, positions: Range<usize>) -> Self { fn narrowed(&self, positions: Range<WordPosition>) -> Self {
DiffSource { DiffSource {
text: self.text, text: self.text,
ranges: &self.ranges[positions], ranges: &self.ranges[positions.start.0..positions.end.0],
} }
} }
fn range_at(&self, position: WordPosition) -> Range<usize> {
self.ranges[position.0].clone()
}
} }
struct Histogram<'a> { struct Histogram<'a> {
word_to_positions: HashMap<&'a BStr, Vec<usize>>, word_to_positions: HashMap<&'a BStr, Vec<WordPosition>>,
count_to_words: BTreeMap<usize, Vec<&'a BStr>>, count_to_words: BTreeMap<usize, Vec<&'a BStr>>,
} }
impl Histogram<'_> { impl Histogram<'_> {
fn calculate<'a>(source: &DiffSource<'a, '_>, max_occurrences: usize) -> Histogram<'a> { fn calculate<'a>(source: &DiffSource<'a, '_>, max_occurrences: usize) -> Histogram<'a> {
let mut word_to_positions: HashMap<&BStr, Vec<usize>> = HashMap::new(); let mut word_to_positions: HashMap<&BStr, Vec<WordPosition>> = HashMap::new();
for (i, range) in source.ranges.iter().enumerate() { for (i, range) in source.ranges.iter().enumerate() {
let word = &source.text[range.clone()]; let word = &source.text[range.clone()];
let positions = word_to_positions.entry(word).or_default(); let positions = word_to_positions.entry(word).or_default();
// Allow one more than max_occurrences, so we can later skip those with more // Allow one more than max_occurrences, so we can later skip those with more
// than max_occurrences // than max_occurrences
if positions.len() <= max_occurrences { if positions.len() <= max_occurrences {
positions.push(i); positions.push(WordPosition(i));
} }
} }
let mut count_to_words: BTreeMap<usize, Vec<&BStr>> = BTreeMap::new(); let mut count_to_words: BTreeMap<usize, Vec<&BStr>> = BTreeMap::new();
@ -284,8 +292,8 @@ fn unchanged_ranges_lcs(
// Produce output ranges, recursing into the modified areas between the elements // Produce output ranges, recursing into the modified areas between the elements
// in the LCS. // in the LCS.
let mut result = vec![]; let mut result = vec![];
let mut previous_left_position = 0; let mut previous_left_position = WordPosition(0);
let mut previous_right_position = 0; let mut previous_right_position = WordPosition(0);
for (left_index, right_index) in lcs { for (left_index, right_index) in lcs {
let left_position = left_positions[left_index].0; let left_position = left_positions[left_index].0;
let right_position = right_positions[right_index].0; let right_position = right_positions[right_index].0;
@ -299,16 +307,13 @@ fn unchanged_ranges_lcs(
result.push(unchanged_nested_range); result.push(unchanged_nested_range);
} }
} }
result.push(( result.push((left.range_at(left_position), right.range_at(right_position)));
left.ranges[left_position].clone(), previous_left_position = WordPosition(left_position.0 + 1);
right.ranges[right_position].clone(), previous_right_position = WordPosition(right_position.0 + 1);
));
previous_left_position = left_position + 1;
previous_right_position = right_position + 1;
} }
// Also recurse into range at end (after common ranges). // Also recurse into range at end (after common ranges).
let skipped_left_positions = previous_left_position..left.ranges.len(); let skipped_left_positions = previous_left_position..WordPosition(left.ranges.len());
let skipped_right_positions = previous_right_position..right.ranges.len(); let skipped_right_positions = previous_right_position..WordPosition(right.ranges.len());
if !skipped_left_positions.is_empty() || !skipped_right_positions.is_empty() { if !skipped_left_positions.is_empty() || !skipped_right_positions.is_empty() {
for unchanged_nested_range in unchanged_ranges( for unchanged_nested_range in unchanged_ranges(
&left.narrowed(skipped_left_positions), &left.narrowed(skipped_left_positions),