From 1e657c533120bd433b54706ccf3dfd8e113cd0ad Mon Sep 17 00:00:00 2001 From: Martin von Zweigbergk Date: Sat, 20 Mar 2021 14:44:00 -0700 Subject: [PATCH] diff: add a histogram(-like?) diff algorithm The current diff algorithm does a full LCS on the words of the texts, which is really slow. Diffing the working copy when e.g. `src/commands.py` has changes far apart takes seconds. This patch adds an implementation inspired by JGit's Histogram diff. I say "inspired" because I just didn't quite understand it :P In particular, I didn't understand what it does when it finds non-unique elements. I decided to line up the leading common elements on both sides of the merge. I don't know if that usually gives good enough results in practice. I'm sure this can still be optimized a lot, but this seems good enough as a start. There is also many things to improve about the quality of the diffs. --- lib/benches/diff_bench.rs | 70 +++ lib/src/diff.rs | 879 ++++++++++++++++++++++++++++++++++++++ lib/src/lib.rs | 1 + 3 files changed, 950 insertions(+) create mode 100644 lib/benches/diff_bench.rs create mode 100644 lib/src/diff.rs diff --git a/lib/benches/diff_bench.rs b/lib/benches/diff_bench.rs new file mode 100644 index 000000000..7f44bdccd --- /dev/null +++ b/lib/benches/diff_bench.rs @@ -0,0 +1,70 @@ +#![feature(test)] + +extern crate test; + +use jujube_lib::diff; +use test::Bencher; + +fn unchanged_lines(count: usize) -> (String, String) { + let mut lines = vec![]; + for i in 0..count { + lines.push(format!("left line {}\n", i)); + } + (lines.join(""), lines.join("")) +} + +fn modified_lines(count: usize) -> (String, String) { + let mut left_lines = vec![]; + let mut right_lines = vec![]; + for i in 0..count { + left_lines.push(format!("left line {}\n", i)); + right_lines.push(format!("right line {}\n", i)); + } + (left_lines.join(""), right_lines.join("")) +} + +fn reversed_lines(count: usize) -> (String, String) { + let mut left_lines = vec![]; + for i in 0..count { + left_lines.push(format!("left line {}\n", i)); + } + let mut right_lines = left_lines.clone(); + right_lines.reverse(); + (left_lines.join(""), right_lines.join("")) +} + +#[bench] +fn bench_diff_1k_unchanged_lines(b: &mut Bencher) { + let (left, right) = unchanged_lines(1000); + b.iter(|| diff::diff(left.as_bytes(), right.as_bytes())); +} + +#[bench] +fn bench_diff_10k_unchanged_lines(b: &mut Bencher) { + let (left, right) = unchanged_lines(10000); + b.iter(|| diff::diff(left.as_bytes(), right.as_bytes())); +} + +#[bench] +fn bench_diff_1k_modified_lines(b: &mut Bencher) { + let (left, right) = modified_lines(1000); + b.iter(|| diff::diff(left.as_bytes(), right.as_bytes())); +} + +#[bench] +fn bench_diff_10k_modified_lines(b: &mut Bencher) { + let (left, right) = modified_lines(10000); + b.iter(|| diff::diff(left.as_bytes(), right.as_bytes())); +} + +#[bench] +fn bench_diff_1k_lines_reversed(b: &mut Bencher) { + let (left, right) = reversed_lines(1000); + b.iter(|| diff::diff(left.as_bytes(), right.as_bytes())); +} + +#[bench] +fn bench_diff_10k_lines_reversed(b: &mut Bencher) { + let (left, right) = reversed_lines(10000); + b.iter(|| diff::diff(left.as_bytes(), right.as_bytes())); +} diff --git a/lib/src/diff.rs b/lib/src/diff.rs new file mode 100644 index 000000000..f20f15202 --- /dev/null +++ b/lib/src/diff.rs @@ -0,0 +1,879 @@ +use std::cmp::min; +use std::collections::{BTreeMap, HashMap}; +use std::fmt::{Debug, Formatter}; +use std::ops::Range; + +#[allow(dead_code)] +fn find_line_ranges(text: &[u8]) -> Vec> { + let mut ranges = vec![]; + let mut start = 0; + loop { + match text[start..].iter().position(|b| *b == b'\n') { + None => { + break; + } + Some(i) => { + ranges.push(start..start + i + 1); + start += i + 1; + } + } + } + if start < text.len() { + ranges.push(start..text.len()); + } + ranges +} + +fn find_word_ranges(text: &[u8]) -> Vec> { + let mut word_ranges = vec![]; + let mut word_start_pos = 0; + let mut in_word = false; + for (i, b) in text.iter().enumerate() { + // TODO: Make this configurable (probably higher up in the call stack) + let is_word_byte = matches!(*b, b'A'..=b'Z' | b'a'..=b'z' | b'0'..=b'9' | b'_'); + if in_word && !is_word_byte { + in_word = false; + word_ranges.push(word_start_pos..i); + word_start_pos = i; + } else if !in_word && is_word_byte { + in_word = true; + word_start_pos = i; + } + } + if in_word && word_start_pos < text.len() { + word_ranges.push(word_start_pos..text.len()); + } + word_ranges +} + +struct Histogram<'a> { + word_to_positions: HashMap<&'a [u8], Vec>, + count_to_words: BTreeMap>, +} + +impl Histogram<'_> { + fn calculate<'a>( + text: &'a [u8], + ranges: &[Range], + max_occurrences: usize, + ) -> Histogram<'a> { + let mut word_to_positions: HashMap<&[u8], Vec> = HashMap::new(); + for (i, range) in ranges.iter().enumerate() { + let positions = word_to_positions.entry(&text[range.clone()]).or_default(); + // Allow one more than max_occurrences, so we can later skip those with more + // than max_occurrences + if positions.len() <= max_occurrences { + positions.push(i); + } + } + let mut count_to_words: BTreeMap> = BTreeMap::new(); + for (word, ranges) in &word_to_positions { + count_to_words.entry(ranges.len()).or_default().push(word); + } + Histogram { + word_to_positions, + count_to_words, + } + } +} + +#[derive(Clone, PartialEq, Eq, Hash, Debug)] +enum RangeDiff { + Unchanged(Range, Range), + Replaced(Range, Range), +} + +impl RangeDiff { + fn is_empty(&self) -> bool { + match self { + RangeDiff::Unchanged(left_range, right_range) => { + left_range.is_empty() && right_range.is_empty() + } + RangeDiff::Replaced(left_range, right_range) => { + left_range.is_empty() && right_range.is_empty() + } + } + } +} + +#[derive(Clone, PartialEq, Eq, Hash)] +pub enum SliceDiff<'a> { + Unchanged(&'a [u8]), + Replaced(&'a [u8], &'a [u8]), +} + +impl Debug for SliceDiff<'_> { + fn fmt(&self, f: &mut Formatter<'_>) -> Result<(), std::fmt::Error> { + match self { + SliceDiff::Unchanged(data) => f + .debug_tuple("Unchanged") + .field(&String::from_utf8_lossy(data)) + .finish(), + SliceDiff::Replaced(left, right) => f + .debug_tuple("Replaced") + .field(&String::from_utf8_lossy(left)) + .field(&String::from_utf8_lossy(right)) + .finish(), + } + } +} + +/// Finds the LCS given a array where the value of `input[i]` indicates that +/// the position of element `i` in the right array is at position `input[i]` in +/// the left array. +/// +/// For example (some have multiple valid outputs): +/// +/// [0,1,2] => [(0,0),(1,1),(2,2)] +/// [2,1,0] => [(0,2)] +/// [0,1,4,2,3,5,6] => [(0,0),(1,1),(2,3),(3,4),(5,5),(6,6)] +/// [0,1,4,3,2,5,6] => [(0,0),(1,1),(4,2),(5,5),(6,6)] +fn find_lcs(input: &[usize]) -> Vec<(usize, usize)> { + if input.is_empty() { + return vec![]; + } + + let mut chain = vec![(0, 0, 0); input.len()]; + let mut longest = 0; + let mut longest_right_pos = 0; + for (right_pos, &left_pos) in input.iter().enumerate() { + chain[right_pos] = (1, left_pos, usize::MAX); + for i in (0..right_pos).rev() { + let (previous_len, previous_left_pos, _) = chain[i]; + if previous_left_pos < left_pos { + let len = previous_len + 1; + chain[right_pos] = (len, left_pos, i); + if len > longest { + longest = len; + longest_right_pos = right_pos; + } + break; + } + } + } + + let mut result = vec![]; + let mut right_pos = longest_right_pos; + loop { + let (_, left_pos, previous_right_pos) = chain[right_pos]; + result.push((left_pos, right_pos)); + if previous_right_pos == usize::MAX { + break; + } + right_pos = previous_right_pos; + } + result.reverse(); + + result +} + +/// Finds unchanged ranges among the ones given as arguments. The data between +/// those ranges is ignored. +fn unchanged_ranges( + left: &[u8], + right: &[u8], + left_ranges: &[Range], + right_ranges: &[Range], +) -> Vec<(Range, Range)> { + if left_ranges.is_empty() || right_ranges.is_empty() { + return vec![]; + } + + // TODO: Don't waste time calculating entire histogram. We don't need to keep + // data about common entries. If a word has more than N occurrences, we should + // just ignore it (and assume that everything changes if we have no less common + // words). + let max_occurrences = 100; + let mut left_histogram = Histogram::calculate(left, left_ranges, max_occurrences); + if *left_histogram.count_to_words.first_entry().unwrap().key() > max_occurrences { + // If there are very many occurrences of all words, then we just give up. + return vec![]; + } + let mut right_histogram = Histogram::calculate(right, right_ranges, max_occurrences); + // Look for words with few occurrences in `left` (could equally well have picked + // `right`?). If any of them also occur in `right`, then we add the words to + // the LCS. + let mut uncommon_shared_words = vec![]; + while !left_histogram.count_to_words.is_empty() && uncommon_shared_words.is_empty() { + let left_words = left_histogram.count_to_words.pop_first().unwrap().1; + for left_word in left_words { + if right_histogram.word_to_positions.contains_key(left_word) { + uncommon_shared_words.push(left_word); + } + } + } + + // Let's say our inputs are "a b a b" and "a b c c b a b". We will have found + // the least common words to be "a" and "b". We now assume that each + // occurrence of each word lines up in the left and right input. We do that + // by numbering the shared occurrences, effectively instead comparing "a1 b1 + // a2 b2" and "a1 b1 c c b2 a2 b". We then walk the common words in the + // right input in order (["a1", "b1", "b2", "a2"]), and record the index of + // that word in the left input ([0,1,3,2]). We then find the LCS and split + // points based on that ([0,1,3] or [0,1,2] are both valid). + + // [(index into left_ranges, word, occurrence #)] + let mut left_positions = vec![]; + let mut right_positions = vec![]; + for uncommon_shared_word in uncommon_shared_words { + let left_occurrences = left_histogram + .word_to_positions + .get_mut(uncommon_shared_word) + .unwrap(); + let right_occurrences = right_histogram + .word_to_positions + .get_mut(uncommon_shared_word) + .unwrap(); + let shared_count = min(left_occurrences.len(), right_occurrences.len()); + for occurrence in 0..shared_count { + left_positions.push(( + left_occurrences[occurrence], + uncommon_shared_word, + occurrence, + )); + right_positions.push(( + right_occurrences[occurrence], + uncommon_shared_word, + occurrence, + )); + } + } + left_positions.sort(); + right_positions.sort(); + let mut left_position_map = HashMap::new(); + for (i, (_pos, word, occurrence)) in left_positions.iter().enumerate() { + left_position_map.insert((*word, *occurrence), i); + } + let mut left_index_by_right_index = vec![]; + for (_pos, word, occurrence) in &right_positions { + left_index_by_right_index.push(*left_position_map.get(&(*word, *occurrence)).unwrap()); + } + let lcs = find_lcs(&left_index_by_right_index); + + // Produce output ranges, recursing into the modified areas between the elements + // in the LCS. + let mut result = vec![]; + let mut previous_left_position = 0; + let mut previous_right_position = 0; + for (left_index, right_index) in lcs { + let left_position = left_positions[left_index].0; + let right_position = right_positions[right_index].0; + let skipped_left_positions = previous_left_position..left_position; + let skipped_right_positions = previous_right_position..right_position; + if !skipped_left_positions.is_empty() || !skipped_right_positions.is_empty() { + for unchanged_nested_range in unchanged_ranges( + left, + right, + &left_ranges[skipped_left_positions.clone()], + &right_ranges[skipped_right_positions.clone()], + ) { + result.push(unchanged_nested_range); + } + } + result.push(( + left_ranges[left_position].clone(), + right_ranges[right_position].clone(), + )); + previous_left_position = left_position + 1; + previous_right_position = right_position + 1; + } + + result +} + +/// Adds ranges between around the `input` ranges so that the full ranges of +/// `left` and `right` are covered. +fn fill_in_range_gaps( + left: &[u8], + right: &[u8], + input: &[(Range, Range)], +) -> Vec { + let mut output = vec![]; + let mut previous_left_end_pos = 0; + let mut previous_right_end_pos = 0; + // Add an empty range at the end in order to fill in any gap just before the + // end (without needing to duplicate code for that after the loop). + for (left_range, right_range) in input + .iter() + .chain(&[(left.len()..left.len(), right.len()..right.len())]) + { + let left_gap_range = previous_left_end_pos..left_range.start; + let right_gap_range = previous_right_end_pos..right_range.start; + if !left_gap_range.is_empty() || !right_gap_range.is_empty() { + if left[left_gap_range.clone()] == right[right_gap_range.clone()] { + output.push(RangeDiff::Unchanged(left_gap_range, right_gap_range)); + } else { + output.push(RangeDiff::Replaced(left_gap_range, right_gap_range)); + } + } + previous_left_end_pos = left_range.end; + previous_right_end_pos = right_range.end; + if !(left_range.is_empty() && right_range.is_empty()) { + output.push(RangeDiff::Unchanged( + left_range.clone(), + right_range.clone(), + )); + } + } + + output +} + +/// Combines adjacent ranges of the same type into larger ranges. Removes empty +/// ranges. +fn compact_ranges(input: &[RangeDiff]) -> Vec { + if input.is_empty() { + return vec![]; + } + let mut output = vec![]; + let mut current_range = input[0].clone(); + for range in input.iter().skip(1) { + match (&mut current_range, range) { + (RangeDiff::Unchanged(left1, right1), RangeDiff::Unchanged(left2, right2)) => { + left1.end = left2.end; + right1.end = right2.end; + } + (RangeDiff::Replaced(left1, right1), RangeDiff::Replaced(left2, right2)) => { + left1.end = left2.end; + right1.end = right2.end; + } + _ => { + // The previous range was unchanged and this one was replaced, or vice versa. + // If the new range is empty, just ignore it, so we can possibly compact + // with the previous one. + if !range.is_empty() { + if !current_range.is_empty() { + output.push(current_range.clone()); + } + current_range = range.clone(); + } + } + } + } + if !current_range.is_empty() { + output.push(current_range); + } + output +} + +fn range_diffs_to_slice_diffs<'a>( + left: &'a [u8], + right: &'a [u8], + range_diffs: &[RangeDiff], +) -> Vec> { + let mut slice_diffs = vec![]; + for range in range_diffs { + match range { + RangeDiff::Unchanged(left_range, _right_range) => { + slice_diffs.push(SliceDiff::Unchanged(&left[left_range.clone()])); + } + RangeDiff::Replaced(left_range, right_range) => { + slice_diffs.push(SliceDiff::Replaced( + &left[left_range.clone()], + &right[right_range.clone()], + )); + } + } + } + slice_diffs +} + +/// Diffs two slices of bytes. The returned diff hunks may be any length (may +/// span many lines or may be only part of a line). This currently uses +/// Histogram diff (or maybe something similar; I'm not sure I understood the +/// algorithm correctly). It runs on the words in the input (not lines) and +/// ignores non-word characters when trying to find common ranges of text. +/// +/// TODO: Does it give better results to first diff lines and then diff words +/// only within modified ranges? Is it faster? +/// +/// TODO: Diff at even lower level in the non-word ranges? +pub fn diff<'a>(left: &'a [u8], right: &'a [u8]) -> Vec> { + if left == right { + return vec![SliceDiff::Unchanged(left)]; + } + if left.is_empty() { + return vec![SliceDiff::Replaced(b"", right)]; + } + if right.is_empty() { + return vec![SliceDiff::Replaced(left, b"")]; + } + + let left_word_ranges = find_word_ranges(left); + let right_word_ranges = find_word_ranges(right); + let unchanged_ranges = unchanged_ranges(left, right, &left_word_ranges, &right_word_ranges); + let all_ranges = fill_in_range_gaps(left, right, &unchanged_ranges); + let compacted_ranges = compact_ranges(&all_ranges); + range_diffs_to_slice_diffs(left, right, &compacted_ranges) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_find_line_ranges_empty() { + assert_eq!(find_line_ranges(b""), vec![]); + } + + #[test] + fn test_find_line_ranges_blank_line() { + assert_eq!(find_line_ranges(b"\n"), vec![0..1]); + } + + #[test] + fn test_find_line_ranges_missing_newline_at_eof() { + assert_eq!(find_line_ranges(b"foo"), vec![0..3]); + } + + #[test] + fn test_find_line_ranges_multiple_lines() { + assert_eq!(find_line_ranges(b"a\nbb\nccc\n"), vec![0..2, 2..5, 5..9]); + } + + #[test] + fn test_find_word_ranges_empty() { + assert_eq!(find_word_ranges(b""), vec![]); + } + + #[test] + fn test_find_word_ranges_single_word() { + assert_eq!(find_word_ranges(b"Abc"), vec![0..3]); + } + + #[test] + fn test_find_word_ranges_no_word() { + assert_eq!(find_word_ranges(b"+-*/"), vec![]); + } + + #[test] + fn test_find_word_ranges_word_then_non_word() { + assert_eq!(find_word_ranges(b"Abc "), vec![0..3]); + } + + #[test] + fn test_find_word_ranges_non_word_then_word() { + assert_eq!(find_word_ranges(b" Abc"), vec![3..6]); + } + + #[test] + fn test_find_lcs_empty() { + let empty: Vec<(usize, usize)> = vec![]; + assert_eq!(find_lcs(&[]), empty); + } + + #[test] + fn test_find_lcs_single_element() { + assert_eq!(find_lcs(&[0]), vec![(0, 0)]); + } + + #[test] + fn test_find_lcs_in_order() { + assert_eq!(find_lcs(&[0, 1, 2]), vec![(0, 0), (1, 1), (2, 2)]); + } + + #[test] + fn test_find_lcs_reverse_order() { + assert_eq!(find_lcs(&[2, 1, 0]), vec![(2, 0)]); + } + + #[test] + fn test_find_lcs_two_swapped() { + assert_eq!( + find_lcs(&[0, 1, 4, 3, 2, 5, 6]), + vec![(0, 0), (1, 1), (2, 4), (5, 5), (6, 6)] + ); + } + + #[test] + fn test_find_lcs_element_moved_earlier() { + assert_eq!( + find_lcs(&[0, 1, 4, 2, 3, 5, 6]), + vec![(0, 0), (1, 1), (2, 3), (3, 4), (5, 5), (6, 6)] + ); + } + + #[test] + fn test_find_lcs_interleaved_longest_chains() { + assert_eq!( + find_lcs(&[0, 4, 2, 9, 6, 5, 1, 3, 7, 8]), + vec![(0, 0), (1, 6), (3, 7), (7, 8), (8, 9)] + ); + } + + #[test] + fn test_find_word_ranges_many_words() { + assert_eq!( + find_word_ranges(b"fn find_words(text: &[u8])"), + vec![0..2, 3..13, 14..18, 22..24] + ); + } + + #[test] + fn test_fill_in_gaps_empty() { + assert_eq!( + fill_in_range_gaps(b"abc", b"abcde", &[]), + vec![RangeDiff::Replaced(0..3, 0..5),] + ); + } + + #[test] + fn test_fill_in_gaps_only_middle() { + assert_eq!( + fill_in_range_gaps( + b"a b c", + b"a x b y c", + &[(0..2, 0..2), (2..4, 4..6), (4..5, 8..9),] + ), + vec![ + RangeDiff::Unchanged(0..2, 0..2), + RangeDiff::Replaced(2..2, 2..4), + RangeDiff::Unchanged(2..4, 4..6), + RangeDiff::Replaced(4..4, 6..8), + RangeDiff::Unchanged(4..5, 8..9), + ] + ); + } + + #[test] + fn test_fill_in_gaps_empty_gap() { + assert_eq!( + fill_in_range_gaps(b"a b", b"a b", &[(0..1, 0..1), (1..2, 1..2), (2..3, 2..3),]), + vec![ + RangeDiff::Unchanged(0..1, 0..1), + RangeDiff::Unchanged(1..2, 1..2), + RangeDiff::Unchanged(2..3, 2..3), + ] + ); + } + + #[test] + fn test_fill_in_gaps_before_and_after() { + assert_eq!( + fill_in_range_gaps(b" a ", b" a ", &[(1..2, 1..2),]), + vec![ + RangeDiff::Unchanged(0..1, 0..1), + RangeDiff::Unchanged(1..2, 1..2), + RangeDiff::Unchanged(2..3, 2..3), + ] + ); + } + + #[test] + fn test_compact_ranges_all_unchanged() { + assert_eq!( + compact_ranges(&[ + RangeDiff::Unchanged(0..1, 0..2), + RangeDiff::Unchanged(1..2, 2..4), + RangeDiff::Unchanged(2..3, 4..6), + ]), + vec![RangeDiff::Unchanged(0..3, 0..6),] + ); + } + + #[test] + fn test_compact_ranges_all_replaced() { + assert_eq!( + compact_ranges(&[ + RangeDiff::Replaced(0..1, 0..2), + RangeDiff::Replaced(1..2, 2..4), + RangeDiff::Replaced(2..3, 4..6), + ]), + vec![RangeDiff::Replaced(0..3, 0..6),] + ); + } + + #[test] + fn test_compact_ranges_mixed() { + assert_eq!( + compact_ranges(&[ + RangeDiff::Replaced(0..1, 0..2), + RangeDiff::Replaced(1..2, 2..4), + RangeDiff::Unchanged(2..3, 4..6), + RangeDiff::Unchanged(3..4, 6..8), + RangeDiff::Replaced(4..5, 8..10), + RangeDiff::Replaced(5..6, 10..12), + ]), + vec![ + RangeDiff::Replaced(0..2, 0..4), + RangeDiff::Unchanged(2..4, 4..8), + RangeDiff::Replaced(4..6, 8..12), + ] + ); + } + + #[test] + fn test_compact_ranges_mixed_empty_range() { + assert_eq!( + compact_ranges(&[ + RangeDiff::Replaced(0..1, 0..2), + RangeDiff::Replaced(1..2, 2..4), + RangeDiff::Unchanged(2..2, 4..4), + RangeDiff::Replaced(3..4, 6..8), + RangeDiff::Replaced(4..5, 8..10), + ]), + vec![RangeDiff::Replaced(0..5, 0..10)] + ); + } + + #[test] + fn test_unchanged_ranges_insert_in_middle() { + assert_eq!( + unchanged_ranges( + b"a b b c", + b"a b X b c", + &[0..1, 2..3, 4..5, 6..7], + &[0..1, 2..3, 4..5, 6..7, 8..9], + ), + vec![(0..1, 0..1), (2..3, 2..3), (4..5, 6..7), (6..7, 8..9)] + ); + } + + #[test] + fn test_unchanged_ranges_non_unique_removed() { + assert_eq!( + unchanged_ranges( + b"a a a a", + b"a b a c", + &[0..1, 2..3, 4..5, 6..7], + &[0..1, 2..3, 4..5, 6..7], + ), + vec![(0..1, 0..1), (2..3, 4..5)] + ); + } + + #[test] + fn test_unchanged_ranges_non_unique_added() { + assert_eq!( + unchanged_ranges( + b"a b a c", + b"a a a a", + &[0..1, 2..3, 4..5, 6..7], + &[0..1, 2..3, 4..5, 6..7], + ), + vec![(0..1, 0..1), (4..5, 2..3)] + ); + } + + #[test] + fn test_diff_nothing_in_common() { + assert_eq!( + diff(b"aaa", b"bb"), + vec![SliceDiff::Replaced(b"aaa", b"bb")] + ); + } + + #[test] + fn test_diff_insert_in_middle() { + assert_eq!( + diff(b"a z", b"a S z"), + vec![ + SliceDiff::Unchanged(b"a"), + SliceDiff::Replaced(b" ", b" S "), + SliceDiff::Unchanged(b"z"), + ] + ); + } + + #[test] + fn test_diff_no_unique_middle_flips() { + assert_eq!( + diff(b"a R R S S z", b"a S S R R z"), + vec![ + SliceDiff::Unchanged(b"a"), + SliceDiff::Replaced(b" R R ", b" "), + SliceDiff::Unchanged(b"S S"), + SliceDiff::Replaced(b" ", b" R R "), + SliceDiff::Unchanged(b"z") + ], + ); + } + + #[test] + fn test_diff_recursion_needed() { + assert_eq!( + diff( + b"a q x q y q z q b q y q x q c", + b"a r r x q y z q b y q x r r c", + ), + vec![ + SliceDiff::Unchanged(b"a"), + SliceDiff::Replaced(b" q ", b" r r "), + SliceDiff::Unchanged(b"x q y"), + SliceDiff::Replaced(b" q ", b" "), + SliceDiff::Unchanged(b"z q b"), + SliceDiff::Replaced(b" q ", b" "), + SliceDiff::Unchanged(b"y q x"), + SliceDiff::Replaced(b" q ", b" r r "), + SliceDiff::Unchanged(b"c"), + ] + ); + } + + #[test] + fn test_diff_gitgit_http_c() { + assert_eq!( + diff( + br##"/* + * GIT - The information manager from hell + * + * Copyright (C) Linus Torvalds, 2005 + */ +#include "#cache.h" + +static int unpack(unsigned char *sha1) +{ + void *buffer; + unsigned long size; + char type[20]; + + buffer = read_sha1_file(sha1, type, &size); + if (!buffer) + usage("unable to read sha1 file"); + if (strcmp(type, "tree")) + usage("expected a 'tree' node"); + while (size) { + int len = strlen(buffer)+1; + unsigned char *sha1 = buffer + len; + char *path = strchr(buffer, ' ')+1; + unsigned int mode; + if (size < len + 20 || sscanf(buffer, "%o", &mode) != 1) + usage("corrupt 'tree' file"); + buffer = sha1 + 20; + size -= len + 20; + printf("%o %s (%s)\n", mode, path, sha1_to_hex(sha1)); + } + return 0; +} + +int main(int argc, char **argv) +{ + int fd; + unsigned char sha1[20]; + + if (argc != 2) + usage("read-tree "); + if (get_sha1_hex(argv[1], sha1) < 0) + usage("read-tree "); + sha1_file_directory = getenv(DB_ENVIRONMENT); + if (!sha1_file_directory) + sha1_file_directory = DEFAULT_DB_ENVIRONMENT; + if (unpack(sha1) < 0) + usage("unpack failed"); + return 0; +} +"##, + br##"/* + * GIT - The information manager from hell + * + * Copyright (C) Linus Torvalds, 2005 + */ +#include "#cache.h" + +static void create_directories(const char *path) +{ + int len = strlen(path); + char *buf = malloc(len + 1); + const char *slash = path; + + while ((slash = strchr(slash+1, '/')) != NULL) { + len = slash - path; + memcpy(buf, path, len); + buf[len] = 0; + mkdir(buf, 0700); + } +} + +static int create_file(const char *path) +{ + int fd = open(path, O_WRONLY | O_TRUNC | O_CREAT, 0600); + if (fd < 0) { + if (errno == ENOENT) { + create_directories(path); + fd = open(path, O_WRONLY | O_TRUNC | O_CREAT, 0600); + } + } + return fd; +} + +static int unpack(unsigned char *sha1) +{ + void *buffer; + unsigned long size; + char type[20]; + + buffer = read_sha1_file(sha1, type, &size); + if (!buffer) + usage("unable to read sha1 file"); + if (strcmp(type, "tree")) + usage("expected a 'tree' node"); + while (size) { + int len = strlen(buffer)+1; + unsigned char *sha1 = buffer + len; + char *path = strchr(buffer, ' ')+1; + char *data; + unsigned long filesize; + unsigned int mode; + int fd; + + if (size < len + 20 || sscanf(buffer, "%o", &mode) != 1) + usage("corrupt 'tree' file"); + buffer = sha1 + 20; + size -= len + 20; + data = read_sha1_file(sha1, type, &filesize); + if (!data || strcmp(type, "blob")) + usage("tree file refers to bad file data"); + fd = create_file(path); + if (fd < 0) + usage("unable to create file"); + if (write(fd, data, filesize) != filesize) + usage("unable to write file"); + fchmod(fd, mode); + close(fd); + free(data); + } + return 0; +} + +int main(int argc, char **argv) +{ + int fd; + unsigned char sha1[20]; + + if (argc != 2) + usage("read-tree "); + if (get_sha1_hex(argv[1], sha1) < 0) + usage("read-tree "); + sha1_file_directory = getenv(DB_ENVIRONMENT); + if (!sha1_file_directory) + sha1_file_directory = DEFAULT_DB_ENVIRONMENT; + if (unpack(sha1) < 0) + usage("unpack failed"); + return 0; +} +"##, + ), + // TODO: It would be better to break before the initial "static" (at the newline) + // TODO: Move matching whitespace at ends of replaced section out into unchanged section + vec![ + SliceDiff::Unchanged(b"/*\n * GIT - The information manager from hell\n *\n * Copyright (C) Linus Torvalds, 2005\n */\n#include \"#cache.h\"\n\nstatic"), + SliceDiff::Replaced(b" int unpack(unsigned char *sha1)\n{\n\t", b" "), + SliceDiff::Unchanged(b"void"), + SliceDiff::Replaced(b" *buffer;\n\t", b" create_directories(const char *path)\n{\n\tint len = strlen(path);\n\tchar *buf = malloc(len + 1);\n\tconst char *slash = path;\n\n\twhile ((slash = strchr(slash+1, \'/\')) != NULL) {\n\t\tlen = slash - path;\n\t\tmemcpy(buf, path, len);\n\t\tbuf[len] = 0;\n\t\tmkdir(buf, 0700);\n\t}\n}\n\nstatic int create_file(const char *path)\n{\n\tint fd = open(path, O_WRONLY | O_TRUNC | O_CREAT, 0600);\n\tif (fd < 0) {\n\t\tif (errno == ENOENT) {\n\t\t\tcreate_directories(path);\n\t\t\tfd = open(path, O_WRONLY | O_TRUNC | O_CREAT, 0600);\n\t\t}\n\t}\n\treturn fd;\n}\n\nstatic int unpack("), + SliceDiff::Unchanged(b"unsigned"), + SliceDiff::Replaced(b" ", b" char *sha1)\n{\n\tvoid *buffer;\n\tunsigned "), + SliceDiff::Unchanged(b"long size;\n\tchar type[20];\n\n\tbuffer = read_sha1_file(sha1, type, &size);\n\tif (!buffer)\n\t\tusage(\"unable to read sha1 file\");\n\tif (strcmp(type, \"tree\"))\n\t\tusage(\"expected a \'tree\' node\");\n\twhile (size) {\n\t\tint len = strlen(buffer)+1;\n\t\tunsigned char *sha1 = buffer + len;\n\t\tchar *path = strchr(buffer, \' \')+1"), + SliceDiff::Replaced(b";\n\t\t", b";\n\t\tchar *data;\n\t\t"), + SliceDiff::Unchanged(b"unsigned"), + SliceDiff::Replaced(b" ", b" long filesize;\n\t\tunsigned "), + SliceDiff::Unchanged(b"int mode"), + SliceDiff::Replaced(b";\n\t\t", b";\n\t\tint fd;\n\n\t\t"), + SliceDiff::Unchanged(b"if (size < len + 20 || sscanf(buffer, \"%o\", &mode) != 1)\n\t\t\tusage(\"corrupt \'tree\' file\");\n\t\tbuffer = sha1 + 20;\n\t\tsize -= len + 20"), + SliceDiff::Replaced(b";\n\t\tprintf(\"%o %s (%s)\\n\", ", b";\n\t\tdata = read_sha1_file(sha1, type, &filesize);\n\t\tif (!data || strcmp(type, \"blob\"))\n\t\t\tusage(\"tree file refers to bad file data\");\n\t\tfd = create_file(path);\n\t\tif (fd < 0)\n\t\t\tusage(\"unable to create file\");\n\t\tif (write(fd, data, filesize) != filesize)\n\t\t\tusage(\"unable to write file\");\n\t\tfchmod(fd, "), + SliceDiff::Unchanged(b"mode"), + SliceDiff::Replaced(b", path, sha1_to_hex(sha1));\n\t}\n\t", b");\n\t\tclose(fd);\n\t\tfree(data);\n\t}\n\t"), + SliceDiff::Unchanged(b"return 0;\n}\n\nint main(int argc, char **argv)\n{\n\tint fd;\n\tunsigned char sha1[20];\n\n\tif (argc != 2)\n\t\tusage(\"read-tree \");\n\tif (get_sha1_hex(argv[1], sha1) < 0)\n\t\tusage(\"read-tree \");\n\tsha1_file_directory = getenv(DB_ENVIRONMENT);\n\tif (!sha1_file_directory)\n\t\tsha1_file_directory = DEFAULT_DB_ENVIRONMENT;\n\tif (unpack(sha1) < 0)\n\t\tusage(\"unpack failed\");\n\treturn 0;\n}\n") + ] + ); + } +} diff --git a/lib/src/lib.rs b/lib/src/lib.rs index b739e0d42..bc8793c12 100644 --- a/lib/src/lib.rs +++ b/lib/src/lib.rs @@ -23,6 +23,7 @@ pub mod commit; pub mod commit_builder; pub mod conflicts; pub mod dag_walk; +pub mod diff; pub mod evolution; pub mod files; pub mod git;