diff: implement some ignore-space rules

The added comparison functions correspond to --ignore-all-space and --ignore-space-change. --ignore-space-at-eol can be combined with the other flags, so it will have to be implemented as a preprocessing function. --ignore-blank-lines will also require some change in the tokenizer function.
2024-09-22 23:05:34 +09:00 · 2024-09-22 23:05:34 +09:00 · de137c8f9a
commit de137c8f9a
parent f672c92509
1 changed files with 109 additions and 0 deletions
--- a/lib/src/diff.rs
+++ b/lib/src/diff.rs
@ -75,6 +75,38 @@ pub fn find_nonword_ranges(text: &[u8]) -> Vec<Range<usize>> {
        .collect()
 }
 fn bytes_ignore_all_whitespace(text: &[u8]) -> impl Iterator<Item = u8> + '_ {
    text.iter().copied().filter(|b| !b.is_ascii_whitespace())
 }
 fn bytes_ignore_whitespace_amount(text: &[u8]) -> impl Iterator<Item = u8> + '_ {
    let mut prev_was_space = false;
    text.iter().filter_map(move |&b| {
        let was_space = prev_was_space;
        let is_space = b.is_ascii_whitespace();
        prev_was_space = is_space;
        match (was_space, is_space) {
            (_, false) => Some(b),
            (false, true) => Some(b' '),
            (true, true) => None,
        }
    })
 }
 fn hash_with_length_suffix<I, H>(data: I, state: &mut H)
 where
    I: IntoIterator,
    I::Item: Hash,
    H: Hasher,
 {
    let mut len: usize = 0;
    for d in data {
        d.hash(state);
        len += 1;
    }
    state.write_usize(len);
 }
 /// Compares byte sequences based on a certain equivalence property.
 ///
 /// This isn't a newtype `Wrapper<'a>(&'a [u8])` but an external comparison
@ -122,6 +154,34 @@ impl CompareBytes for CompareBytesExactly {
    }
 }
 /// Compares byte sequences ignoring any whitespace occurrences.
 #[derive(Clone, Debug, Default)]
 pub struct CompareBytesIgnoreAllWhitespace;
 impl CompareBytes for CompareBytesIgnoreAllWhitespace {
    fn eq(&self, left: &[u8], right: &[u8]) -> bool {
        bytes_ignore_all_whitespace(left).eq(bytes_ignore_all_whitespace(right))
    }
    fn hash<H: Hasher>(&self, text: &[u8], state: &mut H) {
        hash_with_length_suffix(bytes_ignore_all_whitespace(text), state);
    }
 }
 /// Compares byte sequences ignoring changes in whitespace amount.
 #[derive(Clone, Debug, Default)]
 pub struct CompareBytesIgnoreWhitespaceAmount;
 impl CompareBytes for CompareBytesIgnoreWhitespaceAmount {
    fn eq(&self, left: &[u8], right: &[u8]) -> bool {
        bytes_ignore_whitespace_amount(left).eq(bytes_ignore_whitespace_amount(right))
    }
    fn hash<H: Hasher>(&self, text: &[u8], state: &mut H) {
        hash_with_length_suffix(bytes_ignore_whitespace_amount(text), state);
    }
 }
 /// Compares words (or tokens) under a certain hasher configuration.
 #[derive(Clone, Debug, Default)]
 struct WordComparator<C, S> {
@ -891,6 +951,55 @@ mod tests {
        );
    }
    #[test]
    fn test_compare_bytes_ignore_all_whitespace() {
        let comp = WordComparator::new(CompareBytesIgnoreAllWhitespace);
        let hash = |data: &[u8]| comp.hash_one(data);
        assert!(comp.eq(b"", b""));
        assert!(comp.eq(b"", b" "));
        assert!(comp.eq(b"\t", b"\r"));
        assert_eq!(hash(b""), hash(b""));
        assert_eq!(hash(b""), hash(b" "));
        assert_eq!(hash(b""), hash(b"\t"));
        assert_eq!(hash(b""), hash(b"\r"));
        assert!(comp.eq(b"ab", b" a  b\t"));
        assert_eq!(hash(b"ab"), hash(b" a  b\t"));
        assert!(!comp.eq(b"a", b""));
        assert!(!comp.eq(b"a", b" "));
        assert!(!comp.eq(b"a", b"ab"));
        assert!(!comp.eq(b"ab", b"ba"));
    }
    #[test]
    fn test_compare_bytes_ignore_whitespace_amount() {
        let comp = WordComparator::new(CompareBytesIgnoreWhitespaceAmount);
        let hash = |data: &[u8]| comp.hash_one(data);
        assert!(comp.eq(b"", b""));
        assert!(comp.eq(b"\n", b" \n"));
        assert!(comp.eq(b"\t", b"\r"));
        assert_eq!(hash(b""), hash(b""));
        assert_eq!(hash(b" "), hash(b"\n"));
        assert_eq!(hash(b" "), hash(b" \n"));
        assert_eq!(hash(b" "), hash(b"\t"));
        assert_eq!(hash(b" "), hash(b"\r"));
        assert!(comp.eq(b"a b c\n", b"a  b\tc\r\n"));
        assert_eq!(hash(b"a b c\n"), hash(b"a  b\tc\r\n"));
        assert!(!comp.eq(b"", b" "));
        assert!(!comp.eq(b"a", b""));
        assert!(!comp.eq(b"a", b" "));
        assert!(!comp.eq(b"a", b"a "));
        assert!(!comp.eq(b"a", b" a"));
        assert!(!comp.eq(b"a", b"ab"));
        assert!(!comp.eq(b"ab", b"ba"));
        assert!(!comp.eq(b"ab", b"a b"));
    }
    fn unchanged_ranges(
        left: &DiffSource,
        right: &DiffSource,