From f672c9250942e1f68297b39c119c192e2247ed9f Mon Sep 17 00:00:00 2001
From: Yuya Nishihara <yuya@tcha.org>
Date: Sun, 22 Sep 2024 23:05:34 +0900
Subject: [PATCH] diff: add trait for bytes comparison

This could be implemented as a newtype `Wrapper<'a>(&'a [u8])`, but a lifetime
of the wrap function couldn't be specified correctly:

  fn diff(left: &[u8], right: &[u8], wrap_fn: F, ..)
  where
    F: for<'a> Fn(&'a [u8]) -> W<'a>, // F::Output<'a> can't be specified
    W: Copy + Eq + Hash

If the wrapper were of `&Wrapper([u8])` type, `Fn(&[u8]) -> &W` works. However,
it means we can no longer set comparison parameter (such as Regex) dynamically.

Another idea is to add some filter function of `Fn(&[u8]) -> Cow<'_, [u8]>`
type, but I don't think we would want to pay the allocation cost in
hashing/comparison code. `Fn(&[u8]) -> impl Iterator<Item = &[u8]>` might work,
but it would be equally complex.
---
 lib/src/diff.rs | 82 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 82 insertions(+)
diff --git a/lib/src/diff.rs b/lib/src/diff.rs
index 5eab27eff..a80a7f6cc 100644
--- a/lib/src/diff.rs
+++ b/lib/src/diff.rs
@@ -17,6 +17,10 @@
 use std::cmp::Ordering;
 use std::collections::BTreeMap;
 use std::collections::HashMap;
+use std::hash::BuildHasher;
+use std::hash::Hash;
+use std::hash::Hasher;
+use std::hash::RandomState;
 use std::iter;
 use std::ops::Range;
 use std::slice;
@@ -71,6 +75,84 @@ pub fn find_nonword_ranges(text: &[u8]) -> Vec<Range<usize>> {
         .collect()
 }
 
+/// Compares byte sequences based on a certain equivalence property.
+///
+/// This isn't a newtype `Wrapper<'a>(&'a [u8])` but an external comparison
+/// object for the following reasons:
+///
+/// a. If it were newtype, a generic `wrap` function would be needed. It
+///    couldn't be expressed as a simple closure:
+///    `for<'a> Fn(&'a [u8]) -> ???<'a>`
+/// b. Dynamic comparison object can be implemented intuitively. For example,
+///    `pattern: &Regex` would have to be copied to all newtype instances if it
+///    were newtype.
+/// c. Hash values can be cached if hashing is controlled externally.
+pub trait CompareBytes {
+    /// Returns true if `left` and `right` are equivalent.
+    fn eq(&self, left: &[u8], right: &[u8]) -> bool;
+
+    /// Generates hash which respects the following property:
+    /// `eq(left, right) => hash(left) == hash(right)`
+    fn hash<H: Hasher>(&self, text: &[u8], state: &mut H);
+}
+
+// An instance might have e.g. Regex pattern, which can't be trivially copied.
+// Such comparison object can be passed by reference.
+impl<C: CompareBytes + ?Sized> CompareBytes for &C {
+    fn eq(&self, left: &[u8], right: &[u8]) -> bool {
+        <C as CompareBytes>::eq(self, left, right)
+    }
+
+    fn hash<H: Hasher>(&self, text: &[u8], state: &mut H) {
+        <C as CompareBytes>::hash(self, text, state);
+    }
+}
+
+/// Compares byte sequences literally.
+#[derive(Clone, Debug, Default)]
+pub struct CompareBytesExactly;
+
+impl CompareBytes for CompareBytesExactly {
+    fn eq(&self, left: &[u8], right: &[u8]) -> bool {
+        left == right
+    }
+
+    fn hash<H: Hasher>(&self, text: &[u8], state: &mut H) {
+        text.hash(state);
+    }
+}
+
+/// Compares words (or tokens) under a certain hasher configuration.
+#[derive(Clone, Debug, Default)]
+struct WordComparator<C, S> {
+    compare: C,
+    hash_builder: S,
+}
+
+#[allow(unused)] // TODO
+impl<C: CompareBytes> WordComparator<C, RandomState> {
+    fn new(compare: C) -> Self {
+        WordComparator {
+            compare,
+            // TODO: switch to ahash for better performance?
+            hash_builder: RandomState::new(),
+        }
+    }
+}
+
+#[allow(unused)] // TODO
+impl<C: CompareBytes, S: BuildHasher> WordComparator<C, S> {
+    fn eq(&self, left: &[u8], right: &[u8]) -> bool {
+        self.compare.eq(left, right)
+    }
+
+    fn hash_one(&self, text: &[u8]) -> u64 {
+        let mut state = self.hash_builder.build_hasher();
+        self.compare.hash(text, &mut state);
+        state.finish()
+    }
+}
+
 /// Index in a list of word (or token) ranges.
 #[derive(Clone, Copy, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)]
 struct WordPosition(usize);