From f672c9250942e1f68297b39c119c192e2247ed9f Mon Sep 17 00:00:00 2001 From: Yuya Nishihara Date: Sun, 22 Sep 2024 23:05:34 +0900 Subject: [PATCH] diff: add trait for bytes comparison This could be implemented as a newtype `Wrapper<'a>(&'a [u8])`, but a lifetime of the wrap function couldn't be specified correctly: fn diff(left: &[u8], right: &[u8], wrap_fn: F, ..) where F: for<'a> Fn(&'a [u8]) -> W<'a>, // F::Output<'a> can't be specified W: Copy + Eq + Hash If the wrapper were of `&Wrapper([u8])` type, `Fn(&[u8]) -> &W` works. However, it means we can no longer set comparison parameter (such as Regex) dynamically. Another idea is to add some filter function of `Fn(&[u8]) -> Cow<'_, [u8]>` type, but I don't think we would want to pay the allocation cost in hashing/comparison code. `Fn(&[u8]) -> impl Iterator` might work, but it would be equally complex. --- lib/src/diff.rs | 82 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 82 insertions(+) diff --git a/lib/src/diff.rs b/lib/src/diff.rs index 5eab27eff..a80a7f6cc 100644 --- a/lib/src/diff.rs +++ b/lib/src/diff.rs @@ -17,6 +17,10 @@ use std::cmp::Ordering; use std::collections::BTreeMap; use std::collections::HashMap; +use std::hash::BuildHasher; +use std::hash::Hash; +use std::hash::Hasher; +use std::hash::RandomState; use std::iter; use std::ops::Range; use std::slice; @@ -71,6 +75,84 @@ pub fn find_nonword_ranges(text: &[u8]) -> Vec> { .collect() } +/// Compares byte sequences based on a certain equivalence property. +/// +/// This isn't a newtype `Wrapper<'a>(&'a [u8])` but an external comparison +/// object for the following reasons: +/// +/// a. If it were newtype, a generic `wrap` function would be needed. It +/// couldn't be expressed as a simple closure: +/// `for<'a> Fn(&'a [u8]) -> ???<'a>` +/// b. Dynamic comparison object can be implemented intuitively. For example, +/// `pattern: &Regex` would have to be copied to all newtype instances if it +/// were newtype. +/// c. Hash values can be cached if hashing is controlled externally. +pub trait CompareBytes { + /// Returns true if `left` and `right` are equivalent. + fn eq(&self, left: &[u8], right: &[u8]) -> bool; + + /// Generates hash which respects the following property: + /// `eq(left, right) => hash(left) == hash(right)` + fn hash(&self, text: &[u8], state: &mut H); +} + +// An instance might have e.g. Regex pattern, which can't be trivially copied. +// Such comparison object can be passed by reference. +impl CompareBytes for &C { + fn eq(&self, left: &[u8], right: &[u8]) -> bool { + ::eq(self, left, right) + } + + fn hash(&self, text: &[u8], state: &mut H) { + ::hash(self, text, state); + } +} + +/// Compares byte sequences literally. +#[derive(Clone, Debug, Default)] +pub struct CompareBytesExactly; + +impl CompareBytes for CompareBytesExactly { + fn eq(&self, left: &[u8], right: &[u8]) -> bool { + left == right + } + + fn hash(&self, text: &[u8], state: &mut H) { + text.hash(state); + } +} + +/// Compares words (or tokens) under a certain hasher configuration. +#[derive(Clone, Debug, Default)] +struct WordComparator { + compare: C, + hash_builder: S, +} + +#[allow(unused)] // TODO +impl WordComparator { + fn new(compare: C) -> Self { + WordComparator { + compare, + // TODO: switch to ahash for better performance? + hash_builder: RandomState::new(), + } + } +} + +#[allow(unused)] // TODO +impl WordComparator { + fn eq(&self, left: &[u8], right: &[u8]) -> bool { + self.compare.eq(left, right) + } + + fn hash_one(&self, text: &[u8]) -> u64 { + let mut state = self.hash_builder.build_hasher(); + self.compare.hash(text, &mut state); + state.finish() + } +} + /// Index in a list of word (or token) ranges. #[derive(Clone, Copy, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] struct WordPosition(usize);