forked from mirrors/jj
diff: add trait for bytes comparison
This could be implemented as a newtype `Wrapper<'a>(&'a [u8])`, but a lifetime of the wrap function couldn't be specified correctly: fn diff(left: &[u8], right: &[u8], wrap_fn: F, ..) where F: for<'a> Fn(&'a [u8]) -> W<'a>, // F::Output<'a> can't be specified W: Copy + Eq + Hash If the wrapper were of `&Wrapper([u8])` type, `Fn(&[u8]) -> &W` works. However, it means we can no longer set comparison parameter (such as Regex) dynamically. Another idea is to add some filter function of `Fn(&[u8]) -> Cow<'_, [u8]>` type, but I don't think we would want to pay the allocation cost in hashing/comparison code. `Fn(&[u8]) -> impl Iterator<Item = &[u8]>` might work, but it would be equally complex.
This commit is contained in:
parent
dfaa52c88a
commit
f672c92509
1 changed files with 82 additions and 0 deletions
|
@ -17,6 +17,10 @@
|
|||
use std::cmp::Ordering;
|
||||
use std::collections::BTreeMap;
|
||||
use std::collections::HashMap;
|
||||
use std::hash::BuildHasher;
|
||||
use std::hash::Hash;
|
||||
use std::hash::Hasher;
|
||||
use std::hash::RandomState;
|
||||
use std::iter;
|
||||
use std::ops::Range;
|
||||
use std::slice;
|
||||
|
@ -71,6 +75,84 @@ pub fn find_nonword_ranges(text: &[u8]) -> Vec<Range<usize>> {
|
|||
.collect()
|
||||
}
|
||||
|
||||
/// Compares byte sequences based on a certain equivalence property.
|
||||
///
|
||||
/// This isn't a newtype `Wrapper<'a>(&'a [u8])` but an external comparison
|
||||
/// object for the following reasons:
|
||||
///
|
||||
/// a. If it were newtype, a generic `wrap` function would be needed. It
|
||||
/// couldn't be expressed as a simple closure:
|
||||
/// `for<'a> Fn(&'a [u8]) -> ???<'a>`
|
||||
/// b. Dynamic comparison object can be implemented intuitively. For example,
|
||||
/// `pattern: &Regex` would have to be copied to all newtype instances if it
|
||||
/// were newtype.
|
||||
/// c. Hash values can be cached if hashing is controlled externally.
|
||||
pub trait CompareBytes {
|
||||
/// Returns true if `left` and `right` are equivalent.
|
||||
fn eq(&self, left: &[u8], right: &[u8]) -> bool;
|
||||
|
||||
/// Generates hash which respects the following property:
|
||||
/// `eq(left, right) => hash(left) == hash(right)`
|
||||
fn hash<H: Hasher>(&self, text: &[u8], state: &mut H);
|
||||
}
|
||||
|
||||
// An instance might have e.g. Regex pattern, which can't be trivially copied.
|
||||
// Such comparison object can be passed by reference.
|
||||
impl<C: CompareBytes + ?Sized> CompareBytes for &C {
|
||||
fn eq(&self, left: &[u8], right: &[u8]) -> bool {
|
||||
<C as CompareBytes>::eq(self, left, right)
|
||||
}
|
||||
|
||||
fn hash<H: Hasher>(&self, text: &[u8], state: &mut H) {
|
||||
<C as CompareBytes>::hash(self, text, state);
|
||||
}
|
||||
}
|
||||
|
||||
/// Compares byte sequences literally.
|
||||
#[derive(Clone, Debug, Default)]
|
||||
pub struct CompareBytesExactly;
|
||||
|
||||
impl CompareBytes for CompareBytesExactly {
|
||||
fn eq(&self, left: &[u8], right: &[u8]) -> bool {
|
||||
left == right
|
||||
}
|
||||
|
||||
fn hash<H: Hasher>(&self, text: &[u8], state: &mut H) {
|
||||
text.hash(state);
|
||||
}
|
||||
}
|
||||
|
||||
/// Compares words (or tokens) under a certain hasher configuration.
|
||||
#[derive(Clone, Debug, Default)]
|
||||
struct WordComparator<C, S> {
|
||||
compare: C,
|
||||
hash_builder: S,
|
||||
}
|
||||
|
||||
#[allow(unused)] // TODO
|
||||
impl<C: CompareBytes> WordComparator<C, RandomState> {
|
||||
fn new(compare: C) -> Self {
|
||||
WordComparator {
|
||||
compare,
|
||||
// TODO: switch to ahash for better performance?
|
||||
hash_builder: RandomState::new(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(unused)] // TODO
|
||||
impl<C: CompareBytes, S: BuildHasher> WordComparator<C, S> {
|
||||
fn eq(&self, left: &[u8], right: &[u8]) -> bool {
|
||||
self.compare.eq(left, right)
|
||||
}
|
||||
|
||||
fn hash_one(&self, text: &[u8]) -> u64 {
|
||||
let mut state = self.hash_builder.build_hasher();
|
||||
self.compare.hash(text, &mut state);
|
||||
state.finish()
|
||||
}
|
||||
}
|
||||
|
||||
/// Index in a list of word (or token) ranges.
|
||||
#[derive(Clone, Copy, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)]
|
||||
struct WordPosition(usize);
|
||||
|
|
Loading…
Reference in a new issue