feat: make capacity adjustable

This commit is contained in:
Zixuan Chen 2023-07-14 00:47:02 +08:00
parent f6ebf6783d
commit 92434ccdfc
2 changed files with 36 additions and 22 deletions

View file

@ -3,7 +3,6 @@ use criterion::{black_box, criterion_group, criterion_main, Criterion};
pub fn entry(c: &mut Criterion) { pub fn entry(c: &mut Criterion) {
let data = include_str!("./permuted.mht"); let data = include_str!("./permuted.mht");
let data_x4 = data.repeat(4);
c.bench_function("compact-bytes", |b| { c.bench_function("compact-bytes", |b| {
b.iter(|| { b.iter(|| {
let mut bytes = CompactBytes::new(); let mut bytes = CompactBytes::new();
@ -11,11 +10,22 @@ pub fn entry(c: &mut Criterion) {
}); });
}); });
c.bench_function("compact-bytes x4", |b| { c.bench_function("compact-bytes x4", |b| {
let data_x4 = data.repeat(4);
b.iter(|| { b.iter(|| {
let mut bytes = CompactBytes::new(); let mut bytes = CompactBytes::new();
bytes.alloc_advance(black_box(data_x4.as_bytes())); bytes.alloc_advance(black_box(data_x4.as_bytes()));
}); });
}); });
let mut b = c.benchmark_group("slower");
b.sample_size(10);
b.bench_function("compact-bytes x100", |b| {
let data_x100 = data.repeat(100);
b.iter(|| {
let mut bytes = CompactBytes::new();
bytes.alloc_advance(black_box(data_x100.as_bytes()));
});
});
} }
criterion_group!(benches, entry); criterion_group!(benches, entry);

View file

@ -5,21 +5,15 @@ use fxhash::FxHasher32;
use std::{hash::Hasher, num::NonZeroU32, ops::Range}; use std::{hash::Hasher, num::NonZeroU32, ops::Range};
/// it must be a power of 2 /// it must be a power of 2
const DEFAULT_CAPACITY: usize = 1 << 17; const DEFAULT_CAPACITY: usize = 1 << 16;
const MASK: usize = DEFAULT_CAPACITY - 1;
const MAX_TRIED: usize = 4; const MAX_TRIED: usize = 4;
/// # Memory Usage /// # Memory Usage
/// ///
/// One entry in the hash table will take 36 bytes. And we need one entry for every position in the document. /// The memory usage is capacity * 12 bytes.
/// So the size of the hash table will be (36 ~ 72) * document_size. /// The default capacity is 65536 (2^16), so the default memory usage is 0.75MB
///
/// However, you can set the maximum size of the hashtable to reduce the memory usage.
/// It will drop the old entries when the size of the hashtable reaches the maximum size.
///
/// By default the maximum size of the hash table is 2 * 1024, which means the memory usage will be 72 * 2 * 1024 = 144KB.
/// It can fit L2 cache of most CPUs. This behavior is subjected to change in the future as we do more optimization.
/// ///
/// You can set the capacity by calling `with_capacity`. The capacity must be a power of 2.
pub struct CompactBytes { pub struct CompactBytes {
bytes: AppendOnlyBytes, bytes: AppendOnlyBytes,
map: Box<[Option<NonZeroU32>]>, map: Box<[Option<NonZeroU32>]>,
@ -27,6 +21,7 @@ pub struct CompactBytes {
/// next write index fr pos_and_next /// next write index fr pos_and_next
index: usize, index: usize,
capacity: usize, capacity: usize,
mask: usize,
} }
#[derive(Debug, Default, Clone, Copy)] #[derive(Debug, Default, Clone, Copy)]
@ -45,16 +40,24 @@ impl CompactBytes {
pos_and_next: vec![Default::default(); DEFAULT_CAPACITY].into_boxed_slice(), pos_and_next: vec![Default::default(); DEFAULT_CAPACITY].into_boxed_slice(),
index: 1, index: 1,
capacity: DEFAULT_CAPACITY, capacity: DEFAULT_CAPACITY,
mask: DEFAULT_CAPACITY - 1,
} }
} }
/// Set the maximum size of the hash table /// cap must be a power of 2
/// When the size of the hash table reaches the maximum size, it will drop the old entries. pub fn with_capacity(cap: usize) -> Self {
/// When it's zero, it will never drop the old entries. let cap = cap.max(1024).next_power_of_two();
pub fn set_capacity(&mut self, capacity: usize) { CompactBytes {
self.capacity = capacity; bytes: AppendOnlyBytes::with_capacity(cap),
map: vec![None; cap].into_boxed_slice(),
pos_and_next: vec![Default::default(); cap].into_boxed_slice(),
index: 1,
capacity: cap,
mask: cap - 1,
}
} }
#[inline]
pub fn capacity(&self) -> usize { pub fn capacity(&self) -> usize {
self.capacity self.capacity
} }
@ -74,6 +77,7 @@ impl CompactBytes {
self.append(bytes) self.append(bytes)
} }
#[inline]
pub fn as_bytes(&self) -> &[u8] { pub fn as_bytes(&self) -> &[u8] {
self.bytes.as_bytes() self.bytes.as_bytes()
} }
@ -125,7 +129,7 @@ impl CompactBytes {
// if old doc = "0123", append "x", then we need to add "123x" entry to the map // if old doc = "0123", append "x", then we need to add "123x" entry to the map
// if old doc = "0123", append "xyz", then we need to add "123x", "23xy", "3xyz" entries to the map // if old doc = "0123", append "xyz", then we need to add "123x", "23xy", "3xyz" entries to the map
for i in old_len.saturating_sub(3)..self.bytes.len().saturating_sub(3) { for i in old_len.saturating_sub(3)..self.bytes.len().saturating_sub(3) {
let key = hash(self.bytes.as_bytes(), i); let key = hash(self.bytes.as_bytes(), i, self.mask);
// Override the min position in entry with the current position // Override the min position in entry with the current position
let old = self.map[key]; let old = self.map[key];
self.pos_and_next[self.index] = PosLinkList { self.pos_and_next[self.index] = PosLinkList {
@ -133,7 +137,7 @@ impl CompactBytes {
next: old, next: old,
}; };
self.map[key] = Some(NonZeroU32::new(self.index as u32).unwrap()); self.map[key] = Some(NonZeroU32::new(self.index as u32).unwrap());
self.index = (self.index + 1) & MASK; self.index = (self.index + 1) & self.mask;
if self.index == 0 { if self.index == 0 {
self.index = 1; self.index = 1;
} }
@ -149,7 +153,7 @@ impl CompactBytes {
return None; return None;
} }
let key = hash(bytes, 0); let key = hash(bytes, 0, self.mask);
match self.map[key] { match self.map[key] {
Some(pointer) => { Some(pointer) => {
let mut node = self.pos_and_next[pointer.get() as usize]; let mut node = self.pos_and_next[pointer.get() as usize];
@ -195,14 +199,14 @@ impl Default for CompactBytes {
} }
} }
#[inline] #[inline(always)]
fn hash(bytes: &[u8], n: usize) -> usize { fn hash(bytes: &[u8], n: usize, mask: usize) -> usize {
let mut hasher = FxHasher32::default(); let mut hasher = FxHasher32::default();
hasher.write_u8(bytes[n]); hasher.write_u8(bytes[n]);
hasher.write_u8(bytes[n + 1]); hasher.write_u8(bytes[n + 1]);
hasher.write_u8(bytes[n + 2]); hasher.write_u8(bytes[n + 2]);
hasher.write_u8(bytes[n + 3]); hasher.write_u8(bytes[n + 3]);
hasher.finish() as usize & MASK hasher.finish() as usize & mask
} }
#[cfg(test)] #[cfg(test)]