index: make IdIndex store raw bytes, not hex bytes

This helps us to migrate commit_id index to ReadonlyIndex. For large
repositories, this also reduces initialization cost, but that's not the main
intent of this change.

https://github.com/martinvonz/jj/pull/1041#issuecomment-1399225876

common_hex_len() and iter_half_bytes() are added to backend.rs since more
call sites will be added to index.rs, and I feel index.rs isn't a good place
to host this kind of utility functions.
This commit is contained in:
Yuya Nishihara 2023-01-18 20:18:57 +09:00
parent 65a659347e
commit 1a4b5c5ee6
3 changed files with 75 additions and 26 deletions

View file

@ -330,6 +330,26 @@ impl Tree {
}
}
/// Calculates common prefix length of two bytes. The length to be returned is
/// a number of hexadecimal digits.
pub fn common_hex_len(bytes_a: &[u8], bytes_b: &[u8]) -> usize {
iter_half_bytes(bytes_a)
.zip(iter_half_bytes(bytes_b))
.take_while(|(a, b)| a == b)
.count()
}
fn iter_half_bytes(bytes: &[u8]) -> impl ExactSizeIterator<Item = u8> + '_ {
(0..bytes.len() * 2).map(|i| {
let v = bytes[i / 2];
if i & 1 == 0 {
v >> 4
} else {
v & 0xf
}
})
}
pub fn root_change_id() -> &'static ChangeId {
static ROOT_CHANGE_ID: Lazy<ChangeId> =
Lazy::new(|| ChangeId::new(vec![0; CHANGE_ID_HASH_LENGTH]));

View file

@ -250,7 +250,7 @@ impl ReadonlyRepo {
// We need to account for rewritten commits as well
let mut id_index = IdIndex::new();
for entry in self.as_repo_ref().index().iter() {
id_index.insert(entry.commit_id().hex().as_bytes(), ());
id_index.insert(entry.commit_id().as_bytes(), ());
}
id_index
})
@ -263,22 +263,22 @@ impl ReadonlyRepo {
.unwrap();
let mut id_index = IdIndex::new();
for entry in all_visible_revisions.iter() {
id_index.insert(entry.change_id().hex().as_bytes(), ());
id_index.insert(entry.change_id().as_bytes(), ());
}
id_index
})
}
pub fn shortest_unique_prefix_length(&self, target_id_hex: &str) -> usize {
pub fn shortest_unique_prefix_length(&self, target_id_bytes: &[u8]) -> usize {
// For `len = index.shortest(id)`, a prefix of length `len` will disambiguate
// `id` from all other ids in the index. This will be just as true for
// `max(len, anything_else)`, so a max of such lengths will disambiguate in all
// indices.
cmp::max(
self.commit_id_index()
.shortest_unique_prefix_len(target_id_hex.as_bytes()),
.shortest_unique_prefix_len(target_id_bytes),
self.change_id_index()
.shortest_unique_prefix_len(target_id_hex.as_bytes()),
.shortest_unique_prefix_len(target_id_bytes),
)
}
@ -1194,6 +1194,7 @@ impl IdIndex {
Self::default()
}
/// Inserts a bytes key to the index.
pub fn insert(&mut self, key: &[u8], value: IdIndexValue) -> Option<IdIndexValue> {
self.0.insert(key.to_vec(), value)
}
@ -1201,6 +1202,9 @@ impl IdIndex {
/// This function returns the shortest length of a prefix of `key` that
/// disambiguates it from every other key in the index.
///
/// The given `key` must be provided as bytes, not as ASCII hexadecimal
/// digits. The length to be returned is a number of hexadecimal digits.
///
/// This has some properties that we do not currently make much use of:
///
/// - The algorithm works even if `key` itself is not in the index.
@ -1217,10 +1221,7 @@ impl IdIndex {
.next_back();
let right = self.0.range::<[u8], _>((Excluded(key), Unbounded)).next();
itertools::chain(left, right)
.map(|(neighbor, _value)| {
let common_len = key.iter().zip(neighbor).take_while(|(a, b)| a == b).count();
common_len + 1
})
.map(|(neighbor, _value)| backend::common_hex_len(key, neighbor) + 1)
.max()
.unwrap_or(0)
}
@ -1233,23 +1234,44 @@ mod tests {
#[test]
fn test_id_index() {
let mut id_index = IdIndex::new();
id_index.insert(b"ab", ());
id_index.insert(b"acd0", ());
assert_eq!(id_index.shortest_unique_prefix_len(b"acd0"), 2);
assert_eq!(id_index.shortest_unique_prefix_len(b"ac"), 3);
id_index.insert(&hex::decode("ab").unwrap(), ());
id_index.insert(&hex::decode("acd0").unwrap(), ());
assert_eq!(
id_index.shortest_unique_prefix_len(&hex::decode("acd0").unwrap()),
2
);
assert_eq!(
id_index.shortest_unique_prefix_len(&hex::decode("ac").unwrap()),
3
);
let mut id_index = IdIndex::new();
id_index.insert(b"ab", ());
id_index.insert(b"acd0", ());
id_index.insert(b"acf0", ());
id_index.insert(b"a0", ());
id_index.insert(b"ba", ());
id_index.insert(&hex::decode("ab").unwrap(), ());
id_index.insert(&hex::decode("acd0").unwrap(), ());
id_index.insert(&hex::decode("acf0").unwrap(), ());
id_index.insert(&hex::decode("a0").unwrap(), ());
id_index.insert(&hex::decode("ba").unwrap(), ());
assert_eq!(id_index.shortest_unique_prefix_len(b"a0"), 2);
assert_eq!(id_index.shortest_unique_prefix_len(b"ba"), 1);
assert_eq!(id_index.shortest_unique_prefix_len(b"ab"), 2);
assert_eq!(id_index.shortest_unique_prefix_len(b"acd0"), 3);
assert_eq!(
id_index.shortest_unique_prefix_len(&hex::decode("a0").unwrap()),
2
);
assert_eq!(
id_index.shortest_unique_prefix_len(&hex::decode("ba").unwrap()),
1
);
assert_eq!(
id_index.shortest_unique_prefix_len(&hex::decode("ab").unwrap()),
2
);
assert_eq!(
id_index.shortest_unique_prefix_len(&hex::decode("acd0").unwrap()),
3
);
// If it were there, the length would be 1.
assert_eq!(id_index.shortest_unique_prefix_len(b"c0"), 1);
assert_eq!(
id_index.shortest_unique_prefix_len(&hex::decode("c0").unwrap()),
1
);
}
}

View file

@ -412,6 +412,10 @@ impl<'a, C, I, O> TemplateProperty<C, O> for TemplateFunction<'a, C, I, O> {
pub struct CommitOrChangeId(Vec<u8>);
impl CommitOrChangeId {
pub fn as_bytes(&self) -> &[u8] {
&self.0
}
pub fn hex(&self) -> String {
hex::encode(&self.0)
}
@ -423,12 +427,15 @@ impl CommitOrChangeId {
}
pub fn short_prefix_and_brackets(&self, repo: RepoRef) -> String {
highlight_shortest_prefix(self.hex(), 12, repo)
highlight_shortest_prefix(self, 12, repo)
}
}
fn highlight_shortest_prefix(mut hex: String, total_len: usize, repo: RepoRef) -> String {
let prefix_len = repo.base_repo().shortest_unique_prefix_length(&hex);
fn highlight_shortest_prefix(id: &CommitOrChangeId, total_len: usize, repo: RepoRef) -> String {
let prefix_len = repo
.base_repo()
.shortest_unique_prefix_length(id.as_bytes());
let mut hex = id.hex();
if prefix_len < total_len - 2 {
format!(
"{}[{}]",