index: when writing incremental index, squash into parent file if smaller

We currently write a new incremental index file every time. That means
that the stack of index files quickly gets deep, which makes it slow
to read the index. This commit makes it so that we squash the new
index segment into its parent if the parent has fewer commits. That
means we'll limit the number of files to O(log n). Writes time will
also be O(log n) on average.
This commit is contained in:
Martin von Zweigbergk 2021-02-16 11:50:28 -08:00
parent a51543b752
commit 62ce5782b5
2 changed files with 119 additions and 8 deletions

View file

@ -466,6 +466,46 @@ impl MutableIndex {
buf
}
/// If the MutableIndex has more commits than its parent ReadonlyIndex,
/// return MutableIndex with the commits from both. This is done
/// recursively, so the stack of index files has O(log n) files.
fn maybe_squash_with_ancestors(self) -> MutableIndex {
if self.parent_file.is_none() {
return self;
}
let mut num_new_commits = self.segment_num_commits();
let mut parent_file = self.parent_file.as_ref().unwrap().clone();
let mut squashed;
loop {
// TODO: We should probably also squash if the parent file has less than N
// commits, regardless of how many (few) are in `self`.
if num_new_commits < parent_file.segment_num_commits() {
squashed = MutableIndex::incremental(parent_file);
break;
}
if parent_file.parent_file.is_none() {
squashed = MutableIndex::full(self.dir.clone(), self.hash_length);
break;
}
num_new_commits += parent_file.segment_num_commits();
parent_file = parent_file.parent_file.as_ref().unwrap().clone();
}
// TODO: This can be made more efficient by walking the parent files in order
// and not looking up via `self`.
for pos in squashed.num_parent_commits..self.num_commits() {
let entry = self.entry_by_pos(pos);
let parent_ids: Vec<_> = entry
.parents_positions()
.iter()
.map(|pos| self.entry_by_pos(*pos).commit_id())
.collect();
squashed.add_commit_data(entry.commit_id(), parent_ids);
}
squashed
}
pub fn save(self) -> io::Result<Arc<ReadonlyIndex>> {
if self.segment_num_commits() == 0 && self.parent_file.is_some() {
return Ok(self.parent_file.unwrap());
@ -473,8 +513,8 @@ impl MutableIndex {
let hash_length = self.hash_length;
let dir = self.dir.clone();
let buf = self.serialize();
let buf = self.maybe_squash_with_ancestors().serialize();
let mut hasher = Blake2b::new();
hasher.update(&buf);
let index_file_id_hex = hex::encode(&hasher.finalize());
@ -509,6 +549,10 @@ impl MutableIndex {
CompositeIndex(self).entry_by_id(commit_id)
}
pub fn entry_by_pos(&self, pos: u32) -> IndexEntry {
CompositeIndex(self).entry_by_pos(pos)
}
pub fn has_id(&self, commit_id: &CommitId) -> bool {
CompositeIndex(self).has_id(commit_id)
}
@ -1235,6 +1279,10 @@ impl ReadonlyIndex {
CompositeIndex(self).entry_by_id(commit_id)
}
pub fn entry_by_pos(&self, pos: u32) -> IndexEntry {
CompositeIndex(self).entry_by_pos(pos)
}
pub fn has_id(&self, commit_id: &CommitId) -> bool {
CompositeIndex(self).has_id(commit_id)
}

View file

@ -19,6 +19,7 @@ use jujube_lib::repo::ReadonlyRepo;
use jujube_lib::settings::UserSettings;
use jujube_lib::store::CommitId;
use jujube_lib::testutils;
use jujube_lib::testutils::create_random_commit;
use std::sync::Arc;
use test_case::test_case;
@ -337,13 +338,8 @@ fn test_index_commits_incremental(use_git: bool) {
assert_eq!(stats.num_commits, 2 + 3);
assert_eq!(stats.num_merges, 0);
assert_eq!(stats.max_generation_number, 3);
assert_eq!(stats.levels.len(), 3);
assert_eq!(stats.levels[0].num_commits, 2);
assert_eq!(stats.levels[1].num_commits, 1);
assert_ne!(stats.levels[1].name, stats.levels[0].name);
assert_eq!(stats.levels[2].num_commits, 2);
assert_ne!(stats.levels[2].name, stats.levels[0].name);
assert_ne!(stats.levels[2].name, stats.levels[1].name);
assert_eq!(stats.levels.len(), 1);
assert_eq!(stats.levels[0].num_commits, 5);
assert_eq!(generation_number(index.clone(), root_commit.id()), 0);
assert_eq!(generation_number(index.clone(), commit_a.id()), 1);
@ -389,6 +385,7 @@ fn test_index_commits_incremental_empty_transaction(use_git: bool) {
assert_eq!(stats.levels.len(), 2);
assert_eq!(stats.levels[0].num_commits, 2);
assert_eq!(stats.levels[1].num_commits, 1);
assert_ne!(stats.levels[1].name, stats.levels[0].name);
assert_eq!(generation_number(index.clone(), root_commit.id()), 0);
assert_eq!(generation_number(index.clone(), commit_a.id()), 1);
@ -419,3 +416,69 @@ fn test_index_commits_incremental_already_indexed(use_git: bool) {
assert_eq!(tx.index().num_commits(), 2 + 1);
tx.discard();
}
fn create_n_commits(settings: &UserSettings, repo: &mut Arc<ReadonlyRepo>, num_commits: i32) {
let mut tx = repo.start_transaction("test");
for _ in 0..num_commits {
create_random_commit(settings, repo).write_to_transaction(&mut tx);
}
tx.commit();
Arc::get_mut(repo).unwrap().reload();
}
fn commits_by_level(repo: &ReadonlyRepo) -> Vec<u32> {
repo.index()
.stats()
.levels
.iter()
.map(|level| level.num_commits)
.collect()
}
#[test_case(false ; "local store")]
#[test_case(true ; "git store")]
fn test_index_commits_incremental_squashed(use_git: bool) {
let settings = testutils::user_settings();
let (_temp_dir, mut repo) = testutils::init_repo(&settings, use_git);
create_n_commits(&settings, &mut repo, 1);
assert_eq!(commits_by_level(&repo), vec![2, 1]);
create_n_commits(&settings, &mut repo, 1);
assert_eq!(commits_by_level(&repo), vec![4]);
let (_temp_dir, mut repo) = testutils::init_repo(&settings, use_git);
create_n_commits(&settings, &mut repo, 2);
assert_eq!(commits_by_level(&repo), vec![4]);
let (_temp_dir, mut repo) = testutils::init_repo(&settings, use_git);
create_n_commits(&settings, &mut repo, 100);
assert_eq!(commits_by_level(&repo), vec![102]);
let (_temp_dir, mut repo) = testutils::init_repo(&settings, use_git);
create_n_commits(&settings, &mut repo, 2);
create_n_commits(&settings, &mut repo, 4);
create_n_commits(&settings, &mut repo, 8);
create_n_commits(&settings, &mut repo, 16);
create_n_commits(&settings, &mut repo, 32);
assert_eq!(commits_by_level(&repo), vec![64]);
let (_temp_dir, mut repo) = testutils::init_repo(&settings, use_git);
create_n_commits(&settings, &mut repo, 32);
create_n_commits(&settings, &mut repo, 16);
create_n_commits(&settings, &mut repo, 8);
create_n_commits(&settings, &mut repo, 4);
create_n_commits(&settings, &mut repo, 2);
assert_eq!(commits_by_level(&repo), vec![34, 16, 8, 4, 2]);
let (_temp_dir, mut repo) = testutils::init_repo(&settings, use_git);
create_n_commits(&settings, &mut repo, 10);
create_n_commits(&settings, &mut repo, 10);
create_n_commits(&settings, &mut repo, 10);
create_n_commits(&settings, &mut repo, 10);
create_n_commits(&settings, &mut repo, 10);
create_n_commits(&settings, &mut repo, 10);
create_n_commits(&settings, &mut repo, 10);
create_n_commits(&settings, &mut repo, 10);
create_n_commits(&settings, &mut repo, 10);
assert_eq!(commits_by_level(&repo), vec![72, 20]);
}