From 3d0b3d57d82c5fe77527704d008256b7d995209c Mon Sep 17 00:00:00 2001 From: Yuya Nishihara Date: Tue, 9 Jan 2024 17:31:40 +0900 Subject: [PATCH] git_backend: on gc(), remove unreachable no-gc refs and compact them With my jj repo, the number of jj/keep refs went down from 87887 to 27733. The .git directory size is halved, but we'll need to clean up extra and index files to save disk space. "git gc --prune=now && jj debug reindex" passed, so the repo wouldn't be corrupted. #12 --- CHANGELOG.md | 6 +- lib/src/git_backend.rs | 95 ++++++++++++++++- lib/tests/test_git_backend.rs | 189 ++++++++++++++++++++++++++++++++++ 3 files changed, 286 insertions(+), 4 deletions(-) create mode 100644 lib/tests/test_git_backend.rs diff --git a/CHANGELOG.md b/CHANGELOG.md index 5eb0b0a8b..ad5f76671 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,10 +15,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### New features -* New `jj op abandon` command is added to clean up the operation history. If GC - is implemented, Git refs and commit objects can be compacted. +* New `jj op abandon` command is added to clean up the operation history. Git + refs and commit objects can be further compacted by `jj util gc`. -* `jj util gc` now removes unreachable operation and view objects. +* `jj util gc` now removes unreachable operation, view, and Git objects. * `jj branch rename` will now warn if the renamed branch has a remote branch, since those will have to be manually renamed outside of `jj`. diff --git a/lib/src/git_backend.rs b/lib/src/git_backend.rs index d5abb3d65..a4adad159 100644 --- a/lib/src/git_backend.rs +++ b/lib/src/git_backend.rs @@ -607,6 +607,88 @@ fn to_no_gc_ref_update(id: &CommitId) -> gix::refs::transaction::RefEdit { } } +fn to_ref_deletion(git_ref: gix::refs::Reference) -> gix::refs::transaction::RefEdit { + let expected = gix::refs::transaction::PreviousValue::ExistingMustMatch(git_ref.target); + gix::refs::transaction::RefEdit { + change: gix::refs::transaction::Change::Delete { + expected, + log: gix::refs::transaction::RefLog::AndReference, + }, + name: git_ref.name, + deref: false, + } +} + +/// Recreates `refs/jj/keep` refs for the `new_heads`, and removes the other +/// unreachable and non-head refs. +fn recreate_no_gc_refs( + git_repo: &gix::Repository, + new_heads: impl IntoIterator, + keep_newer: SystemTime, +) -> Result<(), BackendError> { + // Calculate diff between existing no-gc refs and new heads. + let new_heads: HashSet = new_heads.into_iter().collect(); + let mut no_gc_refs_to_keep_count: usize = 0; + let mut no_gc_refs_to_delete: Vec = Vec::new(); + let git_references = git_repo + .references() + .map_err(|err| BackendError::Other(err.into()))?; + let no_gc_refs_iter = git_references + .prefixed(NO_GC_REF_NAMESPACE) + .map_err(|err| BackendError::Other(err.into()))?; + for git_ref in no_gc_refs_iter { + let git_ref = git_ref.map_err(BackendError::Other)?.detach(); + let oid = git_ref.target.try_id().ok_or_else(|| { + let name = git_ref.name.as_bstr(); + BackendError::Other(format!("Symbolic no-gc ref found: {name}").into()) + })?; + let id = CommitId::from_bytes(oid.as_bytes()); + let name_good = git_ref.name.as_bstr()[NO_GC_REF_NAMESPACE.len()..] == id.hex(); + if new_heads.contains(&id) && name_good { + no_gc_refs_to_keep_count += 1; + continue; + } + // Check timestamp of loose ref, but this is still racy on re-import + // because: + // - existing packed ref won't be demoted to loose ref + // - existing loose ref won't be touched + // + // TODO: might be better to switch to a dummy merge, where new no-gc ref + // will always have a unique name. Doing that with the current + // ref-per-head strategy would increase the number of the no-gc refs. + // https://github.com/martinvonz/jj/pull/2659#issuecomment-1837057782 + let loose_ref_path = git_repo.path().join(git_ref.name.to_path()); + if let Ok(metadata) = loose_ref_path.metadata() { + let mtime = metadata.modified().expect("unsupported platform?"); + if mtime > keep_newer { + tracing::trace!(?git_ref, "not deleting new"); + no_gc_refs_to_keep_count += 1; + continue; + } + } + // Also deletes no-gc ref of random name created by old jj. + tracing::trace!(?git_ref, ?name_good, "will delete"); + no_gc_refs_to_delete.push(git_ref); + } + tracing::info!( + new_heads_count = new_heads.len(), + no_gc_refs_to_keep_count, + no_gc_refs_to_delete_count = no_gc_refs_to_delete.len(), + "collected reachable refs" + ); + + // It's slow to delete packed refs one by one, so update refs all at once. + let ref_edits = itertools::chain( + no_gc_refs_to_delete.into_iter().map(to_ref_deletion), + new_heads.iter().map(to_no_gc_ref_update), + ); + git_repo + .edit_references(ref_edits) + .map_err(|err| BackendError::Other(err.into()))?; + + Ok(()) +} + fn run_git_gc(git_dir: &Path) -> Result<(), GitGcError> { let mut git = Command::new("git"); git.arg("--git-dir=."); // turn off discovery @@ -1083,7 +1165,18 @@ impl Backend for GitBackend { Ok((id, contents)) } - fn gc(&self, _index: &dyn Index, _keep_newer: SystemTime) -> BackendResult<()> { + #[tracing::instrument(skip(self, index))] + fn gc(&self, index: &dyn Index, keep_newer: SystemTime) -> BackendResult<()> { + let git_repo = self.lock_git_repo(); + let new_heads = index + .all_heads_for_gc() + .map_err(|err| BackendError::Other(err.into()))? + .filter(|id| *id != self.root_commit_id); + recreate_no_gc_refs(&git_repo, new_heads, keep_newer)?; + // TODO: remove unreachable entries from extras table if segment file + // mtime <= keep_newer? (it won't be consistent with no-gc refs + // preserved by the keep_newer timestamp though) + // TODO: remove unreachable extras table segments // TODO: pass in keep_newer to "git gc" command run_git_gc(self.git_repo_path()).map_err(|err| BackendError::Other(err.into())) } diff --git a/lib/tests/test_git_backend.rs b/lib/tests/test_git_backend.rs new file mode 100644 index 000000000..1d072d8d4 --- /dev/null +++ b/lib/tests/test_git_backend.rs @@ -0,0 +1,189 @@ +// Copyright 2024 The Jujutsu Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::HashSet; +use std::process::Command; +use std::sync::Arc; +use std::time::SystemTime; + +use jj_lib::backend::CommitId; +use jj_lib::git_backend::GitBackend; +use jj_lib::repo::{ReadonlyRepo, Repo}; +use maplit::hashset; +use testutils::{create_random_commit, CommitGraphBuilder, TestRepo, TestRepoBackend}; + +fn get_git_backend(repo: &Arc) -> &GitBackend { + repo.store() + .backend_impl() + .downcast_ref::() + .unwrap() +} + +fn get_git_repo(repo: &Arc) -> gix::Repository { + get_git_backend(repo).git_repo() +} + +fn collect_no_gc_refs(git_repo: &gix::Repository) -> HashSet { + let git_refs = git_repo.references().unwrap(); + let no_gc_refs_iter = git_refs.prefixed("refs/jj/keep/").unwrap(); + no_gc_refs_iter + .map(|git_ref| CommitId::from_bytes(git_ref.unwrap().id().as_bytes())) + .collect() +} + +#[test] +fn test_gc() { + // TODO: Better way to disable the test if git command couldn't be executed + if Command::new("git").arg("--version").status().is_err() { + eprintln!("Skipping because git command might fail to run"); + return; + } + + let settings = testutils::user_settings(); + let test_repo = TestRepo::init_with_backend(TestRepoBackend::Git); + let repo = test_repo.repo; + let git_repo = get_git_repo(&repo); + let base_index = repo.readonly_index(); + + // Set up commits: + // + // H (predecessor: D) + // G | + // |\| + // | F + // E | + // D | | + // C |/ + // |/ + // B + // A + let mut tx = repo.start_transaction(&settings); + let mut graph_builder = CommitGraphBuilder::new(&settings, tx.mut_repo()); + let commit_a = graph_builder.initial_commit(); + let commit_b = graph_builder.commit_with_parents(&[&commit_a]); + let commit_c = graph_builder.commit_with_parents(&[&commit_b]); + let commit_d = graph_builder.commit_with_parents(&[&commit_c]); + let commit_e = graph_builder.commit_with_parents(&[&commit_b]); + let commit_f = graph_builder.commit_with_parents(&[&commit_b]); + let commit_g = graph_builder.commit_with_parents(&[&commit_e, &commit_f]); + let commit_h = create_random_commit(tx.mut_repo(), &settings) + .set_parents(vec![commit_f.id().clone()]) + .set_predecessors(vec![commit_d.id().clone()]) + .write() + .unwrap(); + let repo = tx.commit("test"); + assert_eq!( + *repo.view().heads(), + hashset! { + commit_d.id().clone(), + commit_g.id().clone(), + commit_h.id().clone(), + }, + ); + + // At first, all commits have no-gc refs + assert_eq!( + collect_no_gc_refs(&git_repo), + hashset! { + commit_a.id().clone(), + commit_b.id().clone(), + commit_c.id().clone(), + commit_d.id().clone(), + commit_e.id().clone(), + commit_f.id().clone(), + commit_g.id().clone(), + commit_h.id().clone(), + }, + ); + + // Empty index, but all kept by file modification time + // (Beware that this invokes "git gc" and refs will be packed.) + repo.store() + .gc(base_index.as_index(), SystemTime::UNIX_EPOCH) + .unwrap(); + assert_eq!( + collect_no_gc_refs(&git_repo), + hashset! { + commit_a.id().clone(), + commit_b.id().clone(), + commit_c.id().clone(), + commit_d.id().clone(), + commit_e.id().clone(), + commit_f.id().clone(), + commit_g.id().clone(), + commit_h.id().clone(), + }, + ); + + // All reachable: redundant no-gc refs will be removed + let now = SystemTime::now(); + repo.store().gc(repo.index(), now).unwrap(); + assert_eq!( + collect_no_gc_refs(&git_repo), + hashset! { + commit_d.id().clone(), + commit_g.id().clone(), + commit_h.id().clone(), + }, + ); + + // G is no longer reachable + let mut mut_index = base_index.start_modification(); + mut_index.add_commit(&commit_a); + mut_index.add_commit(&commit_b); + mut_index.add_commit(&commit_c); + mut_index.add_commit(&commit_d); + mut_index.add_commit(&commit_e); + mut_index.add_commit(&commit_f); + mut_index.add_commit(&commit_h); + repo.store().gc(mut_index.as_index(), now).unwrap(); + assert_eq!( + collect_no_gc_refs(&git_repo), + hashset! { + commit_d.id().clone(), + commit_e.id().clone(), + commit_h.id().clone(), + }, + ); + + // D|E|H are no longer reachable + let mut mut_index = base_index.start_modification(); + mut_index.add_commit(&commit_a); + mut_index.add_commit(&commit_b); + mut_index.add_commit(&commit_c); + mut_index.add_commit(&commit_f); + repo.store().gc(mut_index.as_index(), now).unwrap(); + assert_eq!( + collect_no_gc_refs(&git_repo), + hashset! { + commit_c.id().clone(), + commit_f.id().clone(), + }, + ); + + // B|C|F are no longer reachable + let mut mut_index = base_index.start_modification(); + mut_index.add_commit(&commit_a); + repo.store().gc(mut_index.as_index(), now).unwrap(); + assert_eq!( + collect_no_gc_refs(&git_repo), + hashset! { + commit_a.id().clone(), + }, + ); + + // All unreachable + repo.store().gc(base_index.as_index(), now).unwrap(); + assert_eq!(collect_no_gc_refs(&git_repo), hashset! {}); +}