From 042d26049ce4c6f1fa7bfb35d54e6d7a6f587ac5 Mon Sep 17 00:00:00 2001 From: Yuya Nishihara Date: Thu, 16 Nov 2023 12:45:05 +0900 Subject: [PATCH] working_copy: lazily construct file_states BTreeMap MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit While it got faster to build a large BTreeMap, there's still a measurable cost. Let's eliminate it if watchman is enabled and the working copy is clean. Perhaps, we should introduce new serialization format that supports instant loading and lookup, but this hack works for the moment. I'm not sure if the new tree_state format should be flat (RepoPath, _) list, or tree like the backend storage btw. In my "linux" repo (watchman enabled): % hyperfine --sort command --warmup 3 --runs 10 -L bin jj-0,jj-1 \ "target/release-with-debug/{bin} -R ~/mirrors/linux status" Benchmark 1: target/release-with-debug/jj-0 -R ~/mirrors/linux status Time (mean ± σ): 768.9 ms ± 14.2 ms [User: 630.7 ms, System: 131.2 ms] Range (min … max): 742.3 ms … 783.1 ms 10 runs Benchmark 2: target/release-with-debug/jj-1 -R ~/mirrors/linux status Time (mean ± σ): 713.0 ms ± 16.8 ms [User: 587.9 ms, System: 116.2 ms] Range (min … max): 681.5 ms … 731.1 ms 10 runs Relative speed comparison 1.08 ± 0.03 target/release-with-debug/jj-0 -R ~/mirrors/linux status 1.00 target/release-with-debug/jj-1 -R ~/mirrors/linux status --- lib/src/local_working_copy.rs | 110 +++++++++++++++++++++++++++------- 1 file changed, 89 insertions(+), 21 deletions(-) diff --git a/lib/src/local_working_copy.rs b/lib/src/local_working_copy.rs index 5783433fc..977ecf1ea 100644 --- a/lib/src/local_working_copy.rs +++ b/lib/src/local_working_copy.rs @@ -27,7 +27,7 @@ use std::os::unix::fs::symlink; use std::os::unix::fs::PermissionsExt; use std::path::{Path, PathBuf}; use std::sync::mpsc::{channel, Sender}; -use std::sync::Arc; +use std::sync::{Arc, OnceLock}; use std::time::UNIX_EPOCH; use futures::StreamExt; @@ -137,12 +137,62 @@ impl FileState { } } +/// Lazily constructs file states map from proto data. +/// +/// If fsmonitor is enabled and the working-copy is clean, we don't need to +/// build a loaded `BTreeMap` at all. +#[derive(Clone, Debug)] +struct LazyFileStatesMap { + loaded: OnceLock>, + proto: Option>, +} + +impl LazyFileStatesMap { + fn new() -> Self { + LazyFileStatesMap { + loaded: OnceLock::from(BTreeMap::new()), + proto: None, + } + } + + fn from_proto(proto: Vec) -> Self { + LazyFileStatesMap { + loaded: OnceLock::new(), + proto: Some(proto), + } + } + + fn to_proto(&self) -> Vec { + if let Some(proto) = self.proto.as_ref() { + proto.clone() + } else { + // Just return new proto data. There would be no point to cache it + // since we've already paid the cost to build a loaded BTreeMap. + let loaded = self.loaded.get().expect("loaded or proto must exist"); + file_states_to_proto(loaded) + } + } + + fn get_or_load(&self) -> &BTreeMap { + self.loaded.get_or_init(|| { + let proto = self.proto.as_ref().expect("loaded or proto must exist"); + file_states_from_proto(proto) + }) + } + + fn make_mut(&mut self) -> &mut BTreeMap { + self.get_or_load(); + self.proto.take(); // mark dirty + self.loaded.get_mut().unwrap() + } +} + pub struct TreeState { store: Arc, working_copy_path: PathBuf, state_path: PathBuf, tree_id: MergedTreeId, - file_states: BTreeMap, + file_states: LazyFileStatesMap, // Currently only path prefixes sparse_patterns: Vec, own_mtime: MillisSinceEpoch, @@ -194,9 +244,11 @@ fn file_state_to_proto(file_state: &FileState) -> crate::protos::working_copy::F proto } +#[instrument(skip(proto))] fn file_states_from_proto( proto: &[crate::protos::working_copy::FileStateEntry], ) -> BTreeMap { + tracing::debug!("loading file states from proto"); proto .iter() .map(|entry| { @@ -370,7 +422,7 @@ impl TreeState { } pub fn file_states(&self) -> &BTreeMap { - &self.file_states + self.file_states.get_or_load() } pub fn sparse_patterns(&self) -> &Vec { @@ -400,7 +452,7 @@ impl TreeState { working_copy_path: working_copy_path.canonicalize().unwrap(), state_path, tree_id, - file_states: BTreeMap::new(), + file_states: LazyFileStatesMap::new(), sparse_patterns: vec![RepoPath::root()], own_mtime: MillisSinceEpoch(0), watchman_clock: None, @@ -463,7 +515,7 @@ impl TreeState { .collect(); self.tree_id = MergedTreeId::Merge(tree_ids_builder.build()); } - self.file_states = file_states_from_proto(&proto.file_states); + self.file_states = LazyFileStatesMap::from_proto(proto.file_states); self.sparse_patterns = sparse_patterns_from_proto(proto.sparse_patterns.as_ref()); self.watchman_clock = proto.watchman_clock; Ok(()) @@ -480,7 +532,7 @@ impl TreeState { } } - proto.file_states = file_states_to_proto(&self.file_states); + proto.file_states = self.file_states.to_proto(); let mut sparse_patterns = crate::protos::working_copy::SparsePatterns::default(); for path in &self.sparse_patterns { sparse_patterns @@ -593,12 +645,18 @@ impl TreeState { Some(fsmonitor_matcher) => fsmonitor_matcher.as_ref(), }; + let matcher = IntersectionMatcher::new(sparse_matcher.as_ref(), fsmonitor_matcher); + if matcher.visit(&RepoPath::root()).is_nothing() { + // No need to load file states + self.watchman_clock = watchman_clock; + return Ok(is_dirty); + } + let (tree_entries_tx, tree_entries_rx) = channel(); let (file_states_tx, file_states_rx) = channel(); let (present_files_tx, present_files_rx) = channel(); trace_span!("traverse filesystem").in_scope(|| -> Result<(), SnapshotError> { - let matcher = IntersectionMatcher::new(sparse_matcher.as_ref(), fsmonitor_matcher); let current_tree = self.current_tree()?; let directory_to_visit = DirectoryToVisit { dir: RepoPath::root(), @@ -617,10 +675,13 @@ impl TreeState { ) })?; + let file_states = self.file_states.make_mut(); let mut tree_builder = MergedTreeBuilder::new(self.tree_id.clone()); let mut deleted_files: HashSet<_> = trace_span!("collecting existing files").in_scope(|| { - self.file_states + // Since file_states shouldn't contain files excluded by the sparse patterns, + // fsmonitor_matcher here is identical to the intersected matcher. + file_states .iter() .filter(|&(path, state)| { fsmonitor_matcher.matches(path) && state.file_type != FileType::GitSubmodule @@ -637,7 +698,7 @@ impl TreeState { trace_span!("process file states").in_scope(|| { while let Ok((path, file_state)) = file_states_rx.recv() { is_dirty = true; - self.file_states.insert(path, file_state); + file_states.insert(path, file_state); } }); trace_span!("process present files").in_scope(|| { @@ -648,7 +709,7 @@ impl TreeState { trace_span!("process deleted files").in_scope(|| { for file in &deleted_files { is_dirty = true; - self.file_states.remove(file); + file_states.remove(file); tree_builder.set_or_remove(file.clone(), Merge::absent()); } }); @@ -663,7 +724,8 @@ impl TreeState { .entries_matching(sparse_matcher.as_ref()) .map(|(path, _)| path) .collect(); - let state_paths: HashSet<_> = self.file_states.keys().cloned().collect(); + let file_states = self.file_states.get_or_load(); + let state_paths: HashSet<_> = file_states.keys().cloned().collect(); assert_eq!(state_paths, tree_paths); } self.watchman_clock = watchman_clock; @@ -691,6 +753,9 @@ impl TreeState { if matcher.visit(&dir).is_nothing() { return Ok(()); } + + // Don't try to load file states by multiple worker threads. + let file_states = self.file_states.get_or_load(); let git_ignore = git_ignore.chain_with_file(&dir.to_internal_dir_string(), disk_dir.join(".gitignore")); let dir_entries = disk_dir @@ -719,7 +784,7 @@ impl TreeState { return Ok(()); } let path = dir.join(&RepoPathComponent::from(name)); - if let Some(file_state) = self.file_states.get(&path) { + if let Some(file_state) = file_states.get(&path) { if file_state.file_type == FileType::GitSubmodule { return Ok(()); } @@ -729,8 +794,7 @@ impl TreeState { if git_ignore.matches(&path.to_internal_dir_string()) { // If the whole directory is ignored, visit only paths we're already // tracking. - let tracked_paths = self - .file_states + let tracked_paths = file_states .range((Bound::Excluded(&path), Bound::Unbounded)) .take_while(|(sub_path, _)| path.contains(sub_path)) .map(|(sub_path, file_state)| (sub_path.clone(), file_state.clone())) @@ -795,7 +859,7 @@ impl TreeState { if let Some(progress) = progress { progress(&path); } - let maybe_current_file_state = self.file_states.get(&path); + let maybe_current_file_state = file_states.get(&path); if maybe_current_file_state.is_none() && git_ignore.matches(&path.to_internal_file_string()) { @@ -1160,14 +1224,18 @@ impl TreeState { fs::remove_file(&disk_path).ok(); } if before.is_absent() && disk_path.exists() { - self.file_states.insert(path, FileState::placeholder()); + self.file_states + .make_mut() + .insert(path, FileState::placeholder()); stats.skipped_files += 1; continue; } if after.is_present() { let skip = create_parent_dirs(&self.working_copy_path, &path)?; if skip { - self.file_states.insert(path, FileState::placeholder()); + self.file_states + .make_mut() + .insert(path, FileState::placeholder()); stats.skipped_files += 1; continue; } @@ -1183,7 +1251,7 @@ impl TreeState { } parent_dir = parent_dir.parent().unwrap(); } - self.file_states.remove(&path); + self.file_states.make_mut().remove(&path); continue; } MaterializedTreeValue::File { @@ -1205,7 +1273,7 @@ impl TreeState { self.write_conflict(&disk_path, contents)? } }; - self.file_states.insert(path, file_state); + self.file_states.make_mut().insert(path, file_state); } Ok(stats) } @@ -1223,7 +1291,7 @@ impl TreeState { while let Some((path, diff)) = diff_stream.next().await { let (_before, after) = diff?; if after.is_absent() { - self.file_states.remove(&path); + self.file_states.make_mut().remove(&path); } else { let file_type = match after.into_resolved() { Ok(value) => match value.unwrap() { @@ -1255,7 +1323,7 @@ impl TreeState { mtime: MillisSinceEpoch(0), size: 0, }; - self.file_states.insert(path.clone(), file_state); + self.file_states.make_mut().insert(path.clone(), file_state); } } self.tree_id = new_tree.id();