project search: Increase perf up to 10x by batching git status calls (#16936)

***Update**: after rebasing on top of
https://github.com/zed-industries/zed/pull/16915/ (that also changed how
search worked), the results are still good, but not 10x. Instead of
going from 10s to 500ms, it goes from 10s to 3s.*

This improves the performance of project-wide search by an order of
magnitude.

After digging in, @as-cii and I found that opening buffers was the
bottleneck for project-wide search (since Zed opens, parses, ... buffers
when finding them, which is something VS Code doesn't do, for example).
So this PR improves the performance of opening multiple buffers at once.

It does this by doing two things:

- It batches scan-requests in the worktree. When we search, we search
files in chunks of 64. Previously we'd handle all 64 scan requests
separately. The new code checks if the scan requests can be batched and
if so it, it does that.
- It batches `git status` calls when reloading the project entries for
the opened buffers. Instead of calling `git status` for each file, it
calls `git status` for a batch of files, and then extracts the status
for each file.

(It has to be said that I think the slow performance on `main` has been
a regression introduced over the last few months with the changes made
to project/worktree/git. I don't think it was this slow ~5 months ago.
But I also don't think it was this fast ~5 months ago.)

## Benchmarks

| Search | Before | After (without
https://github.com/zed-industries/zed/pull/16915) | After (with
https://github.com/zed-industries/zed/pull/16915)
|--------|--------|-------|------|
| `zed.dev` at `2b2a501192e78e`, searching for `<` (`4484` results) |
3.0s<br>2.9s<br>2.89s | 489ms<br>517ms<br>476ms | n/a |
| `zed.dev` at `2b2a501192e78e`, searching for `:` (`25886+` results) |
3.9s<br>3.9s<br>3.8s | 70ms<br>66ms<br>72ms | n/a |
| `zed` at `55dda0e6af`, searching for `<` (`10937+` results) |
10s<br>11s<br>12s | 500m<br>499ms<br>543ms | 3.4s<br>3.1s<br> |

(All results recorded after doing a warm-up run that would start
language servers etc.)

Release Notes:

- Performance of project-wide search has been improved by up to 10x.

---------

Co-authored-by: Antonio <antonio@zed.dev>
This commit is contained in:
Thorsten Ball 2024-08-27 11:59:59 +02:00 committed by GitHub
parent d1dceef945
commit 8ec680cecb
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
5 changed files with 106 additions and 41 deletions

1
Cargo.lock generated
View file

@ -13856,6 +13856,7 @@ dependencies = [
"serde",
"serde_json",
"settings",
"smallvec",
"smol",
"sum_tree",
"text",

View file

@ -36,11 +36,7 @@ pub trait GitRepository: Send + Sync {
/// Returns the SHA of the current HEAD.
fn head_sha(&self) -> Option<String>;
fn statuses(&self, path_prefix: &Path) -> Result<GitStatus>;
fn status(&self, path: &Path) -> Option<GitFileStatus> {
Some(self.statuses(path).ok()?.entries.first()?.1)
}
fn status(&self, path_prefixes: &[PathBuf]) -> Result<GitStatus>;
fn branches(&self) -> Result<Vec<Branch>>;
fn change_branch(&self, _: &str) -> Result<()>;
@ -126,14 +122,14 @@ impl GitRepository for RealGitRepository {
Some(self.repository.lock().head().ok()?.target()?.to_string())
}
fn statuses(&self, path_prefix: &Path) -> Result<GitStatus> {
fn status(&self, path_prefixes: &[PathBuf]) -> Result<GitStatus> {
let working_directory = self
.repository
.lock()
.workdir()
.context("failed to read git work directory")?
.to_path_buf();
GitStatus::new(&self.git_binary_path, &working_directory, path_prefix)
GitStatus::new(&self.git_binary_path, &working_directory, path_prefixes)
}
fn branches(&self) -> Result<Vec<Branch>> {
@ -245,13 +241,16 @@ impl GitRepository for FakeGitRepository {
None
}
fn statuses(&self, path_prefix: &Path) -> Result<GitStatus> {
fn status(&self, path_prefixes: &[PathBuf]) -> Result<GitStatus> {
let state = self.state.lock();
let mut entries = state
.worktree_statuses
.iter()
.filter_map(|(repo_path, status)| {
if repo_path.0.starts_with(path_prefix) {
if path_prefixes
.iter()
.any(|path_prefix| repo_path.0.starts_with(path_prefix))
{
Some((repo_path.to_owned(), *status))
} else {
None

View file

@ -15,14 +15,10 @@ impl GitStatus {
pub(crate) fn new(
git_binary: &Path,
working_directory: &Path,
mut path_prefix: &Path,
path_prefixes: &[PathBuf],
) -> Result<Self> {
let mut child = Command::new(git_binary);
if path_prefix == Path::new("") {
path_prefix = Path::new(".");
}
child
.current_dir(working_directory)
.args([
@ -32,7 +28,13 @@ impl GitStatus {
"--untracked-files=all",
"-z",
])
.arg(path_prefix)
.args(path_prefixes.iter().map(|path_prefix| {
if *path_prefix == Path::new("") {
Path::new(".")
} else {
path_prefix
}
}))
.stdin(Stdio::null())
.stdout(Stdio::piped())
.stderr(Stdio::piped());

View file

@ -41,6 +41,7 @@ schemars.workspace = true
serde.workspace = true
serde_json.workspace = true
settings.workspace = true
smallvec.workspace = true
smol.workspace = true
sum_tree.workspace = true
text.workspace = true

View file

@ -38,6 +38,7 @@ use postage::{
};
use rpc::proto::{self, AnyProtoClient};
use settings::{Settings, SettingsLocation, SettingsStore};
use smallvec::{smallvec, SmallVec};
use smol::channel::{self, Sender};
use std::{
any::Any,
@ -124,7 +125,7 @@ pub struct LocalWorktree {
struct ScanRequest {
relative_paths: Vec<Arc<Path>>,
done: barrier::Sender,
done: SmallVec<[barrier::Sender; 1]>,
}
pub struct RemoteWorktree {
@ -339,7 +340,7 @@ enum ScanState {
Updated {
snapshot: LocalSnapshot,
changes: UpdatedEntriesSet,
barrier: Option<barrier::Sender>,
barrier: SmallVec<[barrier::Sender; 1]>,
scanning: bool,
},
}
@ -1659,7 +1660,7 @@ impl LocalWorktree {
self.scan_requests_tx
.try_send(ScanRequest {
relative_paths: paths,
done: tx,
done: smallvec![tx],
})
.ok();
rx
@ -2699,7 +2700,7 @@ impl BackgroundScannerState {
work_directory: workdir_path,
statuses: repo
.repo_ptr
.statuses(&repo_path)
.status(&[repo_path.0])
.log_err()
.unwrap_or_default(),
});
@ -3568,7 +3569,7 @@ impl BackgroundScanner {
state.snapshot.completed_scan_id = state.snapshot.scan_id;
}
self.send_status_update(false, None);
self.send_status_update(false, SmallVec::new());
// Process any any FS events that occurred while performing the initial scan.
// For these events, update events cannot be as precise, because we didn't
@ -3588,7 +3589,7 @@ impl BackgroundScanner {
select_biased! {
// Process any path refresh requests from the worktree. Prioritize
// these before handling changes reported by the filesystem.
request = self.scan_requests_rx.recv().fuse() => {
request = self.next_scan_request().fuse() => {
let Ok(request) = request else { break };
if !self.process_scan_request(request, false).await {
return;
@ -3669,7 +3670,7 @@ impl BackgroundScanner {
)
.await;
self.send_status_update(scanning, Some(request.done))
self.send_status_update(scanning, request.done)
}
async fn process_events(&mut self, mut abs_paths: Vec<PathBuf>) {
@ -3777,7 +3778,7 @@ impl BackgroundScanner {
#[cfg(test)]
self.state.lock().snapshot.check_git_invariants();
self.send_status_update(false, None);
self.send_status_update(false, SmallVec::new());
}
async fn forcibly_load_paths(&self, paths: &[Arc<Path>]) -> bool {
@ -3834,7 +3835,7 @@ impl BackgroundScanner {
select_biased! {
// Process any path refresh requests before moving on to process
// the scan queue, so that user operations are prioritized.
request = self.scan_requests_rx.recv().fuse() => {
request = self.next_scan_request().fuse() => {
let Ok(request) = request else { break };
if !self.process_scan_request(request, true).await {
return;
@ -3853,7 +3854,7 @@ impl BackgroundScanner {
) {
Ok(_) => {
last_progress_update_count += 1;
self.send_status_update(true, None);
self.send_status_update(true, SmallVec::new());
}
Err(count) => {
last_progress_update_count = count;
@ -3879,7 +3880,7 @@ impl BackgroundScanner {
.await;
}
fn send_status_update(&self, scanning: bool, barrier: Option<barrier::Sender>) -> bool {
fn send_status_update(&self, scanning: bool, barrier: SmallVec<[barrier::Sender; 1]>) -> bool {
let mut state = self.state.lock();
if state.changed_paths.is_empty() && scanning {
return true;
@ -3954,7 +3955,7 @@ impl BackgroundScanner {
if let Some((work_directory, repository)) = repo {
let t0 = Instant::now();
let statuses = repository
.statuses(Path::new(""))
.status(&[PathBuf::from("")])
.log_err()
.unwrap_or_default();
log::trace!("computed git status in {:?}", t0.elapsed());
@ -4167,6 +4168,32 @@ impl BackgroundScanner {
}
}
// Group all relative paths by their git repository.
let mut paths_by_git_repo = HashMap::default();
for relative_path in relative_paths.iter() {
if let Some((repo_entry, repo)) = state.snapshot.repo_for_path(relative_path) {
if let Ok(repo_path) = repo_entry.relativize(&state.snapshot, relative_path) {
paths_by_git_repo
.entry(repo.git_dir_path.clone())
.or_insert_with(|| RepoPaths {
repo: repo.repo_ptr.clone(),
repo_paths: Vec::new(),
relative_paths: Vec::new(),
})
.add_paths(&relative_path, repo_path);
}
}
}
// Now call `git status` once per repository and collect each file's git status.
let mut git_statuses_by_relative_path =
paths_by_git_repo
.into_values()
.fold(HashMap::default(), |mut map, repo_paths| {
map.extend(repo_paths.into_git_file_statuses());
map
});
for (path, metadata) in relative_paths.iter().zip(metadata.into_iter()) {
let abs_path: Arc<Path> = root_abs_path.join(&path).into();
match metadata {
@ -4192,15 +4219,7 @@ impl BackgroundScanner {
fs_entry.is_external = is_external;
fs_entry.is_private = self.is_path_private(path);
if !is_dir && !fs_entry.is_ignored && !fs_entry.is_external {
if let Some((repo_entry, repo)) = state.snapshot.repo_for_path(path) {
if let Ok(repo_path) = repo_entry.relativize(&state.snapshot, path) {
fs_entry.git_status = repo.repo_ptr.status(&repo_path);
}
}
}
if let (Some(scan_queue_tx), true) = (&scan_queue_tx, fs_entry.is_dir()) {
if let (Some(scan_queue_tx), true) = (&scan_queue_tx, is_dir) {
if state.should_scan_directory(&fs_entry)
|| (fs_entry.path.as_os_str().is_empty()
&& abs_path.file_name() == Some(*DOT_GIT))
@ -4211,7 +4230,11 @@ impl BackgroundScanner {
}
}
state.insert_entry(fs_entry, self.fs.as_ref());
if !is_dir && !fs_entry.is_ignored && !fs_entry.is_external {
fs_entry.git_status = git_statuses_by_relative_path.remove(path);
}
state.insert_entry(fs_entry.clone(), self.fs.as_ref());
}
Ok(None) => {
self.remove_repo_path(path, &mut state.snapshot);
@ -4310,7 +4333,7 @@ impl BackgroundScanner {
select_biased! {
// Process any path refresh requests before moving on to process
// the queue of ignore statuses.
request = self.scan_requests_rx.recv().fuse() => {
request = self.next_scan_request().fuse() => {
let Ok(request) = request else { break };
if !self.process_scan_request(request, true).await {
return;
@ -4380,7 +4403,12 @@ impl BackgroundScanner {
if !entry.is_dir() && !entry.is_ignored && !entry.is_external {
if let Some((ref repo_entry, local_repo)) = repo {
if let Ok(repo_path) = repo_entry.relativize(&snapshot, &entry.path) {
entry.git_status = local_repo.repo_ptr.status(&repo_path);
let status = local_repo
.repo_ptr
.status(&[repo_path.0.clone()])
.ok()
.and_then(|status| status.get(&repo_path));
entry.git_status = status;
}
}
}
@ -4519,7 +4547,7 @@ impl BackgroundScanner {
select_biased! {
// Process any path refresh requests before moving on to process
// the queue of git statuses.
request = self.scan_requests_rx.recv().fuse() => {
request = self.next_scan_request().fuse() => {
let Ok(request) = request else { break };
if !self.process_scan_request(request, true).await {
return;
@ -4537,7 +4565,7 @@ impl BackgroundScanner {
fn update_git_statuses(&self, job: UpdateGitStatusesJob) {
log::trace!("updating git statuses for repo {:?}", job.work_directory.0);
let t0 = Instant::now();
let Some(statuses) = job.repository.statuses(Path::new("")).log_err() else {
let Some(statuses) = job.repository.status(&[PathBuf::from("")]).log_err() else {
return;
};
log::trace!(
@ -4726,6 +4754,15 @@ impl BackgroundScanner {
fn is_path_private(&self, path: &Path) -> bool {
!self.share_private_files && self.settings.is_path_private(path)
}
async fn next_scan_request(&self) -> Result<ScanRequest> {
let mut request = self.scan_requests_rx.recv().await?;
while let Ok(next_request) = self.scan_requests_rx.try_recv() {
request.relative_paths.extend(next_request.relative_paths);
request.done.extend(next_request.done);
}
Ok(request)
}
}
fn swap_to_front(child_paths: &mut Vec<PathBuf>, file: &OsStr) {
@ -4748,6 +4785,31 @@ fn char_bag_for_path(root_char_bag: CharBag, path: &Path) -> CharBag {
result
}
struct RepoPaths {
repo: Arc<dyn GitRepository>,
relative_paths: Vec<Arc<Path>>,
repo_paths: Vec<PathBuf>,
}
impl RepoPaths {
fn add_paths(&mut self, relative_path: &Arc<Path>, repo_path: RepoPath) {
self.relative_paths.push(relative_path.clone());
self.repo_paths.push(repo_path.0);
}
fn into_git_file_statuses(self) -> HashMap<Arc<Path>, GitFileStatus> {
let mut statuses = HashMap::default();
if let Ok(status) = self.repo.status(&self.repo_paths) {
for (repo_path, relative_path) in self.repo_paths.into_iter().zip(self.relative_paths) {
if let Some(path_status) = status.get(&repo_path) {
statuses.insert(relative_path, path_status);
}
}
}
statuses
}
}
struct ScanJob {
abs_path: Arc<Path>,
path: Arc<Path>,