mirror of
https://github.com/zed-industries/zed.git
synced 2025-02-05 18:25:57 +00:00
Make evals handle failures more gracefully (#18082)
Now when an individual project eval fails, instead of panicking we add it to a list of failures that we collect and report at the end (and make the exit code nonzero). Release Notes: - N/A
This commit is contained in:
parent
d6c184b494
commit
5f1046b3cd
1 changed files with 195 additions and 120 deletions
|
@ -12,13 +12,16 @@ use language::LanguageRegistry;
|
||||||
use node_runtime::FakeNodeRuntime;
|
use node_runtime::FakeNodeRuntime;
|
||||||
use open_ai::OpenAiEmbeddingModel;
|
use open_ai::OpenAiEmbeddingModel;
|
||||||
use project::Project;
|
use project::Project;
|
||||||
use semantic_index::{OpenAiEmbeddingProvider, ProjectIndex, SemanticDb, Status};
|
use semantic_index::{
|
||||||
|
EmbeddingProvider, OpenAiEmbeddingProvider, ProjectIndex, SemanticDb, Status,
|
||||||
|
};
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
use settings::SettingsStore;
|
use settings::SettingsStore;
|
||||||
use smol::channel::bounded;
|
use smol::channel::bounded;
|
||||||
use smol::io::AsyncReadExt;
|
use smol::io::AsyncReadExt;
|
||||||
use smol::Timer;
|
use smol::Timer;
|
||||||
use std::ops::RangeInclusive;
|
use std::ops::RangeInclusive;
|
||||||
|
use std::path::PathBuf;
|
||||||
use std::time::Duration;
|
use std::time::Duration;
|
||||||
use std::{
|
use std::{
|
||||||
fs,
|
fs,
|
||||||
|
@ -237,6 +240,14 @@ async fn fetch_code_search_net_resources(http_client: &dyn HttpClient) -> Result
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Default, Debug)]
|
||||||
|
struct Counts {
|
||||||
|
covered_results: usize,
|
||||||
|
overlapped_results: usize,
|
||||||
|
covered_files: usize,
|
||||||
|
total_results: usize,
|
||||||
|
}
|
||||||
|
|
||||||
async fn run_evaluation(
|
async fn run_evaluation(
|
||||||
only_repo: Option<String>,
|
only_repo: Option<String>,
|
||||||
executor: &BackgroundExecutor,
|
executor: &BackgroundExecutor,
|
||||||
|
@ -297,12 +308,11 @@ async fn run_evaluation(
|
||||||
cx.update(|cx| languages::init(language_registry.clone(), node_runtime.clone(), cx))
|
cx.update(|cx| languages::init(language_registry.clone(), node_runtime.clone(), cx))
|
||||||
.unwrap();
|
.unwrap();
|
||||||
|
|
||||||
let mut covered_result_count = 0;
|
let mut counts = Counts::default();
|
||||||
let mut overlapped_result_count = 0;
|
|
||||||
let mut covered_file_count = 0;
|
|
||||||
let mut total_result_count = 0;
|
|
||||||
eprint!("Running evals.");
|
eprint!("Running evals.");
|
||||||
|
|
||||||
|
let mut failures = Vec::new();
|
||||||
|
|
||||||
for evaluation_project in evaluations {
|
for evaluation_project in evaluations {
|
||||||
if only_repo
|
if only_repo
|
||||||
.as_ref()
|
.as_ref()
|
||||||
|
@ -314,27 +324,24 @@ async fn run_evaluation(
|
||||||
eprint!("\r\x1B[2K");
|
eprint!("\r\x1B[2K");
|
||||||
eprint!(
|
eprint!(
|
||||||
"Running evals. {}/{} covered. {}/{} overlapped. {}/{} files captured. Project: {}...",
|
"Running evals. {}/{} covered. {}/{} overlapped. {}/{} files captured. Project: {}...",
|
||||||
covered_result_count,
|
counts.covered_results,
|
||||||
total_result_count,
|
counts.total_results,
|
||||||
overlapped_result_count,
|
counts.overlapped_results,
|
||||||
total_result_count,
|
counts.total_results,
|
||||||
covered_file_count,
|
counts.covered_files,
|
||||||
total_result_count,
|
counts.total_results,
|
||||||
evaluation_project.repo
|
evaluation_project.repo
|
||||||
);
|
);
|
||||||
|
|
||||||
let repo_db_path =
|
|
||||||
db_path.join(format!("{}.db", evaluation_project.repo.replace('/', "_")));
|
|
||||||
let mut semantic_index = SemanticDb::new(repo_db_path, embedding_provider.clone(), cx)
|
|
||||||
.await
|
|
||||||
.unwrap();
|
|
||||||
|
|
||||||
let repo_dir = repos_dir.join(&evaluation_project.repo);
|
let repo_dir = repos_dir.join(&evaluation_project.repo);
|
||||||
if !repo_dir.exists() || repo_dir.join(SKIP_EVAL_PATH).exists() {
|
if !repo_dir.exists() || repo_dir.join(SKIP_EVAL_PATH).exists() {
|
||||||
eprintln!("Skipping {}: directory not found", evaluation_project.repo);
|
eprintln!("Skipping {}: directory not found", evaluation_project.repo);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
let repo_db_path =
|
||||||
|
db_path.join(format!("{}.db", evaluation_project.repo.replace('/', "_")));
|
||||||
|
|
||||||
let project = cx
|
let project = cx
|
||||||
.update(|cx| {
|
.update(|cx| {
|
||||||
Project::local(
|
Project::local(
|
||||||
|
@ -349,6 +356,64 @@ async fn run_evaluation(
|
||||||
})
|
})
|
||||||
.unwrap();
|
.unwrap();
|
||||||
|
|
||||||
|
let repo = evaluation_project.repo.clone();
|
||||||
|
if let Err(err) = run_eval_project(
|
||||||
|
evaluation_project,
|
||||||
|
&user_store,
|
||||||
|
repo_db_path,
|
||||||
|
&repo_dir,
|
||||||
|
&mut counts,
|
||||||
|
project,
|
||||||
|
embedding_provider.clone(),
|
||||||
|
fs.clone(),
|
||||||
|
cx,
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
{
|
||||||
|
eprintln!("{repo} eval failed with error: {:?}", err);
|
||||||
|
|
||||||
|
failures.push((repo, err));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
eprintln!(
|
||||||
|
"Running evals. {}/{} covered. {}/{} overlapped. {}/{} files captured. {} failed.",
|
||||||
|
counts.covered_results,
|
||||||
|
counts.total_results,
|
||||||
|
counts.overlapped_results,
|
||||||
|
counts.total_results,
|
||||||
|
counts.covered_files,
|
||||||
|
counts.total_results,
|
||||||
|
failures.len(),
|
||||||
|
);
|
||||||
|
|
||||||
|
if failures.is_empty() {
|
||||||
|
Ok(())
|
||||||
|
} else {
|
||||||
|
eprintln!("Failures:\n");
|
||||||
|
|
||||||
|
for (index, (repo, failure)) in failures.iter().enumerate() {
|
||||||
|
eprintln!("Failure #{} - {repo}\n{:?}", index + 1, failure);
|
||||||
|
}
|
||||||
|
|
||||||
|
Err(anyhow::anyhow!("Some evals failed."))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[allow(clippy::too_many_arguments)]
|
||||||
|
async fn run_eval_project(
|
||||||
|
evaluation_project: EvaluationProject,
|
||||||
|
user_store: &Model<UserStore>,
|
||||||
|
repo_db_path: PathBuf,
|
||||||
|
repo_dir: &Path,
|
||||||
|
counts: &mut Counts,
|
||||||
|
project: Model<Project>,
|
||||||
|
embedding_provider: Arc<dyn EmbeddingProvider>,
|
||||||
|
fs: Arc<dyn Fs>,
|
||||||
|
cx: &mut AsyncAppContext,
|
||||||
|
) -> Result<(), anyhow::Error> {
|
||||||
|
let mut semantic_index = SemanticDb::new(repo_db_path, embedding_provider, cx).await?;
|
||||||
|
|
||||||
let (worktree, _) = project
|
let (worktree, _) = project
|
||||||
.update(cx, |project, cx| {
|
.update(cx, |project, cx| {
|
||||||
project.find_or_create_worktree(repo_dir, true, cx)
|
project.find_or_create_worktree(repo_dir, true, cx)
|
||||||
|
@ -358,28 +423,54 @@ async fn run_evaluation(
|
||||||
worktree
|
worktree
|
||||||
.update(cx, |worktree, _| {
|
.update(cx, |worktree, _| {
|
||||||
worktree.as_local().unwrap().scan_complete()
|
worktree.as_local().unwrap().scan_complete()
|
||||||
})
|
})?
|
||||||
.unwrap()
|
|
||||||
.await;
|
.await;
|
||||||
|
|
||||||
let project_index = cx
|
let project_index = cx.update(|cx| semantic_index.create_project_index(project.clone(), cx))?;
|
||||||
.update(|cx| semantic_index.create_project_index(project.clone(), cx))
|
|
||||||
.unwrap();
|
|
||||||
wait_for_indexing_complete(&project_index, cx, Some(Duration::from_secs(120))).await;
|
wait_for_indexing_complete(&project_index, cx, Some(Duration::from_secs(120))).await;
|
||||||
|
|
||||||
for query in evaluation_project.queries {
|
for query in evaluation_project.queries {
|
||||||
let results = cx
|
let results = {
|
||||||
.update(|cx| {
|
// Retry search up to 3 times in case of timeout, network failure, etc.
|
||||||
|
let mut retries_remaining = 3;
|
||||||
|
let mut result;
|
||||||
|
|
||||||
|
loop {
|
||||||
|
match cx.update(|cx| {
|
||||||
let project_index = project_index.read(cx);
|
let project_index = project_index.read(cx);
|
||||||
project_index.search(query.query.clone(), SEARCH_RESULT_LIMIT, cx)
|
project_index.search(query.query.clone(), SEARCH_RESULT_LIMIT, cx)
|
||||||
})
|
}) {
|
||||||
.unwrap()
|
Ok(task) => match task.await {
|
||||||
.await
|
Ok(answer) => {
|
||||||
.unwrap();
|
result = Ok(answer);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
Err(err) => {
|
||||||
|
result = Err(err);
|
||||||
|
}
|
||||||
|
},
|
||||||
|
Err(err) => {
|
||||||
|
result = Err(err);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
let results = SemanticDb::load_results(results, &fs.clone(), &cx)
|
if retries_remaining > 0 {
|
||||||
.await
|
eprintln!(
|
||||||
.unwrap();
|
"Retrying search after it failed on query {:?} with {:?}",
|
||||||
|
query, result
|
||||||
|
);
|
||||||
|
retries_remaining -= 1;
|
||||||
|
} else {
|
||||||
|
eprintln!(
|
||||||
|
"Ran out of retries; giving up on search which failed on query {:?} with {:?}",
|
||||||
|
query, result
|
||||||
|
);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
SemanticDb::load_results(result?, &fs.clone(), &cx).await?
|
||||||
|
};
|
||||||
|
|
||||||
let mut project_covered_result_count = 0;
|
let mut project_covered_result_count = 0;
|
||||||
let mut project_overlapped_result_count = 0;
|
let mut project_overlapped_result_count = 0;
|
||||||
|
@ -393,8 +484,7 @@ async fn run_evaluation(
|
||||||
for (ix, result) in results.iter().enumerate() {
|
for (ix, result) in results.iter().enumerate() {
|
||||||
if result.path.as_ref() == Path::new(&expected_result.file) {
|
if result.path.as_ref() == Path::new(&expected_result.file) {
|
||||||
file_matched = true;
|
file_matched = true;
|
||||||
let start_matched =
|
let start_matched = result.row_range.contains(&expected_result.lines.start());
|
||||||
result.row_range.contains(&expected_result.lines.start());
|
|
||||||
let end_matched = result.row_range.contains(&expected_result.lines.end());
|
let end_matched = result.row_range.contains(&expected_result.lines.end());
|
||||||
|
|
||||||
if start_matched || end_matched {
|
if start_matched || end_matched {
|
||||||
|
@ -439,35 +529,20 @@ async fn run_evaluation(
|
||||||
covered_result_indices,
|
covered_result_indices,
|
||||||
};
|
};
|
||||||
|
|
||||||
overlapped_result_count += query_results.overlapped_result_count;
|
counts.overlapped_results += query_results.overlapped_result_count;
|
||||||
covered_result_count += query_results.covered_result_count;
|
counts.covered_results += query_results.covered_result_count;
|
||||||
covered_file_count += query_results.covered_file_count;
|
counts.covered_files += query_results.covered_file_count;
|
||||||
total_result_count += query_results.total_result_count;
|
counts.total_results += query_results.total_result_count;
|
||||||
|
|
||||||
println!("{}", serde_json::to_string(&query_results).unwrap());
|
println!("{}", serde_json::to_string(&query_results)?);
|
||||||
}
|
}
|
||||||
|
|
||||||
user_store
|
user_store.update(cx, |_, _| {
|
||||||
.update(cx, |_, _| {
|
|
||||||
drop(semantic_index);
|
drop(semantic_index);
|
||||||
drop(project);
|
drop(project);
|
||||||
drop(worktree);
|
drop(worktree);
|
||||||
drop(project_index);
|
drop(project_index);
|
||||||
})
|
})
|
||||||
.unwrap();
|
|
||||||
}
|
|
||||||
|
|
||||||
eprint!(
|
|
||||||
"Running evals. {}/{} covered. {}/{} overlapped. {}/{} files captured.",
|
|
||||||
covered_result_count,
|
|
||||||
total_result_count,
|
|
||||||
overlapped_result_count,
|
|
||||||
total_result_count,
|
|
||||||
covered_file_count,
|
|
||||||
total_result_count,
|
|
||||||
);
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn wait_for_indexing_complete(
|
async fn wait_for_indexing_complete(
|
||||||
|
@ -524,7 +599,7 @@ async fn fetch_eval_repos(
|
||||||
let evaluations = fs::read(&evaluations_path).expect("failed to read evaluations.json");
|
let evaluations = fs::read(&evaluations_path).expect("failed to read evaluations.json");
|
||||||
let evaluations: Vec<EvaluationProject> = serde_json::from_slice(&evaluations).unwrap();
|
let evaluations: Vec<EvaluationProject> = serde_json::from_slice(&evaluations).unwrap();
|
||||||
|
|
||||||
eprint!("Fetching evaluation repositories...");
|
eprintln!("Fetching evaluation repositories...");
|
||||||
|
|
||||||
executor
|
executor
|
||||||
.scoped(move |scope| {
|
.scoped(move |scope| {
|
||||||
|
|
Loading…
Reference in a new issue