Make evals handle failures more gracefully (#18082)

Now when an individual project eval fails, instead of panicking we add
it to a list of failures that we collect and report at the end (and make
the exit code nonzero).

Release Notes:

- N/A
This commit is contained in:
Richard Feldman 2024-09-20 10:28:22 -04:00 committed by GitHub
parent d6c184b494
commit 5f1046b3cd
No known key found for this signature in database
GPG key ID: B5690EEEBB952194

View file

@ -12,13 +12,16 @@ use language::LanguageRegistry;
use node_runtime::FakeNodeRuntime; use node_runtime::FakeNodeRuntime;
use open_ai::OpenAiEmbeddingModel; use open_ai::OpenAiEmbeddingModel;
use project::Project; use project::Project;
use semantic_index::{OpenAiEmbeddingProvider, ProjectIndex, SemanticDb, Status}; use semantic_index::{
EmbeddingProvider, OpenAiEmbeddingProvider, ProjectIndex, SemanticDb, Status,
};
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use settings::SettingsStore; use settings::SettingsStore;
use smol::channel::bounded; use smol::channel::bounded;
use smol::io::AsyncReadExt; use smol::io::AsyncReadExt;
use smol::Timer; use smol::Timer;
use std::ops::RangeInclusive; use std::ops::RangeInclusive;
use std::path::PathBuf;
use std::time::Duration; use std::time::Duration;
use std::{ use std::{
fs, fs,
@ -237,6 +240,14 @@ async fn fetch_code_search_net_resources(http_client: &dyn HttpClient) -> Result
Ok(()) Ok(())
} }
#[derive(Default, Debug)]
struct Counts {
covered_results: usize,
overlapped_results: usize,
covered_files: usize,
total_results: usize,
}
async fn run_evaluation( async fn run_evaluation(
only_repo: Option<String>, only_repo: Option<String>,
executor: &BackgroundExecutor, executor: &BackgroundExecutor,
@ -297,12 +308,11 @@ async fn run_evaluation(
cx.update(|cx| languages::init(language_registry.clone(), node_runtime.clone(), cx)) cx.update(|cx| languages::init(language_registry.clone(), node_runtime.clone(), cx))
.unwrap(); .unwrap();
let mut covered_result_count = 0; let mut counts = Counts::default();
let mut overlapped_result_count = 0;
let mut covered_file_count = 0;
let mut total_result_count = 0;
eprint!("Running evals."); eprint!("Running evals.");
let mut failures = Vec::new();
for evaluation_project in evaluations { for evaluation_project in evaluations {
if only_repo if only_repo
.as_ref() .as_ref()
@ -314,27 +324,24 @@ async fn run_evaluation(
eprint!("\r\x1B[2K"); eprint!("\r\x1B[2K");
eprint!( eprint!(
"Running evals. {}/{} covered. {}/{} overlapped. {}/{} files captured. Project: {}...", "Running evals. {}/{} covered. {}/{} overlapped. {}/{} files captured. Project: {}...",
covered_result_count, counts.covered_results,
total_result_count, counts.total_results,
overlapped_result_count, counts.overlapped_results,
total_result_count, counts.total_results,
covered_file_count, counts.covered_files,
total_result_count, counts.total_results,
evaluation_project.repo evaluation_project.repo
); );
let repo_db_path =
db_path.join(format!("{}.db", evaluation_project.repo.replace('/', "_")));
let mut semantic_index = SemanticDb::new(repo_db_path, embedding_provider.clone(), cx)
.await
.unwrap();
let repo_dir = repos_dir.join(&evaluation_project.repo); let repo_dir = repos_dir.join(&evaluation_project.repo);
if !repo_dir.exists() || repo_dir.join(SKIP_EVAL_PATH).exists() { if !repo_dir.exists() || repo_dir.join(SKIP_EVAL_PATH).exists() {
eprintln!("Skipping {}: directory not found", evaluation_project.repo); eprintln!("Skipping {}: directory not found", evaluation_project.repo);
continue; continue;
} }
let repo_db_path =
db_path.join(format!("{}.db", evaluation_project.repo.replace('/', "_")));
let project = cx let project = cx
.update(|cx| { .update(|cx| {
Project::local( Project::local(
@ -349,6 +356,64 @@ async fn run_evaluation(
}) })
.unwrap(); .unwrap();
let repo = evaluation_project.repo.clone();
if let Err(err) = run_eval_project(
evaluation_project,
&user_store,
repo_db_path,
&repo_dir,
&mut counts,
project,
embedding_provider.clone(),
fs.clone(),
cx,
)
.await
{
eprintln!("{repo} eval failed with error: {:?}", err);
failures.push((repo, err));
}
}
eprintln!(
"Running evals. {}/{} covered. {}/{} overlapped. {}/{} files captured. {} failed.",
counts.covered_results,
counts.total_results,
counts.overlapped_results,
counts.total_results,
counts.covered_files,
counts.total_results,
failures.len(),
);
if failures.is_empty() {
Ok(())
} else {
eprintln!("Failures:\n");
for (index, (repo, failure)) in failures.iter().enumerate() {
eprintln!("Failure #{} - {repo}\n{:?}", index + 1, failure);
}
Err(anyhow::anyhow!("Some evals failed."))
}
}
#[allow(clippy::too_many_arguments)]
async fn run_eval_project(
evaluation_project: EvaluationProject,
user_store: &Model<UserStore>,
repo_db_path: PathBuf,
repo_dir: &Path,
counts: &mut Counts,
project: Model<Project>,
embedding_provider: Arc<dyn EmbeddingProvider>,
fs: Arc<dyn Fs>,
cx: &mut AsyncAppContext,
) -> Result<(), anyhow::Error> {
let mut semantic_index = SemanticDb::new(repo_db_path, embedding_provider, cx).await?;
let (worktree, _) = project let (worktree, _) = project
.update(cx, |project, cx| { .update(cx, |project, cx| {
project.find_or_create_worktree(repo_dir, true, cx) project.find_or_create_worktree(repo_dir, true, cx)
@ -358,28 +423,54 @@ async fn run_evaluation(
worktree worktree
.update(cx, |worktree, _| { .update(cx, |worktree, _| {
worktree.as_local().unwrap().scan_complete() worktree.as_local().unwrap().scan_complete()
}) })?
.unwrap()
.await; .await;
let project_index = cx let project_index = cx.update(|cx| semantic_index.create_project_index(project.clone(), cx))?;
.update(|cx| semantic_index.create_project_index(project.clone(), cx))
.unwrap();
wait_for_indexing_complete(&project_index, cx, Some(Duration::from_secs(120))).await; wait_for_indexing_complete(&project_index, cx, Some(Duration::from_secs(120))).await;
for query in evaluation_project.queries { for query in evaluation_project.queries {
let results = cx let results = {
.update(|cx| { // Retry search up to 3 times in case of timeout, network failure, etc.
let mut retries_remaining = 3;
let mut result;
loop {
match cx.update(|cx| {
let project_index = project_index.read(cx); let project_index = project_index.read(cx);
project_index.search(query.query.clone(), SEARCH_RESULT_LIMIT, cx) project_index.search(query.query.clone(), SEARCH_RESULT_LIMIT, cx)
}) }) {
.unwrap() Ok(task) => match task.await {
.await Ok(answer) => {
.unwrap(); result = Ok(answer);
break;
}
Err(err) => {
result = Err(err);
}
},
Err(err) => {
result = Err(err);
}
}
let results = SemanticDb::load_results(results, &fs.clone(), &cx) if retries_remaining > 0 {
.await eprintln!(
.unwrap(); "Retrying search after it failed on query {:?} with {:?}",
query, result
);
retries_remaining -= 1;
} else {
eprintln!(
"Ran out of retries; giving up on search which failed on query {:?} with {:?}",
query, result
);
break;
}
}
SemanticDb::load_results(result?, &fs.clone(), &cx).await?
};
let mut project_covered_result_count = 0; let mut project_covered_result_count = 0;
let mut project_overlapped_result_count = 0; let mut project_overlapped_result_count = 0;
@ -393,8 +484,7 @@ async fn run_evaluation(
for (ix, result) in results.iter().enumerate() { for (ix, result) in results.iter().enumerate() {
if result.path.as_ref() == Path::new(&expected_result.file) { if result.path.as_ref() == Path::new(&expected_result.file) {
file_matched = true; file_matched = true;
let start_matched = let start_matched = result.row_range.contains(&expected_result.lines.start());
result.row_range.contains(&expected_result.lines.start());
let end_matched = result.row_range.contains(&expected_result.lines.end()); let end_matched = result.row_range.contains(&expected_result.lines.end());
if start_matched || end_matched { if start_matched || end_matched {
@ -439,35 +529,20 @@ async fn run_evaluation(
covered_result_indices, covered_result_indices,
}; };
overlapped_result_count += query_results.overlapped_result_count; counts.overlapped_results += query_results.overlapped_result_count;
covered_result_count += query_results.covered_result_count; counts.covered_results += query_results.covered_result_count;
covered_file_count += query_results.covered_file_count; counts.covered_files += query_results.covered_file_count;
total_result_count += query_results.total_result_count; counts.total_results += query_results.total_result_count;
println!("{}", serde_json::to_string(&query_results).unwrap()); println!("{}", serde_json::to_string(&query_results)?);
} }
user_store user_store.update(cx, |_, _| {
.update(cx, |_, _| {
drop(semantic_index); drop(semantic_index);
drop(project); drop(project);
drop(worktree); drop(worktree);
drop(project_index); drop(project_index);
}) })
.unwrap();
}
eprint!(
"Running evals. {}/{} covered. {}/{} overlapped. {}/{} files captured.",
covered_result_count,
total_result_count,
overlapped_result_count,
total_result_count,
covered_file_count,
total_result_count,
);
Ok(())
} }
async fn wait_for_indexing_complete( async fn wait_for_indexing_complete(
@ -524,7 +599,7 @@ async fn fetch_eval_repos(
let evaluations = fs::read(&evaluations_path).expect("failed to read evaluations.json"); let evaluations = fs::read(&evaluations_path).expect("failed to read evaluations.json");
let evaluations: Vec<EvaluationProject> = serde_json::from_slice(&evaluations).unwrap(); let evaluations: Vec<EvaluationProject> = serde_json::from_slice(&evaluations).unwrap();
eprint!("Fetching evaluation repositories..."); eprintln!("Fetching evaluation repositories...");
executor executor
.scoped(move |scope| { .scoped(move |scope| {