From 5f1046b3cd5290112f6dd464e49bc58661fd2179 Mon Sep 17 00:00:00 2001 From: Richard Feldman Date: Fri, 20 Sep 2024 10:28:22 -0400 Subject: [PATCH] Make evals handle failures more gracefully (#18082) Now when an individual project eval fails, instead of panicking we add it to a list of failures that we collect and report at the end (and make the exit code nonzero). Release Notes: - N/A --- crates/evals/src/eval.rs | 315 ++++++++++++++++++++++++--------------- 1 file changed, 195 insertions(+), 120 deletions(-) diff --git a/crates/evals/src/eval.rs b/crates/evals/src/eval.rs index 708cfa7511..0580053373 100644 --- a/crates/evals/src/eval.rs +++ b/crates/evals/src/eval.rs @@ -12,13 +12,16 @@ use language::LanguageRegistry; use node_runtime::FakeNodeRuntime; use open_ai::OpenAiEmbeddingModel; use project::Project; -use semantic_index::{OpenAiEmbeddingProvider, ProjectIndex, SemanticDb, Status}; +use semantic_index::{ + EmbeddingProvider, OpenAiEmbeddingProvider, ProjectIndex, SemanticDb, Status, +}; use serde::{Deserialize, Serialize}; use settings::SettingsStore; use smol::channel::bounded; use smol::io::AsyncReadExt; use smol::Timer; use std::ops::RangeInclusive; +use std::path::PathBuf; use std::time::Duration; use std::{ fs, @@ -237,6 +240,14 @@ async fn fetch_code_search_net_resources(http_client: &dyn HttpClient) -> Result Ok(()) } +#[derive(Default, Debug)] +struct Counts { + covered_results: usize, + overlapped_results: usize, + covered_files: usize, + total_results: usize, +} + async fn run_evaluation( only_repo: Option, executor: &BackgroundExecutor, @@ -297,12 +308,11 @@ async fn run_evaluation( cx.update(|cx| languages::init(language_registry.clone(), node_runtime.clone(), cx)) .unwrap(); - let mut covered_result_count = 0; - let mut overlapped_result_count = 0; - let mut covered_file_count = 0; - let mut total_result_count = 0; + let mut counts = Counts::default(); eprint!("Running evals."); + let mut failures = Vec::new(); + for evaluation_project in evaluations { if only_repo .as_ref() @@ -314,27 +324,24 @@ async fn run_evaluation( eprint!("\r\x1B[2K"); eprint!( "Running evals. {}/{} covered. {}/{} overlapped. {}/{} files captured. Project: {}...", - covered_result_count, - total_result_count, - overlapped_result_count, - total_result_count, - covered_file_count, - total_result_count, + counts.covered_results, + counts.total_results, + counts.overlapped_results, + counts.total_results, + counts.covered_files, + counts.total_results, evaluation_project.repo ); - let repo_db_path = - db_path.join(format!("{}.db", evaluation_project.repo.replace('/', "_"))); - let mut semantic_index = SemanticDb::new(repo_db_path, embedding_provider.clone(), cx) - .await - .unwrap(); - let repo_dir = repos_dir.join(&evaluation_project.repo); if !repo_dir.exists() || repo_dir.join(SKIP_EVAL_PATH).exists() { eprintln!("Skipping {}: directory not found", evaluation_project.repo); continue; } + let repo_db_path = + db_path.join(format!("{}.db", evaluation_project.repo.replace('/', "_"))); + let project = cx .update(|cx| { Project::local( @@ -349,125 +356,193 @@ async fn run_evaluation( }) .unwrap(); - let (worktree, _) = project - .update(cx, |project, cx| { - project.find_or_create_worktree(repo_dir, true, cx) - })? - .await?; + let repo = evaluation_project.repo.clone(); + if let Err(err) = run_eval_project( + evaluation_project, + &user_store, + repo_db_path, + &repo_dir, + &mut counts, + project, + embedding_provider.clone(), + fs.clone(), + cx, + ) + .await + { + eprintln!("{repo} eval failed with error: {:?}", err); - worktree - .update(cx, |worktree, _| { - worktree.as_local().unwrap().scan_complete() - }) - .unwrap() - .await; + failures.push((repo, err)); + } + } - let project_index = cx - .update(|cx| semantic_index.create_project_index(project.clone(), cx)) - .unwrap(); - wait_for_indexing_complete(&project_index, cx, Some(Duration::from_secs(120))).await; + eprintln!( + "Running evals. {}/{} covered. {}/{} overlapped. {}/{} files captured. {} failed.", + counts.covered_results, + counts.total_results, + counts.overlapped_results, + counts.total_results, + counts.covered_files, + counts.total_results, + failures.len(), + ); - for query in evaluation_project.queries { - let results = cx - .update(|cx| { + if failures.is_empty() { + Ok(()) + } else { + eprintln!("Failures:\n"); + + for (index, (repo, failure)) in failures.iter().enumerate() { + eprintln!("Failure #{} - {repo}\n{:?}", index + 1, failure); + } + + Err(anyhow::anyhow!("Some evals failed.")) + } +} + +#[allow(clippy::too_many_arguments)] +async fn run_eval_project( + evaluation_project: EvaluationProject, + user_store: &Model, + repo_db_path: PathBuf, + repo_dir: &Path, + counts: &mut Counts, + project: Model, + embedding_provider: Arc, + fs: Arc, + cx: &mut AsyncAppContext, +) -> Result<(), anyhow::Error> { + let mut semantic_index = SemanticDb::new(repo_db_path, embedding_provider, cx).await?; + + let (worktree, _) = project + .update(cx, |project, cx| { + project.find_or_create_worktree(repo_dir, true, cx) + })? + .await?; + + worktree + .update(cx, |worktree, _| { + worktree.as_local().unwrap().scan_complete() + })? + .await; + + let project_index = cx.update(|cx| semantic_index.create_project_index(project.clone(), cx))?; + wait_for_indexing_complete(&project_index, cx, Some(Duration::from_secs(120))).await; + + for query in evaluation_project.queries { + let results = { + // Retry search up to 3 times in case of timeout, network failure, etc. + let mut retries_remaining = 3; + let mut result; + + loop { + match cx.update(|cx| { let project_index = project_index.read(cx); project_index.search(query.query.clone(), SEARCH_RESULT_LIMIT, cx) - }) - .unwrap() - .await - .unwrap(); - - let results = SemanticDb::load_results(results, &fs.clone(), &cx) - .await - .unwrap(); - - let mut project_covered_result_count = 0; - let mut project_overlapped_result_count = 0; - let mut project_covered_file_count = 0; - let mut covered_result_indices = Vec::new(); - for expected_result in &query.expected_results { - let mut file_matched = false; - let mut range_overlapped = false; - let mut range_covered = false; - - for (ix, result) in results.iter().enumerate() { - if result.path.as_ref() == Path::new(&expected_result.file) { - file_matched = true; - let start_matched = - result.row_range.contains(&expected_result.lines.start()); - let end_matched = result.row_range.contains(&expected_result.lines.end()); - - if start_matched || end_matched { - range_overlapped = true; - } - - if start_matched && end_matched { - range_covered = true; - covered_result_indices.push(ix); + }) { + Ok(task) => match task.await { + Ok(answer) => { + result = Ok(answer); break; } + Err(err) => { + result = Err(err); + } + }, + Err(err) => { + result = Err(err); } } - if range_covered { - project_covered_result_count += 1 - }; - if range_overlapped { - project_overlapped_result_count += 1 - }; - if file_matched { - project_covered_file_count += 1 - }; + if retries_remaining > 0 { + eprintln!( + "Retrying search after it failed on query {:?} with {:?}", + query, result + ); + retries_remaining -= 1; + } else { + eprintln!( + "Ran out of retries; giving up on search which failed on query {:?} with {:?}", + query, result + ); + break; + } } - let outcome_repo = evaluation_project.repo.clone(); - let query_results = EvaluationQueryOutcome { - repo: outcome_repo, - query: query.query, - total_result_count: query.expected_results.len(), - covered_result_count: project_covered_result_count, - overlapped_result_count: project_overlapped_result_count, - covered_file_count: project_covered_file_count, - expected_results: query.expected_results, - actual_results: results - .iter() - .map(|result| EvaluationSearchResult { - file: result.path.to_string_lossy().to_string(), - lines: result.row_range.clone(), - }) - .collect(), - covered_result_indices, + SemanticDb::load_results(result?, &fs.clone(), &cx).await? + }; + + let mut project_covered_result_count = 0; + let mut project_overlapped_result_count = 0; + let mut project_covered_file_count = 0; + let mut covered_result_indices = Vec::new(); + for expected_result in &query.expected_results { + let mut file_matched = false; + let mut range_overlapped = false; + let mut range_covered = false; + + for (ix, result) in results.iter().enumerate() { + if result.path.as_ref() == Path::new(&expected_result.file) { + file_matched = true; + let start_matched = result.row_range.contains(&expected_result.lines.start()); + let end_matched = result.row_range.contains(&expected_result.lines.end()); + + if start_matched || end_matched { + range_overlapped = true; + } + + if start_matched && end_matched { + range_covered = true; + covered_result_indices.push(ix); + break; + } + } + } + + if range_covered { + project_covered_result_count += 1 + }; + if range_overlapped { + project_overlapped_result_count += 1 + }; + if file_matched { + project_covered_file_count += 1 }; - - overlapped_result_count += query_results.overlapped_result_count; - covered_result_count += query_results.covered_result_count; - covered_file_count += query_results.covered_file_count; - total_result_count += query_results.total_result_count; - - println!("{}", serde_json::to_string(&query_results).unwrap()); } + let outcome_repo = evaluation_project.repo.clone(); - user_store - .update(cx, |_, _| { - drop(semantic_index); - drop(project); - drop(worktree); - drop(project_index); - }) - .unwrap(); + let query_results = EvaluationQueryOutcome { + repo: outcome_repo, + query: query.query, + total_result_count: query.expected_results.len(), + covered_result_count: project_covered_result_count, + overlapped_result_count: project_overlapped_result_count, + covered_file_count: project_covered_file_count, + expected_results: query.expected_results, + actual_results: results + .iter() + .map(|result| EvaluationSearchResult { + file: result.path.to_string_lossy().to_string(), + lines: result.row_range.clone(), + }) + .collect(), + covered_result_indices, + }; + + counts.overlapped_results += query_results.overlapped_result_count; + counts.covered_results += query_results.covered_result_count; + counts.covered_files += query_results.covered_file_count; + counts.total_results += query_results.total_result_count; + + println!("{}", serde_json::to_string(&query_results)?); } - eprint!( - "Running evals. {}/{} covered. {}/{} overlapped. {}/{} files captured.", - covered_result_count, - total_result_count, - overlapped_result_count, - total_result_count, - covered_file_count, - total_result_count, - ); - - Ok(()) + user_store.update(cx, |_, _| { + drop(semantic_index); + drop(project); + drop(worktree); + drop(project_index); + }) } async fn wait_for_indexing_complete( @@ -524,7 +599,7 @@ async fn fetch_eval_repos( let evaluations = fs::read(&evaluations_path).expect("failed to read evaluations.json"); let evaluations: Vec = serde_json::from_slice(&evaluations).unwrap(); - eprint!("Fetching evaluation repositories..."); + eprintln!("Fetching evaluation repositories..."); executor .scoped(move |scope| {