optimize insert file in vector database

Co-authored-by: Max <max@zed.dev>
This commit is contained in:
KCaverly 2023-09-01 13:01:37 -04:00
parent 524533cfb2
commit e86964eb5d
2 changed files with 33 additions and 34 deletions

View file

@ -162,6 +162,11 @@ impl VectorDatabase {
[], [],
)?; )?;
db.execute(
"CREATE UNIQUE INDEX files_worktree_id_and_relative_path ON files (worktree_id, relative_path)",
[],
)?;
db.execute( db.execute(
"CREATE TABLE documents ( "CREATE TABLE documents (
id INTEGER PRIMARY KEY AUTOINCREMENT, id INTEGER PRIMARY KEY AUTOINCREMENT,
@ -206,43 +211,37 @@ impl VectorDatabase {
// Return the existing ID, if both the file and mtime match // Return the existing ID, if both the file and mtime match
let mtime = Timestamp::from(mtime); let mtime = Timestamp::from(mtime);
let mut existing_id_query = db.prepare("SELECT id FROM files WHERE worktree_id = ?1 AND relative_path = ?2 AND mtime_seconds = ?3 AND mtime_nanos = ?4")?; db.execute(
let existing_id = existing_id_query "
.query_row( REPLACE INTO files
params![worktree_id, path.to_str(), mtime.seconds, mtime.nanos], (worktree_id, relative_path, mtime_seconds, mtime_nanos)
|row| Ok(row.get::<_, i64>(0)?), VALUES (?1, ?2, ?3, ?4)
); ",
params![worktree_id, path.to_str(), mtime.seconds, mtime.nanos],
)?;
let file_id = if existing_id.is_ok() { let file_id = db.last_insert_rowid();
// If already exists, just return the existing id
existing_id? let mut query = db.prepare(
} else { "
// Delete Existing Row INSERT INTO documents
db.execute( (file_id, start_byte, end_byte, name, embedding, digest)
"DELETE FROM files WHERE worktree_id = ?1 AND relative_path = ?2;", VALUES (?1, ?2, ?3, ?4, ?5, ?6)
params![worktree_id, path.to_str()], ",
)?; )?;
db.execute("INSERT INTO files (worktree_id, relative_path, mtime_seconds, mtime_nanos) VALUES (?1, ?2, ?3, ?4);", params![worktree_id, path.to_str(), mtime.seconds, mtime.nanos])?;
db.last_insert_rowid()
};
// Currently inserting at approximately 3400 documents a second
// I imagine we can speed this up with a bulk insert of some kind.
for document in documents { for document in documents {
db.execute( query.execute(params![
"INSERT INTO documents (file_id, start_byte, end_byte, name, embedding, digest) VALUES (?1, ?2, ?3, ?4, ?5, ?6)", file_id,
params![ document.range.start.to_string(),
file_id, document.range.end.to_string(),
document.range.start.to_string(), document.name,
document.range.end.to_string(), document.embedding,
document.name, document.digest
document.embedding, ])?;
document.digest }
],
)?;
}
Ok(()) Ok(())
}) })
} }

View file

@ -38,7 +38,7 @@ use util::{
}; };
use workspace::WorkspaceCreated; use workspace::WorkspaceCreated;
const SEMANTIC_INDEX_VERSION: usize = 8; const SEMANTIC_INDEX_VERSION: usize = 9;
const BACKGROUND_INDEXING_DELAY: Duration = Duration::from_secs(600); const BACKGROUND_INDEXING_DELAY: Duration = Duration::from_secs(600);
const EMBEDDING_QUEUE_FLUSH_TIMEOUT: Duration = Duration::from_millis(250); const EMBEDDING_QUEUE_FLUSH_TIMEOUT: Duration = Duration::from_millis(250);