optimize insert file in vector database

Co-authored-by: Max <max@zed.dev>
2025-01-30 14:17:02 +00:00 · 2023-09-01 13:01:37 -04:00 · 2023-09-01 13:01:37 -04:00 · e86964eb5d
commit e86964eb5d
parent 524533cfb2
2 changed files with 33 additions and 34 deletions
--- a/crates/semantic_index/src/db.rs
+++ b/crates/semantic_index/src/db.rs
@ -162,6 +162,11 @@ impl VectorDatabase {
                [],
            )?;
            db.execute(
                "CREATE UNIQUE INDEX files_worktree_id_and_relative_path ON files (worktree_id, relative_path)",
                [],
            )?;
            db.execute(
                "CREATE TABLE documents (
                    id INTEGER PRIMARY KEY AUTOINCREMENT,
@ -206,43 +211,37 @@ impl VectorDatabase {
            // Return the existing ID, if both the file and mtime match
            let mtime = Timestamp::from(mtime);
-            let mut existing_id_query = db.prepare("SELECT id FROM files WHERE worktree_id = ?1 AND relative_path = ?2 AND mtime_seconds = ?3 AND mtime_nanos = ?4")?;
+            db.execute(
-            let existing_id = existing_id_query
+                "
-                .query_row(
+                REPLACE INTO files
-                    params![worktree_id, path.to_str(), mtime.seconds, mtime.nanos],
+                (worktree_id, relative_path, mtime_seconds, mtime_nanos)
-                    |row| Ok(row.get::<_, i64>(0)?),
+                VALUES (?1, ?2, ?3, ?4)
-                );
+                ",
                params![worktree_id, path.to_str(), mtime.seconds, mtime.nanos],
            )?;
-            let file_id = if existing_id.is_ok() {
+            let file_id = db.last_insert_rowid();
-                // If already exists, just return the existing id
+
-                existing_id?
+            let mut query = db.prepare(
-            } else {
+                "
-                // Delete Existing Row
+                INSERT INTO documents
-                db.execute(
+                (file_id, start_byte, end_byte, name, embedding, digest)
-                    "DELETE FROM files WHERE worktree_id = ?1 AND relative_path = ?2;",
+                VALUES (?1, ?2, ?3, ?4, ?5, ?6)
-                    params![worktree_id, path.to_str()],
+                ",
-                )?;
+            )?;
                db.execute("INSERT INTO files (worktree_id, relative_path, mtime_seconds, mtime_nanos) VALUES (?1, ?2, ?3, ?4);", params![worktree_id, path.to_str(), mtime.seconds, mtime.nanos])?;
                db.last_insert_rowid()
            };
            // Currently inserting at approximately 3400 documents a second
            // I imagine we can speed this up with a bulk insert of some kind.
            for document in documents {
-                db.execute(
+                query.execute(params![
-                    "INSERT INTO documents (file_id, start_byte, end_byte, name, embedding, digest) VALUES (?1, ?2, ?3, ?4, ?5, ?6)",
+                    file_id,
-                    params![
+                    document.range.start.to_string(),
-                        file_id,
+                    document.range.end.to_string(),
-                        document.range.start.to_string(),
+                    document.name,
-                        document.range.end.to_string(),
+                    document.embedding,
-                        document.name,
+                    document.digest
-                        document.embedding,
+                ])?;
-                        document.digest
+            }
                    ],
                )?;
           }
-           Ok(())
+            Ok(())
        })
    }
--- a/crates/semantic_index/src/semantic_index.rs
+++ b/crates/semantic_index/src/semantic_index.rs
@ -38,7 +38,7 @@ use util::{
 };
 use workspace::WorkspaceCreated;
-const SEMANTIC_INDEX_VERSION: usize = 8;
+const SEMANTIC_INDEX_VERSION: usize = 9;
 const BACKGROUND_INDEXING_DELAY: Duration = Duration::from_secs(600);
 const EMBEDDING_QUEUE_FLUSH_TIMEOUT: Duration = Duration::from_millis(250);