From 648035b194dd1b8c658a6411b9f9fe19467ccc10 Mon Sep 17 00:00:00 2001 From: Grail Finder Date: Fri, 13 Mar 2026 10:18:31 +0300 Subject: Fix: slow startup from silly migrations --- storage/migrate.go | 11 +++ storage/migrations/004_populate_fts.up.sql | 30 +------- .../migrations/005_drop_unused_embeddings.down.sql | 87 ++++++++++++++++++++++ .../migrations/005_drop_unused_embeddings.up.sql | 32 ++++++++ storage/vector.go | 76 ++++--------------- 5 files changed, 147 insertions(+), 89 deletions(-) create mode 100644 storage/migrations/005_drop_unused_embeddings.down.sql create mode 100644 storage/migrations/005_drop_unused_embeddings.up.sql (limited to 'storage') diff --git a/storage/migrate.go b/storage/migrate.go index 38f9854..b6fed37 100644 --- a/storage/migrate.go +++ b/storage/migrate.go @@ -23,9 +23,20 @@ func (p *ProviderSQL) Migrate() error { p.logger.Error("Failed to read migrations directory;", "error", err) return fmt.Errorf("failed to read migrations directory: %w", err) } + + // Check if FTS already has data - skip populate migration if so + var ftsCount int + _ = p.db.QueryRow("SELECT COUNT(*) FROM fts_embeddings").Scan(&ftsCount) + skipFTSMigration := ftsCount > 0 + // Execute each .up.sql file for _, file := range files { if strings.HasSuffix(file.Name(), ".up.sql") { + // Skip FTS populate migration if already populated + if skipFTSMigration && strings.Contains(file.Name(), "004_populate_fts") { + p.logger.Debug("Skipping FTS migration - already populated", "file", file.Name()) + continue + } err := p.executeMigration(migrationsDir, file.Name()) if err != nil { p.logger.Error("Failed to execute migration %s: %v", file.Name(), err) diff --git a/storage/migrations/004_populate_fts.up.sql b/storage/migrations/004_populate_fts.up.sql index 1d1b16a..1068bf7 100644 --- a/storage/migrations/004_populate_fts.up.sql +++ b/storage/migrations/004_populate_fts.up.sql @@ -1,26 +1,4 @@ --- Populate FTS table with existing embeddings -DELETE FROM fts_embeddings; - -INSERT INTO fts_embeddings (slug, raw_text, filename, embedding_size) -SELECT slug, raw_text, filename, 384 FROM embeddings_384; - -INSERT INTO fts_embeddings (slug, raw_text, filename, embedding_size) -SELECT slug, raw_text, filename, 768 FROM embeddings_768; - -INSERT INTO fts_embeddings (slug, raw_text, filename, embedding_size) -SELECT slug, raw_text, filename, 1024 FROM embeddings_1024; - -INSERT INTO fts_embeddings (slug, raw_text, filename, embedding_size) -SELECT slug, raw_text, filename, 1536 FROM embeddings_1536; - -INSERT INTO fts_embeddings (slug, raw_text, filename, embedding_size) -SELECT slug, raw_text, filename, 2048 FROM embeddings_2048; - -INSERT INTO fts_embeddings (slug, raw_text, filename, embedding_size) -SELECT slug, raw_text, filename, 3072 FROM embeddings_3072; - -INSERT INTO fts_embeddings (slug, raw_text, filename, embedding_size) -SELECT slug, raw_text, filename, 4096 FROM embeddings_4096; - -INSERT INTO fts_embeddings (slug, raw_text, filename, embedding_size) -SELECT slug, raw_text, filename, 5120 FROM embeddings_5120; \ No newline at end of file +-- Populate FTS table with existing embeddings (incremental - only inserts missing rows) +-- Only use 768 embeddings as that's what we use +INSERT OR IGNORE INTO fts_embeddings (slug, raw_text, filename, embedding_size) +SELECT slug, raw_text, filename, 768 FROM embeddings_768; \ No newline at end of file diff --git a/storage/migrations/005_drop_unused_embeddings.down.sql b/storage/migrations/005_drop_unused_embeddings.down.sql new file mode 100644 index 0000000..063cb88 --- /dev/null +++ b/storage/migrations/005_drop_unused_embeddings.down.sql @@ -0,0 +1,87 @@ +-- Recreate unused embedding tables (for rollback) +CREATE TABLE IF NOT EXISTS embeddings_384 ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + embeddings BLOB NOT NULL, + slug TEXT NOT NULL, + raw_text TEXT NOT NULL, + filename TEXT NOT NULL, + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP +); + +CREATE TABLE IF NOT EXISTS embeddings_1024 ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + embeddings BLOB NOT NULL, + slug TEXT NOT NULL, + raw_text TEXT NOT NULL, + filename TEXT NOT NULL, + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP +); + +CREATE TABLE IF NOT EXISTS embeddings_1536 ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + embeddings BLOB NOT NULL, + slug TEXT NOT NULL, + raw_text TEXT NOT NULL, + filename TEXT NOT NULL, + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP +); + +CREATE TABLE IF NOT EXISTS embeddings_2048 ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + embeddings BLOB NOT NULL, + slug TEXT NOT NULL, + raw_text TEXT NOT NULL, + filename TEXT NOT NULL, + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP +); + +CREATE TABLE IF NOT EXISTS embeddings_3072 ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + embeddings BLOB NOT NULL, + slug TEXT NOT NULL, + raw_text TEXT NOT NULL, + filename TEXT NOT NULL, + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP +); + +CREATE TABLE IF NOT EXISTS embeddings_4096 ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + embeddings BLOB NOT NULL, + slug TEXT NOT NULL, + raw_text TEXT NOT NULL, + filename TEXT NOT NULL, + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP +); + +CREATE TABLE IF NOT EXISTS embeddings_5120 ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + embeddings BLOB NOT NULL, + slug TEXT NOT NULL, + raw_text TEXT NOT NULL, + filename TEXT NOT NULL, + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP +); + +CREATE INDEX IF NOT EXISTS idx_embeddings_384_filename ON embeddings_384(filename); +CREATE INDEX IF NOT EXISTS idx_embeddings_1024_filename ON embeddings_1024(filename); +CREATE INDEX IF NOT EXISTS idx_embeddings_1536_filename ON embeddings_1536(filename); +CREATE INDEX IF NOT EXISTS idx_embeddings_2048_filename ON embeddings_2048(filename); +CREATE INDEX IF NOT EXISTS idx_embeddings_3072_filename ON embeddings_3072(filename); +CREATE INDEX IF NOT EXISTS idx_embeddings_4096_filename ON embeddings_4096(filename); +CREATE INDEX IF NOT EXISTS idx_embeddings_5120_filename ON embeddings_5120(filename); + +CREATE INDEX IF NOT EXISTS idx_embeddings_384_slug ON embeddings_384(slug); +CREATE INDEX IF NOT EXISTS idx_embeddings_1024_slug ON embeddings_1024(slug); +CREATE INDEX IF NOT EXISTS idx_embeddings_1536_slug ON embeddings_1536(slug); +CREATE INDEX IF NOT EXISTS idx_embeddings_2048_slug ON embeddings_2048(slug); +CREATE INDEX IF NOT EXISTS idx_embeddings_3072_slug ON embeddings_3072(slug); +CREATE INDEX IF NOT EXISTS idx_embeddings_4096_slug ON embeddings_4096(slug); +CREATE INDEX IF NOT EXISTS idx_embeddings_5120_slug ON embeddings_5120(slug); + +CREATE INDEX IF NOT EXISTS idx_embeddings_384_created_at ON embeddings_384(created_at); +CREATE INDEX IF NOT EXISTS idx_embeddings_1024_created_at ON embeddings_1024(created_at); +CREATE INDEX IF NOT EXISTS idx_embeddings_1536_created_at ON embeddings_1536(created_at); +CREATE INDEX IF NOT EXISTS idx_embeddings_2048_created_at ON embeddings_2048(created_at); +CREATE INDEX IF NOT EXISTS idx_embeddings_3072_created_at ON embeddings_3072(created_at); +CREATE INDEX IF NOT EXISTS idx_embeddings_4096_created_at ON embeddings_4096(created_at); +CREATE INDEX IF NOT EXISTS idx_embeddings_5120_created_at ON embeddings_5120(created_at); diff --git a/storage/migrations/005_drop_unused_embeddings.up.sql b/storage/migrations/005_drop_unused_embeddings.up.sql new file mode 100644 index 0000000..f26e30f --- /dev/null +++ b/storage/migrations/005_drop_unused_embeddings.up.sql @@ -0,0 +1,32 @@ +-- Drop unused embedding tables (we only use 768) +DROP INDEX IF EXISTS idx_embeddings_384_filename; +DROP INDEX IF EXISTS idx_embeddings_1024_filename; +DROP INDEX IF EXISTS idx_embeddings_1536_filename; +DROP INDEX IF EXISTS idx_embeddings_2048_filename; +DROP INDEX IF EXISTS idx_embeddings_3072_filename; +DROP INDEX IF EXISTS idx_embeddings_4096_filename; +DROP INDEX IF EXISTS idx_embeddings_5120_filename; + +DROP INDEX IF EXISTS idx_embeddings_384_slug; +DROP INDEX IF EXISTS idx_embeddings_1024_slug; +DROP INDEX IF EXISTS idx_embeddings_1536_slug; +DROP INDEX IF EXISTS idx_embeddings_2048_slug; +DROP INDEX IF EXISTS idx_embeddings_3072_slug; +DROP INDEX IF EXISTS idx_embeddings_4096_slug; +DROP INDEX IF EXISTS idx_embeddings_5120_slug; + +DROP INDEX IF EXISTS idx_embeddings_384_created_at; +DROP INDEX IF EXISTS idx_embeddings_1024_created_at; +DROP INDEX IF EXISTS idx_embeddings_1536_created_at; +DROP INDEX IF EXISTS idx_embeddings_2048_created_at; +DROP INDEX IF EXISTS idx_embeddings_3072_created_at; +DROP INDEX IF EXISTS idx_embeddings_4096_created_at; +DROP INDEX IF EXISTS idx_embeddings_5120_created_at; + +DROP TABLE IF EXISTS embeddings_384; +DROP TABLE IF EXISTS embeddings_1024; +DROP TABLE IF EXISTS embeddings_1536; +DROP TABLE IF EXISTS embeddings_2048; +DROP TABLE IF EXISTS embeddings_3072; +DROP TABLE IF EXISTS embeddings_4096; +DROP TABLE IF EXISTS embeddings_5120; diff --git a/storage/vector.go b/storage/vector.go index e3bbb89..e8ecb52 100644 --- a/storage/vector.go +++ b/storage/vector.go @@ -48,22 +48,8 @@ func mathBitsToFloat32(b uint32) float32 { func fetchTableName(emb []float32) (string, error) { switch len(emb) { - case 384: - return "embeddings_384", nil case 768: return "embeddings_768", nil - case 1024: - return "embeddings_1024", nil - case 1536: - return "embeddings_1536", nil - case 2048: - return "embeddings_2048", nil - case 3072: - return "embeddings_3072", nil - case 4096: - return "embeddings_4096", nil - case 5120: - return "embeddings_5120", nil default: return "", fmt.Errorf("no table for the size of %d", len(emb)) } @@ -170,62 +156,26 @@ func sqrt(f float32) float32 { } func (p ProviderSQL) ListFiles() ([]string, error) { - fileLists := make([][]string, 0) - - // Query all supported tables and combine results - tableNames := []string{ - "embeddings_384", "embeddings_768", "embeddings_1024", "embeddings_1536", - "embeddings_2048", "embeddings_3072", "embeddings_4096", "embeddings_5120", - } - for _, table := range tableNames { - query := "SELECT DISTINCT filename FROM " + table - rows, err := p.db.Query(query) - if err != nil { - // Continue if one table doesn't exist - continue - } - - var files []string - for rows.Next() { - var filename string - if err := rows.Scan(&filename); err != nil { - continue - } - files = append(files, filename) - } - rows.Close() - - fileLists = append(fileLists, files) + query := "SELECT DISTINCT filename FROM embeddings_768" + rows, err := p.db.Query(query) + if err != nil { + return nil, err } + defer rows.Close() - // Combine and deduplicate - fileSet := make(map[string]bool) var allFiles []string - for _, files := range fileLists { - for _, file := range files { - if !fileSet[file] { - fileSet[file] = true - allFiles = append(allFiles, file) - } + for rows.Next() { + var filename string + if err := rows.Scan(&filename); err != nil { + continue } + allFiles = append(allFiles, filename) } return allFiles, nil } func (p ProviderSQL) RemoveEmbByFileName(filename string) error { - var errors []string - tableNames := []string{ - "embeddings_384", "embeddings_768", "embeddings_1024", "embeddings_1536", - "embeddings_2048", "embeddings_3072", "embeddings_4096", "embeddings_5120", - } - for _, table := range tableNames { - query := fmt.Sprintf("DELETE FROM %s WHERE filename = ?", table) - if _, err := p.db.Exec(query, filename); err != nil { - errors = append(errors, err.Error()) - } - } - if len(errors) > 0 { - return fmt.Errorf("errors occurred: %v", errors) - } - return nil + query := "DELETE FROM embeddings_768 WHERE filename = ?" + _, err := p.db.Exec(query, filename) + return err } -- cgit v1.2.3