diff options
| -rw-r--r-- | bot.go | 28 | ||||
| -rw-r--r-- | rag/rag.go | 4 | ||||
| -rw-r--r-- | rag/rag_integration_test.go | 1 | ||||
| -rw-r--r-- | rag/rag_real_test.go | 2 | ||||
| -rw-r--r-- | rag/rag_test.go | 4 | ||||
| -rw-r--r-- | storage/migrate.go | 11 | ||||
| -rw-r--r-- | storage/migrations/004_populate_fts.up.sql | 30 | ||||
| -rw-r--r-- | storage/migrations/005_drop_unused_embeddings.down.sql | 87 | ||||
| -rw-r--r-- | storage/migrations/005_drop_unused_embeddings.up.sql | 32 | ||||
| -rw-r--r-- | storage/vector.go | 77 |
10 files changed, 162 insertions, 114 deletions
@@ -1529,21 +1529,23 @@ func init() { asr = NewSTT(logger, cfg) } if cfg.PlaywrightEnabled { - if err := checkPlaywright(); err != nil { - // slow, need a faster check if playwright install - if err := installPW(); err != nil { - logger.Error("failed to install playwright", "error", err) - cancel() - os.Exit(1) - return - } + go func() { if err := checkPlaywright(); err != nil { - logger.Error("failed to run playwright", "error", err) - cancel() - os.Exit(1) - return + // slow, need a faster check if playwright install + if err := installPW(); err != nil { + logger.Error("failed to install playwright", "error", err) + cancel() + os.Exit(1) + return + } + if err := checkPlaywright(); err != nil { + logger.Error("failed to run playwright", "error", err) + cancel() + os.Exit(1) + return + } } - } + }() } // atomic default values cachedModelColor.Store("orange") @@ -47,7 +47,6 @@ func isStopWord(word string) bool { func detectPhrases(query string) []string { words := strings.Fields(strings.ToLower(query)) var phrases []string - for i := 0; i < len(words)-1; i++ { word1 := strings.Trim(words[i], ".,!?;:'\"()[]{}") word2 := strings.Trim(words[i+1], ".,!?;:'\"()[]{}") @@ -70,7 +69,6 @@ func detectPhrases(query string) []string { } } } - return phrases } @@ -122,7 +120,6 @@ func areSlugsAdjacent(slug1, slug2 string) bool { if prefix1 != prefix2 { return false } - batch1, chunk1, ok1 := parseSlugIndices(slug1) batch2, chunk2, ok2 := parseSlugIndices(slug2) if !ok1 || !ok2 { @@ -843,7 +840,6 @@ func (r *RAG) GenerateQueryVariations(query string) []string { } } } - return variations } diff --git a/rag/rag_integration_test.go b/rag/rag_integration_test.go index f3405eb..6cd56c6 100644 --- a/rag/rag_integration_test.go +++ b/rag/rag_integration_test.go @@ -297,7 +297,6 @@ relative term as it was still a few kilometers away. The clan house was made of brick and conformed to an older style of architecture. Nearly everyone knew about this mansion and its clock tower. It stood tall over the neighboring mansions and rumor had it that you could see the whole capital from the top. It spoke to this clan’s renown and history that they were able to get away with building something that dwarfed the mansions of the nobility.` - chunks := []*models.VectorRow{ { Slug: "kjv_bible.epub_1786_0", diff --git a/rag/rag_real_test.go b/rag/rag_real_test.go index 87f6906..a4a2508 100644 --- a/rag/rag_real_test.go +++ b/rag/rag_real_test.go @@ -41,7 +41,6 @@ func TestRealBiblicalQuery(t *testing.T) { t.Fatalf("failed to create RAG instance: %v", err) } t.Cleanup(func() { rag.Destroy() }) - query := "bald prophet and two she bears" results, err := rag.Search(query, 30) if err != nil { @@ -95,7 +94,6 @@ func TestRealQueryVariations(t *testing.T) { t.Fatalf("failed to create RAG instance: %v", err) } t.Cleanup(func() { rag.Destroy() }) - tests := []struct { name string query string diff --git a/rag/rag_test.go b/rag/rag_test.go index 4944007..a2204ee 100644 --- a/rag/rag_test.go +++ b/rag/rag_test.go @@ -30,7 +30,6 @@ func TestDetectPhrases(t *testing.T) { expect: []string{}, }, } - for _, tt := range tests { got := detectPhrases(tt.query) if len(got) != len(tt.expect) { @@ -73,7 +72,6 @@ func TestCountPhraseMatches(t *testing.T) { expect: 2, // "she bears" and "bald prophet" }, } - for _, tt := range tests { got := countPhraseMatches(tt.text, tt.query) if got != tt.expect { @@ -119,7 +117,6 @@ func TestAreSlugsAdjacent(t *testing.T) { expect: true, // sequential batches with same chunk index are adjacent }, } - for _, tt := range tests { got := areSlugsAdjacent(tt.slug1, tt.slug2) if got != tt.expect { @@ -141,7 +138,6 @@ func TestParseSlugIndices(t *testing.T) { {"file_abc_def", 0, 0, false}, {"file_123_456_extra", 456, 0, false}, // regex matches last two numbers } - for _, tt := range tests { batch, chunk, ok := parseSlugIndices(tt.slug) if ok != tt.wantOk { diff --git a/storage/migrate.go b/storage/migrate.go index 38f9854..b6fed37 100644 --- a/storage/migrate.go +++ b/storage/migrate.go @@ -23,9 +23,20 @@ func (p *ProviderSQL) Migrate() error { p.logger.Error("Failed to read migrations directory;", "error", err) return fmt.Errorf("failed to read migrations directory: %w", err) } + + // Check if FTS already has data - skip populate migration if so + var ftsCount int + _ = p.db.QueryRow("SELECT COUNT(*) FROM fts_embeddings").Scan(&ftsCount) + skipFTSMigration := ftsCount > 0 + // Execute each .up.sql file for _, file := range files { if strings.HasSuffix(file.Name(), ".up.sql") { + // Skip FTS populate migration if already populated + if skipFTSMigration && strings.Contains(file.Name(), "004_populate_fts") { + p.logger.Debug("Skipping FTS migration - already populated", "file", file.Name()) + continue + } err := p.executeMigration(migrationsDir, file.Name()) if err != nil { p.logger.Error("Failed to execute migration %s: %v", file.Name(), err) diff --git a/storage/migrations/004_populate_fts.up.sql b/storage/migrations/004_populate_fts.up.sql index 1d1b16a..1068bf7 100644 --- a/storage/migrations/004_populate_fts.up.sql +++ b/storage/migrations/004_populate_fts.up.sql @@ -1,26 +1,4 @@ --- Populate FTS table with existing embeddings -DELETE FROM fts_embeddings; - -INSERT INTO fts_embeddings (slug, raw_text, filename, embedding_size) -SELECT slug, raw_text, filename, 384 FROM embeddings_384; - -INSERT INTO fts_embeddings (slug, raw_text, filename, embedding_size) -SELECT slug, raw_text, filename, 768 FROM embeddings_768; - -INSERT INTO fts_embeddings (slug, raw_text, filename, embedding_size) -SELECT slug, raw_text, filename, 1024 FROM embeddings_1024; - -INSERT INTO fts_embeddings (slug, raw_text, filename, embedding_size) -SELECT slug, raw_text, filename, 1536 FROM embeddings_1536; - -INSERT INTO fts_embeddings (slug, raw_text, filename, embedding_size) -SELECT slug, raw_text, filename, 2048 FROM embeddings_2048; - -INSERT INTO fts_embeddings (slug, raw_text, filename, embedding_size) -SELECT slug, raw_text, filename, 3072 FROM embeddings_3072; - -INSERT INTO fts_embeddings (slug, raw_text, filename, embedding_size) -SELECT slug, raw_text, filename, 4096 FROM embeddings_4096; - -INSERT INTO fts_embeddings (slug, raw_text, filename, embedding_size) -SELECT slug, raw_text, filename, 5120 FROM embeddings_5120;
\ No newline at end of file +-- Populate FTS table with existing embeddings (incremental - only inserts missing rows) +-- Only use 768 embeddings as that's what we use +INSERT OR IGNORE INTO fts_embeddings (slug, raw_text, filename, embedding_size) +SELECT slug, raw_text, filename, 768 FROM embeddings_768;
\ No newline at end of file diff --git a/storage/migrations/005_drop_unused_embeddings.down.sql b/storage/migrations/005_drop_unused_embeddings.down.sql new file mode 100644 index 0000000..063cb88 --- /dev/null +++ b/storage/migrations/005_drop_unused_embeddings.down.sql @@ -0,0 +1,87 @@ +-- Recreate unused embedding tables (for rollback) +CREATE TABLE IF NOT EXISTS embeddings_384 ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + embeddings BLOB NOT NULL, + slug TEXT NOT NULL, + raw_text TEXT NOT NULL, + filename TEXT NOT NULL, + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP +); + +CREATE TABLE IF NOT EXISTS embeddings_1024 ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + embeddings BLOB NOT NULL, + slug TEXT NOT NULL, + raw_text TEXT NOT NULL, + filename TEXT NOT NULL, + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP +); + +CREATE TABLE IF NOT EXISTS embeddings_1536 ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + embeddings BLOB NOT NULL, + slug TEXT NOT NULL, + raw_text TEXT NOT NULL, + filename TEXT NOT NULL, + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP +); + +CREATE TABLE IF NOT EXISTS embeddings_2048 ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + embeddings BLOB NOT NULL, + slug TEXT NOT NULL, + raw_text TEXT NOT NULL, + filename TEXT NOT NULL, + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP +); + +CREATE TABLE IF NOT EXISTS embeddings_3072 ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + embeddings BLOB NOT NULL, + slug TEXT NOT NULL, + raw_text TEXT NOT NULL, + filename TEXT NOT NULL, + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP +); + +CREATE TABLE IF NOT EXISTS embeddings_4096 ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + embeddings BLOB NOT NULL, + slug TEXT NOT NULL, + raw_text TEXT NOT NULL, + filename TEXT NOT NULL, + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP +); + +CREATE TABLE IF NOT EXISTS embeddings_5120 ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + embeddings BLOB NOT NULL, + slug TEXT NOT NULL, + raw_text TEXT NOT NULL, + filename TEXT NOT NULL, + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP +); + +CREATE INDEX IF NOT EXISTS idx_embeddings_384_filename ON embeddings_384(filename); +CREATE INDEX IF NOT EXISTS idx_embeddings_1024_filename ON embeddings_1024(filename); +CREATE INDEX IF NOT EXISTS idx_embeddings_1536_filename ON embeddings_1536(filename); +CREATE INDEX IF NOT EXISTS idx_embeddings_2048_filename ON embeddings_2048(filename); +CREATE INDEX IF NOT EXISTS idx_embeddings_3072_filename ON embeddings_3072(filename); +CREATE INDEX IF NOT EXISTS idx_embeddings_4096_filename ON embeddings_4096(filename); +CREATE INDEX IF NOT EXISTS idx_embeddings_5120_filename ON embeddings_5120(filename); + +CREATE INDEX IF NOT EXISTS idx_embeddings_384_slug ON embeddings_384(slug); +CREATE INDEX IF NOT EXISTS idx_embeddings_1024_slug ON embeddings_1024(slug); +CREATE INDEX IF NOT EXISTS idx_embeddings_1536_slug ON embeddings_1536(slug); +CREATE INDEX IF NOT EXISTS idx_embeddings_2048_slug ON embeddings_2048(slug); +CREATE INDEX IF NOT EXISTS idx_embeddings_3072_slug ON embeddings_3072(slug); +CREATE INDEX IF NOT EXISTS idx_embeddings_4096_slug ON embeddings_4096(slug); +CREATE INDEX IF NOT EXISTS idx_embeddings_5120_slug ON embeddings_5120(slug); + +CREATE INDEX IF NOT EXISTS idx_embeddings_384_created_at ON embeddings_384(created_at); +CREATE INDEX IF NOT EXISTS idx_embeddings_1024_created_at ON embeddings_1024(created_at); +CREATE INDEX IF NOT EXISTS idx_embeddings_1536_created_at ON embeddings_1536(created_at); +CREATE INDEX IF NOT EXISTS idx_embeddings_2048_created_at ON embeddings_2048(created_at); +CREATE INDEX IF NOT EXISTS idx_embeddings_3072_created_at ON embeddings_3072(created_at); +CREATE INDEX IF NOT EXISTS idx_embeddings_4096_created_at ON embeddings_4096(created_at); +CREATE INDEX IF NOT EXISTS idx_embeddings_5120_created_at ON embeddings_5120(created_at); diff --git a/storage/migrations/005_drop_unused_embeddings.up.sql b/storage/migrations/005_drop_unused_embeddings.up.sql new file mode 100644 index 0000000..f26e30f --- /dev/null +++ b/storage/migrations/005_drop_unused_embeddings.up.sql @@ -0,0 +1,32 @@ +-- Drop unused embedding tables (we only use 768) +DROP INDEX IF EXISTS idx_embeddings_384_filename; +DROP INDEX IF EXISTS idx_embeddings_1024_filename; +DROP INDEX IF EXISTS idx_embeddings_1536_filename; +DROP INDEX IF EXISTS idx_embeddings_2048_filename; +DROP INDEX IF EXISTS idx_embeddings_3072_filename; +DROP INDEX IF EXISTS idx_embeddings_4096_filename; +DROP INDEX IF EXISTS idx_embeddings_5120_filename; + +DROP INDEX IF EXISTS idx_embeddings_384_slug; +DROP INDEX IF EXISTS idx_embeddings_1024_slug; +DROP INDEX IF EXISTS idx_embeddings_1536_slug; +DROP INDEX IF EXISTS idx_embeddings_2048_slug; +DROP INDEX IF EXISTS idx_embeddings_3072_slug; +DROP INDEX IF EXISTS idx_embeddings_4096_slug; +DROP INDEX IF EXISTS idx_embeddings_5120_slug; + +DROP INDEX IF EXISTS idx_embeddings_384_created_at; +DROP INDEX IF EXISTS idx_embeddings_1024_created_at; +DROP INDEX IF EXISTS idx_embeddings_1536_created_at; +DROP INDEX IF EXISTS idx_embeddings_2048_created_at; +DROP INDEX IF EXISTS idx_embeddings_3072_created_at; +DROP INDEX IF EXISTS idx_embeddings_4096_created_at; +DROP INDEX IF EXISTS idx_embeddings_5120_created_at; + +DROP TABLE IF EXISTS embeddings_384; +DROP TABLE IF EXISTS embeddings_1024; +DROP TABLE IF EXISTS embeddings_1536; +DROP TABLE IF EXISTS embeddings_2048; +DROP TABLE IF EXISTS embeddings_3072; +DROP TABLE IF EXISTS embeddings_4096; +DROP TABLE IF EXISTS embeddings_5120; diff --git a/storage/vector.go b/storage/vector.go index e3bbb89..fed78a9 100644 --- a/storage/vector.go +++ b/storage/vector.go @@ -48,22 +48,8 @@ func mathBitsToFloat32(b uint32) float32 { func fetchTableName(emb []float32) (string, error) { switch len(emb) { - case 384: - return "embeddings_384", nil case 768: return "embeddings_768", nil - case 1024: - return "embeddings_1024", nil - case 1536: - return "embeddings_1536", nil - case 2048: - return "embeddings_2048", nil - case 3072: - return "embeddings_3072", nil - case 4096: - return "embeddings_4096", nil - case 5120: - return "embeddings_5120", nil default: return "", fmt.Errorf("no table for the size of %d", len(emb)) } @@ -170,62 +156,25 @@ func sqrt(f float32) float32 { } func (p ProviderSQL) ListFiles() ([]string, error) { - fileLists := make([][]string, 0) - - // Query all supported tables and combine results - tableNames := []string{ - "embeddings_384", "embeddings_768", "embeddings_1024", "embeddings_1536", - "embeddings_2048", "embeddings_3072", "embeddings_4096", "embeddings_5120", - } - for _, table := range tableNames { - query := "SELECT DISTINCT filename FROM " + table - rows, err := p.db.Query(query) - if err != nil { - // Continue if one table doesn't exist - continue - } - - var files []string - for rows.Next() { - var filename string - if err := rows.Scan(&filename); err != nil { - continue - } - files = append(files, filename) - } - rows.Close() - - fileLists = append(fileLists, files) + query := "SELECT DISTINCT filename FROM embeddings_768" + rows, err := p.db.Query(query) + if err != nil { + return nil, err } - - // Combine and deduplicate - fileSet := make(map[string]bool) + defer rows.Close() var allFiles []string - for _, files := range fileLists { - for _, file := range files { - if !fileSet[file] { - fileSet[file] = true - allFiles = append(allFiles, file) - } + for rows.Next() { + var filename string + if err := rows.Scan(&filename); err != nil { + continue } + allFiles = append(allFiles, filename) } return allFiles, nil } func (p ProviderSQL) RemoveEmbByFileName(filename string) error { - var errors []string - tableNames := []string{ - "embeddings_384", "embeddings_768", "embeddings_1024", "embeddings_1536", - "embeddings_2048", "embeddings_3072", "embeddings_4096", "embeddings_5120", - } - for _, table := range tableNames { - query := fmt.Sprintf("DELETE FROM %s WHERE filename = ?", table) - if _, err := p.db.Exec(query, filename); err != nil { - errors = append(errors, err.Error()) - } - } - if len(errors) > 0 { - return fmt.Errorf("errors occurred: %v", errors) - } - return nil + query := "DELETE FROM embeddings_768 WHERE filename = ?" + _, err := p.db.Exec(query, filename) + return err } |
