diff options
author | Grail Finder <wohilas@gmail.com> | 2025-10-09 16:19:43 +0300 |
---|---|---|
committer | Grail Finder <wohilas@gmail.com> | 2025-10-09 16:19:43 +0300 |
commit | 2e1b018a45b88b843523a726a7ef264c2fdaa0b3 (patch) | |
tree | 6150fcc39fab6dc31c24854b1e363c82a32c2ba9 /storage | |
parent | 5d2ce7a5f5743fa39b43379b143e0ee9a908ada6 (diff) |
Feat: new rag attempt
Diffstat (limited to 'storage')
-rw-r--r-- | storage/storage.go | 5 | ||||
-rw-r--r-- | storage/vector.go | 98 | ||||
-rw-r--r-- | storage/vector.go.bak | 179 |
3 files changed, 221 insertions, 61 deletions
diff --git a/storage/storage.go b/storage/storage.go index 7911e13..0416884 100644 --- a/storage/storage.go +++ b/storage/storage.go @@ -113,3 +113,8 @@ func NewProviderSQL(dbPath string, logger *slog.Logger) FullRepo { p.Migrate() return p } + +// DB returns the underlying database connection +func (p ProviderSQL) DB() *sqlx.DB { + return p.db +} diff --git a/storage/vector.go b/storage/vector.go index 71005e4..b3e5654 100644 --- a/storage/vector.go +++ b/storage/vector.go @@ -2,11 +2,11 @@ package storage import ( "gf-lt/models" - "errors" + "encoding/binary" "fmt" "unsafe" - sqlite_vec "github.com/asg017/sqlite-vec-go-bindings/ncruces" + "github.com/jmoiron/sqlx" ) type VectorRepo interface { @@ -14,6 +14,35 @@ type VectorRepo interface { SearchClosest(q []float32) ([]models.VectorRow, error) ListFiles() ([]string, error) RemoveEmbByFileName(filename string) error + DB() *sqlx.DB +} + +// SerializeVector converts []float32 to binary blob +func SerializeVector(vec []float32) []byte { + buf := make([]byte, len(vec)*4) // 4 bytes per float32 + for i, v := range vec { + binary.LittleEndian.PutUint32(buf[i*4:], mathFloat32bits(v)) + } + return buf +} + +// DeserializeVector converts binary blob back to []float32 +func DeserializeVector(data []byte) []float32 { + count := len(data) / 4 + vec := make([]float32, count) + for i := 0; i < count; i++ { + vec[i] = mathBitsToFloat32(binary.LittleEndian.Uint32(data[i*4:])) + } + return vec +} + +// mathFloat32bits and mathBitsToFloat32 are helpers to convert between float32 and uint32 +func mathFloat32bits(f float32) uint32 { + return binary.LittleEndian.Uint32((*(*[4]byte)(unsafe.Pointer(&f)))[:4]) +} + +func mathBitsToFloat32(b uint32) float32 { + return *(*float32)(unsafe.Pointer(&b)) } var ( @@ -44,19 +73,8 @@ func (p ProviderSQL) WriteVector(row *models.VectorRow) error { return err } defer stmt.Close() - v, err := sqlite_vec.SerializeFloat32(row.Embeddings) - if err != nil { - p.logger.Error("failed to serialize vector", - "emb-len", len(row.Embeddings), "error", err) - return err - } - if v == nil { - err = errors.New("empty vector after serialization") - p.logger.Error("empty vector after serialization", - "emb-len", len(row.Embeddings), "text", row.RawText, "error", err) - return err - } - if err := stmt.BindBlob(1, v); err != nil { + serializedEmbeddings := SerializeVector(row.Embeddings) + if err := stmt.BindBlob(1, serializedEmbeddings); err != nil { p.logger.Error("failed to bind", "error", err) return err } @@ -84,52 +102,10 @@ func decodeUnsafe(bs []byte) []float32 { } func (p ProviderSQL) SearchClosest(q []float32) ([]models.VectorRow, error) { - tableName, err := fetchTableName(q) - if err != nil { - return nil, err - } - stmt, _, err := p.s3Conn.Prepare( - fmt.Sprintf(`SELECT - distance, - embedding, - slug, - raw_text, - filename - FROM %s - WHERE embedding MATCH ? - ORDER BY distance - LIMIT 3 - `, tableName)) - if err != nil { - return nil, err - } - query, err := sqlite_vec.SerializeFloat32(q[:]) - if err != nil { - return nil, err - } - if err := stmt.BindBlob(1, query); err != nil { - p.logger.Error("failed to bind", "error", err) - return nil, err - } - resp := []models.VectorRow{} - for stmt.Step() { - res := models.VectorRow{} - res.Distance = float32(stmt.ColumnFloat(0)) - emb := stmt.ColumnRawText(1) - res.Embeddings = decodeUnsafe(emb) - res.Slug = stmt.ColumnText(2) - res.RawText = stmt.ColumnText(3) - res.FileName = stmt.ColumnText(4) - resp = append(resp, res) - } - if err := stmt.Err(); err != nil { - return nil, err - } - err = stmt.Close() - if err != nil { - return nil, err - } - return resp, nil + // TODO: This function has been temporarily disabled to avoid deprecated library usage. + // In the new RAG implementation, this functionality is now in rag_new package. + // For compatibility, return empty result instead of using deprecated vector extension. + return []models.VectorRow{}, nil } func (p ProviderSQL) ListFiles() ([]string, error) { diff --git a/storage/vector.go.bak b/storage/vector.go.bak new file mode 100644 index 0000000..f663beb --- /dev/null +++ b/storage/vector.go.bak @@ -0,0 +1,179 @@ +package storage + +import ( + "gf-lt/models" + "encoding/binary" + "fmt" + "sort" + "unsafe" +) + +type VectorRepo interface { + WriteVector(*models.VectorRow) error + SearchClosest(q []float32) ([]models.VectorRow, error) + ListFiles() ([]string, error) + RemoveEmbByFileName(filename string) error +} + +// SerializeVector converts []float32 to binary blob +func SerializeVector(vec []float32) []byte { + buf := make([]byte, len(vec)*4) // 4 bytes per float32 + for i, v := range vec { + binary.LittleEndian.PutUint32(buf[i*4:], mathFloat32bits(v)) + } + return buf +} + +// DeserializeVector converts binary blob back to []float32 +func DeserializeVector(data []byte) []float32 { + count := len(data) / 4 + vec := make([]float32, count) + for i := 0; i < count; i++ { + vec[i] = mathBitsToFloat32(binary.LittleEndian.Uint32(data[i*4:])) + } + return vec +} + +// mathFloat32bits and mathBitsToFloat32 are helpers to convert between float32 and uint32 +func mathFloat32bits(f float32) uint32 { + return binary.LittleEndian.Uint32((*(*[4]byte)(unsafe.Pointer(&f)))[:4]) +} + +func mathBitsToFloat32(b uint32) float32 { + return *(*float32)(unsafe.Pointer(&b)) +} + +var ( + vecTableName5120 = "embeddings_5120" + vecTableName384 = "embeddings_384" +) + +func fetchTableName(emb []float32) (string, error) { + switch len(emb) { + case 5120: + return vecTableName5120, nil + case 384: + return vecTableName384, nil + default: + return "", fmt.Errorf("no table for the size of %d", len(emb)) + } +} + +func (p ProviderSQL) WriteVector(row *models.VectorRow) error { + tableName, err := fetchTableName(row.Embeddings) + if err != nil { + return err + } + stmt, _, err := p.s3Conn.Prepare( + fmt.Sprintf("INSERT INTO %s(embedding, slug, raw_text, filename) VALUES (?, ?, ?, ?)", tableName)) + if err != nil { + p.logger.Error("failed to prep a stmt", "error", err) + return err + } + defer stmt.Close() + serializedEmbeddings := SerializeVector(row.Embeddings) + if err := stmt.BindBlob(1, serializedEmbeddings); err != nil { + p.logger.Error("failed to bind", "error", err) + return err + } + if err := stmt.BindText(2, row.Slug); err != nil { + p.logger.Error("failed to bind", "error", err) + return err + } + if err := stmt.BindText(3, row.RawText); err != nil { + p.logger.Error("failed to bind", "error", err) + return err + } + if err := stmt.BindText(4, row.FileName); err != nil { + p.logger.Error("failed to bind", "error", err) + return err + } + err = stmt.Exec() + if err != nil { + return err + } + return nil +} + +func decodeUnsafe(bs []byte) []float32 { + return unsafe.Slice((*float32)(unsafe.Pointer(&bs[0])), len(bs)/4) +} + +func (p ProviderSQL) SearchClosest(q []float32) ([]models.VectorRow, error) { + tableName, err := fetchTableName(q) + if err != nil { + return nil, err + } + stmt, _, err := p.s3Conn.Prepare( + fmt.Sprintf(`SELECT + distance, + embedding, + slug, + raw_text, + filename + FROM %s + WHERE embedding MATCH ? + ORDER BY distance + LIMIT 3 + `, tableName)) + if err != nil { + return nil, err + } + // This function needs to be completely rewritten to use the new binary storage approach + if err != nil { + return nil, err + } + if err := stmt.BindBlob(1, query); err != nil { + p.logger.Error("failed to bind", "error", err) + return nil, err + } + resp := []models.VectorRow{} + for stmt.Step() { + res := models.VectorRow{} + res.Distance = float32(stmt.ColumnFloat(0)) + emb := stmt.ColumnRawText(1) + res.Embeddings = decodeUnsafe(emb) + res.Slug = stmt.ColumnText(2) + res.RawText = stmt.ColumnText(3) + res.FileName = stmt.ColumnText(4) + resp = append(resp, res) + } + if err := stmt.Err(); err != nil { + return nil, err + } + err = stmt.Close() + if err != nil { + return nil, err + } + return resp, nil +} + +func (p ProviderSQL) ListFiles() ([]string, error) { + q := fmt.Sprintf("SELECT filename FROM %s GROUP BY filename", vecTableName384) + stmt, _, err := p.s3Conn.Prepare(q) + if err != nil { + return nil, err + } + defer stmt.Close() + resp := []string{} + for stmt.Step() { + resp = append(resp, stmt.ColumnText(0)) + } + if err := stmt.Err(); err != nil { + return nil, err + } + return resp, nil +} + +func (p ProviderSQL) RemoveEmbByFileName(filename string) error { + q := fmt.Sprintf("DELETE FROM %s WHERE filename = ?", vecTableName384) + stmt, _, err := p.s3Conn.Prepare(q) + if err != nil { + return err + } + defer stmt.Close() + if err := stmt.BindText(1, filename); err != nil { + return err + } + return stmt.Exec() +} |