summaryrefslogtreecommitdiff
path: root/storage
diff options
context:
space:
mode:
Diffstat (limited to 'storage')
-rw-r--r--storage/storage.go5
-rw-r--r--storage/vector.go98
-rw-r--r--storage/vector.go.bak179
3 files changed, 221 insertions, 61 deletions
diff --git a/storage/storage.go b/storage/storage.go
index 7911e13..0416884 100644
--- a/storage/storage.go
+++ b/storage/storage.go
@@ -113,3 +113,8 @@ func NewProviderSQL(dbPath string, logger *slog.Logger) FullRepo {
p.Migrate()
return p
}
+
+// DB returns the underlying database connection
+func (p ProviderSQL) DB() *sqlx.DB {
+ return p.db
+}
diff --git a/storage/vector.go b/storage/vector.go
index 71005e4..b3e5654 100644
--- a/storage/vector.go
+++ b/storage/vector.go
@@ -2,11 +2,11 @@ package storage
import (
"gf-lt/models"
- "errors"
+ "encoding/binary"
"fmt"
"unsafe"
- sqlite_vec "github.com/asg017/sqlite-vec-go-bindings/ncruces"
+ "github.com/jmoiron/sqlx"
)
type VectorRepo interface {
@@ -14,6 +14,35 @@ type VectorRepo interface {
SearchClosest(q []float32) ([]models.VectorRow, error)
ListFiles() ([]string, error)
RemoveEmbByFileName(filename string) error
+ DB() *sqlx.DB
+}
+
+// SerializeVector converts []float32 to binary blob
+func SerializeVector(vec []float32) []byte {
+ buf := make([]byte, len(vec)*4) // 4 bytes per float32
+ for i, v := range vec {
+ binary.LittleEndian.PutUint32(buf[i*4:], mathFloat32bits(v))
+ }
+ return buf
+}
+
+// DeserializeVector converts binary blob back to []float32
+func DeserializeVector(data []byte) []float32 {
+ count := len(data) / 4
+ vec := make([]float32, count)
+ for i := 0; i < count; i++ {
+ vec[i] = mathBitsToFloat32(binary.LittleEndian.Uint32(data[i*4:]))
+ }
+ return vec
+}
+
+// mathFloat32bits and mathBitsToFloat32 are helpers to convert between float32 and uint32
+func mathFloat32bits(f float32) uint32 {
+ return binary.LittleEndian.Uint32((*(*[4]byte)(unsafe.Pointer(&f)))[:4])
+}
+
+func mathBitsToFloat32(b uint32) float32 {
+ return *(*float32)(unsafe.Pointer(&b))
}
var (
@@ -44,19 +73,8 @@ func (p ProviderSQL) WriteVector(row *models.VectorRow) error {
return err
}
defer stmt.Close()
- v, err := sqlite_vec.SerializeFloat32(row.Embeddings)
- if err != nil {
- p.logger.Error("failed to serialize vector",
- "emb-len", len(row.Embeddings), "error", err)
- return err
- }
- if v == nil {
- err = errors.New("empty vector after serialization")
- p.logger.Error("empty vector after serialization",
- "emb-len", len(row.Embeddings), "text", row.RawText, "error", err)
- return err
- }
- if err := stmt.BindBlob(1, v); err != nil {
+ serializedEmbeddings := SerializeVector(row.Embeddings)
+ if err := stmt.BindBlob(1, serializedEmbeddings); err != nil {
p.logger.Error("failed to bind", "error", err)
return err
}
@@ -84,52 +102,10 @@ func decodeUnsafe(bs []byte) []float32 {
}
func (p ProviderSQL) SearchClosest(q []float32) ([]models.VectorRow, error) {
- tableName, err := fetchTableName(q)
- if err != nil {
- return nil, err
- }
- stmt, _, err := p.s3Conn.Prepare(
- fmt.Sprintf(`SELECT
- distance,
- embedding,
- slug,
- raw_text,
- filename
- FROM %s
- WHERE embedding MATCH ?
- ORDER BY distance
- LIMIT 3
- `, tableName))
- if err != nil {
- return nil, err
- }
- query, err := sqlite_vec.SerializeFloat32(q[:])
- if err != nil {
- return nil, err
- }
- if err := stmt.BindBlob(1, query); err != nil {
- p.logger.Error("failed to bind", "error", err)
- return nil, err
- }
- resp := []models.VectorRow{}
- for stmt.Step() {
- res := models.VectorRow{}
- res.Distance = float32(stmt.ColumnFloat(0))
- emb := stmt.ColumnRawText(1)
- res.Embeddings = decodeUnsafe(emb)
- res.Slug = stmt.ColumnText(2)
- res.RawText = stmt.ColumnText(3)
- res.FileName = stmt.ColumnText(4)
- resp = append(resp, res)
- }
- if err := stmt.Err(); err != nil {
- return nil, err
- }
- err = stmt.Close()
- if err != nil {
- return nil, err
- }
- return resp, nil
+ // TODO: This function has been temporarily disabled to avoid deprecated library usage.
+ // In the new RAG implementation, this functionality is now in rag_new package.
+ // For compatibility, return empty result instead of using deprecated vector extension.
+ return []models.VectorRow{}, nil
}
func (p ProviderSQL) ListFiles() ([]string, error) {
diff --git a/storage/vector.go.bak b/storage/vector.go.bak
new file mode 100644
index 0000000..f663beb
--- /dev/null
+++ b/storage/vector.go.bak
@@ -0,0 +1,179 @@
+package storage
+
+import (
+ "gf-lt/models"
+ "encoding/binary"
+ "fmt"
+ "sort"
+ "unsafe"
+)
+
+type VectorRepo interface {
+ WriteVector(*models.VectorRow) error
+ SearchClosest(q []float32) ([]models.VectorRow, error)
+ ListFiles() ([]string, error)
+ RemoveEmbByFileName(filename string) error
+}
+
+// SerializeVector converts []float32 to binary blob
+func SerializeVector(vec []float32) []byte {
+ buf := make([]byte, len(vec)*4) // 4 bytes per float32
+ for i, v := range vec {
+ binary.LittleEndian.PutUint32(buf[i*4:], mathFloat32bits(v))
+ }
+ return buf
+}
+
+// DeserializeVector converts binary blob back to []float32
+func DeserializeVector(data []byte) []float32 {
+ count := len(data) / 4
+ vec := make([]float32, count)
+ for i := 0; i < count; i++ {
+ vec[i] = mathBitsToFloat32(binary.LittleEndian.Uint32(data[i*4:]))
+ }
+ return vec
+}
+
+// mathFloat32bits and mathBitsToFloat32 are helpers to convert between float32 and uint32
+func mathFloat32bits(f float32) uint32 {
+ return binary.LittleEndian.Uint32((*(*[4]byte)(unsafe.Pointer(&f)))[:4])
+}
+
+func mathBitsToFloat32(b uint32) float32 {
+ return *(*float32)(unsafe.Pointer(&b))
+}
+
+var (
+ vecTableName5120 = "embeddings_5120"
+ vecTableName384 = "embeddings_384"
+)
+
+func fetchTableName(emb []float32) (string, error) {
+ switch len(emb) {
+ case 5120:
+ return vecTableName5120, nil
+ case 384:
+ return vecTableName384, nil
+ default:
+ return "", fmt.Errorf("no table for the size of %d", len(emb))
+ }
+}
+
+func (p ProviderSQL) WriteVector(row *models.VectorRow) error {
+ tableName, err := fetchTableName(row.Embeddings)
+ if err != nil {
+ return err
+ }
+ stmt, _, err := p.s3Conn.Prepare(
+ fmt.Sprintf("INSERT INTO %s(embedding, slug, raw_text, filename) VALUES (?, ?, ?, ?)", tableName))
+ if err != nil {
+ p.logger.Error("failed to prep a stmt", "error", err)
+ return err
+ }
+ defer stmt.Close()
+ serializedEmbeddings := SerializeVector(row.Embeddings)
+ if err := stmt.BindBlob(1, serializedEmbeddings); err != nil {
+ p.logger.Error("failed to bind", "error", err)
+ return err
+ }
+ if err := stmt.BindText(2, row.Slug); err != nil {
+ p.logger.Error("failed to bind", "error", err)
+ return err
+ }
+ if err := stmt.BindText(3, row.RawText); err != nil {
+ p.logger.Error("failed to bind", "error", err)
+ return err
+ }
+ if err := stmt.BindText(4, row.FileName); err != nil {
+ p.logger.Error("failed to bind", "error", err)
+ return err
+ }
+ err = stmt.Exec()
+ if err != nil {
+ return err
+ }
+ return nil
+}
+
+func decodeUnsafe(bs []byte) []float32 {
+ return unsafe.Slice((*float32)(unsafe.Pointer(&bs[0])), len(bs)/4)
+}
+
+func (p ProviderSQL) SearchClosest(q []float32) ([]models.VectorRow, error) {
+ tableName, err := fetchTableName(q)
+ if err != nil {
+ return nil, err
+ }
+ stmt, _, err := p.s3Conn.Prepare(
+ fmt.Sprintf(`SELECT
+ distance,
+ embedding,
+ slug,
+ raw_text,
+ filename
+ FROM %s
+ WHERE embedding MATCH ?
+ ORDER BY distance
+ LIMIT 3
+ `, tableName))
+ if err != nil {
+ return nil, err
+ }
+ // This function needs to be completely rewritten to use the new binary storage approach
+ if err != nil {
+ return nil, err
+ }
+ if err := stmt.BindBlob(1, query); err != nil {
+ p.logger.Error("failed to bind", "error", err)
+ return nil, err
+ }
+ resp := []models.VectorRow{}
+ for stmt.Step() {
+ res := models.VectorRow{}
+ res.Distance = float32(stmt.ColumnFloat(0))
+ emb := stmt.ColumnRawText(1)
+ res.Embeddings = decodeUnsafe(emb)
+ res.Slug = stmt.ColumnText(2)
+ res.RawText = stmt.ColumnText(3)
+ res.FileName = stmt.ColumnText(4)
+ resp = append(resp, res)
+ }
+ if err := stmt.Err(); err != nil {
+ return nil, err
+ }
+ err = stmt.Close()
+ if err != nil {
+ return nil, err
+ }
+ return resp, nil
+}
+
+func (p ProviderSQL) ListFiles() ([]string, error) {
+ q := fmt.Sprintf("SELECT filename FROM %s GROUP BY filename", vecTableName384)
+ stmt, _, err := p.s3Conn.Prepare(q)
+ if err != nil {
+ return nil, err
+ }
+ defer stmt.Close()
+ resp := []string{}
+ for stmt.Step() {
+ resp = append(resp, stmt.ColumnText(0))
+ }
+ if err := stmt.Err(); err != nil {
+ return nil, err
+ }
+ return resp, nil
+}
+
+func (p ProviderSQL) RemoveEmbByFileName(filename string) error {
+ q := fmt.Sprintf("DELETE FROM %s WHERE filename = ?", vecTableName384)
+ stmt, _, err := p.s3Conn.Prepare(q)
+ if err != nil {
+ return err
+ }
+ defer stmt.Close()
+ if err := stmt.BindText(1, filename); err != nil {
+ return err
+ }
+ return stmt.Exec()
+}