From fbc955ca37836553ef4b7c365b84e3dfa859c501 Mon Sep 17 00:00:00 2001 From: Grail Finder Date: Thu, 5 Mar 2026 14:13:58 +0300 Subject: Enha: local onnx --- rag/embedder.go | 65 ++++++++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 60 insertions(+), 5 deletions(-) (limited to 'rag/embedder.go') diff --git a/rag/embedder.go b/rag/embedder.go index 1d29877..386d508 100644 --- a/rag/embedder.go +++ b/rag/embedder.go @@ -9,6 +9,10 @@ import ( "gf-lt/models" "log/slog" "net/http" + + "github.com/takara-ai/go-tokenizers/tokenizers" + + "github.com/yalue/onnxruntime_go" ) // Embedder defines the interface for embedding text @@ -134,11 +138,62 @@ func (a *APIEmbedder) EmbedSlice(lines []string) ([][]float32, error) { return embeddings, nil } -// TODO: ONNXEmbedder implementation would go here -// This would require: // 1. Loading ONNX models locally // 2. Using a Go ONNX runtime (like gorgonia/onnx or similar) // 3. Converting text to embeddings without external API calls -// -// For now, we'll focus on the API implementation which is already working in the current system, -// and can be extended later when we have ONNX runtime integration + +type ONNXEmbedder struct { + session *onnxruntime_go.DynamicAdvancedSession + tokenizer *tokenizers.Tokenizer + dims int // 768, 512, 256, or 128 for Matryoshka +} + +func (e *ONNXEmbedder) EmbedSlice(texts []string) ([][]float32, error) { + // Batch processing + inputs := e.prepareBatch(texts) + outputs := make([][]float32, len(texts)) + + // Run batch inference (much faster) + err := e.session.Run(inputs, outputs) + return outputs, err +} + +func NewONNXEmbedder(modelPath string) (*ONNXEmbedder, error) { + // Load ONNX model + session, err := onnxruntime_go.NewDynamicAdvancedSession( + modelPath, // onnx/embedgemma/model_q4.onnx + []string{"input_ids", "attention_mask"}, + []string{"sentence_embedding"}, + nil, + ) + if err != nil { + return nil, err + } + // Load tokenizer (from Hugging Face) + tokenizer, err := tokenizers.FromFile("./tokenizer.json") + return &ONNXEmbedder{ + session: session, + tokenizer: tokenizer, + }, nil +} + +func (e *ONNXEmbedder) Embed(text string) ([]float32, error) { + // Tokenize + tokens := e.tokenizer.Encode(text, true) + // Prepare inputs + inputIDs := []int64{tokens.GetIds()} + attentionMask := []int64{tokens.GetAttentionMask()} + // Run inference + output := onnxruntime_go.NewEmptyTensor[float32]( + onnxruntime_go.NewShape(1, 768), + ) + err := e.session.Run( + map[string]any{ + "input_ids": inputIDs, + "attention_mask": attentionMask, + }, + []string{"sentence_embedding"}, + []any{&output}, + ) + return output.GetData(), nil +} -- cgit v1.2.3 From 7c56e27dbe904b3c08b3eee375542011458e297c Mon Sep 17 00:00:00 2001 From: Grail Finder Date: Thu, 5 Mar 2026 14:27:19 +0300 Subject: Dep: trying sugarme tokenizer --- rag/embedder.go | 181 +++++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 145 insertions(+), 36 deletions(-) (limited to 'rag/embedder.go') diff --git a/rag/embedder.go b/rag/embedder.go index 386d508..396f04b 100644 --- a/rag/embedder.go +++ b/rag/embedder.go @@ -10,8 +10,8 @@ import ( "log/slog" "net/http" - "github.com/takara-ai/go-tokenizers/tokenizers" - + "github.com/sugarme/tokenizer" + "github.com/sugarme/tokenizer/pretrained" "github.com/yalue/onnxruntime_go" ) @@ -141,59 +141,168 @@ func (a *APIEmbedder) EmbedSlice(lines []string) ([][]float32, error) { // 1. Loading ONNX models locally // 2. Using a Go ONNX runtime (like gorgonia/onnx or similar) // 3. Converting text to embeddings without external API calls - type ONNXEmbedder struct { session *onnxruntime_go.DynamicAdvancedSession - tokenizer *tokenizers.Tokenizer - dims int // 768, 512, 256, or 128 for Matryoshka + tokenizer *tokenizer.Tokenizer + dims int // embedding dimension (e.g., 768) + logger *slog.Logger } -func (e *ONNXEmbedder) EmbedSlice(texts []string) ([][]float32, error) { - // Batch processing - inputs := e.prepareBatch(texts) - outputs := make([][]float32, len(texts)) - - // Run batch inference (much faster) - err := e.session.Run(inputs, outputs) - return outputs, err -} - -func NewONNXEmbedder(modelPath string) (*ONNXEmbedder, error) { - // Load ONNX model +func NewONNXEmbedder(modelPath, tokenizerPath string, dims int, logger *slog.Logger) (*ONNXEmbedder, error) { + // Load tokenizer using sugarme/tokenizer + tok, err := pretrained.FromFile(tokenizerPath) + if err != nil { + return nil, fmt.Errorf("failed to load tokenizer: %w", err) + } + // Create ONNX session session, err := onnxruntime_go.NewDynamicAdvancedSession( modelPath, // onnx/embedgemma/model_q4.onnx []string{"input_ids", "attention_mask"}, []string{"sentence_embedding"}, - nil, + nil, // optional options ) if err != nil { - return nil, err + return nil, fmt.Errorf("failed to create ONNX session: %w", err) } - // Load tokenizer (from Hugging Face) - tokenizer, err := tokenizers.FromFile("./tokenizer.json") return &ONNXEmbedder{ session: session, - tokenizer: tokenizer, + tokenizer: tok, + dims: dims, + logger: logger, }, nil } func (e *ONNXEmbedder) Embed(text string) ([]float32, error) { - // Tokenize - tokens := e.tokenizer.Encode(text, true) - // Prepare inputs - inputIDs := []int64{tokens.GetIds()} - attentionMask := []int64{tokens.GetAttentionMask()} - // Run inference - output := onnxruntime_go.NewEmptyTensor[float32]( - onnxruntime_go.NewShape(1, 768), + // 1. Tokenize + encoding, err := e.tokenizer.Encode(text, true) // true = add special tokens + if err != nil { + return nil, fmt.Errorf("tokenization failed: %w", err) + } + // Convert []int32 to []int64 for ONNX + inputIDs := make([]int64, len(encoding.GetIDs())) + for i, id := range encoding.GetIDs() { + inputIDs[i] = int64(id) + } + attentionMask := make([]int64, len(encoding.GetAttentionMask())) + for i, m := range encoding.GetAttentionMask() { + attentionMask[i] = int64(m) + } + // 2. Create input tensors (shape: [1, seq_len]) + seqLen := int64(len(inputIDs)) + inputIDsTensor, err := onnxruntime_go.NewTensor(onnxruntime_go.NewShape(1, seqLen), inputIDs) + if err != nil { + return nil, fmt.Errorf("failed to create input_ids tensor: %w", err) + } + defer inputIDsTensor.Destroy() + maskTensor, err := onnxruntime_go.NewTensor(onnxruntime_go.NewShape(1, seqLen), attentionMask) + if err != nil { + return nil, fmt.Errorf("failed to create attention_mask tensor: %w", err) + } + defer maskTensor.Destroy() + // 3. Create output tensor (shape: [1, dims]) + outputTensor, err := onnxruntime_go.NewEmptyTensor[float32](onnxruntime_go.NewShape(1, int64(e.dims))) + if err != nil { + return nil, fmt.Errorf("failed to create output tensor: %w", err) + } + defer outputTensor.Destroy() + // 4. Run inference + err = e.session.Run( + map[string]*onnxruntime_go.Tensor{ + "input_ids": inputIDsTensor, + "attention_mask": maskTensor, + }, + []string{"sentence_embedding"}, + []*onnxruntime_go.Tensor{outputTensor}, ) - err := e.session.Run( - map[string]any{ - "input_ids": inputIDs, - "attention_mask": attentionMask, + if err != nil { + return nil, fmt.Errorf("inference failed: %w", err) + } + // 5. Extract data + outputData := outputTensor.GetData() + // outputTensor is owned by us, but GetData returns a slice that remains valid until Destroy. + // We need to copy if we want to keep it after Destroy (we defer Destroy, so copy now). + embedding := make([]float32, len(outputData)) + copy(embedding, outputData) + return embedding, nil +} + +// EmbedSlice (batch) – to be implemented properly +func (e *ONNXEmbedder) EmbedSlice(texts []string) ([][]float32, error) { + if len(texts) == 0 { + return nil, nil + } + // 1. Tokenize all texts and find max length for padding + encodings := make([]*tokenizer.Encoding, len(texts)) + maxLen := 0 + for i, txt := range texts { + enc, err := e.tokenizer.Encode(txt, true) + if err != nil { + return nil, fmt.Errorf("tokenization failed at index %d: %w", i, err) + } + encodings[i] = enc + if l := len(enc.GetIDs()); l > maxLen { + maxLen = l + } + } + // 2. Build padded input_ids and attention_mask (shape: [batch, maxLen]) + batchSize := len(texts) + inputIDs := make([]int64, batchSize*maxLen) + attentionMask := make([]int64, batchSize*maxLen) + for i, enc := range encodings { + ids := enc.GetIDs() + mask := enc.GetAttentionMask() + offset := i * maxLen + // copy actual tokens + for j := 0; j < len(ids); j++ { + inputIDs[offset+j] = int64(ids[j]) + attentionMask[offset+j] = int64(mask[j]) + } + // remaining positions (padding) are already zero-initialized + } + // 3. Create tensors + inputIDsTensor, err := onnxruntime_go.NewTensor( + onnxruntime_go.NewShape(int64(batchSize), int64(maxLen)), + inputIDs, + ) + if err != nil { + return nil, err + } + defer inputIDsTensor.Destroy() + maskTensor, err := onnxruntime_go.NewTensor( + onnxruntime_go.NewShape(int64(batchSize), int64(maxLen)), + attentionMask, + ) + if err != nil { + return nil, err + } + defer maskTensor.Destroy() + outputTensor, err := onnxruntime_go.NewEmptyTensor[float32]( + onnxruntime_go.NewShape(int64(batchSize), int64(e.dims)), + ) + if err != nil { + return nil, err + } + defer outputTensor.Destroy() + // 4. Run + err = e.session.Run( + map[string]*onnxruntime_go.Tensor{ + "input_ids": inputIDsTensor, + "attention_mask": maskTensor, }, []string{"sentence_embedding"}, - []any{&output}, + []*onnxruntime_go.Tensor{outputTensor}, ) - return output.GetData(), nil + if err != nil { + return nil, err + } + // 5. Extract batch results + outputData := outputTensor.GetData() + embeddings := make([][]float32, batchSize) + for i := 0; i < batchSize; i++ { + start := i * e.dims + emb := make([]float32, e.dims) + copy(emb, outputData[start:start+e.dims]) + embeddings[i] = emb + } + return embeddings, nil } -- cgit v1.2.3 From 4bd6883966824cff81b86e8bf79e278165d7d24a Mon Sep 17 00:00:00 2001 From: Grail Finder Date: Thu, 5 Mar 2026 14:38:26 +0300 Subject: WIP --- rag/embedder.go | 105 +++++++++++++++++++++++--------------------------------- 1 file changed, 43 insertions(+), 62 deletions(-) (limited to 'rag/embedder.go') diff --git a/rag/embedder.go b/rag/embedder.go index 396f04b..988d91e 100644 --- a/rag/embedder.go +++ b/rag/embedder.go @@ -174,134 +174,115 @@ func NewONNXEmbedder(modelPath, tokenizerPath string, dims int, logger *slog.Log func (e *ONNXEmbedder) Embed(text string) ([]float32, error) { // 1. Tokenize - encoding, err := e.tokenizer.Encode(text, true) // true = add special tokens + encoding, err := e.tokenizer.EncodeSingle(text) if err != nil { return nil, fmt.Errorf("tokenization failed: %w", err) } - // Convert []int32 to []int64 for ONNX - inputIDs := make([]int64, len(encoding.GetIDs())) - for i, id := range encoding.GetIDs() { + // 2. Convert to int64 and create attention mask + ids := encoding.Ids + inputIDs := make([]int64, len(ids)) + attentionMask := make([]int64, len(ids)) + for i, id := range ids { inputIDs[i] = int64(id) + attentionMask[i] = 1 } - attentionMask := make([]int64, len(encoding.GetAttentionMask())) - for i, m := range encoding.GetAttentionMask() { - attentionMask[i] = int64(m) - } - // 2. Create input tensors (shape: [1, seq_len]) + // 3. Create input tensors (shape: [1, seq_len]) seqLen := int64(len(inputIDs)) - inputIDsTensor, err := onnxruntime_go.NewTensor(onnxruntime_go.NewShape(1, seqLen), inputIDs) + inputIDsTensor, err := onnxruntime_go.NewTensor[int64]( + onnxruntime_go.NewShape(1, seqLen), + inputIDs, + ) if err != nil { return nil, fmt.Errorf("failed to create input_ids tensor: %w", err) } defer inputIDsTensor.Destroy() - maskTensor, err := onnxruntime_go.NewTensor(onnxruntime_go.NewShape(1, seqLen), attentionMask) + maskTensor, err := onnxruntime_go.NewTensor[int64]( + onnxruntime_go.NewShape(1, seqLen), + attentionMask, + ) if err != nil { return nil, fmt.Errorf("failed to create attention_mask tensor: %w", err) } defer maskTensor.Destroy() - // 3. Create output tensor (shape: [1, dims]) - outputTensor, err := onnxruntime_go.NewEmptyTensor[float32](onnxruntime_go.NewShape(1, int64(e.dims))) + // 4. Create output tensor + outputTensor, err := onnxruntime_go.NewEmptyTensor[float32]( + onnxruntime_go.NewShape(1, int64(e.dims)), + ) if err != nil { return nil, fmt.Errorf("failed to create output tensor: %w", err) } defer outputTensor.Destroy() - // 4. Run inference + // 5. Run inference err = e.session.Run( - map[string]*onnxruntime_go.Tensor{ - "input_ids": inputIDsTensor, - "attention_mask": maskTensor, - }, + []onnxruntime_go.Value{inputIDsTensor, maskTensor}, []string{"sentence_embedding"}, - []*onnxruntime_go.Tensor{outputTensor}, + []onnxruntime_go.Value{outputTensor}, ) if err != nil { return nil, fmt.Errorf("inference failed: %w", err) } - // 5. Extract data + // 6. Copy output data outputData := outputTensor.GetData() - // outputTensor is owned by us, but GetData returns a slice that remains valid until Destroy. - // We need to copy if we want to keep it after Destroy (we defer Destroy, so copy now). embedding := make([]float32, len(outputData)) copy(embedding, outputData) return embedding, nil } -// EmbedSlice (batch) – to be implemented properly func (e *ONNXEmbedder) EmbedSlice(texts []string) ([][]float32, error) { - if len(texts) == 0 { - return nil, nil - } - // 1. Tokenize all texts and find max length for padding encodings := make([]*tokenizer.Encoding, len(texts)) maxLen := 0 for i, txt := range texts { - enc, err := e.tokenizer.Encode(txt, true) + enc, err := e.tokenizer.EncodeSingle(txt) if err != nil { - return nil, fmt.Errorf("tokenization failed at index %d: %w", i, err) + return nil, err } encodings[i] = enc - if l := len(enc.GetIDs()); l > maxLen { + if l := len(enc.Ids); l > maxLen { maxLen = l } } - // 2. Build padded input_ids and attention_mask (shape: [batch, maxLen]) batchSize := len(texts) inputIDs := make([]int64, batchSize*maxLen) attentionMask := make([]int64, batchSize*maxLen) for i, enc := range encodings { - ids := enc.GetIDs() - mask := enc.GetAttentionMask() + ids := enc.Ids offset := i * maxLen - // copy actual tokens - for j := 0; j < len(ids); j++ { - inputIDs[offset+j] = int64(ids[j]) - attentionMask[offset+j] = int64(mask[j]) + for j, id := range ids { + inputIDs[offset+j] = int64(id) + attentionMask[offset+j] = 1 } - // remaining positions (padding) are already zero-initialized + // Remaining positions are already zero (padding) } - // 3. Create tensors - inputIDsTensor, err := onnxruntime_go.NewTensor( + // Create tensors with shape [batchSize, maxLen] + inputTensor, _ := onnxruntime_go.NewTensor[int64]( onnxruntime_go.NewShape(int64(batchSize), int64(maxLen)), inputIDs, ) - if err != nil { - return nil, err - } - defer inputIDsTensor.Destroy() - maskTensor, err := onnxruntime_go.NewTensor( + defer inputTensor.Destroy() + maskTensor, _ := onnxruntime_go.NewTensor[int64]( onnxruntime_go.NewShape(int64(batchSize), int64(maxLen)), attentionMask, ) - if err != nil { - return nil, err - } defer maskTensor.Destroy() - outputTensor, err := onnxruntime_go.NewEmptyTensor[float32]( + outputTensor, _ := onnxruntime_go.NewEmptyTensor[float32]( onnxruntime_go.NewShape(int64(batchSize), int64(e.dims)), ) - if err != nil { - return nil, err - } defer outputTensor.Destroy() - // 4. Run - err = e.session.Run( - map[string]*onnxruntime_go.Tensor{ - "input_ids": inputIDsTensor, - "attention_mask": maskTensor, - }, + err := e.session.Run( + []onnxruntime_go.Value{inputTensor, maskTensor}, []string{"sentence_embedding"}, - []*onnxruntime_go.Tensor{outputTensor}, + []onnxruntime_go.Value{outputTensor}, ) if err != nil { return nil, err } - // 5. Extract batch results - outputData := outputTensor.GetData() + // Extract embeddings per batch item + data := outputTensor.GetData() embeddings := make([][]float32, batchSize) for i := 0; i < batchSize; i++ { start := i * e.dims emb := make([]float32, e.dims) - copy(emb, outputData[start:start+e.dims]) + copy(emb, data[start:start+e.dims]) embeddings[i] = emb } return embeddings, nil -- cgit v1.2.3 From c2757653a3429ab3f9e76081328a3877bc11ed4d Mon Sep 17 00:00:00 2001 From: Grail Finder Date: Thu, 5 Mar 2026 14:49:59 +0300 Subject: Fix: buildable --- rag/embedder.go | 2 -- 1 file changed, 2 deletions(-) (limited to 'rag/embedder.go') diff --git a/rag/embedder.go b/rag/embedder.go index 988d91e..6903a5d 100644 --- a/rag/embedder.go +++ b/rag/embedder.go @@ -215,7 +215,6 @@ func (e *ONNXEmbedder) Embed(text string) ([]float32, error) { // 5. Run inference err = e.session.Run( []onnxruntime_go.Value{inputIDsTensor, maskTensor}, - []string{"sentence_embedding"}, []onnxruntime_go.Value{outputTensor}, ) if err != nil { @@ -270,7 +269,6 @@ func (e *ONNXEmbedder) EmbedSlice(texts []string) ([][]float32, error) { defer outputTensor.Destroy() err := e.session.Run( []onnxruntime_go.Value{inputTensor, maskTensor}, - []string{"sentence_embedding"}, []onnxruntime_go.Value{outputTensor}, ) if err != nil { -- cgit v1.2.3 From ac8c8bb0558a00cf0d025ab8522aaa57b8cba7de Mon Sep 17 00:00:00 2001 From: Grail Finder Date: Thu, 5 Mar 2026 19:20:21 +0300 Subject: Enha: onnx config vars --- rag/embedder.go | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) (limited to 'rag/embedder.go') diff --git a/rag/embedder.go b/rag/embedder.go index 6903a5d..b0a3226 100644 --- a/rag/embedder.go +++ b/rag/embedder.go @@ -9,6 +9,7 @@ import ( "gf-lt/models" "log/slog" "net/http" + "sync" "github.com/sugarme/tokenizer" "github.com/sugarme/tokenizer/pretrained" @@ -148,7 +149,17 @@ type ONNXEmbedder struct { logger *slog.Logger } +var onnxInitOnce sync.Once + func NewONNXEmbedder(modelPath, tokenizerPath string, dims int, logger *slog.Logger) (*ONNXEmbedder, error) { + // Initialize ONNX runtime environment once + onnxInitOnce.Do(func() { + onnxruntime_go.SetSharedLibraryPath("/usr/local/lib/libonnxruntime.so") + err := onnxruntime_go.InitializeEnvironment() + if err != nil { + logger.Error("failed to initialize ONNX runtime", "error", err) + } + }) // Load tokenizer using sugarme/tokenizer tok, err := pretrained.FromFile(tokenizerPath) if err != nil { @@ -195,7 +206,7 @@ func (e *ONNXEmbedder) Embed(text string) ([]float32, error) { if err != nil { return nil, fmt.Errorf("failed to create input_ids tensor: %w", err) } - defer inputIDsTensor.Destroy() + defer func() { _ = inputIDsTensor.Destroy() }() maskTensor, err := onnxruntime_go.NewTensor[int64]( onnxruntime_go.NewShape(1, seqLen), attentionMask, @@ -203,7 +214,7 @@ func (e *ONNXEmbedder) Embed(text string) ([]float32, error) { if err != nil { return nil, fmt.Errorf("failed to create attention_mask tensor: %w", err) } - defer maskTensor.Destroy() + defer func() { _ = maskTensor.Destroy() }() // 4. Create output tensor outputTensor, err := onnxruntime_go.NewEmptyTensor[float32]( onnxruntime_go.NewShape(1, int64(e.dims)), @@ -211,7 +222,7 @@ func (e *ONNXEmbedder) Embed(text string) ([]float32, error) { if err != nil { return nil, fmt.Errorf("failed to create output tensor: %w", err) } - defer outputTensor.Destroy() + defer func() { _ = outputTensor.Destroy() }() // 5. Run inference err = e.session.Run( []onnxruntime_go.Value{inputIDsTensor, maskTensor}, @@ -257,16 +268,16 @@ func (e *ONNXEmbedder) EmbedSlice(texts []string) ([][]float32, error) { onnxruntime_go.NewShape(int64(batchSize), int64(maxLen)), inputIDs, ) - defer inputTensor.Destroy() + defer func() { _ = inputTensor.Destroy() }() maskTensor, _ := onnxruntime_go.NewTensor[int64]( onnxruntime_go.NewShape(int64(batchSize), int64(maxLen)), attentionMask, ) - defer maskTensor.Destroy() + defer func() { _ = maskTensor.Destroy() }() outputTensor, _ := onnxruntime_go.NewEmptyTensor[float32]( onnxruntime_go.NewShape(int64(batchSize), int64(e.dims)), ) - defer outputTensor.Destroy() + defer func() { _ = outputTensor.Destroy() }() err := e.session.Run( []onnxruntime_go.Value{inputTensor, maskTensor}, []onnxruntime_go.Value{outputTensor}, -- cgit v1.2.3 From efc92d884c36498220e2b8d5ad9e02f84e42d953 Mon Sep 17 00:00:00 2001 From: Grail Finder Date: Thu, 5 Mar 2026 20:02:46 +0300 Subject: Chore: onnx library lookup --- rag/embedder.go | 113 ++++++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 89 insertions(+), 24 deletions(-) (limited to 'rag/embedder.go') diff --git a/rag/embedder.go b/rag/embedder.go index b0a3226..59dbfd2 100644 --- a/rag/embedder.go +++ b/rag/embedder.go @@ -9,6 +9,7 @@ import ( "gf-lt/models" "log/slog" "net/http" + "os" "sync" "github.com/sugarme/tokenizer" @@ -143,47 +144,111 @@ func (a *APIEmbedder) EmbedSlice(lines []string) ([][]float32, error) { // 2. Using a Go ONNX runtime (like gorgonia/onnx or similar) // 3. Converting text to embeddings without external API calls type ONNXEmbedder struct { - session *onnxruntime_go.DynamicAdvancedSession - tokenizer *tokenizer.Tokenizer - dims int // embedding dimension (e.g., 768) - logger *slog.Logger + session *onnxruntime_go.DynamicAdvancedSession + tokenizer *tokenizer.Tokenizer + tokenizerPath string + dims int + logger *slog.Logger + mu sync.Mutex + modelPath string } var onnxInitOnce sync.Once +var onnxReady bool +var onnxLibPath string + +var onnxLibPaths = []string{ + "/usr/lib/libonnxruntime.so", + "/usr/local/lib/libonnxruntime.so", + "/usr/lib/x86_64-linux-gnu/libonnxruntime.so", + "/opt/onnxruntime/lib/libonnxruntime.so", +} + +func findONNXLibrary() string { + for _, path := range onnxLibPaths { + if _, err := os.Stat(path); err == nil { + return path + } + } + return "" +} func NewONNXEmbedder(modelPath, tokenizerPath string, dims int, logger *slog.Logger) (*ONNXEmbedder, error) { - // Initialize ONNX runtime environment once - onnxInitOnce.Do(func() { - onnxruntime_go.SetSharedLibraryPath("/usr/local/lib/libonnxruntime.so") - err := onnxruntime_go.InitializeEnvironment() + // Check if model and tokenizer files exist + if _, err := os.Stat(modelPath); err != nil { + return nil, fmt.Errorf("ONNX model not found: %w", err) + } + if _, err := os.Stat(tokenizerPath); err != nil { + return nil, fmt.Errorf("tokenizer not found: %w", err) + } + + // Find ONNX library + onnxLibPath = findONNXLibrary() + if onnxLibPath == "" { + return nil, errors.New("ONNX runtime library not found in standard locations") + } + + emb := &ONNXEmbedder{ + tokenizerPath: tokenizerPath, + dims: dims, + logger: logger, + modelPath: modelPath, + } + return emb, nil +} + +func (e *ONNXEmbedder) ensureInitialized() error { + if e.session != nil { + return nil + } + e.mu.Lock() + defer e.mu.Unlock() + if e.session != nil { + return nil + } + + // Load tokenizer lazily + if e.tokenizer == nil { + tok, err := pretrained.FromFile(e.tokenizerPath) if err != nil { - logger.Error("failed to initialize ONNX runtime", "error", err) + return fmt.Errorf("failed to load tokenizer: %w", err) + } + e.tokenizer = tok + } + + onnxInitOnce.Do(func() { + onnxruntime_go.SetSharedLibraryPath(onnxLibPath) + if err := onnxruntime_go.InitializeEnvironment(); err != nil { + e.logger.Error("failed to initialize ONNX runtime", "error", err) + onnxReady = false + return } + onnxReady = true }) - // Load tokenizer using sugarme/tokenizer - tok, err := pretrained.FromFile(tokenizerPath) - if err != nil { - return nil, fmt.Errorf("failed to load tokenizer: %w", err) + if !onnxReady { + return errors.New("ONNX runtime not ready") } - // Create ONNX session session, err := onnxruntime_go.NewDynamicAdvancedSession( - modelPath, // onnx/embedgemma/model_q4.onnx + e.getModelPath(), []string{"input_ids", "attention_mask"}, []string{"sentence_embedding"}, - nil, // optional options + nil, ) if err != nil { - return nil, fmt.Errorf("failed to create ONNX session: %w", err) - } - return &ONNXEmbedder{ - session: session, - tokenizer: tok, - dims: dims, - logger: logger, - }, nil + return fmt.Errorf("failed to create ONNX session: %w", err) + } + e.session = session + return nil +} + +func (e *ONNXEmbedder) getModelPath() string { + return e.modelPath } func (e *ONNXEmbedder) Embed(text string) ([]float32, error) { + if err := e.ensureInitialized(); err != nil { + return nil, err + } // 1. Tokenize encoding, err := e.tokenizer.EncodeSingle(text) if err != nil { -- cgit v1.2.3 From d2caebdb4fd3ad148aad20866503b7d46d546404 Mon Sep 17 00:00:00 2001 From: Grail Finder Date: Fri, 6 Mar 2026 09:11:25 +0300 Subject: Enha (onnx): use gpu --- rag/embedder.go | 68 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 67 insertions(+), 1 deletion(-) (limited to 'rag/embedder.go') diff --git a/rag/embedder.go b/rag/embedder.go index 59dbfd2..13f6a6e 100644 --- a/rag/embedder.go +++ b/rag/embedder.go @@ -156,14 +156,22 @@ type ONNXEmbedder struct { var onnxInitOnce sync.Once var onnxReady bool var onnxLibPath string +var cudaLibPath string var onnxLibPaths = []string{ "/usr/lib/libonnxruntime.so", + "/usr/lib/libonnxruntime.so.1.24.2", "/usr/local/lib/libonnxruntime.so", "/usr/lib/x86_64-linux-gnu/libonnxruntime.so", "/opt/onnxruntime/lib/libonnxruntime.so", } +var cudaLibPaths = []string{ + "/usr/lib/libonnxruntime_providers_cuda.so", + "/usr/local/lib/libonnxruntime_providers_cuda.so", + "/opt/onnxruntime/lib/libonnxruntime_providers_cuda.so", +} + func findONNXLibrary() string { for _, path := range onnxLibPaths { if _, err := os.Stat(path); err == nil { @@ -173,6 +181,15 @@ func findONNXLibrary() string { return "" } +func findCUDALibrary() string { + for _, path := range cudaLibPaths { + if _, err := os.Stat(path); err == nil { + return path + } + } + return "" +} + func NewONNXEmbedder(modelPath, tokenizerPath string, dims int, logger *slog.Logger) (*ONNXEmbedder, error) { // Check if model and tokenizer files exist if _, err := os.Stat(modelPath); err != nil { @@ -188,6 +205,12 @@ func NewONNXEmbedder(modelPath, tokenizerPath string, dims int, logger *slog.Log return nil, errors.New("ONNX runtime library not found in standard locations") } + // Find CUDA provider library (optional) + cudaLibPath = findCUDALibrary() + if cudaLibPath == "" { + fmt.Println("WARNING: CUDA provider library not found, will use CPU") + } + emb := &ONNXEmbedder{ tokenizerPath: tokenizerPath, dims: dims, @@ -223,16 +246,56 @@ func (e *ONNXEmbedder) ensureInitialized() error { onnxReady = false return } + // Register CUDA provider if available + if cudaLibPath != "" { + if err := onnxruntime_go.RegisterExecutionProviderLibrary("CUDA", cudaLibPath); err != nil { + e.logger.Warn("failed to register CUDA provider", "error", err) + } + } onnxReady = true }) if !onnxReady { return errors.New("ONNX runtime not ready") } + + // Create session options + opts, err := onnxruntime_go.NewSessionOptions() + if err != nil { + return fmt.Errorf("failed to create session options: %w", err) + } + defer opts.Destroy() + + // Try to add CUDA provider + useCUDA := cudaLibPath != "" + if useCUDA { + cudaOpts, err := onnxruntime_go.NewCUDAProviderOptions() + if err != nil { + e.logger.Warn("failed to create CUDA provider options, falling back to CPU", "error", err) + useCUDA = false + } else { + defer cudaOpts.Destroy() + if err := cudaOpts.Update(map[string]string{"device_id": "0"}); err != nil { + e.logger.Warn("failed to update CUDA options, falling back to CPU", "error", err) + useCUDA = false + } else if err := opts.AppendExecutionProviderCUDA(cudaOpts); err != nil { + e.logger.Warn("failed to append CUDA provider, falling back to CPU", "error", err) + useCUDA = false + } + } + } + + if useCUDA { + e.logger.Info("Using CUDA for ONNX inference") + } else { + e.logger.Info("Using CPU for ONNX inference") + } + + // Create session with options session, err := onnxruntime_go.NewDynamicAdvancedSession( e.getModelPath(), []string{"input_ids", "attention_mask"}, []string{"sentence_embedding"}, - nil, + opts, ) if err != nil { return fmt.Errorf("failed to create ONNX session: %w", err) @@ -304,6 +367,9 @@ func (e *ONNXEmbedder) Embed(text string) ([]float32, error) { } func (e *ONNXEmbedder) EmbedSlice(texts []string) ([][]float32, error) { + if err := e.ensureInitialized(); err != nil { + return nil, err + } encodings := make([]*tokenizer.Encoding, len(texts)) maxLen := 0 for i, txt := range texts { -- cgit v1.2.3 From 4ef0a215119924347c2219f4677f11a96358307f Mon Sep 17 00:00:00 2001 From: Grail Finder Date: Fri, 6 Mar 2026 09:32:45 +0300 Subject: Enha (onnx): unload model if noop for 30s --- rag/embedder.go | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'rag/embedder.go') diff --git a/rag/embedder.go b/rag/embedder.go index 13f6a6e..39f4b5c 100644 --- a/rag/embedder.go +++ b/rag/embedder.go @@ -308,6 +308,19 @@ func (e *ONNXEmbedder) getModelPath() string { return e.modelPath } +func (e *ONNXEmbedder) Destroy() error { + e.mu.Lock() + defer e.mu.Unlock() + if e.session != nil { + if err := e.session.Destroy(); err != nil { + return fmt.Errorf("failed to destroy ONNX session: %w", err) + } + e.session = nil + e.logger.Info("ONNX session destroyed, VRAM freed") + } + return nil +} + func (e *ONNXEmbedder) Embed(text string) ([]float32, error) { if err := e.ensureInitialized(); err != nil { return nil, err -- cgit v1.2.3 From 17b68bc21fae99c17ec48e046e67a643b9d159bb Mon Sep 17 00:00:00 2001 From: Grail Finder Date: Fri, 6 Mar 2026 18:58:23 +0300 Subject: Enha (rag): async writes --- rag/embedder.go | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'rag/embedder.go') diff --git a/rag/embedder.go b/rag/embedder.go index 39f4b5c..fd4cfa7 100644 --- a/rag/embedder.go +++ b/rag/embedder.go @@ -11,6 +11,7 @@ import ( "net/http" "os" "sync" + "time" "github.com/sugarme/tokenizer" "github.com/sugarme/tokenizer/pretrained" @@ -33,8 +34,10 @@ type APIEmbedder struct { func NewAPIEmbedder(l *slog.Logger, cfg *config.Config) *APIEmbedder { return &APIEmbedder{ logger: l, - client: &http.Client{}, - cfg: cfg, + client: &http.Client{ + Timeout: 30 * time.Second, + }, + cfg: cfg, } } -- cgit v1.2.3 From 014e297ae3497d07b5c46c234a9157db8dfce198 Mon Sep 17 00:00:00 2001 From: Grail Finder Date: Fri, 6 Mar 2026 19:57:44 +0300 Subject: Chore: linter complaints --- rag/embedder.go | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'rag/embedder.go') diff --git a/rag/embedder.go b/rag/embedder.go index fd4cfa7..5a4aae0 100644 --- a/rag/embedder.go +++ b/rag/embedder.go @@ -213,7 +213,6 @@ func NewONNXEmbedder(modelPath, tokenizerPath string, dims int, logger *slog.Log if cudaLibPath == "" { fmt.Println("WARNING: CUDA provider library not found, will use CPU") } - emb := &ONNXEmbedder{ tokenizerPath: tokenizerPath, dims: dims, @@ -232,7 +231,6 @@ func (e *ONNXEmbedder) ensureInitialized() error { if e.session != nil { return nil } - // Load tokenizer lazily if e.tokenizer == nil { tok, err := pretrained.FromFile(e.tokenizerPath) @@ -241,7 +239,6 @@ func (e *ONNXEmbedder) ensureInitialized() error { } e.tokenizer = tok } - onnxInitOnce.Do(func() { onnxruntime_go.SetSharedLibraryPath(onnxLibPath) if err := onnxruntime_go.InitializeEnvironment(); err != nil { @@ -260,13 +257,14 @@ func (e *ONNXEmbedder) ensureInitialized() error { if !onnxReady { return errors.New("ONNX runtime not ready") } - // Create session options opts, err := onnxruntime_go.NewSessionOptions() if err != nil { return fmt.Errorf("failed to create session options: %w", err) } - defer opts.Destroy() + defer func() { + _ = opts.Destroy() + }() // Try to add CUDA provider useCUDA := cudaLibPath != "" @@ -276,7 +274,9 @@ func (e *ONNXEmbedder) ensureInitialized() error { e.logger.Warn("failed to create CUDA provider options, falling back to CPU", "error", err) useCUDA = false } else { - defer cudaOpts.Destroy() + defer func() { + _ = cudaOpts.Destroy() + }() if err := cudaOpts.Update(map[string]string{"device_id": "0"}); err != nil { e.logger.Warn("failed to update CUDA options, falling back to CPU", "error", err) useCUDA = false @@ -286,7 +286,6 @@ func (e *ONNXEmbedder) ensureInitialized() error { } } } - if useCUDA { e.logger.Info("Using CUDA for ONNX inference") } else { -- cgit v1.2.3