summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGrail Finder <wohilas@gmail.com>2025-12-09 15:03:21 +0300
committerGrail Finder <wohilas@gmail.com>2025-12-09 15:03:21 +0300
commite1bac8d0646ab9e6eab976dac0199807cb79a09e (patch)
tree48605fef19e4c5840620d372577f617adedeae2b
parent378dceb3f4dbb19554e568489d9773c00af4e114 (diff)
Feat: image prompt for llama.cpp /completion
-rw-r--r--llm.go34
-rw-r--r--main_test.go27
-rw-r--r--models/models.go47
3 files changed, 80 insertions, 28 deletions
diff --git a/llm.go b/llm.go
index 469df92..beb9273 100644
--- a/llm.go
+++ b/llm.go
@@ -6,6 +6,7 @@ import (
"gf-lt/models"
"io"
"strings"
+ "fmt"
)
var imageAttachmentPath string // Global variable to track image attachment for next message
@@ -82,6 +83,26 @@ func (lcp LCPCompletion) GetToken() string {
func (lcp LCPCompletion) FormMsg(msg, role string, resume bool) (io.Reader, error) {
logger.Debug("formmsg lcpcompletion", "link", cfg.CurrentAPI)
+ localImageAttachmentPath := imageAttachmentPath
+ var multimodalData []string
+
+ if localImageAttachmentPath != "" {
+ imageURL, err := models.CreateImageURLFromPath(localImageAttachmentPath)
+ if err != nil {
+ logger.Error("failed to create image URL from path for completion", "error", err, "path", localImageAttachmentPath)
+ return nil, err
+ }
+ // Extract base64 part from data URL (e.g., "data:image/jpeg;base64,...")
+ parts := strings.SplitN(imageURL, ",", 2)
+ if len(parts) == 2 {
+ multimodalData = append(multimodalData, parts[1])
+ } else {
+ logger.Error("invalid image data URL format", "url", imageURL)
+ return nil, fmt.Errorf("invalid image data URL format")
+ }
+ imageAttachmentPath = "" // Clear the attachment after use
+ }
+
if msg != "" { // otherwise let the bot to continue
newMsg := models.RoleMsg{Role: role, Content: msg}
chatBody.Messages = append(chatBody.Messages, newMsg)
@@ -118,9 +139,18 @@ func (lcp LCPCompletion) FormMsg(msg, role string, resume bool) (io.Reader, erro
if cfg.ThinkUse && !cfg.ToolUse {
prompt += "<think>"
}
+ // Add multimodal media markers to the prompt text when multimodal data is present
+ // This is required by llama.cpp multimodal models so they know where to insert media
+ if len(multimodalData) > 0 {
+ // Add a media marker for each item in the multimodal data
+ for range multimodalData {
+ prompt += " <__media__>" // llama.cpp default multimodal marker
+ }
+ }
+
logger.Debug("checking prompt for /completion", "tool_use", cfg.ToolUse,
- "msg", msg, "resume", resume, "prompt", prompt)
- payload := models.NewLCPReq(prompt, defaultLCPProps, chatBody.MakeStopSlice())
+ "msg", msg, "resume", resume, "prompt", prompt, "multimodal_data_count", len(multimodalData))
+ payload := models.NewLCPReq(prompt, multimodalData, defaultLCPProps, chatBody.MakeStopSlice())
data, err := json.Marshal(payload)
if err != nil {
logger.Error("failed to form a msg", "error", err)
diff --git a/main_test.go b/main_test.go
index fb0a774..84d23ba 100644
--- a/main_test.go
+++ b/main_test.go
@@ -3,6 +3,7 @@ package main
import (
"gf-lt/models"
"fmt"
+ "gf-lt/config"
"strings"
"testing"
)
@@ -25,17 +26,17 @@ func TestRemoveThinking(t *testing.T) {
},
}
for i, tc := range cases {
- t.Run(fmt.Sprintf("run_%d", i), func(t *testing.T) {
- mNum := len(tc.cb.Messages)
- removeThinking(tc.cb)
- if len(tc.cb.Messages) != mNum-int(tc.toolMsgs) {
- t.Error("failed to delete tools msg", tc.cb.Messages, cfg.ToolRole)
- }
- for _, msg := range tc.cb.Messages {
- if strings.Contains(msg.Content, "<think>") {
- t.Errorf("msg contains think tag; msg: %s\n", msg.Content)
- }
- }
- })
- }
+ t.Run(fmt.Sprintf("run_%d", i), func(t *testing.T) {
+ cfg = &config.Config{ToolRole: "tool"} // Initialize cfg.ToolRole for test
+ mNum := len(tc.cb.Messages)
+ removeThinking(tc.cb)
+ if len(tc.cb.Messages) != mNum-int(tc.toolMsgs) {
+ t.Errorf("failed to delete tools msg %v; expected %d, got %d", tc.cb.Messages, mNum-int(tc.toolMsgs), len(tc.cb.Messages))
+ }
+ for _, msg := range tc.cb.Messages {
+ if strings.Contains(msg.Content, "<think>") {
+ t.Errorf("msg contains think tag; msg: %s\n", msg.Content)
+ }
+ }
+ }) }
}
diff --git a/models/models.go b/models/models.go
index b4e7113..baadc8d 100644
--- a/models/models.go
+++ b/models/models.go
@@ -440,13 +440,14 @@ type LLMModels struct {
type LlamaCPPReq struct {
Stream bool `json:"stream"`
- // Messages []RoleMsg `json:"messages"`
- Prompt string `json:"prompt"`
- Temperature float32 `json:"temperature"`
- DryMultiplier float32 `json:"dry_multiplier"`
- Stop []string `json:"stop"`
- MinP float32 `json:"min_p"`
- NPredict int32 `json:"n_predict"`
+ // For multimodal requests, prompt should be an object with prompt_string and multimodal_data
+ // For regular requests, prompt is a string
+ Prompt interface{} `json:"prompt"` // Can be string or object with prompt_string and multimodal_data
+ Temperature float32 `json:"temperature"`
+ DryMultiplier float32 `json:"dry_multiplier"`
+ Stop []string `json:"stop"`
+ MinP float32 `json:"min_p"`
+ NPredict int32 `json:"n_predict"`
// MaxTokens int `json:"max_tokens"`
// DryBase float64 `json:"dry_base"`
// DryAllowedLength int `json:"dry_allowed_length"`
@@ -466,17 +467,37 @@ type LlamaCPPReq struct {
// Samplers string `json:"samplers"`
}
-func NewLCPReq(prompt string, props map[string]float32, stopStrings []string) LlamaCPPReq {
+type PromptObject struct {
+ PromptString string `json:"prompt_string"`
+ MultimodalData []string `json:"multimodal_data,omitempty"`
+ // Alternative field name used by some llama.cpp implementations
+ ImageData []string `json:"image_data,omitempty"` // For compatibility
+}
+
+func NewLCPReq(prompt string, multimodalData []string, props map[string]float32, stopStrings []string) LlamaCPPReq {
+ var finalPrompt interface{}
+
+ if len(multimodalData) > 0 {
+ // When multimodal data is present, use the object format as per Python example:
+ // { "prompt": { "prompt_string": "...", "multimodal_data": [...] } }
+ finalPrompt = PromptObject{
+ PromptString: prompt,
+ MultimodalData: multimodalData,
+ ImageData: multimodalData, // Also populate for compatibility with different llama.cpp versions
+ }
+ } else {
+ // When no multimodal data, use plain string
+ finalPrompt = prompt
+ }
+
return LlamaCPPReq{
- Stream: true,
- Prompt: prompt,
- // Temperature: 0.8,
- // DryMultiplier: 0.5,
+ Stream: true,
+ Prompt: finalPrompt,
Temperature: props["temperature"],
DryMultiplier: props["dry_multiplier"],
+ Stop: stopStrings,
MinP: props["min_p"],
NPredict: int32(props["n_predict"]),
- Stop: stopStrings,
}
}