Feat: image prompt for llama.cpp /completion

author: Grail Finder <wohilas@gmail.com> 2025-12-09 15:03:21 +0300
committer: Grail Finder <wohilas@gmail.com> 2025-12-09 15:03:21 +0300
commit: e1bac8d0646ab9e6eab976dac0199807cb79a09e (patch)
tree: 48605fef19e4c5840620d372577f617adedeae2b
parent: 378dceb3f4dbb19554e568489d9773c00af4e114 (diff)
3 files changed, 80 insertions, 28 deletions
diff --git a/llm.go b/llm.go
index 469df92..beb9273 100644
--- a/llm.go
+++ b/llm.go
@@ -6,6 +6,7 @@ import (
 	"gf-lt/models"
 	"io"
 	"strings"
+	"fmt"
 )
 
 var imageAttachmentPath string // Global variable to track image attachment for next message
@@ -82,6 +83,26 @@ func (lcp LCPCompletion) GetToken() string {
 
 func (lcp LCPCompletion) FormMsg(msg, role string, resume bool) (io.Reader, error) {
 	logger.Debug("formmsg lcpcompletion", "link", cfg.CurrentAPI)
+	localImageAttachmentPath := imageAttachmentPath
+	var multimodalData []string
+
+	if localImageAttachmentPath != "" {
+		imageURL, err := models.CreateImageURLFromPath(localImageAttachmentPath)
+		if err != nil {
+			logger.Error("failed to create image URL from path for completion", "error", err, "path", localImageAttachmentPath)
+			return nil, err
+		}
+		// Extract base64 part from data URL (e.g., "data:image/jpeg;base64,...")
+		parts := strings.SplitN(imageURL, ",", 2)
+		if len(parts) == 2 {
+			multimodalData = append(multimodalData, parts[1])
+		} else {
+			logger.Error("invalid image data URL format", "url", imageURL)
+			return nil, fmt.Errorf("invalid image data URL format")
+		}
+		imageAttachmentPath = "" // Clear the attachment after use
+	}
+
 	if msg != "" { // otherwise let the bot to continue
 		newMsg := models.RoleMsg{Role: role, Content: msg}
 		chatBody.Messages = append(chatBody.Messages, newMsg)
@@ -118,9 +139,18 @@ func (lcp LCPCompletion) FormMsg(msg, role string, resume bool) (io.Reader, erro
 	if cfg.ThinkUse && !cfg.ToolUse {
 		prompt += "<think>"
 	}
+	// Add multimodal media markers to the prompt text when multimodal data is present
+	// This is required by llama.cpp multimodal models so they know where to insert media
+	if len(multimodalData) > 0 {
+		// Add a media marker for each item in the multimodal data
+		for range multimodalData {
+			prompt += " <__media__>"  // llama.cpp default multimodal marker
+		}
+	}
+
 	logger.Debug("checking prompt for /completion", "tool_use", cfg.ToolUse,
-		"msg", msg, "resume", resume, "prompt", prompt)
-	payload := models.NewLCPReq(prompt, defaultLCPProps, chatBody.MakeStopSlice())
+		"msg", msg, "resume", resume, "prompt", prompt, "multimodal_data_count", len(multimodalData))
+	payload := models.NewLCPReq(prompt, multimodalData, defaultLCPProps, chatBody.MakeStopSlice())
 	data, err := json.Marshal(payload)
 	if err != nil {
 		logger.Error("failed to form a msg", "error", err)
diff --git a/main_test.go b/main_test.go
index fb0a774..84d23ba 100644
--- a/main_test.go
+++ b/main_test.go
@@ -3,6 +3,7 @@ package main
 import (
 	"gf-lt/models"
 	"fmt"
+	"gf-lt/config"
 	"strings"
 	"testing"
 )
@@ -25,17 +26,17 @@ func TestRemoveThinking(t *testing.T) {
 		},
 	}
 	for i, tc := range cases {
-		t.Run(fmt.Sprintf("run_%d", i), func(t *testing.T) {
-			mNum := len(tc.cb.Messages)
-			removeThinking(tc.cb)
-			if len(tc.cb.Messages) != mNum-int(tc.toolMsgs) {
-				t.Error("failed to delete tools msg", tc.cb.Messages, cfg.ToolRole)
-			}
-			for _, msg := range tc.cb.Messages {
-				if strings.Contains(msg.Content, "<think>") {
-					t.Errorf("msg contains think tag; msg: %s\n", msg.Content)
-				}
-			}
-		})
-	}
+					t.Run(fmt.Sprintf("run_%d", i), func(t *testing.T) {
+						cfg = &config.Config{ToolRole: "tool"} // Initialize cfg.ToolRole for test
+						mNum := len(tc.cb.Messages)
+						removeThinking(tc.cb)
+						if len(tc.cb.Messages) != mNum-int(tc.toolMsgs) {
+							t.Errorf("failed to delete tools msg %v; expected %d, got %d", tc.cb.Messages, mNum-int(tc.toolMsgs), len(tc.cb.Messages))
+						}
+						for _, msg := range tc.cb.Messages {
+							if strings.Contains(msg.Content, "<think>") {
+								t.Errorf("msg contains think tag; msg: %s\n", msg.Content)
+							}
+						}
+					})	}
 }
diff --git a/models/models.go b/models/models.go
index b4e7113..baadc8d 100644
--- a/models/models.go
+++ b/models/models.go
@@ -440,13 +440,14 @@ type LLMModels struct {
 
 type LlamaCPPReq struct {
 	Stream bool `json:"stream"`
-	// Messages      []RoleMsg `json:"messages"`
-	Prompt        string   `json:"prompt"`
-	Temperature   float32  `json:"temperature"`
-	DryMultiplier float32  `json:"dry_multiplier"`
-	Stop          []string `json:"stop"`
-	MinP          float32  `json:"min_p"`
-	NPredict      int32    `json:"n_predict"`
+	// For multimodal requests, prompt should be an object with prompt_string and multimodal_data
+	// For regular requests, prompt is a string
+	Prompt          interface{} `json:"prompt"`  // Can be string or object with prompt_string and multimodal_data
+	Temperature     float32     `json:"temperature"`
+	DryMultiplier   float32     `json:"dry_multiplier"`
+	Stop            []string    `json:"stop"`
+	MinP            float32     `json:"min_p"`
+	NPredict        int32       `json:"n_predict"`
 	// MaxTokens        int     `json:"max_tokens"`
 	// DryBase          float64 `json:"dry_base"`
 	// DryAllowedLength int     `json:"dry_allowed_length"`
@@ -466,17 +467,37 @@ type LlamaCPPReq struct {
 	// Samplers         string  `json:"samplers"`
 }
 
-func NewLCPReq(prompt string, props map[string]float32, stopStrings []string) LlamaCPPReq {
+type PromptObject struct {
+	PromptString   string   `json:"prompt_string"`
+	MultimodalData []string `json:"multimodal_data,omitempty"`
+	// Alternative field name used by some llama.cpp implementations
+	ImageData      []string `json:"image_data,omitempty"` // For compatibility
+}
+
+func NewLCPReq(prompt string, multimodalData []string, props map[string]float32, stopStrings []string) LlamaCPPReq {
+	var finalPrompt interface{}
+
+	if len(multimodalData) > 0 {
+		// When multimodal data is present, use the object format as per Python example:
+		// { "prompt": { "prompt_string": "...", "multimodal_data": [...] } }
+		finalPrompt = PromptObject{
+			PromptString:   prompt,
+			MultimodalData: multimodalData,
+			ImageData:      multimodalData, // Also populate for compatibility with different llama.cpp versions
+		}
+	} else {
+		// When no multimodal data, use plain string
+		finalPrompt = prompt
+	}
+
 	return LlamaCPPReq{
-		Stream: true,
-		Prompt: prompt,
-		// Temperature:   0.8,
-		// DryMultiplier: 0.5,
+		Stream:        true,
+		Prompt:        finalPrompt,
 		Temperature:   props["temperature"],
 		DryMultiplier: props["dry_multiplier"],
+		Stop:          stopStrings,
 		MinP:          props["min_p"],
 		NPredict:      int32(props["n_predict"]),
-		Stop:          stopStrings,
 	}
 }
author	Grail Finder <wohilas@gmail.com>	2025-12-09 15:03:21 +0300
committer	Grail Finder <wohilas@gmail.com>	2025-12-09 15:03:21 +0300
commit	e1bac8d0646ab9e6eab976dac0199807cb79a09e (patch)
tree	48605fef19e4c5840620d372577f617adedeae2b
parent	378dceb3f4dbb19554e568489d9773c00af4e114 (diff)