diff options
| author | Grail Finder <wohilas@gmail.com> | 2025-12-09 15:03:21 +0300 |
|---|---|---|
| committer | Grail Finder <wohilas@gmail.com> | 2025-12-09 15:03:21 +0300 |
| commit | e1bac8d0646ab9e6eab976dac0199807cb79a09e (patch) | |
| tree | 48605fef19e4c5840620d372577f617adedeae2b | |
| parent | 378dceb3f4dbb19554e568489d9773c00af4e114 (diff) | |
Feat: image prompt for llama.cpp /completion
| -rw-r--r-- | llm.go | 34 | ||||
| -rw-r--r-- | main_test.go | 27 | ||||
| -rw-r--r-- | models/models.go | 47 |
3 files changed, 80 insertions, 28 deletions
@@ -6,6 +6,7 @@ import ( "gf-lt/models" "io" "strings" + "fmt" ) var imageAttachmentPath string // Global variable to track image attachment for next message @@ -82,6 +83,26 @@ func (lcp LCPCompletion) GetToken() string { func (lcp LCPCompletion) FormMsg(msg, role string, resume bool) (io.Reader, error) { logger.Debug("formmsg lcpcompletion", "link", cfg.CurrentAPI) + localImageAttachmentPath := imageAttachmentPath + var multimodalData []string + + if localImageAttachmentPath != "" { + imageURL, err := models.CreateImageURLFromPath(localImageAttachmentPath) + if err != nil { + logger.Error("failed to create image URL from path for completion", "error", err, "path", localImageAttachmentPath) + return nil, err + } + // Extract base64 part from data URL (e.g., "data:image/jpeg;base64,...") + parts := strings.SplitN(imageURL, ",", 2) + if len(parts) == 2 { + multimodalData = append(multimodalData, parts[1]) + } else { + logger.Error("invalid image data URL format", "url", imageURL) + return nil, fmt.Errorf("invalid image data URL format") + } + imageAttachmentPath = "" // Clear the attachment after use + } + if msg != "" { // otherwise let the bot to continue newMsg := models.RoleMsg{Role: role, Content: msg} chatBody.Messages = append(chatBody.Messages, newMsg) @@ -118,9 +139,18 @@ func (lcp LCPCompletion) FormMsg(msg, role string, resume bool) (io.Reader, erro if cfg.ThinkUse && !cfg.ToolUse { prompt += "<think>" } + // Add multimodal media markers to the prompt text when multimodal data is present + // This is required by llama.cpp multimodal models so they know where to insert media + if len(multimodalData) > 0 { + // Add a media marker for each item in the multimodal data + for range multimodalData { + prompt += " <__media__>" // llama.cpp default multimodal marker + } + } + logger.Debug("checking prompt for /completion", "tool_use", cfg.ToolUse, - "msg", msg, "resume", resume, "prompt", prompt) - payload := models.NewLCPReq(prompt, defaultLCPProps, chatBody.MakeStopSlice()) + "msg", msg, "resume", resume, "prompt", prompt, "multimodal_data_count", len(multimodalData)) + payload := models.NewLCPReq(prompt, multimodalData, defaultLCPProps, chatBody.MakeStopSlice()) data, err := json.Marshal(payload) if err != nil { logger.Error("failed to form a msg", "error", err) diff --git a/main_test.go b/main_test.go index fb0a774..84d23ba 100644 --- a/main_test.go +++ b/main_test.go @@ -3,6 +3,7 @@ package main import ( "gf-lt/models" "fmt" + "gf-lt/config" "strings" "testing" ) @@ -25,17 +26,17 @@ func TestRemoveThinking(t *testing.T) { }, } for i, tc := range cases { - t.Run(fmt.Sprintf("run_%d", i), func(t *testing.T) { - mNum := len(tc.cb.Messages) - removeThinking(tc.cb) - if len(tc.cb.Messages) != mNum-int(tc.toolMsgs) { - t.Error("failed to delete tools msg", tc.cb.Messages, cfg.ToolRole) - } - for _, msg := range tc.cb.Messages { - if strings.Contains(msg.Content, "<think>") { - t.Errorf("msg contains think tag; msg: %s\n", msg.Content) - } - } - }) - } + t.Run(fmt.Sprintf("run_%d", i), func(t *testing.T) { + cfg = &config.Config{ToolRole: "tool"} // Initialize cfg.ToolRole for test + mNum := len(tc.cb.Messages) + removeThinking(tc.cb) + if len(tc.cb.Messages) != mNum-int(tc.toolMsgs) { + t.Errorf("failed to delete tools msg %v; expected %d, got %d", tc.cb.Messages, mNum-int(tc.toolMsgs), len(tc.cb.Messages)) + } + for _, msg := range tc.cb.Messages { + if strings.Contains(msg.Content, "<think>") { + t.Errorf("msg contains think tag; msg: %s\n", msg.Content) + } + } + }) } } diff --git a/models/models.go b/models/models.go index b4e7113..baadc8d 100644 --- a/models/models.go +++ b/models/models.go @@ -440,13 +440,14 @@ type LLMModels struct { type LlamaCPPReq struct { Stream bool `json:"stream"` - // Messages []RoleMsg `json:"messages"` - Prompt string `json:"prompt"` - Temperature float32 `json:"temperature"` - DryMultiplier float32 `json:"dry_multiplier"` - Stop []string `json:"stop"` - MinP float32 `json:"min_p"` - NPredict int32 `json:"n_predict"` + // For multimodal requests, prompt should be an object with prompt_string and multimodal_data + // For regular requests, prompt is a string + Prompt interface{} `json:"prompt"` // Can be string or object with prompt_string and multimodal_data + Temperature float32 `json:"temperature"` + DryMultiplier float32 `json:"dry_multiplier"` + Stop []string `json:"stop"` + MinP float32 `json:"min_p"` + NPredict int32 `json:"n_predict"` // MaxTokens int `json:"max_tokens"` // DryBase float64 `json:"dry_base"` // DryAllowedLength int `json:"dry_allowed_length"` @@ -466,17 +467,37 @@ type LlamaCPPReq struct { // Samplers string `json:"samplers"` } -func NewLCPReq(prompt string, props map[string]float32, stopStrings []string) LlamaCPPReq { +type PromptObject struct { + PromptString string `json:"prompt_string"` + MultimodalData []string `json:"multimodal_data,omitempty"` + // Alternative field name used by some llama.cpp implementations + ImageData []string `json:"image_data,omitempty"` // For compatibility +} + +func NewLCPReq(prompt string, multimodalData []string, props map[string]float32, stopStrings []string) LlamaCPPReq { + var finalPrompt interface{} + + if len(multimodalData) > 0 { + // When multimodal data is present, use the object format as per Python example: + // { "prompt": { "prompt_string": "...", "multimodal_data": [...] } } + finalPrompt = PromptObject{ + PromptString: prompt, + MultimodalData: multimodalData, + ImageData: multimodalData, // Also populate for compatibility with different llama.cpp versions + } + } else { + // When no multimodal data, use plain string + finalPrompt = prompt + } + return LlamaCPPReq{ - Stream: true, - Prompt: prompt, - // Temperature: 0.8, - // DryMultiplier: 0.5, + Stream: true, + Prompt: finalPrompt, Temperature: props["temperature"], DryMultiplier: props["dry_multiplier"], + Stop: stopStrings, MinP: props["min_p"], NPredict: int32(props["n_predict"]), - Stop: stopStrings, } } |
