summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--bot.go57
-rw-r--r--models/consts.go3
-rw-r--r--models/models.go8
-rw-r--r--tools.go56
4 files changed, 113 insertions, 11 deletions
diff --git a/bot.go b/bot.go
index bf3a239..b3ae41e 100644
--- a/bot.go
+++ b/bot.go
@@ -1174,17 +1174,60 @@ func findCall(msg, toolCall string) bool {
toolRunningMode = false
toolMsg := string(resp)
logger.Info("llm used a tool call", "tool_name", fc.Name, "too_args", fc.Args, "id", fc.ID, "tool_resp", toolMsg)
- fmt.Fprintf(textView, "%s[-:-:b](%d) <%s>: [-:-:-]\n%s\n",
- "\n\n", len(chatBody.Messages), cfg.ToolRole, toolMsg)
// Create tool response message with the proper tool_call_id
// Mark shell commands as always visible
isShellCommand := fc.Name == "execute_command"
- toolResponseMsg := models.RoleMsg{
- Role: cfg.ToolRole,
- Content: toolMsg,
- ToolCallID: lastToolCall.ID,
- IsShellCommand: isShellCommand,
+
+ // Check if response is multimodal content (image)
+ var toolResponseMsg models.RoleMsg
+ if strings.HasPrefix(strings.TrimSpace(toolMsg), `{"type":"multimodal_content"`) {
+ // Parse multimodal content response
+ multimodalResp := models.MultimodalToolResp{}
+ if err := json.Unmarshal([]byte(toolMsg), &multimodalResp); err == nil && multimodalResp.Type == "multimodal_content" {
+ // Create RoleMsg with ContentParts
+ var contentParts []any
+ for _, part := range multimodalResp.Parts {
+ partType, ok := part["type"]
+ if !ok {
+ continue
+ }
+ if partType == "text" {
+ contentParts = append(contentParts, models.TextContentPart{Type: "text", Text: part["text"]})
+ } else if partType == "image_url" {
+ contentParts = append(contentParts, models.ImageContentPart{
+ Type: "image_url",
+ ImageURL: struct {
+ URL string `json:"url"`
+ }{URL: part["url"]},
+ })
+ }
+ }
+ toolResponseMsg = models.RoleMsg{
+ Role: cfg.ToolRole,
+ ContentParts: contentParts,
+ HasContentParts: true,
+ ToolCallID: lastToolCall.ID,
+ IsShellCommand: isShellCommand,
+ }
+ } else {
+ // Fallback to regular content
+ toolResponseMsg = models.RoleMsg{
+ Role: cfg.ToolRole,
+ Content: toolMsg,
+ ToolCallID: lastToolCall.ID,
+ IsShellCommand: isShellCommand,
+ }
+ }
+ } else {
+ toolResponseMsg = models.RoleMsg{
+ Role: cfg.ToolRole,
+ Content: toolMsg,
+ ToolCallID: lastToolCall.ID,
+ IsShellCommand: isShellCommand,
+ }
}
+ fmt.Fprintf(textView, "%s[-:-:b](%d) <%s>: [-:-:-]\n%s\n",
+ "\n\n", len(chatBody.Messages), cfg.ToolRole, toolResponseMsg.GetText())
chatBody.Messages = append(chatBody.Messages, toolResponseMsg)
logger.Debug("findCall: added actual tool response", "role", toolResponseMsg.Role, "content_len", len(toolResponseMsg.Content), "tool_call_id", toolResponseMsg.ToolCallID, "message_count_after_add", len(chatBody.Messages))
// Clear the stored tool call ID after using it
diff --git a/models/consts.go b/models/consts.go
index 4f61435..8b4002b 100644
--- a/models/consts.go
+++ b/models/consts.go
@@ -1,7 +1,8 @@
package models
const (
- LoadedMark = "(loaded) "
+ LoadedMark = "(loaded) "
+ ToolRespMultyType = "multimodel_content"
)
type APIType int
diff --git a/models/models.go b/models/models.go
index a35f16c..973eb3d 100644
--- a/models/models.go
+++ b/models/models.go
@@ -391,7 +391,6 @@ func CreateImageURLFromPath(imagePath string) (string, error) {
if err != nil {
return "", err
}
-
// Determine the image format based on file extension
var mimeType string
switch {
@@ -408,10 +407,8 @@ func CreateImageURLFromPath(imagePath string) (string, error) {
default:
mimeType = "image/jpeg" // default
}
-
// Encode to base64
encoded := base64.StdEncoding.EncodeToString(data)
-
// Create data URL
return fmt.Sprintf("data:%s;base64,%s", mimeType, encoded), nil
}
@@ -623,3 +620,8 @@ type ChatRoundReq struct {
Regen bool
Resume bool
}
+
+type MultimodalToolResp struct {
+ Type string `json:"type"`
+ Parts []map[string]string `json:"parts"`
+}
diff --git a/tools.go b/tools.go
index 04ba554..1e6cfb8 100644
--- a/tools.go
+++ b/tools.go
@@ -469,6 +469,43 @@ func fileRead(args map[string]string) []byte {
return jsonResult
}
+func fileReadImage(args map[string]string) []byte {
+ path, ok := args["path"]
+ if !ok || path == "" {
+ msg := "path not provided to file_read_image tool"
+ logger.Error(msg)
+ return []byte(msg)
+ }
+ path = resolvePath(path)
+ dataURL, err := models.CreateImageURLFromPath(path)
+ if err != nil {
+ msg := "failed to read image; error: " + err.Error()
+ logger.Error(msg)
+ return []byte(msg)
+ }
+ // result := map[string]any{
+ // "type": "multimodal_content",
+ // "parts": []map[string]string{
+ // {"type": "text", "text": "Image at " + path},
+ // {"type": "image_url", "url": dataURL},
+ // },
+ // }
+ result := models.MultimodalToolResp{
+ Type: "multimodal_content",
+ Parts: []map[string]string{
+ {"type": "text", "text": "Image at " + path},
+ {"type": "image_url", "url": dataURL},
+ },
+ }
+ jsonResult, err := json.Marshal(result)
+ if err != nil {
+ msg := "failed to marshal result; error: " + err.Error()
+ logger.Error(msg)
+ return []byte(msg)
+ }
+ return jsonResult
+}
+
func fileWrite(args map[string]string) []byte {
path, ok := args["path"]
if !ok || path == "" {
@@ -1101,6 +1138,7 @@ var fnMap = map[string]fnSig{
"read_url_raw": readURLRaw,
"file_create": fileCreate,
"file_read": fileRead,
+ "file_read_image": fileReadImage,
"file_write": fileWrite,
"file_write_append": fileWriteAppend,
"file_edit": fileEdit,
@@ -1327,6 +1365,24 @@ var baseTools = []models.Tool{
},
},
},
+ // file_read_image
+ models.Tool{
+ Type: "function",
+ Function: models.ToolFunc{
+ Name: "file_read_image",
+ Description: "Read an image file and return it for multimodal LLM viewing. Supports png, jpg, jpeg, gif, webp formats. Use when you need the LLM to see and analyze an image.",
+ Parameters: models.ToolFuncParams{
+ Type: "object",
+ Required: []string{"path"},
+ Properties: map[string]models.ToolArgProps{
+ "path": models.ToolArgProps{
+ Type: "string",
+ Description: "path of the image file to read",
+ },
+ },
+ },
+ },
+ },
// file_write
models.Tool{
Type: "function",