4 files changed, 113 insertions, 11 deletions
diff --git a/bot.go b/bot.go
index bf3a239..b3ae41e 100644
--- a/bot.go
+++ b/bot.go
@@ -1174,17 +1174,60 @@ func findCall(msg, toolCall string) bool {
 	toolRunningMode = false
 	toolMsg := string(resp)
 	logger.Info("llm used a tool call", "tool_name", fc.Name, "too_args", fc.Args, "id", fc.ID, "tool_resp", toolMsg)
-	fmt.Fprintf(textView, "%s[-:-:b](%d) <%s>: [-:-:-]\n%s\n",
-		"\n\n", len(chatBody.Messages), cfg.ToolRole, toolMsg)
 	// Create tool response message with the proper tool_call_id
 	// Mark shell commands as always visible
 	isShellCommand := fc.Name == "execute_command"
-	toolResponseMsg := models.RoleMsg{
-		Role:           cfg.ToolRole,
-		Content:        toolMsg,
-		ToolCallID:     lastToolCall.ID,
-		IsShellCommand: isShellCommand,
+
+	// Check if response is multimodal content (image)
+	var toolResponseMsg models.RoleMsg
+	if strings.HasPrefix(strings.TrimSpace(toolMsg), `{"type":"multimodal_content"`) {
+		// Parse multimodal content response
+		multimodalResp := models.MultimodalToolResp{}
+		if err := json.Unmarshal([]byte(toolMsg), &multimodalResp); err == nil && multimodalResp.Type == "multimodal_content" {
+			// Create RoleMsg with ContentParts
+			var contentParts []any
+			for _, part := range multimodalResp.Parts {
+				partType, ok := part["type"]
+				if !ok {
+					continue
+				}
+				if partType == "text" {
+					contentParts = append(contentParts, models.TextContentPart{Type: "text", Text: part["text"]})
+				} else if partType == "image_url" {
+					contentParts = append(contentParts, models.ImageContentPart{
+						Type: "image_url",
+						ImageURL: struct {
+							URL string `json:"url"`
+						}{URL: part["url"]},
+					})
+				}
+			}
+			toolResponseMsg = models.RoleMsg{
+				Role:            cfg.ToolRole,
+				ContentParts:    contentParts,
+				HasContentParts: true,
+				ToolCallID:      lastToolCall.ID,
+				IsShellCommand:  isShellCommand,
+			}
+		} else {
+			// Fallback to regular content
+			toolResponseMsg = models.RoleMsg{
+				Role:           cfg.ToolRole,
+				Content:        toolMsg,
+				ToolCallID:     lastToolCall.ID,
+				IsShellCommand: isShellCommand,
+			}
+		}
+	} else {
+		toolResponseMsg = models.RoleMsg{
+			Role:           cfg.ToolRole,
+			Content:        toolMsg,
+			ToolCallID:     lastToolCall.ID,
+			IsShellCommand: isShellCommand,
+		}
 	}
+	fmt.Fprintf(textView, "%s[-:-:b](%d) <%s>: [-:-:-]\n%s\n",
+		"\n\n", len(chatBody.Messages), cfg.ToolRole, toolResponseMsg.GetText())
 	chatBody.Messages = append(chatBody.Messages, toolResponseMsg)
 	logger.Debug("findCall: added actual tool response", "role", toolResponseMsg.Role, "content_len", len(toolResponseMsg.Content), "tool_call_id", toolResponseMsg.ToolCallID, "message_count_after_add", len(chatBody.Messages))
 	// Clear the stored tool call ID after using it
diff --git a/models/consts.go b/models/consts.go
index 4f61435..8b4002b 100644
--- a/models/consts.go
+++ b/models/consts.go
@@ -1,7 +1,8 @@
 package models
 
 const (
-	LoadedMark = "(loaded) "
+	LoadedMark        = "(loaded) "
+	ToolRespMultyType = "multimodel_content"
 )
 
 type APIType int
diff --git a/models/models.go b/models/models.go
index a35f16c..973eb3d 100644
--- a/models/models.go
+++ b/models/models.go
@@ -391,7 +391,6 @@ func CreateImageURLFromPath(imagePath string) (string, error) {
 	if err != nil {
 		return "", err
 	}
-
 	// Determine the image format based on file extension
 	var mimeType string
 	switch {
@@ -408,10 +407,8 @@ func CreateImageURLFromPath(imagePath string) (string, error) {
 	default:
 		mimeType = "image/jpeg" // default
 	}
-
 	// Encode to base64
 	encoded := base64.StdEncoding.EncodeToString(data)
-
 	// Create data URL
 	return fmt.Sprintf("data:%s;base64,%s", mimeType, encoded), nil
 }
@@ -623,3 +620,8 @@ type ChatRoundReq struct {
 	Regen   bool
 	Resume  bool
 }
+
+type MultimodalToolResp struct {
+	Type  string              `json:"type"`
+	Parts []map[string]string `json:"parts"`
+}
diff --git a/tools.go b/tools.go
index 04ba554..1e6cfb8 100644
--- a/tools.go
+++ b/tools.go
@@ -469,6 +469,43 @@ func fileRead(args map[string]string) []byte {
 	return jsonResult
 }
 
+func fileReadImage(args map[string]string) []byte {
+	path, ok := args["path"]
+	if !ok || path == "" {
+		msg := "path not provided to file_read_image tool"
+		logger.Error(msg)
+		return []byte(msg)
+	}
+	path = resolvePath(path)
+	dataURL, err := models.CreateImageURLFromPath(path)
+	if err != nil {
+		msg := "failed to read image; error: " + err.Error()
+		logger.Error(msg)
+		return []byte(msg)
+	}
+	// result := map[string]any{
+	// 	"type": "multimodal_content",
+	// 	"parts": []map[string]string{
+	// 		{"type": "text", "text": "Image at " + path},
+	// 		{"type": "image_url", "url": dataURL},
+	// 	},
+	// }
+	result := models.MultimodalToolResp{
+		Type: "multimodal_content",
+		Parts: []map[string]string{
+			{"type": "text", "text": "Image at " + path},
+			{"type": "image_url", "url": dataURL},
+		},
+	}
+	jsonResult, err := json.Marshal(result)
+	if err != nil {
+		msg := "failed to marshal result; error: " + err.Error()
+		logger.Error(msg)
+		return []byte(msg)
+	}
+	return jsonResult
+}
+
 func fileWrite(args map[string]string) []byte {
 	path, ok := args["path"]
 	if !ok || path == "" {
@@ -1101,6 +1138,7 @@ var fnMap = map[string]fnSig{
 	"read_url_raw":      readURLRaw,
 	"file_create":       fileCreate,
 	"file_read":         fileRead,
+	"file_read_image":   fileReadImage,
 	"file_write":        fileWrite,
 	"file_write_append": fileWriteAppend,
 	"file_edit":         fileEdit,
@@ -1327,6 +1365,24 @@ var baseTools = []models.Tool{
 			},
 		},
 	},
+	// file_read_image
+	models.Tool{
+		Type: "function",
+		Function: models.ToolFunc{
+			Name:        "file_read_image",
+			Description: "Read an image file and return it for multimodal LLM viewing. Supports png, jpg, jpeg, gif, webp formats. Use when you need the LLM to see and analyze an image.",
+			Parameters: models.ToolFuncParams{
+				Type:     "object",
+				Required: []string{"path"},
+				Properties: map[string]models.ToolArgProps{
+					"path": models.ToolArgProps{
+						Type:        "string",
+						Description: "path of the image file to read",
+					},
+				},
+			},
+		},
+	},
 	// file_write
 	models.Tool{
 		Type: "function",