diff options
| -rw-r--r-- | agent/agent.go | 12 | ||||
| -rw-r--r-- | agent/pw_agent.go | 126 | ||||
| -rw-r--r-- | agent/pw_tools.go | 349 | ||||
| -rw-r--r-- | agent/request.go | 145 | ||||
| -rw-r--r-- | agent/webagent.go | 3 | ||||
| -rw-r--r-- | config.example.toml | 2 | ||||
| -rw-r--r-- | tools.go | 97 |
7 files changed, 634 insertions, 100 deletions
diff --git a/agent/agent.go b/agent/agent.go index 8824ecb..8a6614f 100644 --- a/agent/agent.go +++ b/agent/agent.go @@ -4,11 +4,12 @@ package agent // ones who do their own tools calls // ones that works only with the output -// A: main chat -> agent (handles everything: tool + processing) +// A: main chat -> agent (handles everything: tool + processing), supports tool chaining // B: main chat -> tool -> agent (process tool output) -// AgenterA gets a task "find out weather in london" -// proceeds to make tool calls on its own +// AgenterA gets a task like "go to the webpage, login and take a screenshot (tell me what you see)" +// proceeds to make a plan and executes it. +// returns with final result or an error type AgenterA interface { ProcessTask(task string) []byte } @@ -38,8 +39,3 @@ func RegisterA(toolNames []string, a AgenterA) { func Get(toolName string) AgenterB { return RegistryB[toolName] } - -// Register is a convenience wrapper for RegisterB. -func Register(toolName string, a AgenterB) { - RegisterB(toolName, a) -} diff --git a/agent/pw_agent.go b/agent/pw_agent.go new file mode 100644 index 0000000..2807331 --- /dev/null +++ b/agent/pw_agent.go @@ -0,0 +1,126 @@ +package agent + +import ( + "encoding/json" + "gf-lt/models" + "strings" +) + +// PWAgent: is AgenterA type agent (enclosed with tool chaining) +// sysprompt explain tools and how to plan for execution +type PWAgent struct { + *AgentClient + sysprompt string +} + +// NewPWAgent creates a PWAgent with the given client and system prompt +func NewPWAgent(client *AgentClient, sysprompt string) *PWAgent { + return &PWAgent{AgentClient: client, sysprompt: sysprompt} +} + +// SetTools sets the tools available to the agent +func (a *PWAgent) SetTools(tools []models.Tool) { + a.tools = tools +} + +func (a *PWAgent) ProcessTask(task string) []byte { + req, err := a.FormFirstMsg(a.sysprompt, task) + if err != nil { + a.Log().Error("PWAgent failed to process the request", "error", err) + return []byte("PWAgent failed to process the request; err: " + err.Error()) + } + toolCallLimit := 10 + for i := 0; i < toolCallLimit; i++ { + resp, err := a.LLMRequest(req) + if err != nil { + a.Log().Error("failed to process the request", "error", err) + return []byte("failed to process the request; err: " + err.Error()) + } + execTool, toolCallID, hasToolCall := findToolCall(resp) + if !hasToolCall { + return resp + } + + a.setToolCallOnLastMessage(resp, toolCallID) + + toolResp := string(execTool()) + req, err = a.FormMsgWithToolCallID(toolResp, toolCallID) + if err != nil { + a.Log().Error("failed to form next message", "error", err) + return []byte("failed to form next message; err: " + err.Error()) + } + } + return nil +} + +func (a *PWAgent) setToolCallOnLastMessage(resp []byte, toolCallID string) { + if toolCallID == "" { + return + } + + var genericResp map[string]interface{} + if err := json.Unmarshal(resp, &genericResp); err != nil { + return + } + + var name string + var args map[string]string + + if choices, ok := genericResp["choices"].([]interface{}); ok && len(choices) > 0 { + if firstChoice, ok := choices[0].(map[string]interface{}); ok { + if message, ok := firstChoice["message"].(map[string]interface{}); ok { + if toolCalls, ok := message["tool_calls"].([]interface{}); ok && len(toolCalls) > 0 { + if tc, ok := toolCalls[0].(map[string]interface{}); ok { + if fn, ok := tc["function"].(map[string]interface{}); ok { + name, _ = fn["name"].(string) + argsStr, _ := fn["arguments"].(string) + json.Unmarshal([]byte(argsStr), &args) + } + } + } + } + } + } + + if name == "" { + content, _ := genericResp["content"].(string) + name = extractToolNameFromText(content) + } + + lastIdx := len(a.chatBody.Messages) - 1 + if lastIdx >= 0 { + a.chatBody.Messages[lastIdx].ToolCallID = toolCallID + if name != "" { + argsJSON, _ := json.Marshal(args) + a.chatBody.Messages[lastIdx].ToolCall = &models.ToolCall{ + ID: toolCallID, + Name: name, + Args: string(argsJSON), + } + } + } +} + +func extractToolNameFromText(text string) string { + jsStr := toolCallRE.FindString(text) + if jsStr == "" { + return "" + } + jsStr = strings.TrimSpace(jsStr) + jsStr = strings.TrimPrefix(jsStr, "__tool_call__") + jsStr = strings.TrimSuffix(jsStr, "__tool_call__") + jsStr = strings.TrimSpace(jsStr) + + start := strings.Index(jsStr, "{") + end := strings.LastIndex(jsStr, "}") + if start == -1 || end == -1 || end <= start { + return "" + } + jsStr = jsStr[start : end+1] + + var fc models.FuncCall + if err := json.Unmarshal([]byte(jsStr), &fc); err != nil { + return "" + } + return fc.Name +} diff --git a/agent/pw_tools.go b/agent/pw_tools.go new file mode 100644 index 0000000..19fd130 --- /dev/null +++ b/agent/pw_tools.go @@ -0,0 +1,349 @@ +package agent + +import ( + "encoding/json" + "fmt" + "regexp" + "strings" + + "gf-lt/models" +) + +type ToolFunc func(map[string]string) []byte + +var pwToolMap = make(map[string]ToolFunc) + +func RegisterPWTool(name string, fn ToolFunc) { + pwToolMap[name] = fn +} + +func GetPWTools() []models.Tool { + return pwTools +} + +var pwTools = []models.Tool{ + { + Type: "function", + Function: models.ToolFunc{ + Name: "pw_start", + Description: "Start a Playwright browser instance. Must be called first before any other browser automation. Uses headless mode by default.", + Parameters: models.ToolFuncParams{ + Type: "object", + Required: []string{}, + Properties: map[string]models.ToolArgProps{}, + }, + }, + }, + { + Type: "function", + Function: models.ToolFunc{ + Name: "pw_stop", + Description: "Stop the Playwright browser instance. Call when done with browser automation.", + Parameters: models.ToolFuncParams{ + Type: "object", + Required: []string{}, + Properties: map[string]models.ToolArgProps{}, + }, + }, + }, + { + Type: "function", + Function: models.ToolFunc{ + Name: "pw_is_running", + Description: "Check if Playwright browser is currently running.", + Parameters: models.ToolFuncParams{ + Type: "object", + Required: []string{}, + Properties: map[string]models.ToolArgProps{}, + }, + }, + }, + { + Type: "function", + Function: models.ToolFunc{ + Name: "pw_navigate", + Description: "Navigate to a URL in the browser.", + Parameters: models.ToolFuncParams{ + Type: "object", + Required: []string{"url"}, + Properties: map[string]models.ToolArgProps{ + "url": {Type: "string", Description: "URL to navigate to"}, + }, + }, + }, + }, + { + Type: "function", + Function: models.ToolFunc{ + Name: "pw_click", + Description: "Click on an element on the current webpage. Use 'index' for multiple matches (default 0).", + Parameters: models.ToolFuncParams{ + Type: "object", + Required: []string{"selector"}, + Properties: map[string]models.ToolArgProps{ + "selector": {Type: "string", Description: "CSS selector for the element"}, + "index": {Type: "integer", Description: "Index for multiple matches (default 0)"}, + }, + }, + }, + }, + { + Type: "function", + Function: models.ToolFunc{ + Name: "pw_fill", + Description: "Type text into an input field. Use 'index' for multiple matches (default 0).", + Parameters: models.ToolFuncParams{ + Type: "object", + Required: []string{"selector", "text"}, + Properties: map[string]models.ToolArgProps{ + "selector": {Type: "string", Description: "CSS selector for the input element"}, + "text": {Type: "string", Description: "Text to type into the field"}, + "index": {Type: "integer", Description: "Index for multiple matches (default 0)"}, + }, + }, + }, + }, + { + Type: "function", + Function: models.ToolFunc{ + Name: "pw_extract_text", + Description: "Extract text content from the page or specific elements. Use selector 'body' for all page text.", + Parameters: models.ToolFuncParams{ + Type: "object", + Required: []string{}, + Properties: map[string]models.ToolArgProps{ + "selector": {Type: "string", Description: "CSS selector (default 'body' for all page text)"}, + }, + }, + }, + }, + { + Type: "function", + Function: models.ToolFunc{ + Name: "pw_screenshot", + Description: "Take a screenshot of the page or a specific element. Returns a file path to the image.", + Parameters: models.ToolFuncParams{ + Type: "object", + Required: []string{}, + Properties: map[string]models.ToolArgProps{ + "selector": {Type: "string", Description: "CSS selector for element to screenshot"}, + "full_page": {Type: "boolean", Description: "Capture full page (default false)"}, + }, + }, + }, + }, + { + Type: "function", + Function: models.ToolFunc{ + Name: "pw_screenshot_and_view", + Description: "Take a screenshot and return the image for viewing. Use to visually verify page state.", + Parameters: models.ToolFuncParams{ + Type: "object", + Required: []string{}, + Properties: map[string]models.ToolArgProps{ + "selector": {Type: "string", Description: "CSS selector for element to screenshot"}, + "full_page": {Type: "boolean", Description: "Capture full page (default false)"}, + }, + }, + }, + }, + { + Type: "function", + Function: models.ToolFunc{ + Name: "pw_wait_for_selector", + Description: "Wait for an element to appear on the page before proceeding.", + Parameters: models.ToolFuncParams{ + Type: "object", + Required: []string{"selector"}, + Properties: map[string]models.ToolArgProps{ + "selector": {Type: "string", Description: "CSS selector to wait for"}, + "timeout": {Type: "integer", Description: "Timeout in milliseconds (default 30000)"}, + }, + }, + }, + }, + { + Type: "function", + Function: models.ToolFunc{ + Name: "pw_drag", + Description: "Drag the mouse from point (x1,y1) to (x2,y2).", + Parameters: models.ToolFuncParams{ + Type: "object", + Required: []string{"x1", "y1", "x2", "y2"}, + Properties: map[string]models.ToolArgProps{ + "x1": {Type: "number", Description: "Starting X coordinate"}, + "y1": {Type: "number", Description: "Starting Y coordinate"}, + "x2": {Type: "number", Description: "Ending X coordinate"}, + "y2": {Type: "number", Description: "Ending Y coordinate"}, + }, + }, + }, + }, + { + Type: "function", + Function: models.ToolFunc{ + Name: "pw_click_at", + Description: "Click at specific X,Y coordinates on the page.", + Parameters: models.ToolFuncParams{ + Type: "object", + Required: []string{"x", "y"}, + Properties: map[string]models.ToolArgProps{ + "x": {Type: "number", Description: "X coordinate"}, + "y": {Type: "number", Description: "Y coordinate"}, + }, + }, + }, + }, + { + Type: "function", + Function: models.ToolFunc{ + Name: "pw_get_html", + Description: "Get the HTML content of the page or a specific element.", + Parameters: models.ToolFuncParams{ + Type: "object", + Required: []string{}, + Properties: map[string]models.ToolArgProps{ + "selector": {Type: "string", Description: "CSS selector (default 'body')"}, + }, + }, + }, + }, + { + Type: "function", + Function: models.ToolFunc{ + Name: "pw_get_dom", + Description: "Get a structured DOM representation with tag, attributes, text, and children.", + Parameters: models.ToolFuncParams{ + Type: "object", + Required: []string{}, + Properties: map[string]models.ToolArgProps{ + "selector": {Type: "string", Description: "CSS selector (default 'body')"}, + }, + }, + }, + }, + { + Type: "function", + Function: models.ToolFunc{ + Name: "pw_search_elements", + Description: "Search for elements by text content or CSS selector.", + Parameters: models.ToolFuncParams{ + Type: "object", + Required: []string{}, + Properties: map[string]models.ToolArgProps{ + "text": {Type: "string", Description: "Text content to search for"}, + "selector": {Type: "string", Description: "CSS selector to search for"}, + }, + }, + }, + }, +} + +var toolCallRE = regexp.MustCompile(`__tool_call__(.+?)__tool_call__`) + +type ParsedToolCall struct { + ID string + Name string + Args map[string]string +} + +func findToolCall(resp []byte) (func() []byte, string, bool) { + var genericResp map[string]interface{} + if err := json.Unmarshal(resp, &genericResp); err != nil { + return findToolCallFromText(string(resp)) + } + + if choices, ok := genericResp["choices"].([]interface{}); ok && len(choices) > 0 { + if firstChoice, ok := choices[0].(map[string]interface{}); ok { + if message, ok := firstChoice["message"].(map[string]interface{}); ok { + if toolCalls, ok := message["tool_calls"].([]interface{}); ok && len(toolCalls) > 0 { + return parseOpenAIToolCall(toolCalls) + } + if content, ok := message["content"].(string); ok { + return findToolCallFromText(content) + } + } + if text, ok := firstChoice["text"].(string); ok { + return findToolCallFromText(text) + } + } + } + + if content, ok := genericResp["content"].(string); ok { + return findToolCallFromText(content) + } + + return findToolCallFromText(string(resp)) +} + +func parseOpenAIToolCall(toolCalls []interface{}) (func() []byte, string, bool) { + if len(toolCalls) == 0 { + return nil, "", false + } + + tc := toolCalls[0].(map[string]interface{}) + id, _ := tc["id"].(string) + function, _ := tc["function"].(map[string]interface{}) + name, _ := function["name"].(string) + argsStr, _ := function["arguments"].(string) + + var args map[string]string + if err := json.Unmarshal([]byte(argsStr), &args); err != nil { + return func() []byte { + return []byte(fmt.Sprintf(`{"error": "failed to parse arguments: %v"}`, err)) + }, id, true + } + + return func() []byte { + fn, ok := pwToolMap[name] + if !ok { + return []byte(fmt.Sprintf(`{"error": "tool %s not found"}`, name)) + } + return fn(args) + }, id, true +} + +func findToolCallFromText(text string) (func() []byte, string, bool) { + jsStr := toolCallRE.FindString(text) + if jsStr == "" { + return nil, "", false + } + + jsStr = strings.TrimSpace(jsStr) + jsStr = strings.TrimPrefix(jsStr, "__tool_call__") + jsStr = strings.TrimSuffix(jsStr, "__tool_call__") + jsStr = strings.TrimSpace(jsStr) + + start := strings.Index(jsStr, "{") + end := strings.LastIndex(jsStr, "}") + if start == -1 || end == -1 || end <= start { + return func() []byte { + return []byte(`{"error": "no valid JSON found in tool call"}`) + }, "", true + } + + jsStr = jsStr[start : end+1] + + var fc models.FuncCall + if err := json.Unmarshal([]byte(jsStr), &fc); err != nil { + return func() []byte { + return []byte(fmt.Sprintf(`{"error": "failed to parse tool call: %v}`, err)) + }, "", true + } + + if fc.ID == "" { + fc.ID = "call_" + generateToolCallID() + } + + return func() []byte { + fn, ok := pwToolMap[fc.Name] + if !ok { + return []byte(fmt.Sprintf(`{"error": "tool %s not found"}`, fc.Name)) + } + return fn(fc.Args) + }, fc.ID, true +} + +func generateToolCallID() string { + return fmt.Sprintf("%d", len(pwToolMap)%10000) +} diff --git a/agent/request.go b/agent/request.go index f42b06e..754f16e 100644 --- a/agent/request.go +++ b/agent/request.go @@ -30,12 +30,16 @@ func detectAPI(api string) (isCompletion, isChat, isDeepSeek, isOpenRouter bool) } type AgentClient struct { - cfg *config.Config - getToken func() string - log slog.Logger + cfg *config.Config + getToken func() string + log *slog.Logger + chatBody *models.ChatBody + sysprompt string + lastToolCallID string + tools []models.Tool } -func NewAgentClient(cfg *config.Config, log slog.Logger, gt func() string) *AgentClient { +func NewAgentClient(cfg *config.Config, log *slog.Logger, gt func() string) *AgentClient { return &AgentClient{ cfg: cfg, getToken: gt, @@ -44,93 +48,99 @@ func NewAgentClient(cfg *config.Config, log slog.Logger, gt func() string) *Agen } func (ag *AgentClient) Log() *slog.Logger { - return &ag.log + return ag.log } -func (ag *AgentClient) FormMsg(sysprompt, msg string) (io.Reader, error) { - b, err := ag.buildRequest(sysprompt, msg) +func (ag *AgentClient) FormFirstMsg(sysprompt, msg string) (io.Reader, error) { + ag.sysprompt = sysprompt + ag.chatBody = &models.ChatBody{ + Messages: []models.RoleMsg{ + {Role: "system", Content: ag.sysprompt}, + {Role: "user", Content: msg}, + }, + Stream: false, + Model: ag.cfg.CurrentModel, + } + b, err := ag.buildRequest() if err != nil { return nil, err } return bytes.NewReader(b), nil } -// buildRequest creates the appropriate LLM request based on the current API endpoint. -func (ag *AgentClient) buildRequest(sysprompt, msg string) ([]byte, error) { - api := ag.cfg.CurrentAPI - model := ag.cfg.CurrentModel - messages := []models.RoleMsg{ - {Role: "system", Content: sysprompt}, - {Role: "user", Content: msg}, +func (ag *AgentClient) FormMsg(msg string) (io.Reader, error) { + m := models.RoleMsg{ + Role: "tool", Content: msg, } + ag.chatBody.Messages = append(ag.chatBody.Messages, m) + b, err := ag.buildRequest() + if err != nil { + return nil, err + } + return bytes.NewReader(b), nil +} - // Determine API type - isCompletion, isChat, isDeepSeek, isOpenRouter := detectAPI(api) - ag.log.Debug("agent building request", "api", api, "isCompletion", isCompletion, "isChat", isChat, "isDeepSeek", isDeepSeek, "isOpenRouter", isOpenRouter) +func (ag *AgentClient) FormMsgWithToolCallID(msg, toolCallID string) (io.Reader, error) { + m := models.RoleMsg{ + Role: "tool", + Content: msg, + ToolCallID: toolCallID, + } + ag.chatBody.Messages = append(ag.chatBody.Messages, m) + b, err := ag.buildRequest() + if err != nil { + return nil, err + } + return bytes.NewReader(b), nil +} +// buildRequest creates the appropriate LLM request based on the current API endpoint. +func (ag *AgentClient) buildRequest() ([]byte, error) { + isCompletion, isChat, isDeepSeek, isOpenRouter := detectAPI(ag.cfg.CurrentAPI) + ag.log.Debug("agent building request", "api", ag.cfg.CurrentAPI, "isCompletion", isCompletion, "isChat", isChat, "isDeepSeek", isDeepSeek, "isOpenRouter", isOpenRouter) // Build prompt for completion endpoints if isCompletion { var sb strings.Builder - for i := range messages { - sb.WriteString(messages[i].ToPrompt()) + for i := range ag.chatBody.Messages { + sb.WriteString(ag.chatBody.Messages[i].ToPrompt()) sb.WriteString("\n") } prompt := strings.TrimSpace(sb.String()) - switch { case isDeepSeek: // DeepSeek completion - req := models.NewDSCompletionReq(prompt, model, defaultProps["temperature"], []string{}) + req := models.NewDSCompletionReq(prompt, ag.chatBody.Model, defaultProps["temperature"], []string{}) req.Stream = false // Agents don't need streaming return json.Marshal(req) case isOpenRouter: // OpenRouter completion - req := models.NewOpenRouterCompletionReq(model, prompt, defaultProps, []string{}) + req := models.NewOpenRouterCompletionReq(ag.chatBody.Model, prompt, defaultProps, []string{}) req.Stream = false // Agents don't need streaming return json.Marshal(req) default: // Assume llama.cpp completion - req := models.NewLCPReq(prompt, model, nil, defaultProps, []string{}) + req := models.NewLCPReq(prompt, ag.chatBody.Model, nil, defaultProps, []string{}) req.Stream = false // Agents don't need streaming return json.Marshal(req) } } - - // Chat completions endpoints - if isChat || !isCompletion { - chatBody := &models.ChatBody{ - Model: model, - Stream: false, // Agents don't need streaming - Messages: messages, - } - - switch { - case isDeepSeek: - // DeepSeek chat - req := models.NewDSChatReq(*chatBody) - return json.Marshal(req) - case isOpenRouter: - // OpenRouter chat - agents don't use reasoning by default - req := models.NewOpenRouterChatReq(*chatBody, defaultProps, "") - return json.Marshal(req) - default: - // Assume llama.cpp chat (OpenAI format) - req := models.OpenAIReq{ - ChatBody: chatBody, - Tools: nil, - } - return json.Marshal(req) + switch { + case isDeepSeek: + // DeepSeek chat + req := models.NewDSChatReq(*ag.chatBody) + return json.Marshal(req) + case isOpenRouter: + // OpenRouter chat - agents don't use reasoning by default + req := models.NewOpenRouterChatReq(*ag.chatBody, defaultProps, ag.cfg.ReasoningEffort) + return json.Marshal(req) + default: + // Assume llama.cpp chat (OpenAI format) + req := models.OpenAIReq{ + ChatBody: ag.chatBody, + Tools: ag.tools, } + return json.Marshal(req) } - - // Fallback (should not reach here) - ag.log.Warn("unknown API, using default chat completions format", "api", api) - chatBody := &models.ChatBody{ - Model: model, - Stream: false, // Agents don't need streaming - Messages: messages, - } - return json.Marshal(chatBody) } func (ag *AgentClient) LLMRequest(body io.Reader) ([]byte, error) { @@ -165,7 +175,6 @@ func (ag *AgentClient) LLMRequest(body io.Reader) ([]byte, error) { ag.log.Error("agent LLM request failed", "status", resp.StatusCode, "response", string(responseBytes[:min(len(responseBytes), 1000)])) return responseBytes, fmt.Errorf("HTTP %d: %s", resp.StatusCode, string(responseBytes[:min(len(responseBytes), 200)])) } - // Parse response and extract text content text, err := extractTextFromResponse(responseBytes) if err != nil { @@ -179,17 +188,16 @@ func (ag *AgentClient) LLMRequest(body io.Reader) ([]byte, error) { // extractTextFromResponse parses common LLM response formats and extracts the text content. func extractTextFromResponse(data []byte) (string, error) { // Try to parse as generic JSON first - var genericResp map[string]interface{} + var genericResp map[string]any if err := json.Unmarshal(data, &genericResp); err != nil { // Not JSON, return as string return string(data), nil } - // Check for OpenAI chat completion format - if choices, ok := genericResp["choices"].([]interface{}); ok && len(choices) > 0 { - if firstChoice, ok := choices[0].(map[string]interface{}); ok { + if choices, ok := genericResp["choices"].([]any); ok && len(choices) > 0 { + if firstChoice, ok := choices[0].(map[string]any); ok { // Chat completion: choices[0].message.content - if message, ok := firstChoice["message"].(map[string]interface{}); ok { + if message, ok := firstChoice["message"].(map[string]any); ok { if content, ok := message["content"].(string); ok { return content, nil } @@ -199,19 +207,17 @@ func extractTextFromResponse(data []byte) (string, error) { return text, nil } // Delta format for streaming (should not happen with stream: false) - if delta, ok := firstChoice["delta"].(map[string]interface{}); ok { + if delta, ok := firstChoice["delta"].(map[string]any); ok { if content, ok := delta["content"].(string); ok { return content, nil } } } } - // Check for llama.cpp completion format if content, ok := genericResp["content"].(string); ok { return content, nil } - // Unknown format, return pretty-printed JSON prettyJSON, err := json.MarshalIndent(genericResp, "", " ") if err != nil { @@ -219,10 +225,3 @@ func extractTextFromResponse(data []byte) (string, error) { } return string(prettyJSON), nil } - -func min(a, b int) int { - if a < b { - return a - } - return b -} diff --git a/agent/webagent.go b/agent/webagent.go index ff6cd86..e8ca3a2 100644 --- a/agent/webagent.go +++ b/agent/webagent.go @@ -17,7 +17,8 @@ func NewWebAgentB(client *AgentClient, sysprompt string) *WebAgentB { // Process applies the formatting function to raw output func (a *WebAgentB) Process(args map[string]string, rawOutput []byte) []byte { - msg, err := a.FormMsg(a.sysprompt, + msg, err := a.FormFirstMsg( + a.sysprompt, fmt.Sprintf("request:\n%+v\ntool response:\n%v", args, string(rawOutput))) if err != nil { a.Log().Error("failed to process the request", "error", err) diff --git a/config.example.toml b/config.example.toml index 665fed6..8e45734 100644 --- a/config.example.toml +++ b/config.example.toml @@ -61,4 +61,4 @@ StripThinkingFromAPI = true # Strip <think> blocks from messages before sending ReasoningEffort = "medium" # playwright tools PlaywrightEnabled = false -PlaywrightDebug = false +PlaywrightDebug = false # when true opens in gui mode (headless=false) @@ -278,25 +278,13 @@ func updateToolCapabilities() { // getWebAgentClient returns a singleton AgentClient for web agents. func getWebAgentClient() *agent.AgentClient { webAgentClientOnce.Do(func() { - if cfg == nil { - if logger != nil { - logger.Warn("web agent client unavailable: config not initialized") - } - return - } - if logger == nil { - if logger != nil { - logger.Warn("web agent client unavailable: logger not initialized") - } - return - } getToken := func() string { if chunkParser == nil { return "" } return chunkParser.GetToken() } - webAgentClient = agent.NewAgentClient(cfg, *logger, getToken) + webAgentClient = agent.NewAgentClient(cfg, logger, getToken) }) return webAgentClient } @@ -306,13 +294,13 @@ func registerWebAgents() { webAgentsOnce.Do(func() { client := getWebAgentClient() // Register rag_search agent - agent.Register("rag_search", agent.NewWebAgentB(client, ragSearchSysPrompt)) + agent.RegisterB("rag_search", agent.NewWebAgentB(client, ragSearchSysPrompt)) // Register websearch agent - agent.Register("websearch", agent.NewWebAgentB(client, webSearchSysPrompt)) + agent.RegisterB("websearch", agent.NewWebAgentB(client, webSearchSysPrompt)) // Register read_url agent - agent.Register("read_url", agent.NewWebAgentB(client, readURLSysPrompt)) + agent.RegisterB("read_url", agent.NewWebAgentB(client, readURLSysPrompt)) // Register summarize_chat agent - agent.Register("summarize_chat", agent.NewWebAgentB(client, summarySysPrompt)) + agent.RegisterB("summarize_chat", agent.NewWebAgentB(client, summarySysPrompt)) }) } @@ -1503,6 +1491,48 @@ func registerWindowTools() { } } +var browserAgentSysPrompt = `You are an autonomous browser automation agent. Your goal is to complete the user's task by intelligently using browser automation tools. + +Important: The browser may already be running from a previous task! Always check pw_is_running first before starting a new browser. + +Available tools: +- pw_start: Start browser (only if not already running) +- pw_stop: Stop browser (only when you're truly done and browser is no longer needed) +- pw_is_running: Check if browser is running +- pw_navigate: Go to a URL +- pw_click: Click an element by CSS selector +- pw_fill: Type text into an input +- pw_extract_text: Get text from page/element +- pw_screenshot: Take a screenshot (returns file path) +- pw_screenshot_and_view: Take screenshot with image for viewing +- pw_wait_for_selector: Wait for element to appear +- pw_drag: Drag mouse from one point to another +- pw_click_at: Click at X,Y coordinates +- pw_get_html: Get HTML content +- pw_get_dom: Get structured DOM tree +- pw_search_elements: Search for elements by text or selector + +Workflow: +1. First, check if browser is already running (pw_is_running) +2. Only start browser if not already running (pw_start) +3. Navigate to required pages (pw_navigate) +4. Interact with elements as needed (click, fill, etc.) +5. Extract information or take screenshots as requested +6. IMPORTANT: Do NOT stop the browser when done! Leave it running so the user can continue interacting with the page in subsequent requests. + +Always provide clear feedback about what you're doing and what you found.` + +func runBrowserAgent(args map[string]string) []byte { + task, ok := args["task"] + if !ok || task == "" { + return []byte(`{"error": "task argument is required"}`) + } + client := getWebAgentClient() + pwAgent := agent.NewPWAgent(client, browserAgentSysPrompt) + pwAgent.SetTools(agent.GetPWTools()) + return pwAgent.ProcessTask(task) +} + func registerPlaywrightTools() { removePlaywrightToolsFromBaseTools() if cfg != nil && cfg.PlaywrightEnabled { @@ -1788,6 +1818,39 @@ func registerPlaywrightTools() { } baseTools = append(baseTools, playwrightTools...) toolSysMsg += browserToolSysMsg + agent.RegisterPWTool("pw_start", pwStart) + agent.RegisterPWTool("pw_stop", pwStop) + agent.RegisterPWTool("pw_is_running", pwIsRunning) + agent.RegisterPWTool("pw_navigate", pwNavigate) + agent.RegisterPWTool("pw_click", pwClick) + agent.RegisterPWTool("pw_click_at", pwClickAt) + agent.RegisterPWTool("pw_fill", pwFill) + agent.RegisterPWTool("pw_extract_text", pwExtractText) + agent.RegisterPWTool("pw_screenshot", pwScreenshot) + agent.RegisterPWTool("pw_screenshot_and_view", pwScreenshotAndView) + agent.RegisterPWTool("pw_wait_for_selector", pwWaitForSelector) + agent.RegisterPWTool("pw_drag", pwDrag) + agent.RegisterPWTool("pw_get_html", pwGetHTML) + agent.RegisterPWTool("pw_get_dom", pwGetDOM) + agent.RegisterPWTool("pw_search_elements", pwSearchElements) + browserAgentTool := []models.Tool{ + { + Type: "function", + Function: models.ToolFunc{ + Name: "browser_agent", + Description: "Autonomous browser automation agent. Use for complex multi-step browser tasks like 'go to website, login, and take screenshot'. The agent will plan and execute steps automatically using browser tools.", + Parameters: models.ToolFuncParams{ + Type: "object", + Required: []string{"task"}, + Properties: map[string]models.ToolArgProps{ + "task": {Type: "string", Description: "The task to accomplish, e.g., 'go to github.com and take a screenshot of the homepage'"}, + }, + }, + }, + }, + } + baseTools = append(baseTools, browserAgentTool...) + fnMap["browser_agent"] = runBrowserAgent } } |
