diff options
Diffstat (limited to 'tools.go')
| -rw-r--r-- | tools.go | 97 |
1 files changed, 80 insertions, 17 deletions
@@ -278,25 +278,13 @@ func updateToolCapabilities() { // getWebAgentClient returns a singleton AgentClient for web agents. func getWebAgentClient() *agent.AgentClient { webAgentClientOnce.Do(func() { - if cfg == nil { - if logger != nil { - logger.Warn("web agent client unavailable: config not initialized") - } - return - } - if logger == nil { - if logger != nil { - logger.Warn("web agent client unavailable: logger not initialized") - } - return - } getToken := func() string { if chunkParser == nil { return "" } return chunkParser.GetToken() } - webAgentClient = agent.NewAgentClient(cfg, *logger, getToken) + webAgentClient = agent.NewAgentClient(cfg, logger, getToken) }) return webAgentClient } @@ -306,13 +294,13 @@ func registerWebAgents() { webAgentsOnce.Do(func() { client := getWebAgentClient() // Register rag_search agent - agent.Register("rag_search", agent.NewWebAgentB(client, ragSearchSysPrompt)) + agent.RegisterB("rag_search", agent.NewWebAgentB(client, ragSearchSysPrompt)) // Register websearch agent - agent.Register("websearch", agent.NewWebAgentB(client, webSearchSysPrompt)) + agent.RegisterB("websearch", agent.NewWebAgentB(client, webSearchSysPrompt)) // Register read_url agent - agent.Register("read_url", agent.NewWebAgentB(client, readURLSysPrompt)) + agent.RegisterB("read_url", agent.NewWebAgentB(client, readURLSysPrompt)) // Register summarize_chat agent - agent.Register("summarize_chat", agent.NewWebAgentB(client, summarySysPrompt)) + agent.RegisterB("summarize_chat", agent.NewWebAgentB(client, summarySysPrompt)) }) } @@ -1503,6 +1491,48 @@ func registerWindowTools() { } } +var browserAgentSysPrompt = `You are an autonomous browser automation agent. Your goal is to complete the user's task by intelligently using browser automation tools. + +Important: The browser may already be running from a previous task! Always check pw_is_running first before starting a new browser. + +Available tools: +- pw_start: Start browser (only if not already running) +- pw_stop: Stop browser (only when you're truly done and browser is no longer needed) +- pw_is_running: Check if browser is running +- pw_navigate: Go to a URL +- pw_click: Click an element by CSS selector +- pw_fill: Type text into an input +- pw_extract_text: Get text from page/element +- pw_screenshot: Take a screenshot (returns file path) +- pw_screenshot_and_view: Take screenshot with image for viewing +- pw_wait_for_selector: Wait for element to appear +- pw_drag: Drag mouse from one point to another +- pw_click_at: Click at X,Y coordinates +- pw_get_html: Get HTML content +- pw_get_dom: Get structured DOM tree +- pw_search_elements: Search for elements by text or selector + +Workflow: +1. First, check if browser is already running (pw_is_running) +2. Only start browser if not already running (pw_start) +3. Navigate to required pages (pw_navigate) +4. Interact with elements as needed (click, fill, etc.) +5. Extract information or take screenshots as requested +6. IMPORTANT: Do NOT stop the browser when done! Leave it running so the user can continue interacting with the page in subsequent requests. + +Always provide clear feedback about what you're doing and what you found.` + +func runBrowserAgent(args map[string]string) []byte { + task, ok := args["task"] + if !ok || task == "" { + return []byte(`{"error": "task argument is required"}`) + } + client := getWebAgentClient() + pwAgent := agent.NewPWAgent(client, browserAgentSysPrompt) + pwAgent.SetTools(agent.GetPWTools()) + return pwAgent.ProcessTask(task) +} + func registerPlaywrightTools() { removePlaywrightToolsFromBaseTools() if cfg != nil && cfg.PlaywrightEnabled { @@ -1788,6 +1818,39 @@ func registerPlaywrightTools() { } baseTools = append(baseTools, playwrightTools...) toolSysMsg += browserToolSysMsg + agent.RegisterPWTool("pw_start", pwStart) + agent.RegisterPWTool("pw_stop", pwStop) + agent.RegisterPWTool("pw_is_running", pwIsRunning) + agent.RegisterPWTool("pw_navigate", pwNavigate) + agent.RegisterPWTool("pw_click", pwClick) + agent.RegisterPWTool("pw_click_at", pwClickAt) + agent.RegisterPWTool("pw_fill", pwFill) + agent.RegisterPWTool("pw_extract_text", pwExtractText) + agent.RegisterPWTool("pw_screenshot", pwScreenshot) + agent.RegisterPWTool("pw_screenshot_and_view", pwScreenshotAndView) + agent.RegisterPWTool("pw_wait_for_selector", pwWaitForSelector) + agent.RegisterPWTool("pw_drag", pwDrag) + agent.RegisterPWTool("pw_get_html", pwGetHTML) + agent.RegisterPWTool("pw_get_dom", pwGetDOM) + agent.RegisterPWTool("pw_search_elements", pwSearchElements) + browserAgentTool := []models.Tool{ + { + Type: "function", + Function: models.ToolFunc{ + Name: "browser_agent", + Description: "Autonomous browser automation agent. Use for complex multi-step browser tasks like 'go to website, login, and take screenshot'. The agent will plan and execute steps automatically using browser tools.", + Parameters: models.ToolFuncParams{ + Type: "object", + Required: []string{"task"}, + Properties: map[string]models.ToolArgProps{ + "task": {Type: "string", Description: "The task to accomplish, e.g., 'go to github.com and take a screenshot of the homepage'"}, + }, + }, + }, + }, + } + baseTools = append(baseTools, browserAgentTool...) + fnMap["browser_agent"] = runBrowserAgent } } |
