summaryrefslogtreecommitdiff
path: root/tools.go
diff options
context:
space:
mode:
Diffstat (limited to 'tools.go')
-rw-r--r--tools.go97
1 files changed, 80 insertions, 17 deletions
diff --git a/tools.go b/tools.go
index 41b0b9b..3ea3517 100644
--- a/tools.go
+++ b/tools.go
@@ -278,25 +278,13 @@ func updateToolCapabilities() {
// getWebAgentClient returns a singleton AgentClient for web agents.
func getWebAgentClient() *agent.AgentClient {
webAgentClientOnce.Do(func() {
- if cfg == nil {
- if logger != nil {
- logger.Warn("web agent client unavailable: config not initialized")
- }
- return
- }
- if logger == nil {
- if logger != nil {
- logger.Warn("web agent client unavailable: logger not initialized")
- }
- return
- }
getToken := func() string {
if chunkParser == nil {
return ""
}
return chunkParser.GetToken()
}
- webAgentClient = agent.NewAgentClient(cfg, *logger, getToken)
+ webAgentClient = agent.NewAgentClient(cfg, logger, getToken)
})
return webAgentClient
}
@@ -306,13 +294,13 @@ func registerWebAgents() {
webAgentsOnce.Do(func() {
client := getWebAgentClient()
// Register rag_search agent
- agent.Register("rag_search", agent.NewWebAgentB(client, ragSearchSysPrompt))
+ agent.RegisterB("rag_search", agent.NewWebAgentB(client, ragSearchSysPrompt))
// Register websearch agent
- agent.Register("websearch", agent.NewWebAgentB(client, webSearchSysPrompt))
+ agent.RegisterB("websearch", agent.NewWebAgentB(client, webSearchSysPrompt))
// Register read_url agent
- agent.Register("read_url", agent.NewWebAgentB(client, readURLSysPrompt))
+ agent.RegisterB("read_url", agent.NewWebAgentB(client, readURLSysPrompt))
// Register summarize_chat agent
- agent.Register("summarize_chat", agent.NewWebAgentB(client, summarySysPrompt))
+ agent.RegisterB("summarize_chat", agent.NewWebAgentB(client, summarySysPrompt))
})
}
@@ -1503,6 +1491,48 @@ func registerWindowTools() {
}
}
+var browserAgentSysPrompt = `You are an autonomous browser automation agent. Your goal is to complete the user's task by intelligently using browser automation tools.
+
+Important: The browser may already be running from a previous task! Always check pw_is_running first before starting a new browser.
+
+Available tools:
+- pw_start: Start browser (only if not already running)
+- pw_stop: Stop browser (only when you're truly done and browser is no longer needed)
+- pw_is_running: Check if browser is running
+- pw_navigate: Go to a URL
+- pw_click: Click an element by CSS selector
+- pw_fill: Type text into an input
+- pw_extract_text: Get text from page/element
+- pw_screenshot: Take a screenshot (returns file path)
+- pw_screenshot_and_view: Take screenshot with image for viewing
+- pw_wait_for_selector: Wait for element to appear
+- pw_drag: Drag mouse from one point to another
+- pw_click_at: Click at X,Y coordinates
+- pw_get_html: Get HTML content
+- pw_get_dom: Get structured DOM tree
+- pw_search_elements: Search for elements by text or selector
+
+Workflow:
+1. First, check if browser is already running (pw_is_running)
+2. Only start browser if not already running (pw_start)
+3. Navigate to required pages (pw_navigate)
+4. Interact with elements as needed (click, fill, etc.)
+5. Extract information or take screenshots as requested
+6. IMPORTANT: Do NOT stop the browser when done! Leave it running so the user can continue interacting with the page in subsequent requests.
+
+Always provide clear feedback about what you're doing and what you found.`
+
+func runBrowserAgent(args map[string]string) []byte {
+ task, ok := args["task"]
+ if !ok || task == "" {
+ return []byte(`{"error": "task argument is required"}`)
+ }
+ client := getWebAgentClient()
+ pwAgent := agent.NewPWAgent(client, browserAgentSysPrompt)
+ pwAgent.SetTools(agent.GetPWTools())
+ return pwAgent.ProcessTask(task)
+}
+
func registerPlaywrightTools() {
removePlaywrightToolsFromBaseTools()
if cfg != nil && cfg.PlaywrightEnabled {
@@ -1788,6 +1818,39 @@ func registerPlaywrightTools() {
}
baseTools = append(baseTools, playwrightTools...)
toolSysMsg += browserToolSysMsg
+ agent.RegisterPWTool("pw_start", pwStart)
+ agent.RegisterPWTool("pw_stop", pwStop)
+ agent.RegisterPWTool("pw_is_running", pwIsRunning)
+ agent.RegisterPWTool("pw_navigate", pwNavigate)
+ agent.RegisterPWTool("pw_click", pwClick)
+ agent.RegisterPWTool("pw_click_at", pwClickAt)
+ agent.RegisterPWTool("pw_fill", pwFill)
+ agent.RegisterPWTool("pw_extract_text", pwExtractText)
+ agent.RegisterPWTool("pw_screenshot", pwScreenshot)
+ agent.RegisterPWTool("pw_screenshot_and_view", pwScreenshotAndView)
+ agent.RegisterPWTool("pw_wait_for_selector", pwWaitForSelector)
+ agent.RegisterPWTool("pw_drag", pwDrag)
+ agent.RegisterPWTool("pw_get_html", pwGetHTML)
+ agent.RegisterPWTool("pw_get_dom", pwGetDOM)
+ agent.RegisterPWTool("pw_search_elements", pwSearchElements)
+ browserAgentTool := []models.Tool{
+ {
+ Type: "function",
+ Function: models.ToolFunc{
+ Name: "browser_agent",
+ Description: "Autonomous browser automation agent. Use for complex multi-step browser tasks like 'go to website, login, and take screenshot'. The agent will plan and execute steps automatically using browser tools.",
+ Parameters: models.ToolFuncParams{
+ Type: "object",
+ Required: []string{"task"},
+ Properties: map[string]models.ToolArgProps{
+ "task": {Type: "string", Description: "The task to accomplish, e.g., 'go to github.com and take a screenshot of the homepage'"},
+ },
+ },
+ },
+ },
+ }
+ baseTools = append(baseTools, browserAgentTool...)
+ fnMap["browser_agent"] = runBrowserAgent
}
}