From f4fcb8557005a5358b87109d945acd3d5a6a0517 Mon Sep 17 00:00:00 2001 From: Grail Finder Date: Sat, 14 Mar 2026 12:24:11 +0300 Subject: Feat: run browser --- tools.go | 212 +++++++++++++++++++++++++++++++++++++++++++++++++++- tools_playwright.go | 77 +++++++++++++++++++ 2 files changed, 285 insertions(+), 4 deletions(-) diff --git a/tools.go b/tools.go index fa9c8b5..494dc24 100644 --- a/tools.go +++ b/tools.go @@ -205,8 +205,7 @@ func updateToolCapabilities() { if cfg == nil || cfg.CurrentAPI == "" { logger.Warn("cannot determine model capabilities: cfg or CurrentAPI is nil") registerWindowTools() - fnMap["browser_agent"] = runBrowserAgent - // registerPlaywrightTools() + // fnMap["browser_agent"] = runBrowserAgent return } prevHasVision := modelHasVision @@ -220,8 +219,7 @@ func updateToolCapabilities() { } } registerWindowTools() - fnMap["browser_agent"] = runBrowserAgent - // registerPlaywrightTools() + // fnMap["browser_agent"] = runBrowserAgent } // getWebAgentClient returns a singleton AgentClient for web agents. @@ -511,6 +509,18 @@ func runCmd(args map[string]string) []byte { case "todo": // todo create|read|update|delete - route to existing todo handlers return []byte(handleTodoSubcommand(rest, args)) + case "window", "windows": + // window list - list all windows + return listWindows(args) + case "capture", "screenshot": + // capture - capture a window + return captureWindow(args) + case "capture_and_view", "screenshot_and_view": + // capture and view screenshot + return captureWindowAndView(args) + case "browser": + // browser [args...] - Playwright browser automation + return runBrowserCommand(rest, args) default: // Everything else: shell with pipe/chaining support result := tools.ExecChain(commandStr) @@ -518,6 +528,136 @@ func runCmd(args map[string]string) []byte { } } +// runBrowserCommand routes browser subcommands to Playwright handlers +func runBrowserCommand(args []string, originalArgs map[string]string) []byte { + if len(args) == 0 { + return []byte(`usage: browser [args...] +Actions: + start - start browser + stop - stop browser + running - check if browser is running + go - navigate to URL + click - click element + fill - fill input + text [selector] - extract text + html [selector] - get HTML + dom - get DOM + screenshot [path] - take screenshot + screenshot_and_view - take and view screenshot + wait - wait for element + drag - drag element`) + } + + action := args[0] + rest := args[1:] + + switch action { + case "start": + return pwStart(originalArgs) + case "stop": + return pwStop(originalArgs) + case "running": + return pwIsRunning(originalArgs) + case "go", "navigate", "open": + // browser go + url := "" + if len(rest) > 0 { + url = rest[0] + } + if url == "" { + return []byte("usage: browser go ") + } + return pwNavigate(map[string]string{"url": url}) + case "click": + // browser click [index] + selector := "" + index := "0" + if len(rest) > 0 { + selector = rest[0] + } + if len(rest) > 1 { + index = rest[1] + } + if selector == "" { + return []byte("usage: browser click [index]") + } + return pwClick(map[string]string{"selector": selector, "index": index}) + case "fill": + // browser fill + if len(rest) < 2 { + return []byte("usage: browser fill ") + } + return pwFill(map[string]string{"selector": rest[0], "text": strings.Join(rest[1:], " ")}) + case "text": + // browser text [selector] + selector := "" + if len(rest) > 0 { + selector = rest[0] + } + return pwExtractText(map[string]string{"selector": selector}) + case "html": + // browser html [selector] + selector := "" + if len(rest) > 0 { + selector = rest[0] + } + return pwGetHTML(map[string]string{"selector": selector}) + case "dom": + return pwGetDOM(originalArgs) + case "screenshot": + // browser screenshot [path] + path := "" + if len(rest) > 0 { + path = rest[0] + } + return pwScreenshot(map[string]string{"path": path}) + case "screenshot_and_view": + // browser screenshot_and_view [path] + path := "" + if len(rest) > 0 { + path = rest[0] + } + return pwScreenshotAndView(map[string]string{"path": path}) + case "wait": + // browser wait + selector := "" + if len(rest) > 0 { + selector = rest[0] + } + if selector == "" { + return []byte("usage: browser wait ") + } + return pwWaitForSelector(map[string]string{"selector": selector}) + case "drag": + // browser drag OR browser drag + if len(rest) < 4 && len(rest) < 2 { + return []byte("usage: browser drag OR browser drag ") + } + // Check if first arg is a number (coordinates) or selector + _, err := strconv.Atoi(rest[0]) + _, err2 := strconv.ParseFloat(rest[0], 64) + if err == nil || err2 == nil { + // Coordinates: browser drag 100 200 300 400 + if len(rest) < 4 { + return []byte("usage: browser drag ") + } + return pwDrag(map[string]string{ + "x1": rest[0], "y1": rest[1], + "x2": rest[2], "y2": rest[3], + }) + } + // Selectors: browser drag #item #container + // pwDrag needs coordinates, so we need to get element positions first + // This requires a different approach - use JavaScript to get centers + return pwDragBySelector(map[string]string{ + "fromSelector": rest[0], + "toSelector": rest[1], + }) + default: + return []byte(fmt.Sprintf("unknown browser action: %s", action)) + } +} + // getHelp returns help text for commands func getHelp(args []string) string { if len(args) == 0 { @@ -567,6 +707,25 @@ func getHelp(args []string) string { todo update - update todo (pending/in_progress/completed) todo delete - delete a todo + # Window (requires xdotool + maim) + window - list available windows + capture - capture a window screenshot + capture_and_view - capture and view screenshot + + # Browser (requires Playwright) + browser start - start browser + browser stop - stop browser + browser running - check if running + browser go - navigate to URL + browser click - click element + browser fill - fill input + browser text [sel] - extract text + browser html [sel] - get HTML + browser screenshot - take screenshot + browser wait - wait for element + browser drag - drag by coordinates + browser drag - drag by selectors (center points) + # System - run shell command directly @@ -675,6 +834,51 @@ Use: run "command" to execute.` run "go test ./..." run "go mod tidy" run "go get github.com/package"` + case "window", "windows": + return `window + List available windows. + Requires: xdotool and maim + Example: + run "window"` + case "capture", "screenshot": + return `capture + Capture a screenshot of a window. + Requires: xdotool and maim + Examples: + run "capture Firefox" + run "capture 0x12345678" + run "capture_and_view Firefox"` + case "capture_and_view": + return `capture_and_view + Capture a window and return for viewing. + Requires: xdotool and maim + Examples: + run "capture_and_view Firefox"` + case "browser": + return `browser [args] + Playwright browser automation. + Requires: Playwright browser server running + Actions: + start - start browser + stop - stop browser + running - check if browser is running + go - navigate to URL + click - click element (use index for multiple: click #btn 1) + fill - fill input field + text [selector] - extract text (from element or whole page) + html [selector] - get HTML (from element or whole page) + screenshot [path] - take screenshot + wait - wait for element to appear + drag - drag element to another element + Examples: + run "browser start" + run "browser go https://example.com" + run "browser click #submit-button" + run "browser fill #search-input hello" + run "browser text" + run "browser screenshot" + run "browser drag 100 200 300 400" + run "browser drag #item1 #container2"` default: return fmt.Sprintf("No help available for: %s. Use: run \"help\" for all commands.", cmd) } diff --git a/tools_playwright.go b/tools_playwright.go index 3555469..786b170 100644 --- a/tools_playwright.go +++ b/tools_playwright.go @@ -455,6 +455,83 @@ func pwDrag(args map[string]string) []byte { return []byte(fmt.Sprintf(`{"success": true, "message": "Dragged from (%s,%s) to (%s,%s)"}`, x1, y1, x2, y2)) } +func pwDragBySelector(args map[string]string) []byte { + fromSelector, ok := args["fromSelector"] + if !ok || fromSelector == "" { + return []byte(`{"error": "fromSelector not provided"}`) + } + toSelector, ok := args["toSelector"] + if !ok || toSelector == "" { + return []byte(`{"error": "toSelector not provided"}`) + } + if !browserStarted || page == nil { + return []byte(`{"error": "Browser not started. Call pw_start first."}`) + } + + // Get center coordinates of both elements using JavaScript + fromJS := fmt.Sprintf(` + function getCenter(selector) { + const el = document.querySelector(selector); + if (!el) return null; + const rect = el.getBoundingClientRect(); + return { x: rect.left + rect.width / 2, y: rect.top + rect.height / 2 }; + } + getCenter(%q) + `, fromSelector) + toJS := fmt.Sprintf(` + function getCenter(selector) { + const el = document.querySelector(selector); + if (!el) return null; + const rect = el.getBoundingClientRect(); + return { x: rect.left + rect.width / 2, y: rect.top + rect.height / 2 }; + } + getCenter(%q) + `, toSelector) + + fromResult, err := page.Evaluate(fromJS) + if err != nil { + return []byte(fmt.Sprintf(`{"error": "failed to get from element: %s"}`, err.Error())) + } + fromMap, ok := fromResult.(map[string]interface{}) + if !ok || fromMap == nil { + return []byte(fmt.Sprintf(`{"error": "from selector '%s' not found"}`, fromSelector)) + } + fromX := fromMap["x"].(float64) + fromY := fromMap["y"].(float64) + + toResult, err := page.Evaluate(toJS) + if err != nil { + return []byte(fmt.Sprintf(`{"error": "failed to get to element: %s"}`, err.Error())) + } + toMap, ok := toResult.(map[string]interface{}) + if !ok || toMap == nil { + return []byte(fmt.Sprintf(`{"error": "to selector '%s' not found"}`, toSelector)) + } + toX := toMap["x"].(float64) + toY := toMap["y"].(float64) + + // Perform the drag using coordinates + mouse := page.Mouse() + err = mouse.Move(fromX, fromY) + if err != nil { + return []byte(fmt.Sprintf(`{"error": "failed to move mouse: %s"}`, err.Error())) + } + err = mouse.Down() + if err != nil { + return []byte(fmt.Sprintf(`{"error": "failed to mouse down: %s"}`, err.Error())) + } + err = mouse.Move(toX, toY) + if err != nil { + return []byte(fmt.Sprintf(`{"error": "failed to move mouse: %s"}`, err.Error())) + } + err = mouse.Up() + if err != nil { + return []byte(fmt.Sprintf(`{"error": "failed to mouse up: %s"}`, err.Error())) + } + msg := fmt.Sprintf("Dragged from %s (%.0f,%.0f) to %s (%.0f,%.0f)", fromSelector, fromX, fromY, toSelector, toX, toY) + return []byte(fmt.Sprintf(`{"success": true, "message": "%s"}`, msg)) +} + func pwClickAt(args map[string]string) []byte { x, ok := args["x"] if !ok { -- cgit v1.2.3