summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGrail Finder <wohilas@gmail.com>2026-03-14 12:24:11 +0300
committerGrail Finder <wohilas@gmail.com>2026-03-14 12:24:11 +0300
commitf4fcb8557005a5358b87109d945acd3d5a6a0517 (patch)
treea76553a4cdcce647f55d8979812a32f9d7922671
parent92acfb7ed4cda8e93909eadb54863012eac45128 (diff)
Feat: run browser
-rw-r--r--tools.go212
-rw-r--r--tools_playwright.go77
2 files changed, 285 insertions, 4 deletions
diff --git a/tools.go b/tools.go
index fa9c8b5..494dc24 100644
--- a/tools.go
+++ b/tools.go
@@ -205,8 +205,7 @@ func updateToolCapabilities() {
if cfg == nil || cfg.CurrentAPI == "" {
logger.Warn("cannot determine model capabilities: cfg or CurrentAPI is nil")
registerWindowTools()
- fnMap["browser_agent"] = runBrowserAgent
- // registerPlaywrightTools()
+ // fnMap["browser_agent"] = runBrowserAgent
return
}
prevHasVision := modelHasVision
@@ -220,8 +219,7 @@ func updateToolCapabilities() {
}
}
registerWindowTools()
- fnMap["browser_agent"] = runBrowserAgent
- // registerPlaywrightTools()
+ // fnMap["browser_agent"] = runBrowserAgent
}
// getWebAgentClient returns a singleton AgentClient for web agents.
@@ -511,6 +509,18 @@ func runCmd(args map[string]string) []byte {
case "todo":
// todo create|read|update|delete - route to existing todo handlers
return []byte(handleTodoSubcommand(rest, args))
+ case "window", "windows":
+ // window list - list all windows
+ return listWindows(args)
+ case "capture", "screenshot":
+ // capture <window-name> - capture a window
+ return captureWindow(args)
+ case "capture_and_view", "screenshot_and_view":
+ // capture and view screenshot
+ return captureWindowAndView(args)
+ case "browser":
+ // browser <action> [args...] - Playwright browser automation
+ return runBrowserCommand(rest, args)
default:
// Everything else: shell with pipe/chaining support
result := tools.ExecChain(commandStr)
@@ -518,6 +528,136 @@ func runCmd(args map[string]string) []byte {
}
}
+// runBrowserCommand routes browser subcommands to Playwright handlers
+func runBrowserCommand(args []string, originalArgs map[string]string) []byte {
+ if len(args) == 0 {
+ return []byte(`usage: browser <action> [args...]
+Actions:
+ start - start browser
+ stop - stop browser
+ running - check if browser is running
+ go <url> - navigate to URL
+ click <selector> - click element
+ fill <selector> <text> - fill input
+ text [selector] - extract text
+ html [selector] - get HTML
+ dom - get DOM
+ screenshot [path] - take screenshot
+ screenshot_and_view - take and view screenshot
+ wait <selector> - wait for element
+ drag <from> <to> - drag element`)
+ }
+
+ action := args[0]
+ rest := args[1:]
+
+ switch action {
+ case "start":
+ return pwStart(originalArgs)
+ case "stop":
+ return pwStop(originalArgs)
+ case "running":
+ return pwIsRunning(originalArgs)
+ case "go", "navigate", "open":
+ // browser go <url>
+ url := ""
+ if len(rest) > 0 {
+ url = rest[0]
+ }
+ if url == "" {
+ return []byte("usage: browser go <url>")
+ }
+ return pwNavigate(map[string]string{"url": url})
+ case "click":
+ // browser click <selector> [index]
+ selector := ""
+ index := "0"
+ if len(rest) > 0 {
+ selector = rest[0]
+ }
+ if len(rest) > 1 {
+ index = rest[1]
+ }
+ if selector == "" {
+ return []byte("usage: browser click <selector> [index]")
+ }
+ return pwClick(map[string]string{"selector": selector, "index": index})
+ case "fill":
+ // browser fill <selector> <text>
+ if len(rest) < 2 {
+ return []byte("usage: browser fill <selector> <text>")
+ }
+ return pwFill(map[string]string{"selector": rest[0], "text": strings.Join(rest[1:], " ")})
+ case "text":
+ // browser text [selector]
+ selector := ""
+ if len(rest) > 0 {
+ selector = rest[0]
+ }
+ return pwExtractText(map[string]string{"selector": selector})
+ case "html":
+ // browser html [selector]
+ selector := ""
+ if len(rest) > 0 {
+ selector = rest[0]
+ }
+ return pwGetHTML(map[string]string{"selector": selector})
+ case "dom":
+ return pwGetDOM(originalArgs)
+ case "screenshot":
+ // browser screenshot [path]
+ path := ""
+ if len(rest) > 0 {
+ path = rest[0]
+ }
+ return pwScreenshot(map[string]string{"path": path})
+ case "screenshot_and_view":
+ // browser screenshot_and_view [path]
+ path := ""
+ if len(rest) > 0 {
+ path = rest[0]
+ }
+ return pwScreenshotAndView(map[string]string{"path": path})
+ case "wait":
+ // browser wait <selector>
+ selector := ""
+ if len(rest) > 0 {
+ selector = rest[0]
+ }
+ if selector == "" {
+ return []byte("usage: browser wait <selector>")
+ }
+ return pwWaitForSelector(map[string]string{"selector": selector})
+ case "drag":
+ // browser drag <x1> <y1> <x2> <y2> OR browser drag <from_selector> <to_selector>
+ if len(rest) < 4 && len(rest) < 2 {
+ return []byte("usage: browser drag <x1> <y1> <x2> <y2> OR browser drag <from_selector> <to_selector>")
+ }
+ // Check if first arg is a number (coordinates) or selector
+ _, err := strconv.Atoi(rest[0])
+ _, err2 := strconv.ParseFloat(rest[0], 64)
+ if err == nil || err2 == nil {
+ // Coordinates: browser drag 100 200 300 400
+ if len(rest) < 4 {
+ return []byte("usage: browser drag <x1> <y1> <x2> <y2>")
+ }
+ return pwDrag(map[string]string{
+ "x1": rest[0], "y1": rest[1],
+ "x2": rest[2], "y2": rest[3],
+ })
+ }
+ // Selectors: browser drag #item #container
+ // pwDrag needs coordinates, so we need to get element positions first
+ // This requires a different approach - use JavaScript to get centers
+ return pwDragBySelector(map[string]string{
+ "fromSelector": rest[0],
+ "toSelector": rest[1],
+ })
+ default:
+ return []byte(fmt.Sprintf("unknown browser action: %s", action))
+ }
+}
+
// getHelp returns help text for commands
func getHelp(args []string) string {
if len(args) == 0 {
@@ -567,6 +707,25 @@ func getHelp(args []string) string {
todo update <id> <status> - update todo (pending/in_progress/completed)
todo delete <id> - delete a todo
+ # Window (requires xdotool + maim)
+ window - list available windows
+ capture <name> - capture a window screenshot
+ capture_and_view <name> - capture and view screenshot
+
+ # Browser (requires Playwright)
+ browser start - start browser
+ browser stop - stop browser
+ browser running - check if running
+ browser go <url> - navigate to URL
+ browser click <sel> - click element
+ browser fill <sel> <txt> - fill input
+ browser text [sel] - extract text
+ browser html [sel] - get HTML
+ browser screenshot - take screenshot
+ browser wait <sel> - wait for element
+ browser drag <x1> <y1> <x2> <y2> - drag by coordinates
+ browser drag <sel1> <sel2> - drag by selectors (center points)
+
# System
<any shell command> - run shell command directly
@@ -675,6 +834,51 @@ Use: run "command" to execute.`
run "go test ./..."
run "go mod tidy"
run "go get github.com/package"`
+ case "window", "windows":
+ return `window
+ List available windows.
+ Requires: xdotool and maim
+ Example:
+ run "window"`
+ case "capture", "screenshot":
+ return `capture <window-name-or-id>
+ Capture a screenshot of a window.
+ Requires: xdotool and maim
+ Examples:
+ run "capture Firefox"
+ run "capture 0x12345678"
+ run "capture_and_view Firefox"`
+ case "capture_and_view":
+ return `capture_and_view <window-name-or-id>
+ Capture a window and return for viewing.
+ Requires: xdotool and maim
+ Examples:
+ run "capture_and_view Firefox"`
+ case "browser":
+ return `browser <action> [args]
+ Playwright browser automation.
+ Requires: Playwright browser server running
+ Actions:
+ start - start browser
+ stop - stop browser
+ running - check if browser is running
+ go <url> - navigate to URL
+ click <selector> - click element (use index for multiple: click #btn 1)
+ fill <selector> <text> - fill input field
+ text [selector] - extract text (from element or whole page)
+ html [selector] - get HTML (from element or whole page)
+ screenshot [path] - take screenshot
+ wait <selector> - wait for element to appear
+ drag <from> <to> - drag element to another element
+ Examples:
+ run "browser start"
+ run "browser go https://example.com"
+ run "browser click #submit-button"
+ run "browser fill #search-input hello"
+ run "browser text"
+ run "browser screenshot"
+ run "browser drag 100 200 300 400"
+ run "browser drag #item1 #container2"`
default:
return fmt.Sprintf("No help available for: %s. Use: run \"help\" for all commands.", cmd)
}
diff --git a/tools_playwright.go b/tools_playwright.go
index 3555469..786b170 100644
--- a/tools_playwright.go
+++ b/tools_playwright.go
@@ -455,6 +455,83 @@ func pwDrag(args map[string]string) []byte {
return []byte(fmt.Sprintf(`{"success": true, "message": "Dragged from (%s,%s) to (%s,%s)"}`, x1, y1, x2, y2))
}
+func pwDragBySelector(args map[string]string) []byte {
+ fromSelector, ok := args["fromSelector"]
+ if !ok || fromSelector == "" {
+ return []byte(`{"error": "fromSelector not provided"}`)
+ }
+ toSelector, ok := args["toSelector"]
+ if !ok || toSelector == "" {
+ return []byte(`{"error": "toSelector not provided"}`)
+ }
+ if !browserStarted || page == nil {
+ return []byte(`{"error": "Browser not started. Call pw_start first."}`)
+ }
+
+ // Get center coordinates of both elements using JavaScript
+ fromJS := fmt.Sprintf(`
+ function getCenter(selector) {
+ const el = document.querySelector(selector);
+ if (!el) return null;
+ const rect = el.getBoundingClientRect();
+ return { x: rect.left + rect.width / 2, y: rect.top + rect.height / 2 };
+ }
+ getCenter(%q)
+ `, fromSelector)
+ toJS := fmt.Sprintf(`
+ function getCenter(selector) {
+ const el = document.querySelector(selector);
+ if (!el) return null;
+ const rect = el.getBoundingClientRect();
+ return { x: rect.left + rect.width / 2, y: rect.top + rect.height / 2 };
+ }
+ getCenter(%q)
+ `, toSelector)
+
+ fromResult, err := page.Evaluate(fromJS)
+ if err != nil {
+ return []byte(fmt.Sprintf(`{"error": "failed to get from element: %s"}`, err.Error()))
+ }
+ fromMap, ok := fromResult.(map[string]interface{})
+ if !ok || fromMap == nil {
+ return []byte(fmt.Sprintf(`{"error": "from selector '%s' not found"}`, fromSelector))
+ }
+ fromX := fromMap["x"].(float64)
+ fromY := fromMap["y"].(float64)
+
+ toResult, err := page.Evaluate(toJS)
+ if err != nil {
+ return []byte(fmt.Sprintf(`{"error": "failed to get to element: %s"}`, err.Error()))
+ }
+ toMap, ok := toResult.(map[string]interface{})
+ if !ok || toMap == nil {
+ return []byte(fmt.Sprintf(`{"error": "to selector '%s' not found"}`, toSelector))
+ }
+ toX := toMap["x"].(float64)
+ toY := toMap["y"].(float64)
+
+ // Perform the drag using coordinates
+ mouse := page.Mouse()
+ err = mouse.Move(fromX, fromY)
+ if err != nil {
+ return []byte(fmt.Sprintf(`{"error": "failed to move mouse: %s"}`, err.Error()))
+ }
+ err = mouse.Down()
+ if err != nil {
+ return []byte(fmt.Sprintf(`{"error": "failed to mouse down: %s"}`, err.Error()))
+ }
+ err = mouse.Move(toX, toY)
+ if err != nil {
+ return []byte(fmt.Sprintf(`{"error": "failed to move mouse: %s"}`, err.Error()))
+ }
+ err = mouse.Up()
+ if err != nil {
+ return []byte(fmt.Sprintf(`{"error": "failed to mouse up: %s"}`, err.Error()))
+ }
+ msg := fmt.Sprintf("Dragged from %s (%.0f,%.0f) to %s (%.0f,%.0f)", fromSelector, fromX, fromY, toSelector, toX, toY)
+ return []byte(fmt.Sprintf(`{"success": true, "message": "%s"}`, msg))
+}
+
func pwClickAt(args map[string]string) []byte {
x, ok := args["x"]
if !ok {