summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGrail Finder <wohilas@gmail.com>2026-03-09 08:50:33 +0300
committerGrail Finder <wohilas@gmail.com>2026-03-09 08:50:33 +0300
commitc2c90f6d2b766bbba30c8ea8087f799a6c21f525 (patch)
tree75a60a98055b31e8341a83a3b6559e5810666e1d
parent94769225cfbcd4b0a30acab913915f45d6cb9f4b (diff)
Enha: pw agent
-rw-r--r--agent/pw_agent.go102
-rw-r--r--agent/pw_tools.go349
-rw-r--r--agent/request.go14
-rw-r--r--agent/webagent.go4
-rw-r--r--tools.go77
5 files changed, 536 insertions, 10 deletions
diff --git a/agent/pw_agent.go b/agent/pw_agent.go
index 8c1c2bf..2807331 100644
--- a/agent/pw_agent.go
+++ b/agent/pw_agent.go
@@ -1,5 +1,11 @@
package agent
+import (
+ "encoding/json"
+ "gf-lt/models"
+ "strings"
+)
+
// PWAgent: is AgenterA type agent (enclosed with tool chaining)
// sysprompt explain tools and how to plan for execution
type PWAgent struct {
@@ -7,11 +13,16 @@ type PWAgent struct {
sysprompt string
}
-// NewWebAgentB creates a WebAgentB that uses the given formatting function
+// NewPWAgent creates a PWAgent with the given client and system prompt
func NewPWAgent(client *AgentClient, sysprompt string) *PWAgent {
return &PWAgent{AgentClient: client, sysprompt: sysprompt}
}
+// SetTools sets the tools available to the agent
+func (a *PWAgent) SetTools(tools []models.Tool) {
+ a.tools = tools
+}
+
func (a *PWAgent) ProcessTask(task string) []byte {
req, err := a.FormFirstMsg(a.sysprompt, task)
if err != nil {
@@ -25,16 +36,91 @@ func (a *PWAgent) ProcessTask(task string) []byte {
a.Log().Error("failed to process the request", "error", err)
return []byte("failed to process the request; err: " + err.Error())
}
- toolCall, hasToolCall := findToolCall(resp)
+ execTool, toolCallID, hasToolCall := findToolCall(resp)
if !hasToolCall {
return resp
}
- // check resp for tool calls
- // make tool call
- // add tool call resp to body
- // send new request too lmm
- tooResp := toolCall(resp)
- req, err = a.FormMsg(toolResp)
+
+ a.setToolCallOnLastMessage(resp, toolCallID)
+
+ toolResp := string(execTool())
+ req, err = a.FormMsgWithToolCallID(toolResp, toolCallID)
+ if err != nil {
+ a.Log().Error("failed to form next message", "error", err)
+ return []byte("failed to form next message; err: " + err.Error())
+ }
}
return nil
}
+
+func (a *PWAgent) setToolCallOnLastMessage(resp []byte, toolCallID string) {
+ if toolCallID == "" {
+ return
+ }
+
+ var genericResp map[string]interface{}
+ if err := json.Unmarshal(resp, &genericResp); err != nil {
+ return
+ }
+
+ var name string
+ var args map[string]string
+
+ if choices, ok := genericResp["choices"].([]interface{}); ok && len(choices) > 0 {
+ if firstChoice, ok := choices[0].(map[string]interface{}); ok {
+ if message, ok := firstChoice["message"].(map[string]interface{}); ok {
+ if toolCalls, ok := message["tool_calls"].([]interface{}); ok && len(toolCalls) > 0 {
+ if tc, ok := toolCalls[0].(map[string]interface{}); ok {
+ if fn, ok := tc["function"].(map[string]interface{}); ok {
+ name, _ = fn["name"].(string)
+ argsStr, _ := fn["arguments"].(string)
+ json.Unmarshal([]byte(argsStr), &args)
+ }
+ }
+ }
+ }
+ }
+ }
+
+ if name == "" {
+ content, _ := genericResp["content"].(string)
+ name = extractToolNameFromText(content)
+ }
+
+ lastIdx := len(a.chatBody.Messages) - 1
+ if lastIdx >= 0 {
+ a.chatBody.Messages[lastIdx].ToolCallID = toolCallID
+ if name != "" {
+ argsJSON, _ := json.Marshal(args)
+ a.chatBody.Messages[lastIdx].ToolCall = &models.ToolCall{
+ ID: toolCallID,
+ Name: name,
+ Args: string(argsJSON),
+ }
+ }
+ }
+}
+
+func extractToolNameFromText(text string) string {
+ jsStr := toolCallRE.FindString(text)
+ if jsStr == "" {
+ return ""
+ }
+ jsStr = strings.TrimSpace(jsStr)
+ jsStr = strings.TrimPrefix(jsStr, "__tool_call__")
+ jsStr = strings.TrimSuffix(jsStr, "__tool_call__")
+ jsStr = strings.TrimSpace(jsStr)
+
+ start := strings.Index(jsStr, "{")
+ end := strings.LastIndex(jsStr, "}")
+ if start == -1 || end == -1 || end <= start {
+ return ""
+ }
+ jsStr = jsStr[start : end+1]
+
+ var fc models.FuncCall
+ if err := json.Unmarshal([]byte(jsStr), &fc); err != nil {
+ return ""
+ }
+ return fc.Name
+}
diff --git a/agent/pw_tools.go b/agent/pw_tools.go
new file mode 100644
index 0000000..19fd130
--- /dev/null
+++ b/agent/pw_tools.go
@@ -0,0 +1,349 @@
+package agent
+
+import (
+ "encoding/json"
+ "fmt"
+ "regexp"
+ "strings"
+
+ "gf-lt/models"
+)
+
+type ToolFunc func(map[string]string) []byte
+
+var pwToolMap = make(map[string]ToolFunc)
+
+func RegisterPWTool(name string, fn ToolFunc) {
+ pwToolMap[name] = fn
+}
+
+func GetPWTools() []models.Tool {
+ return pwTools
+}
+
+var pwTools = []models.Tool{
+ {
+ Type: "function",
+ Function: models.ToolFunc{
+ Name: "pw_start",
+ Description: "Start a Playwright browser instance. Must be called first before any other browser automation. Uses headless mode by default.",
+ Parameters: models.ToolFuncParams{
+ Type: "object",
+ Required: []string{},
+ Properties: map[string]models.ToolArgProps{},
+ },
+ },
+ },
+ {
+ Type: "function",
+ Function: models.ToolFunc{
+ Name: "pw_stop",
+ Description: "Stop the Playwright browser instance. Call when done with browser automation.",
+ Parameters: models.ToolFuncParams{
+ Type: "object",
+ Required: []string{},
+ Properties: map[string]models.ToolArgProps{},
+ },
+ },
+ },
+ {
+ Type: "function",
+ Function: models.ToolFunc{
+ Name: "pw_is_running",
+ Description: "Check if Playwright browser is currently running.",
+ Parameters: models.ToolFuncParams{
+ Type: "object",
+ Required: []string{},
+ Properties: map[string]models.ToolArgProps{},
+ },
+ },
+ },
+ {
+ Type: "function",
+ Function: models.ToolFunc{
+ Name: "pw_navigate",
+ Description: "Navigate to a URL in the browser.",
+ Parameters: models.ToolFuncParams{
+ Type: "object",
+ Required: []string{"url"},
+ Properties: map[string]models.ToolArgProps{
+ "url": {Type: "string", Description: "URL to navigate to"},
+ },
+ },
+ },
+ },
+ {
+ Type: "function",
+ Function: models.ToolFunc{
+ Name: "pw_click",
+ Description: "Click on an element on the current webpage. Use 'index' for multiple matches (default 0).",
+ Parameters: models.ToolFuncParams{
+ Type: "object",
+ Required: []string{"selector"},
+ Properties: map[string]models.ToolArgProps{
+ "selector": {Type: "string", Description: "CSS selector for the element"},
+ "index": {Type: "integer", Description: "Index for multiple matches (default 0)"},
+ },
+ },
+ },
+ },
+ {
+ Type: "function",
+ Function: models.ToolFunc{
+ Name: "pw_fill",
+ Description: "Type text into an input field. Use 'index' for multiple matches (default 0).",
+ Parameters: models.ToolFuncParams{
+ Type: "object",
+ Required: []string{"selector", "text"},
+ Properties: map[string]models.ToolArgProps{
+ "selector": {Type: "string", Description: "CSS selector for the input element"},
+ "text": {Type: "string", Description: "Text to type into the field"},
+ "index": {Type: "integer", Description: "Index for multiple matches (default 0)"},
+ },
+ },
+ },
+ },
+ {
+ Type: "function",
+ Function: models.ToolFunc{
+ Name: "pw_extract_text",
+ Description: "Extract text content from the page or specific elements. Use selector 'body' for all page text.",
+ Parameters: models.ToolFuncParams{
+ Type: "object",
+ Required: []string{},
+ Properties: map[string]models.ToolArgProps{
+ "selector": {Type: "string", Description: "CSS selector (default 'body' for all page text)"},
+ },
+ },
+ },
+ },
+ {
+ Type: "function",
+ Function: models.ToolFunc{
+ Name: "pw_screenshot",
+ Description: "Take a screenshot of the page or a specific element. Returns a file path to the image.",
+ Parameters: models.ToolFuncParams{
+ Type: "object",
+ Required: []string{},
+ Properties: map[string]models.ToolArgProps{
+ "selector": {Type: "string", Description: "CSS selector for element to screenshot"},
+ "full_page": {Type: "boolean", Description: "Capture full page (default false)"},
+ },
+ },
+ },
+ },
+ {
+ Type: "function",
+ Function: models.ToolFunc{
+ Name: "pw_screenshot_and_view",
+ Description: "Take a screenshot and return the image for viewing. Use to visually verify page state.",
+ Parameters: models.ToolFuncParams{
+ Type: "object",
+ Required: []string{},
+ Properties: map[string]models.ToolArgProps{
+ "selector": {Type: "string", Description: "CSS selector for element to screenshot"},
+ "full_page": {Type: "boolean", Description: "Capture full page (default false)"},
+ },
+ },
+ },
+ },
+ {
+ Type: "function",
+ Function: models.ToolFunc{
+ Name: "pw_wait_for_selector",
+ Description: "Wait for an element to appear on the page before proceeding.",
+ Parameters: models.ToolFuncParams{
+ Type: "object",
+ Required: []string{"selector"},
+ Properties: map[string]models.ToolArgProps{
+ "selector": {Type: "string", Description: "CSS selector to wait for"},
+ "timeout": {Type: "integer", Description: "Timeout in milliseconds (default 30000)"},
+ },
+ },
+ },
+ },
+ {
+ Type: "function",
+ Function: models.ToolFunc{
+ Name: "pw_drag",
+ Description: "Drag the mouse from point (x1,y1) to (x2,y2).",
+ Parameters: models.ToolFuncParams{
+ Type: "object",
+ Required: []string{"x1", "y1", "x2", "y2"},
+ Properties: map[string]models.ToolArgProps{
+ "x1": {Type: "number", Description: "Starting X coordinate"},
+ "y1": {Type: "number", Description: "Starting Y coordinate"},
+ "x2": {Type: "number", Description: "Ending X coordinate"},
+ "y2": {Type: "number", Description: "Ending Y coordinate"},
+ },
+ },
+ },
+ },
+ {
+ Type: "function",
+ Function: models.ToolFunc{
+ Name: "pw_click_at",
+ Description: "Click at specific X,Y coordinates on the page.",
+ Parameters: models.ToolFuncParams{
+ Type: "object",
+ Required: []string{"x", "y"},
+ Properties: map[string]models.ToolArgProps{
+ "x": {Type: "number", Description: "X coordinate"},
+ "y": {Type: "number", Description: "Y coordinate"},
+ },
+ },
+ },
+ },
+ {
+ Type: "function",
+ Function: models.ToolFunc{
+ Name: "pw_get_html",
+ Description: "Get the HTML content of the page or a specific element.",
+ Parameters: models.ToolFuncParams{
+ Type: "object",
+ Required: []string{},
+ Properties: map[string]models.ToolArgProps{
+ "selector": {Type: "string", Description: "CSS selector (default 'body')"},
+ },
+ },
+ },
+ },
+ {
+ Type: "function",
+ Function: models.ToolFunc{
+ Name: "pw_get_dom",
+ Description: "Get a structured DOM representation with tag, attributes, text, and children.",
+ Parameters: models.ToolFuncParams{
+ Type: "object",
+ Required: []string{},
+ Properties: map[string]models.ToolArgProps{
+ "selector": {Type: "string", Description: "CSS selector (default 'body')"},
+ },
+ },
+ },
+ },
+ {
+ Type: "function",
+ Function: models.ToolFunc{
+ Name: "pw_search_elements",
+ Description: "Search for elements by text content or CSS selector.",
+ Parameters: models.ToolFuncParams{
+ Type: "object",
+ Required: []string{},
+ Properties: map[string]models.ToolArgProps{
+ "text": {Type: "string", Description: "Text content to search for"},
+ "selector": {Type: "string", Description: "CSS selector to search for"},
+ },
+ },
+ },
+ },
+}
+
+var toolCallRE = regexp.MustCompile(`__tool_call__(.+?)__tool_call__`)
+
+type ParsedToolCall struct {
+ ID string
+ Name string
+ Args map[string]string
+}
+
+func findToolCall(resp []byte) (func() []byte, string, bool) {
+ var genericResp map[string]interface{}
+ if err := json.Unmarshal(resp, &genericResp); err != nil {
+ return findToolCallFromText(string(resp))
+ }
+
+ if choices, ok := genericResp["choices"].([]interface{}); ok && len(choices) > 0 {
+ if firstChoice, ok := choices[0].(map[string]interface{}); ok {
+ if message, ok := firstChoice["message"].(map[string]interface{}); ok {
+ if toolCalls, ok := message["tool_calls"].([]interface{}); ok && len(toolCalls) > 0 {
+ return parseOpenAIToolCall(toolCalls)
+ }
+ if content, ok := message["content"].(string); ok {
+ return findToolCallFromText(content)
+ }
+ }
+ if text, ok := firstChoice["text"].(string); ok {
+ return findToolCallFromText(text)
+ }
+ }
+ }
+
+ if content, ok := genericResp["content"].(string); ok {
+ return findToolCallFromText(content)
+ }
+
+ return findToolCallFromText(string(resp))
+}
+
+func parseOpenAIToolCall(toolCalls []interface{}) (func() []byte, string, bool) {
+ if len(toolCalls) == 0 {
+ return nil, "", false
+ }
+
+ tc := toolCalls[0].(map[string]interface{})
+ id, _ := tc["id"].(string)
+ function, _ := tc["function"].(map[string]interface{})
+ name, _ := function["name"].(string)
+ argsStr, _ := function["arguments"].(string)
+
+ var args map[string]string
+ if err := json.Unmarshal([]byte(argsStr), &args); err != nil {
+ return func() []byte {
+ return []byte(fmt.Sprintf(`{"error": "failed to parse arguments: %v"}`, err))
+ }, id, true
+ }
+
+ return func() []byte {
+ fn, ok := pwToolMap[name]
+ if !ok {
+ return []byte(fmt.Sprintf(`{"error": "tool %s not found"}`, name))
+ }
+ return fn(args)
+ }, id, true
+}
+
+func findToolCallFromText(text string) (func() []byte, string, bool) {
+ jsStr := toolCallRE.FindString(text)
+ if jsStr == "" {
+ return nil, "", false
+ }
+
+ jsStr = strings.TrimSpace(jsStr)
+ jsStr = strings.TrimPrefix(jsStr, "__tool_call__")
+ jsStr = strings.TrimSuffix(jsStr, "__tool_call__")
+ jsStr = strings.TrimSpace(jsStr)
+
+ start := strings.Index(jsStr, "{")
+ end := strings.LastIndex(jsStr, "}")
+ if start == -1 || end == -1 || end <= start {
+ return func() []byte {
+ return []byte(`{"error": "no valid JSON found in tool call"}`)
+ }, "", true
+ }
+
+ jsStr = jsStr[start : end+1]
+
+ var fc models.FuncCall
+ if err := json.Unmarshal([]byte(jsStr), &fc); err != nil {
+ return func() []byte {
+ return []byte(fmt.Sprintf(`{"error": "failed to parse tool call: %v}`, err))
+ }, "", true
+ }
+
+ if fc.ID == "" {
+ fc.ID = "call_" + generateToolCallID()
+ }
+
+ return func() []byte {
+ fn, ok := pwToolMap[fc.Name]
+ if !ok {
+ return []byte(fmt.Sprintf(`{"error": "tool %s not found"}`, fc.Name))
+ }
+ return fn(fc.Args)
+ }, fc.ID, true
+}
+
+func generateToolCallID() string {
+ return fmt.Sprintf("%d", len(pwToolMap)%10000)
+}
diff --git a/agent/request.go b/agent/request.go
index 4ca619d..754f16e 100644
--- a/agent/request.go
+++ b/agent/request.go
@@ -80,6 +80,20 @@ func (ag *AgentClient) FormMsg(msg string) (io.Reader, error) {
return bytes.NewReader(b), nil
}
+func (ag *AgentClient) FormMsgWithToolCallID(msg, toolCallID string) (io.Reader, error) {
+ m := models.RoleMsg{
+ Role: "tool",
+ Content: msg,
+ ToolCallID: toolCallID,
+ }
+ ag.chatBody.Messages = append(ag.chatBody.Messages, m)
+ b, err := ag.buildRequest()
+ if err != nil {
+ return nil, err
+ }
+ return bytes.NewReader(b), nil
+}
+
// buildRequest creates the appropriate LLM request based on the current API endpoint.
func (ag *AgentClient) buildRequest() ([]byte, error) {
isCompletion, isChat, isDeepSeek, isOpenRouter := detectAPI(ag.cfg.CurrentAPI)
diff --git a/agent/webagent.go b/agent/webagent.go
index ff6cd86..11e9014 100644
--- a/agent/webagent.go
+++ b/agent/webagent.go
@@ -17,8 +17,8 @@ func NewWebAgentB(client *AgentClient, sysprompt string) *WebAgentB {
// Process applies the formatting function to raw output
func (a *WebAgentB) Process(args map[string]string, rawOutput []byte) []byte {
- msg, err := a.FormMsg(a.sysprompt,
- fmt.Sprintf("request:\n%+v\ntool response:\n%v", args, string(rawOutput)))
+ msg, err := a.FormMsg(
+ fmt.Sprintf("%s\n\nrequest:\n%+v\ntool response:\n%v", a.sysprompt, args, string(rawOutput)))
if err != nil {
a.Log().Error("failed to process the request", "error", err)
return []byte("failed to process the request; err: " + err.Error())
diff --git a/tools.go b/tools.go
index d6fe146..275c166 100644
--- a/tools.go
+++ b/tools.go
@@ -1491,6 +1491,47 @@ func registerWindowTools() {
}
}
+var browserAgentSysPrompt = `You are an autonomous browser automation agent. Your goal is to complete the user's task by intelligently using browser automation tools.
+
+Available tools:
+- pw_start: Start browser (must call first)
+- pw_stop: Stop browser (call when done)
+- pw_is_running: Check if browser is running
+- pw_navigate: Go to a URL
+- pw_click: Click an element by CSS selector
+- pw_fill: Type text into an input
+- pw_extract_text: Get text from page/element
+- pw_screenshot: Take a screenshot (returns file path)
+- pw_screenshot_and_view: Take screenshot with image for viewing
+- pw_wait_for_selector: Wait for element to appear
+- pw_drag: Drag mouse from one point to another
+- pw_click_at: Click at X,Y coordinates
+- pw_get_html: Get HTML content
+- pw_get_dom: Get structured DOM tree
+- pw_search_elements: Search for elements by text or selector
+
+Workflow:
+1. Start browser if not running (pw_start)
+2. Navigate to required pages (pw_navigate)
+3. Interact with elements as needed (click, fill, etc.)
+4. Extract information or take screenshots as requested
+5. Stop browser when done (pw_stop)
+
+Always provide clear feedback about what you're doing and what you found.`
+
+func runBrowserAgent(args map[string]string) []byte {
+ task, ok := args["task"]
+ if !ok || task == "" {
+ return []byte(`{"error": "task argument is required"}`)
+ }
+
+ client := getWebAgentClient()
+ pwAgent := agent.NewPWAgent(client, browserAgentSysPrompt)
+ pwAgent.SetTools(agent.GetPWTools())
+
+ return pwAgent.ProcessTask(task)
+}
+
func registerPlaywrightTools() {
removePlaywrightToolsFromBaseTools()
if cfg != nil && cfg.PlaywrightEnabled {
@@ -1776,6 +1817,42 @@ func registerPlaywrightTools() {
}
baseTools = append(baseTools, playwrightTools...)
toolSysMsg += browserToolSysMsg
+
+ agent.RegisterPWTool("pw_start", pwStart)
+ agent.RegisterPWTool("pw_stop", pwStop)
+ agent.RegisterPWTool("pw_is_running", pwIsRunning)
+ agent.RegisterPWTool("pw_navigate", pwNavigate)
+ agent.RegisterPWTool("pw_click", pwClick)
+ agent.RegisterPWTool("pw_click_at", pwClickAt)
+ agent.RegisterPWTool("pw_fill", pwFill)
+ agent.RegisterPWTool("pw_extract_text", pwExtractText)
+ agent.RegisterPWTool("pw_screenshot", pwScreenshot)
+ agent.RegisterPWTool("pw_screenshot_and_view", pwScreenshotAndView)
+ agent.RegisterPWTool("pw_wait_for_selector", pwWaitForSelector)
+ agent.RegisterPWTool("pw_drag", pwDrag)
+ agent.RegisterPWTool("pw_get_html", pwGetHTML)
+ agent.RegisterPWTool("pw_get_dom", pwGetDOM)
+ agent.RegisterPWTool("pw_search_elements", pwSearchElements)
+
+ browserAgentTool := []models.Tool{
+ {
+ Type: "function",
+ Function: models.ToolFunc{
+ Name: "browser_agent",
+ Description: "Autonomous browser automation agent. Use for complex multi-step browser tasks like 'go to website, login, and take screenshot'. The agent will plan and execute steps automatically using browser tools.",
+ Parameters: models.ToolFuncParams{
+ Type: "object",
+ Required: []string{"task"},
+ Properties: map[string]models.ToolArgProps{
+ "task": {Type: "string", Description: "The task to accomplish, e.g., 'go to github.com and take a screenshot of the homepage'"},
+ },
+ },
+ },
+ },
+ }
+ baseTools = append(baseTools, browserAgentTool...)
+
+ fnMap["browser_agent"] = runBrowserAgent
}
}