Haystack/backend/agents/orchestrator.go
2025-05-04 10:19:39 +01:00

172 lines
6.9 KiB
Go

package agents
import (
"screenmark/screenmark/agents/client"
"github.com/charmbracelet/log"
)
const orchestratorPrompt = `
**Role:** You are an Orchestrator AI responsible for analyzing images provided by the user.
**Primary Task:** Examine the input image and determine which specialized AI agent(s), available as tool calls, should be invoked to process the relevant information within the image, or if no specialized processing is needed. Your goal is to either extract and structure useful information for the user by selecting the most appropriate tool(s) or explicitly indicate that no specific action is required.
**Analysis Process & Decision Logic:**
1. **Analyze Image Content:** Scrutinize the image for distinct types of information:
* General text/writing (including code, formulas)
* Information about a person or contact details
* Information about a place, location, or address
* Information about an event
* Content that doesn't fit any specific category or lacks actionable information.
2. **Thinking**
* You should use the think tool to allow you to think your way through the image.
* You should call this as many times as you need to in order to describe and analyse the image correctly.
3. **Agent Selection - Determine ALL that apply:**
* **contactAgent:** Is there information specifically related to a person or their contact details (e.g., business card, name/email/phone)?
* **locationAgent:** Is there information specifically identifying a place, location, city, or address (e.g., map, street sign, address text)?
* **eventAgent:** Is there information specifically related to an event (e.g., invitation, poster with date/time, schedule)?
* **noteAgent** Does the image contain *any* text/writing (including code, formulas)?
* **noAgent**: Call this when you are done working on this image.
* Call all applicable specialized agents (noteAgent, contactAgent, locationAgent, eventAgent) simultaneously (in parallel).
* Do not call agents if you dont think they are relevant. It's better to be sound than complete.
`
const orchestratorTools = `
[
{
"type": "function",
"function": {
"name": "think",
"description": "Use to layout all your thoughts about the image, roughly describing it, and specially describing if the image contains anything relevant to your available agents",
"parameters": {
"type": "object",
"properties": {
"thought": {
"type": "string",
"description": "A singular thought about the image"
}
},
"required": []
}
}
},
{
"type": "function",
"function": {
"name": "noteAgent",
"description": "Extracts general textual content like handwritten notes, paragraphs in documents, presentation slides, code snippets, or mathematical formulas. Use this for significant text that isn't primarily contact details, an address, or specific event information.",
"parameters": {
"type": "object",
"properties": {},
"required": []
}
}
},
{
"type": "function",
"function": {
"name": "contactAgent",
"description": "Extracts personal contact information. Use when the image clearly shows details like names, phone numbers, email addresses, job titles, or company names, especially from sources like business cards, email signatures, or contact lists.",
"parameters": {
"type": "object",
"properties": {},
"required": []
}
}
},
{
"type": "function",
"function": {
"name": "locationAgent",
"description": "Use when the input has anything to do with a place. This could be a city, an address, a postcode, a virtual meeting location, or a geographical location.",
"parameters": {
"type": "object",
"properties": {},
"required": []
}
}
},
{
"type": "function",
"function": {
"name": "eventAgent",
"description": "Extracts details related to scheduled events, appointments, or specific occasions. Use when the image contains information like event titles, dates, times, venues, agendas, or descriptions, typically found on invitations, posters, calendar entries, or schedules.",
"parameters": {
"type": "object",
"properties": {},
"required": []
}
}
},
{
"type": "function",
"function": {
"name": "noAgent",
"description": "Extracts details related to scheduled events, appointments, or specific occasions. Use when the image contains information like event titles, dates, times, venues, agendas, or descriptions, typically found on invitations, posters, calendar entries, or schedules.",
"parameters": {
"type": "object",
"properties": {},
"required": []
}
}
}
]
`
type OrchestratorAgent struct {
Client client.AgentClient
log log.Logger
}
type Status struct {
Ok bool `json:"ok"`
}
func NewOrchestratorAgent(log *log.Logger, noteAgent NoteAgent, contactAgent client.AgentClient, locationAgent client.AgentClient, eventAgent client.AgentClient, imageName string, imageData []byte) client.AgentClient {
agent := client.CreateAgentClient(client.CreateAgentClientOptions{
SystemPrompt: orchestratorPrompt,
JsonTools: orchestratorTools,
Log: log,
EndToolCall: "noAgent",
})
agent.ToolHandler.AddTool("think", func(info client.ToolHandlerInfo, args string, call client.ToolCall) (any, error) {
return "Thought", nil
})
agent.ToolHandler.AddTool("noteAgent", func(info client.ToolHandlerInfo, args string, call client.ToolCall) (any, error) {
noteAgent.GetNotes(info.UserId, info.ImageId, imageName, imageData)
return "noteAgent called successfully", nil
})
agent.ToolHandler.AddTool("contactAgent", func(info client.ToolHandlerInfo, args string, call client.ToolCall) (any, error) {
contactAgent.RunAgent(info.UserId, info.ImageId, imageName, imageData)
return "contactAgent called successfully", nil
})
agent.ToolHandler.AddTool("locationAgent", func(info client.ToolHandlerInfo, args string, call client.ToolCall) (any, error) {
locationAgent.RunAgent(info.UserId, info.ImageId, imageName, imageData)
return "locationAgent called successfully", nil
})
agent.ToolHandler.AddTool("eventAgent", func(info client.ToolHandlerInfo, args string, call client.ToolCall) (any, error) {
eventAgent.RunAgent(info.UserId, info.ImageId, imageName, imageData)
return "eventAgent called successfully", nil
})
agent.ToolHandler.AddTool("noAgent", func(info client.ToolHandlerInfo, args string, call client.ToolCall) (any, error) {
return "ok", nil
})
return agent
}