Haystack/backend/agents/orchestrator.go

package agents

import (
	"errors"
	"screenmark/screenmark/agents/client"

	"github.com/charmbracelet/log"
)

const orchestratorPrompt = `
**Role:** You are an Orchestrator AI responsible for analyzing images provided by the user.

**Primary Task:** Examine the input image and determine which specialized AI agent(s), available as tool calls, should be invoked to process the relevant information within the image. Your goal is to extract and structure useful information for the user by selecting the most appropriate tool(s).

**Input:** User-provided image.

**Analysis Process & Decision Logic:**

1.  **Analyze Image Content:** Scrutinize the image for distinct types of information:
    * General text/writing (including code, formulas)
    * Information about a person or contact details
    * Information about a place, location, or address
    * Information about an event

2.  **Agent Selection - Determine ALL that apply:**
    * **contactAgent:** Is there information specifically related to a person or their contact details (e.g., business card, name/email/phone)? If YES, select contactAgent.
    * **locationAgent:** Is there information specifically identifying a place, location, or address (e.g., map, street sign, address text)? If YES, select locationAgent.
    * **eventAgent:** Is there information specifically related to an event (e.g., invitation, poster with date/time, schedule)? If YES, select eventAgent.
    * **noteAgent** Does the image contain *any* text/writing (including code, formulas)?
        * If YES, *and* if contactAgent, locationAgent, or eventAgent were *also* selected, consider if noteAgent captures *additional* textual information not covered by the others. Call noteAgent alongside the others *only if* there is significant extra text.
        * If YES, and *none* of the other agents (contact, location, event) were selected, then select noteAgent.

3.  **Final Tool Choice:**
    * If *at least one* of noteAgent, contactAgent, locationAgent, or eventAgent was selected in Step 2, prepare to call *all* selected agents in parallel.
    * If *none* of those four agents were selected after your analysis, you MUST call the noAction agent.

**Available Agents (Tools):**

* **noteAgent**: Use when there is any text on the image, this can be code/text/formulas any writing.
* **contactAgent**: Use when the image contains some person or contact.
* **locationAgent**: Use when the image contains some place, location or address.
* **eventAgent**: Use when the image contains some event.
* **noAction**: Use *only* when you are sure none of the other agents (noteAgent, contactAgent, locationAgent, eventAgent) are applicable to the image.

**Execution Rules:**

* Call all applicable agents (noteAgent, contactAgent, locationAgent, eventAgent) simultaneously (in parallel).
* If and only if none of the other agents apply, call noAction. Do not call noAction if any other agent is being called.

**Output:** Specify the tool call(s) required based on your final choice.
`

const orchestratorTools = `
[
    {
        "type": "function",
        "function": {
            "name": "noteAgent",
            "description": "Extracts general textual content like handwritten notes, paragraphs in documents, presentation slides, code snippets, or mathematical formulas. Use this for significant text that isn't primarily contact details, an address, or specific event information.",
            "parameters": {
                "type": "object",
                "properties": {},
                "required": []
            }
        }
    },
    {
        "type": "function",
        "function": {
            "name": "contactAgent",
            "description": "Extracts personal contact information. Use when the image clearly shows details like names, phone numbers, email addresses, job titles, or company names, especially from sources like business cards, email signatures, or contact lists.",
            "parameters": {
                "type": "object",
                "properties": {},
                "required": []
            }
        }
    },
    {
        "type": "function",
        "function": {
            "name": "locationAgent",
            "description": "Identifies and extracts specific geographic locations or addresses. Use for content like street addresses on mail or signs, place names (e.g., restaurant, shop), map snippets, or recognizable landmarks.",
            "parameters": {
                "type": "object",
                "properties": {},
                "required": []
            }
        }
    },
    {
        "type": "function",
        "function": {
            "name": "eventAgent",
            "description": "Extracts details related to scheduled events, appointments, or specific occasions. Use when the image contains information like event titles, dates, times, venues, agendas, or descriptions, typically found on invitations, posters, calendar entries, or schedules.",
            "parameters": {
                "type": "object",
                "properties": {},
                "required": []
            }
        }
    },
    {
        "type": "function",
        "function": {
            "name": "noAction",
            "description": "Select this option *only* when a thorough analysis of the image indicates that none of the other specialized agents (noteAgent, contactAgent, locationAgent, eventAgent) are relevant or needed for processing the image content.",
            "parameters": {
                "type": "object",
                "properties": {},
                "required": []
            }
        }
    }
]`

type OrchestratorAgent struct {
	Client client.AgentClient

	log log.Logger
}

type Status struct {
	Ok bool `json:"ok"`
}

func NewOrchestratorAgent(log *log.Logger, noteAgent NoteAgent, contactAgent client.AgentClient, locationAgent client.AgentClient, eventAgent client.AgentClient, imageName string, imageData []byte) client.AgentClient {
	agent := client.CreateAgentClient(client.CreateAgentClientOptions{
		SystemPrompt: orchestratorPrompt,
		JsonTools:    orchestratorTools,
		Log:          log,
		EndToolCall:  "noAction",
	})

	agent.ToolHandler.AddTool("noteAgent", func(info client.ToolHandlerInfo, args string, call client.ToolCall) (any, error) {
		// go noteAgent.GetNotes(info.UserId, info.ImageId, imageName, imageData)

		return Status{
			Ok: true,
		}, nil
	})

	agent.ToolHandler.AddTool("contactAgent", func(info client.ToolHandlerInfo, args string, call client.ToolCall) (any, error) {
		go contactAgent.RunAgent(info.UserId, info.ImageId, imageName, imageData)

		return Status{
			Ok: true,
		}, nil
	})

	agent.ToolHandler.AddTool("locationAgent", func(info client.ToolHandlerInfo, args string, call client.ToolCall) (any, error) {
		// go locationAgent.RunAgent(info.UserId, info.ImageId, imageName, imageData)

		return Status{
			Ok: true,
		}, nil
	})

	agent.ToolHandler.AddTool("eventAgent", func(info client.ToolHandlerInfo, args string, call client.ToolCall) (any, error) {
		// go eventAgent.RunAgent(info.UserId, info.ImageId, imageName, imageData)

		return Status{
			Ok: true,
		}, nil
	})

	agent.ToolHandler.AddTool("noAction", func(info client.ToolHandlerInfo, args string, call client.ToolCall) (any, error) {
		// To nothing

		return Status{
			Ok: true,
		}, errors.New("Finished! Kinda bad return type but...")
	})

	return agent
}