feat(agents): improving rationality by adding tool to allow the models to think through choices.

This works pretty nicely actually. I'm starting to understand how to demistify the system prompt and have the tools the agent needs to do a good job.
2025-04-18 15:06:20 +01:00
parent 1b1f957e01
commit 57f1e70c98
2 changed files with 75 additions and 48 deletions
--- a/backend/agents/location_agent.go
+++ b/backend/agents/location_agent.go
@ -9,38 +9,33 @@ import (
 	"screenmark/screenmark/models"

 	"github.com/charmbracelet/log"
-	"github.com/google/uuid"
 )

 const locationPrompt = `
 Role: Location AI Assistant

-Objective: Identify locations from images/text, manage a saved list (create, update), and answer user queries about saved locations using the provided tools.
+Objective: Identify locations from images/text, manage a saved list, and answer user queries about saved locations using the provided tools.
+The user does not want to have duplicate entries on their saved location list. So you should only create a new location if listLocation doesnt return
+what would be a duplicate.

 Core Logic:

-**Extract Location Details:** Attempt to extract location details (like InputName, InputAddress) from the user's input (image or text).
+**Extract Location Details:** Attempt to extract location details (like InputName, InputAddress) from the user's input.
 	* If no details can be extracted, inform the user and use stopAgent.

 **Check for Existing Location:** If details *were* extracted:
    * Use listLocations with the extracted InputName and/or InputAddress to search for potentially matching locations already saved in the list.
+	* If you find an existing location, you shouldn't create a duplicate. Call stopAgent when this happens.

 **Decide Action based on Search Results:**
-    * **If listLocations returns one or more likely matches:**
-        * Identify the *best* match (based on name, address similarity).
-        * **Crucially:** Call upsertLocation, providing the locationId of that best match. Include the newly extracted InputName (required) and any other extracted details (InputAddress, etc.) to potentially *update* the existing record or simply link the current input to it.
-    * **If listLocations returns no matches OR no returned location is a confident match:**
-        * Call upsertLocation providing *only* the newly extracted InputName (required) and any other extracted details (InputAddress, etc.). **Do NOT provide a locationId in this case.** This will create a *new* location entry.
+	* If no existing location looks like the location on the input. You should use doesLocationExist to think about whether or not this location is a duplicate.
+		* If you determine it is not a duplicate, then use createLocation to create a new location for the user. 
+		* Else, you should call stopAgent.
+	* If the image does not contain any location, you should use stopAgent.
+	* You should repeat this loop of doesLocationExist and createLocation until you've completed all locations on the image.

-4.  **Finalize:** After successfully calling upsertLocation (or determining no action could be taken), use stopAgent.
-
-Tool Usage:
-
-* **listLocations**: Searches the saved locations list based on provided criteria (like name or address). Used specifically to check if a location potentially already exists before using upsertLocation. Returns a list of matching locations, *each including its locationId*.
-* **upsertLocation**: Creates or updates a location in the saved list. Requires name. Can include address, etc.
-    * **To UPDATE:** If you identified an existing location using listLocations, provide its locationId along with any new/updated details (name, address, etc.).
-    * **To CREATE:** If no existing location was found (or you are creating intentionally), provide the location details (name, address, etc.) but **omit the locationId**.
-* **stopAgent**: Signals the end of the agent's processing for the current turn. Call this *after* completing the location task (create/update/failed extraction).
+**Reply to user querys**
+	* If the user asks you a specific question, you should use the reply tool to reply to them.
 `

 const replyTool = `
@ -64,6 +59,23 @@ const replyTool = `

 const locationTools = `
 [
+    {
+        "type": "function",
+        "function": {
+            "name": "doesLocationExist",
+            "description": "",
+            "parameters": {
+                "type": "object",
+                "properties": {
+					"rationale": {
+						"type": "string",
+						"description": "Your reasoning as to whether or not this image contains a location that already exists in listLocations"
+					}
+				},
+                "required": ["rationale"]
+            }
+        }
+    },
    {
        "type": "function",
        "function": {
@ -79,22 +91,18 @@ const locationTools = `
    {
        "type": "function",
        "function": {
-            "name": "upsertLocation",
-            "description": "Upserts a location. This is used for both creating new locations, and updating existing ones. Providing locationId from an existing ID from listLocations, will make this an update function. Not providing one will create a new location. You must provide a locationId if you think the input is a location that already exists.",
+            "name": "createLocation",
+            "description": "Creates a new location with as much information as you can extract. Be precise. You should only add the parameters you can actually see on the image.",
            "parameters": {
                "type": "object",
                "properties": {
                    "name": {
                        "type": "string",
-                        "description": "The primary name of the location (e.g., 'Eiffel Tower', 'Mom's House', 'Acme Corp HQ'). This field is mandatory."
+                        "description": "The primary name of the location"
                    },
-					"locationId": {
-						"type": "string",
-						"description": "The UUID of the location. You should only provide this IF you believe the location already exists, from listLocation."
-					},
                    "address": {
                        "type": "string",
-                        "description": "The full street address of the location, if available (e.g., 'Champ de Mars, 5 Av. Anatole France, 75007 Paris, France'). Include if extracted."
+                        "description": "The address of the location"
                    }
                },
                "required": ["name"]
@ -125,10 +133,9 @@ func getLocationAgentTools(allowReply bool) string {
 }

 type listLocationArguments struct{}
-type upsertLocationArguments struct {
-	Name       string  `json:"name"`
-	LocationID *string `json:"locationId"`
-	Address    *string `json:"address"`
+type createLocationArguments struct {
+	Name    string  `json:"name"`
+	Address *string `json:"address"`
 }

 func NewLocationAgentWithComm(log *log.Logger, locationModel models.LocationModel) client.AgentClient {
@ -151,8 +158,8 @@ func NewLocationAgent(log *log.Logger, locationModel models.LocationModel) clien
 		return locationModel.List(context.Background(), info.UserId)
 	})

-	agentClient.ToolHandler.AddTool("upsertLocation", func(info client.ToolHandlerInfo, _args string, call client.ToolCall) (any, error) {
-		args := upsertLocationArguments{}
+	agentClient.ToolHandler.AddTool("createLocation", func(info client.ToolHandlerInfo, _args string, call client.ToolCall) (any, error) {
+		args := createLocationArguments{}
 		err := json.Unmarshal([]byte(_args), &args)
 		if err != nil {
 			return model.Locations{}, err
@ -160,18 +167,9 @@ func NewLocationAgent(log *log.Logger, locationModel models.LocationModel) clien

 		ctx := context.Background()

-		locationId := uuid.Nil
-		if args.LocationID != nil {
-			locationUuid, err := uuid.Parse(*args.LocationID)
-			if err != nil {
-				return model.Locations{}, err
-			}
-
-			locationId = locationUuid
-		}
+		// TODO: this tool could be simplier, as the model could have a SaveToImage joined with the save.

 		location, err := locationModel.Save(ctx, info.UserId, model.Locations{
-			ID:      locationId,
 			Name:    args.Name,
 			Address: args.Address,
 		})
@ -192,5 +190,9 @@ func NewLocationAgent(log *log.Logger, locationModel models.LocationModel) clien
 		return "ok", nil
 	})

+	agentClient.ToolHandler.AddTool("doesLocationExist", func(info client.ToolHandlerInfo, args string, call client.ToolCall) (any, error) {
+		return "ok", nil
+	})
+
 	return agentClient
 }
--- a/backend/agents/orchestrator.go
+++ b/backend/agents/orchestrator.go
@ -20,10 +20,14 @@ const orchestratorPrompt = `
    * Information about an event
    * Content that doesn't fit any specific category or lacks actionable information.

-2.  **Agent Selection - Determine ALL that apply:**
-    * **contactAgent:** Is there information specifically related to a person or their contact details (e.g., business card, name/email/phone)? If YES, select contactAgent.
-    * **locationAgent:** Is there information specifically identifying a place, location, or address (e.g., map, street sign, address text)? If YES, select locationAgent.
-    * **eventAgent:** Is there information specifically related to an event (e.g., invitation, poster with date/time, schedule)? If YES, select eventAgent.
+2.  **Thinking**
+	* You should use the think tool to allow you to think your way through the image.
+	* You should call this as many times as you need to in order to describe and analyse the image correctly.
+
+3.  **Agent Selection - Determine ALL that apply:**
+    * **contactAgent:** Is there information specifically related to a person or their contact details (e.g., business card, name/email/phone)?
+    * **locationAgent:** Is there information specifically identifying a place, location, city, or address (e.g., map, street sign, address text)?
+    * **eventAgent:** Is there information specifically related to an event (e.g., invitation, poster with date/time, schedule)?
    * **noteAgent** Does the image contain *any* text/writing (including code, formulas)?
 	* **noAgent**: Call this when you are done working on this image.

@ -32,6 +36,23 @@ const orchestratorPrompt = `

 const orchestratorTools = `
 [
+	{
+        "type": "function",
+        "function": {
+            "name": "think",
+            "description": "Use to layout all your thoughts about the image, roughly describing it, and specially describing if the image contains anything relevant to your available agents",
+            "parameters": {
+                "type": "object",
+                "properties": {
+					"thought": {
+						"type": "string",
+						"description": "A singular thought about the image"
+					}
+				},
+                "required": []
+            }
+        }
+	},
    {
        "type": "function",
        "function": {
@ -60,7 +81,7 @@ const orchestratorTools = `
        "type": "function",
        "function": {
            "name": "locationAgent",
-            "description": "Identifies and extracts specific geographic locations or addresses. Use for content like street addresses on mail or signs, place names (e.g., restaurant, shop), map snippets, or recognizable landmarks.",
+            "description": "Use when the input has anything to do with a place. This could be a city, an address, a postcode, a virtual meeting location, or a geographical location.",
            "parameters": {
                "type": "object",
                "properties": {},
@ -113,6 +134,10 @@ func NewOrchestratorAgent(log *log.Logger, noteAgent NoteAgent, contactAgent cli
 		EndToolCall:  "noAgent",
 	})

+	agent.ToolHandler.AddTool("think", func(info client.ToolHandlerInfo, args string, call client.ToolCall) (any, error) {
+		return "Thought", nil
+	})
+
 	agent.ToolHandler.AddTool("noteAgent", func(info client.ToolHandlerInfo, args string, call client.ToolCall) (any, error) {
 		// go noteAgent.GetNotes(info.UserId, info.ImageId, imageName, imageData)

@ -120,13 +145,13 @@ func NewOrchestratorAgent(log *log.Logger, noteAgent NoteAgent, contactAgent cli
 	})

 	agent.ToolHandler.AddTool("contactAgent", func(info client.ToolHandlerInfo, args string, call client.ToolCall) (any, error) {
-		go contactAgent.RunAgent(info.UserId, info.ImageId, imageName, imageData)
+		// go contactAgent.RunAgent(info.UserId, info.ImageId, imageName, imageData)

 		return "contactAgent called successfully", nil
 	})

 	agent.ToolHandler.AddTool("locationAgent", func(info client.ToolHandlerInfo, args string, call client.ToolCall) (any, error) {
-		// go locationAgent.RunAgent(info.UserId, info.ImageId, imageName, imageData)
+		go locationAgent.RunAgent(info.UserId, info.ImageId, imageName, imageData)

 		return "locationAgent called successfully", nil
 	})