feat: finishing description agent infrastructure on backend

2025-07-24 14:12:41 +01:00
parent 59bf884f5d
commit 37f966e508
3 changed files with 23 additions and 19 deletions
--- a/backend/agents/client/client.go
+++ b/backend/agents/client/client.go
@ -4,6 +4,7 @@ import (
 	"bytes"
 	"encoding/json"
 	"errors"
 	"fmt"
 	"io"
 	"net/http"
 	"os"
@ -101,7 +102,7 @@ func CreateAgentClient(options CreateAgentClientOptions) AgentClient {
 	return AgentClient{
 		apiKey: apiKey,
-		url:    "https://api.openai.com/v1/chat/completions",
+		url:    "https://router.requesty.ai/v1/chat/completions",
 		Do: func(req *http.Request) (*http.Response, error) {
 			client := &http.Client{}
 			return client.Do(req)
@ -132,29 +133,29 @@ func (client AgentClient) getRequest(body []byte) (*http.Request, error) {
 func (client AgentClient) Request(req *AgentRequestBody) (AgentResponse, error) {
 	jsonAiRequest, err := json.Marshal(req)
 	if err != nil {
-		return AgentResponse{}, err
+		return AgentResponse{}, fmt.Errorf("Could not format JSON", err)
 	}
 	httpRequest, err := client.getRequest(jsonAiRequest)
 	if err != nil {
-		return AgentResponse{}, err
+		return AgentResponse{}, fmt.Errorf("Could not get request", err)
 	}
 	resp, err := client.Do(httpRequest)
 	if err != nil {
-		return AgentResponse{}, err
+		return AgentResponse{}, fmt.Errorf("Could not send request", err)
 	}
 	response, err := io.ReadAll(resp.Body)
 	if err != nil {
-		return AgentResponse{}, err
+		return AgentResponse{}, fmt.Errorf("Could not read body", err)
 	}
 	agentResponse := AgentResponse{}
 	err = json.Unmarshal(response, &agentResponse)
 	if err != nil {
-		return AgentResponse{}, err
+		return AgentResponse{}, fmt.Errorf("Could not unmarshal response, response: %s", string(response), err)
 	}
 	if len(agentResponse.Choices) != 1 {
@ -245,7 +246,7 @@ func (client *AgentClient) RunAgent(userId uuid.UUID, imageId uuid.UUID, imageNa
 	request := AgentRequestBody{
 		Tools:       &tools,
 		ToolChoice:  &toolChoice,
-		Model:       "gpt-4.1-mini",
+		Model:       "google/gemini-2.5-flash",
 		RandomSeed:  &seed,
 		Temperature: 0.3,
 		EndToolCall: client.Options.EndToolCall,
--- a/backend/agents/description_agent.go
+++ b/backend/agents/description_agent.go
@ -2,6 +2,7 @@ package agents
 import (
 	"context"
 	"fmt"
 	"screenmark/screenmark/agents/client"
 	"screenmark/screenmark/models"
@ -10,15 +11,10 @@ import (
 )
 const noteAgentPrompt = `
-You are a helpful agent, who's job is to extract notes from images.
+You are an AI agent who's job is to describe the image you see.
 Not all images contain notes, in such cases there's not need to create them.
-An image can have more than one note.
+You should also add any text you see in the image, if no text exists, just add a description.
-
+Be consise and don't add too much extra information or formatting characters, simple text.
 You must return markdown, and adapt the text to best fit markdown.
 Do not return anything except markdown.
 If the image contains code, add this inside code blocks. You must try and correctly guess the language too.
 `
 type DescriptionAgent struct {
@ -27,9 +23,9 @@ type DescriptionAgent struct {
 	imageModel models.ImageModel
 }
-func (agent DescriptionAgent) Describe(imageId uuid.UUID, imageName string, imageData []byte) error {
+func (agent DescriptionAgent) Describe(log *log.Logger, imageId uuid.UUID, imageName string, imageData []byte) error {
 	request := client.AgentRequestBody{
-		Model:       "gpt-4.1-nano",
+		Model:       "google/gemini-2.5-flash-lite-preview-06-17",
 		Temperature: 0.3,
 		ResponseFormat: client.ResponseFormat{
 			Type: "text",
@ -42,15 +38,18 @@ func (agent DescriptionAgent) Describe(imageId uuid.UUID, imageName string, imag
 	request.Chat.AddSystem(noteAgentPrompt)
 	request.Chat.AddImage(imageName, imageData, nil)
 	log.Debug("Sending description request")
 	resp, err := agent.client.Request(&request)
 	if err != nil {
-		return err
+		return fmt.Errorf("Could not request", err)
 	}
 	ctx := context.Background()
 	markdown := resp.Choices[0].Message.Content
 	log.Debugf("Response %s", markdown)
 	err = agent.imageModel.AddDescription(ctx, imageId, markdown)
 	if err != nil {
 		return err
--- a/backend/events.go
+++ b/backend/events.go
@ -72,7 +72,11 @@ func ListenNewImageEvents(db *sql.DB, notifier *Notifier[Notification]) {
 			}
 			descriptionAgent := agents.NewDescriptionAgent(createLogger("Description 📝", splitWriter), imageModel)
-			descriptionAgent.Describe(image.Image.ID, image.Image.ImageName, image.Image.Image)
+			err = descriptionAgent.Describe(createLogger("Description 📓", splitWriter), image.Image.ID, image.Image.ImageName, image.Image.Image)
 			if err != nil {
 				log.Error(err)
 			}
 			listAgent := agents.NewListAgent(createLogger("Lists 🖋️", splitWriter), listModel)
 			listAgent.RunAgent(image.UserID, image.ImageID, image.Image.ImageName, image.Image.Image)