From 37f966e5086748220c8d42dbc1570eb02e5dbddf Mon Sep 17 00:00:00 2001 From: John Costa Date: Thu, 24 Jul 2025 14:12:41 +0100 Subject: [PATCH] feat: finishing description agent infrastructure on backend --- backend/agents/client/client.go | 15 ++++++++------- backend/agents/description_agent.go | 21 ++++++++++----------- backend/events.go | 6 +++++- 3 files changed, 23 insertions(+), 19 deletions(-) diff --git a/backend/agents/client/client.go b/backend/agents/client/client.go index 6d3a495..584c8be 100644 --- a/backend/agents/client/client.go +++ b/backend/agents/client/client.go @@ -4,6 +4,7 @@ import ( "bytes" "encoding/json" "errors" + "fmt" "io" "net/http" "os" @@ -101,7 +102,7 @@ func CreateAgentClient(options CreateAgentClientOptions) AgentClient { return AgentClient{ apiKey: apiKey, - url: "https://api.openai.com/v1/chat/completions", + url: "https://router.requesty.ai/v1/chat/completions", Do: func(req *http.Request) (*http.Response, error) { client := &http.Client{} return client.Do(req) @@ -132,29 +133,29 @@ func (client AgentClient) getRequest(body []byte) (*http.Request, error) { func (client AgentClient) Request(req *AgentRequestBody) (AgentResponse, error) { jsonAiRequest, err := json.Marshal(req) if err != nil { - return AgentResponse{}, err + return AgentResponse{}, fmt.Errorf("Could not format JSON", err) } httpRequest, err := client.getRequest(jsonAiRequest) if err != nil { - return AgentResponse{}, err + return AgentResponse{}, fmt.Errorf("Could not get request", err) } resp, err := client.Do(httpRequest) if err != nil { - return AgentResponse{}, err + return AgentResponse{}, fmt.Errorf("Could not send request", err) } response, err := io.ReadAll(resp.Body) if err != nil { - return AgentResponse{}, err + return AgentResponse{}, fmt.Errorf("Could not read body", err) } agentResponse := AgentResponse{} err = json.Unmarshal(response, &agentResponse) if err != nil { - return AgentResponse{}, err + return AgentResponse{}, fmt.Errorf("Could not unmarshal response, response: %s", string(response), err) } if len(agentResponse.Choices) != 1 { @@ -245,7 +246,7 @@ func (client *AgentClient) RunAgent(userId uuid.UUID, imageId uuid.UUID, imageNa request := AgentRequestBody{ Tools: &tools, ToolChoice: &toolChoice, - Model: "gpt-4.1-mini", + Model: "google/gemini-2.5-flash", RandomSeed: &seed, Temperature: 0.3, EndToolCall: client.Options.EndToolCall, diff --git a/backend/agents/description_agent.go b/backend/agents/description_agent.go index ce7a0af..a745e10 100644 --- a/backend/agents/description_agent.go +++ b/backend/agents/description_agent.go @@ -2,6 +2,7 @@ package agents import ( "context" + "fmt" "screenmark/screenmark/agents/client" "screenmark/screenmark/models" @@ -10,15 +11,10 @@ import ( ) const noteAgentPrompt = ` -You are a helpful agent, who's job is to extract notes from images. -Not all images contain notes, in such cases there's not need to create them. +You are an AI agent who's job is to describe the image you see. -An image can have more than one note. - -You must return markdown, and adapt the text to best fit markdown. -Do not return anything except markdown. - -If the image contains code, add this inside code blocks. You must try and correctly guess the language too. +You should also add any text you see in the image, if no text exists, just add a description. +Be consise and don't add too much extra information or formatting characters, simple text. ` type DescriptionAgent struct { @@ -27,9 +23,9 @@ type DescriptionAgent struct { imageModel models.ImageModel } -func (agent DescriptionAgent) Describe(imageId uuid.UUID, imageName string, imageData []byte) error { +func (agent DescriptionAgent) Describe(log *log.Logger, imageId uuid.UUID, imageName string, imageData []byte) error { request := client.AgentRequestBody{ - Model: "gpt-4.1-nano", + Model: "google/gemini-2.5-flash-lite-preview-06-17", Temperature: 0.3, ResponseFormat: client.ResponseFormat{ Type: "text", @@ -42,15 +38,18 @@ func (agent DescriptionAgent) Describe(imageId uuid.UUID, imageName string, imag request.Chat.AddSystem(noteAgentPrompt) request.Chat.AddImage(imageName, imageData, nil) + log.Debug("Sending description request") resp, err := agent.client.Request(&request) if err != nil { - return err + return fmt.Errorf("Could not request", err) } ctx := context.Background() markdown := resp.Choices[0].Message.Content + log.Debugf("Response %s", markdown) + err = agent.imageModel.AddDescription(ctx, imageId, markdown) if err != nil { return err diff --git a/backend/events.go b/backend/events.go index 45c2f44..1e39f3c 100644 --- a/backend/events.go +++ b/backend/events.go @@ -72,7 +72,11 @@ func ListenNewImageEvents(db *sql.DB, notifier *Notifier[Notification]) { } descriptionAgent := agents.NewDescriptionAgent(createLogger("Description 📝", splitWriter), imageModel) - descriptionAgent.Describe(image.Image.ID, image.Image.ImageName, image.Image.Image) + err = descriptionAgent.Describe(createLogger("Description 📓", splitWriter), image.Image.ID, image.Image.ImageName, image.Image.Image) + + if err != nil { + log.Error(err) + } listAgent := agents.NewListAgent(createLogger("Lists 🖋️", splitWriter), listModel) listAgent.RunAgent(image.UserID, image.ImageID, image.Image.ImageName, image.Image.Image)