feat: finishing description agent infrastructure on backend

This commit is contained in:
2025-07-24 14:12:41 +01:00
parent 59bf884f5d
commit 37f966e508
3 changed files with 23 additions and 19 deletions

View File

@ -4,6 +4,7 @@ import (
"bytes" "bytes"
"encoding/json" "encoding/json"
"errors" "errors"
"fmt"
"io" "io"
"net/http" "net/http"
"os" "os"
@ -101,7 +102,7 @@ func CreateAgentClient(options CreateAgentClientOptions) AgentClient {
return AgentClient{ return AgentClient{
apiKey: apiKey, apiKey: apiKey,
url: "https://api.openai.com/v1/chat/completions", url: "https://router.requesty.ai/v1/chat/completions",
Do: func(req *http.Request) (*http.Response, error) { Do: func(req *http.Request) (*http.Response, error) {
client := &http.Client{} client := &http.Client{}
return client.Do(req) return client.Do(req)
@ -132,29 +133,29 @@ func (client AgentClient) getRequest(body []byte) (*http.Request, error) {
func (client AgentClient) Request(req *AgentRequestBody) (AgentResponse, error) { func (client AgentClient) Request(req *AgentRequestBody) (AgentResponse, error) {
jsonAiRequest, err := json.Marshal(req) jsonAiRequest, err := json.Marshal(req)
if err != nil { if err != nil {
return AgentResponse{}, err return AgentResponse{}, fmt.Errorf("Could not format JSON", err)
} }
httpRequest, err := client.getRequest(jsonAiRequest) httpRequest, err := client.getRequest(jsonAiRequest)
if err != nil { if err != nil {
return AgentResponse{}, err return AgentResponse{}, fmt.Errorf("Could not get request", err)
} }
resp, err := client.Do(httpRequest) resp, err := client.Do(httpRequest)
if err != nil { if err != nil {
return AgentResponse{}, err return AgentResponse{}, fmt.Errorf("Could not send request", err)
} }
response, err := io.ReadAll(resp.Body) response, err := io.ReadAll(resp.Body)
if err != nil { if err != nil {
return AgentResponse{}, err return AgentResponse{}, fmt.Errorf("Could not read body", err)
} }
agentResponse := AgentResponse{} agentResponse := AgentResponse{}
err = json.Unmarshal(response, &agentResponse) err = json.Unmarshal(response, &agentResponse)
if err != nil { if err != nil {
return AgentResponse{}, err return AgentResponse{}, fmt.Errorf("Could not unmarshal response, response: %s", string(response), err)
} }
if len(agentResponse.Choices) != 1 { if len(agentResponse.Choices) != 1 {
@ -245,7 +246,7 @@ func (client *AgentClient) RunAgent(userId uuid.UUID, imageId uuid.UUID, imageNa
request := AgentRequestBody{ request := AgentRequestBody{
Tools: &tools, Tools: &tools,
ToolChoice: &toolChoice, ToolChoice: &toolChoice,
Model: "gpt-4.1-mini", Model: "google/gemini-2.5-flash",
RandomSeed: &seed, RandomSeed: &seed,
Temperature: 0.3, Temperature: 0.3,
EndToolCall: client.Options.EndToolCall, EndToolCall: client.Options.EndToolCall,

View File

@ -2,6 +2,7 @@ package agents
import ( import (
"context" "context"
"fmt"
"screenmark/screenmark/agents/client" "screenmark/screenmark/agents/client"
"screenmark/screenmark/models" "screenmark/screenmark/models"
@ -10,15 +11,10 @@ import (
) )
const noteAgentPrompt = ` const noteAgentPrompt = `
You are a helpful agent, who's job is to extract notes from images. You are an AI agent who's job is to describe the image you see.
Not all images contain notes, in such cases there's not need to create them.
An image can have more than one note. You should also add any text you see in the image, if no text exists, just add a description.
Be consise and don't add too much extra information or formatting characters, simple text.
You must return markdown, and adapt the text to best fit markdown.
Do not return anything except markdown.
If the image contains code, add this inside code blocks. You must try and correctly guess the language too.
` `
type DescriptionAgent struct { type DescriptionAgent struct {
@ -27,9 +23,9 @@ type DescriptionAgent struct {
imageModel models.ImageModel imageModel models.ImageModel
} }
func (agent DescriptionAgent) Describe(imageId uuid.UUID, imageName string, imageData []byte) error { func (agent DescriptionAgent) Describe(log *log.Logger, imageId uuid.UUID, imageName string, imageData []byte) error {
request := client.AgentRequestBody{ request := client.AgentRequestBody{
Model: "gpt-4.1-nano", Model: "google/gemini-2.5-flash-lite-preview-06-17",
Temperature: 0.3, Temperature: 0.3,
ResponseFormat: client.ResponseFormat{ ResponseFormat: client.ResponseFormat{
Type: "text", Type: "text",
@ -42,15 +38,18 @@ func (agent DescriptionAgent) Describe(imageId uuid.UUID, imageName string, imag
request.Chat.AddSystem(noteAgentPrompt) request.Chat.AddSystem(noteAgentPrompt)
request.Chat.AddImage(imageName, imageData, nil) request.Chat.AddImage(imageName, imageData, nil)
log.Debug("Sending description request")
resp, err := agent.client.Request(&request) resp, err := agent.client.Request(&request)
if err != nil { if err != nil {
return err return fmt.Errorf("Could not request", err)
} }
ctx := context.Background() ctx := context.Background()
markdown := resp.Choices[0].Message.Content markdown := resp.Choices[0].Message.Content
log.Debugf("Response %s", markdown)
err = agent.imageModel.AddDescription(ctx, imageId, markdown) err = agent.imageModel.AddDescription(ctx, imageId, markdown)
if err != nil { if err != nil {
return err return err

View File

@ -72,7 +72,11 @@ func ListenNewImageEvents(db *sql.DB, notifier *Notifier[Notification]) {
} }
descriptionAgent := agents.NewDescriptionAgent(createLogger("Description 📝", splitWriter), imageModel) descriptionAgent := agents.NewDescriptionAgent(createLogger("Description 📝", splitWriter), imageModel)
descriptionAgent.Describe(image.Image.ID, image.Image.ImageName, image.Image.Image) err = descriptionAgent.Describe(createLogger("Description 📓", splitWriter), image.Image.ID, image.Image.ImageName, image.Image.Image)
if err != nil {
log.Error(err)
}
listAgent := agents.NewListAgent(createLogger("Lists 🖋️", splitWriter), listModel) listAgent := agents.NewListAgent(createLogger("Lists 🖋️", splitWriter), listModel)
listAgent.RunAgent(image.UserID, image.ImageID, image.Image.ImageName, image.Image.Image) listAgent.RunAgent(image.UserID, image.ImageID, image.Image.ImageName, image.Image.Image)