feat: finishing description agent infrastructure on backend

This commit is contained in:
2025-07-24 14:12:41 +01:00
parent 59bf884f5d
commit 37f966e508
3 changed files with 23 additions and 19 deletions

View File

@ -4,6 +4,7 @@ import (
"bytes"
"encoding/json"
"errors"
"fmt"
"io"
"net/http"
"os"
@ -101,7 +102,7 @@ func CreateAgentClient(options CreateAgentClientOptions) AgentClient {
return AgentClient{
apiKey: apiKey,
url: "https://api.openai.com/v1/chat/completions",
url: "https://router.requesty.ai/v1/chat/completions",
Do: func(req *http.Request) (*http.Response, error) {
client := &http.Client{}
return client.Do(req)
@ -132,29 +133,29 @@ func (client AgentClient) getRequest(body []byte) (*http.Request, error) {
func (client AgentClient) Request(req *AgentRequestBody) (AgentResponse, error) {
jsonAiRequest, err := json.Marshal(req)
if err != nil {
return AgentResponse{}, err
return AgentResponse{}, fmt.Errorf("Could not format JSON", err)
}
httpRequest, err := client.getRequest(jsonAiRequest)
if err != nil {
return AgentResponse{}, err
return AgentResponse{}, fmt.Errorf("Could not get request", err)
}
resp, err := client.Do(httpRequest)
if err != nil {
return AgentResponse{}, err
return AgentResponse{}, fmt.Errorf("Could not send request", err)
}
response, err := io.ReadAll(resp.Body)
if err != nil {
return AgentResponse{}, err
return AgentResponse{}, fmt.Errorf("Could not read body", err)
}
agentResponse := AgentResponse{}
err = json.Unmarshal(response, &agentResponse)
if err != nil {
return AgentResponse{}, err
return AgentResponse{}, fmt.Errorf("Could not unmarshal response, response: %s", string(response), err)
}
if len(agentResponse.Choices) != 1 {
@ -245,7 +246,7 @@ func (client *AgentClient) RunAgent(userId uuid.UUID, imageId uuid.UUID, imageNa
request := AgentRequestBody{
Tools: &tools,
ToolChoice: &toolChoice,
Model: "gpt-4.1-mini",
Model: "google/gemini-2.5-flash",
RandomSeed: &seed,
Temperature: 0.3,
EndToolCall: client.Options.EndToolCall,

View File

@ -2,6 +2,7 @@ package agents
import (
"context"
"fmt"
"screenmark/screenmark/agents/client"
"screenmark/screenmark/models"
@ -10,15 +11,10 @@ import (
)
const noteAgentPrompt = `
You are a helpful agent, who's job is to extract notes from images.
Not all images contain notes, in such cases there's not need to create them.
You are an AI agent who's job is to describe the image you see.
An image can have more than one note.
You must return markdown, and adapt the text to best fit markdown.
Do not return anything except markdown.
If the image contains code, add this inside code blocks. You must try and correctly guess the language too.
You should also add any text you see in the image, if no text exists, just add a description.
Be consise and don't add too much extra information or formatting characters, simple text.
`
type DescriptionAgent struct {
@ -27,9 +23,9 @@ type DescriptionAgent struct {
imageModel models.ImageModel
}
func (agent DescriptionAgent) Describe(imageId uuid.UUID, imageName string, imageData []byte) error {
func (agent DescriptionAgent) Describe(log *log.Logger, imageId uuid.UUID, imageName string, imageData []byte) error {
request := client.AgentRequestBody{
Model: "gpt-4.1-nano",
Model: "google/gemini-2.5-flash-lite-preview-06-17",
Temperature: 0.3,
ResponseFormat: client.ResponseFormat{
Type: "text",
@ -42,15 +38,18 @@ func (agent DescriptionAgent) Describe(imageId uuid.UUID, imageName string, imag
request.Chat.AddSystem(noteAgentPrompt)
request.Chat.AddImage(imageName, imageData, nil)
log.Debug("Sending description request")
resp, err := agent.client.Request(&request)
if err != nil {
return err
return fmt.Errorf("Could not request", err)
}
ctx := context.Background()
markdown := resp.Choices[0].Message.Content
log.Debugf("Response %s", markdown)
err = agent.imageModel.AddDescription(ctx, imageId, markdown)
if err != nil {
return err

View File

@ -72,7 +72,11 @@ func ListenNewImageEvents(db *sql.DB, notifier *Notifier[Notification]) {
}
descriptionAgent := agents.NewDescriptionAgent(createLogger("Description 📝", splitWriter), imageModel)
descriptionAgent.Describe(image.Image.ID, image.Image.ImageName, image.Image.Image)
err = descriptionAgent.Describe(createLogger("Description 📓", splitWriter), image.Image.ID, image.Image.ImageName, image.Image.Image)
if err != nil {
log.Error(err)
}
listAgent := agents.NewListAgent(createLogger("Lists 🖋️", splitWriter), listModel)
listAgent.RunAgent(image.UserID, image.ImageID, image.Image.ImageName, image.Image.Image)