Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 29 additions & 4 deletions packages/core/lib/v3/agent/AnthropicCUAClient.ts
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,29 @@ import { v7 as uuidv7 } from "uuid";

export type ResponseInputItem = AnthropicMessage | AnthropicToolResult;

const IMAGE_DATA_URL_PATTERN =
/^data:(image\/[a-zA-Z0-9.+-]+);base64,([\s\S]*)$/i;
const PNG_DATA_URL_PREFIX = /^data:image\/png;base64,/i;

function getImageToolResultSource(screenshot: string): {
mediaType: string;
base64Data: string;
} {
const match = screenshot.match(IMAGE_DATA_URL_PATTERN);
if (match?.[1] && typeof match[2] === "string") {
return {
mediaType: match[1].toLowerCase(),
base64Data: match[2],
};
}

// Fallback preserves existing PNG behavior for malformed/unexpected values.
return {
mediaType: "image/png",
base64Data: screenshot.replace(PNG_DATA_URL_PREFIX, ""),
};
}

/**
* Client for Anthropic's Computer Use API
* This implementation uses the official Anthropic Messages API for Computer Use
Expand Down Expand Up @@ -661,15 +684,16 @@ export class AnthropicCUAClient extends AgentClient {
message: `Screenshot captured, length: ${screenshot.length}`,
level: 2,
});
const screenshotSource = getImageToolResultSource(screenshot);

// Create proper image content block for Anthropic
const imageContent = [
{
type: "image",
source: {
type: "base64",
media_type: "image/png",
data: screenshot.replace(/^data:image\/png;base64,/, ""),
media_type: screenshotSource.mediaType,
data: screenshotSource.base64Data,
},
},
];
Expand Down Expand Up @@ -770,6 +794,7 @@ export class AnthropicCUAClient extends AgentClient {
// For computer tool, try to capture a screenshot even on error
if (item.name === "computer") {
const screenshot = await this.captureScreenshot();
const screenshotSource = getImageToolResultSource(screenshot);

toolResults.push({
type: "tool_result",
Expand All @@ -779,8 +804,8 @@ export class AnthropicCUAClient extends AgentClient {
type: "image",
source: {
type: "base64",
media_type: "image/png",
data: screenshot.replace(/^data:image\/png;base64,/, ""),
media_type: screenshotSource.mediaType,
data: screenshotSource.base64Data,
},
},
{
Expand Down
104 changes: 104 additions & 0 deletions packages/core/tests/unit/anthropic-cua-client.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
import { describe, expect, it, vi } from "vitest";
import { AnthropicCUAClient } from "../../lib/v3/agent/AnthropicCUAClient.js";
import type { ToolUseItem } from "../../lib/v3/types/public/agent.js";

function createClient() {
return new AnthropicCUAClient(
"anthropic",
"anthropic/claude-sonnet-4-6",
undefined,
{ apiKey: "test-key" },
);
}

const noopLogger = vi.fn();

function computerToolUseItem(id: string): ToolUseItem {
return {
type: "tool_use",
id,
name: "computer",
input: {
action: "left_click",
},
};
}

function extractImageSource(result: {
content: string | Array<{ type: string; source?: Record<string, unknown> }>;
}): { media_type: string; data: string } {
const content = result.content;
if (!Array.isArray(content)) {
throw new Error("Expected tool_result content array");
}

const imageBlock = content.find(
(block) => block.type === "image" && block.source,
);
if (!imageBlock?.source) {
throw new Error("Expected image block in tool_result content");
}

return imageBlock.source as { media_type: string; data: string };
}

describe("AnthropicCUAClient", () => {
it("uses the screenshot MIME type for computer tool_result images", async () => {
const client = createClient();
vi.spyOn(client, "captureScreenshot").mockResolvedValueOnce(
"data:image/jpeg;base64,abcd1234",
);

const results = await client.takeAction(
[computerToolUseItem("tool-1")],
noopLogger,
);

expect(results).toHaveLength(1);
const imageSource = extractImageSource(results[0]!);
expect(imageSource.media_type).toBe("image/jpeg");
expect(imageSource.data).toBe("abcd1234");
});

it("falls back to PNG metadata when screenshot is not an image data URL", async () => {
const client = createClient();
vi.spyOn(client, "captureScreenshot").mockResolvedValueOnce(
"raw-base64-payload",
);

const results = await client.takeAction(
[computerToolUseItem("tool-2")],
noopLogger,
);

const imageSource = extractImageSource(results[0]!);
expect(imageSource.media_type).toBe("image/png");
expect(imageSource.data).toBe("raw-base64-payload");
});

it("uses parsed MIME/data in error tool_result screenshot payloads", async () => {
const client = createClient();
const captureScreenshotSpy = vi
.spyOn(client, "captureScreenshot")
.mockRejectedValueOnce(new Error("capture failed"))
.mockResolvedValueOnce("data:image/webp;base64,errorimg");

const results = await client.takeAction(
[computerToolUseItem("tool-3")],
noopLogger,
);

expect(captureScreenshotSpy).toHaveBeenCalledTimes(2);
const imageSource = extractImageSource(results[0]!);
expect(imageSource.media_type).toBe("image/webp");
expect(imageSource.data).toBe("errorimg");
expect(results[0]?.content).toEqual(
expect.arrayContaining([
expect.objectContaining({
type: "text",
text: expect.stringContaining("Error: capture failed"),
}),
]),
);
});
});
Loading