browser-use/browser-use

AnthropicMessageSerializer applies cache_control to all blocks instead of only the last, risking Anthropic breakpoint overage

Summary

  • ContextAnthropicMessageSerializer converts browser-use’s internal message format to Anthropic’s API format, handling cache control for prompt caching optimization.

  • Bug: When a message has cache=True, the serializer applies cache_control to every content block (text parts, images, tool calls) instead of only the last block.

  • Actual vs. expected: The Anthropic API allows a maximum of 4 cache breakpoints per request. The current implementation can easily exceed this limit by adding cache_control to all blocks within messages that have cache=True.

  • Impact: Requests with multiple content blocks can exceed Anthropic’s 4 cache breakpoint limit, potentially causing API errors, unexpected caching behavior, or degraded performance. This is especially problematic for typical agent conversations with system prompts, screenshots, and tool calls.

Code with bug

@staticmethod
def _serialize_content_part_text(
    part: ContentPartTextParam,
    use_cache: bool,
) -> TextBlockParam:
    """Convert a text content part to Anthropic's TextBlockParam."""
    return TextBlockParam(
        text=part.text,
        type="text",
        cache_control=AnthropicMessageSerializer._serialize_cache_control(use_cache),  # <-- BUG 🔴 Applies cache_control to every text part when use_cache=True
    )


@staticmethod
def _serialize_content(
    content: str | list[ContentPartTextParam | ContentPartImageParam],
    use_cache: bool = False,
) -> str | list[TextBlockParam | ImageBlockParam]:
    """Serialize content to Anthropic format."""
    if isinstance(content, str):
        if use_cache:
            return [
                TextBlockParam(
                    text=content,
                    type="text",
                    cache_control=CacheControlEphemeralParam(type="ephemeral"),
                )
            ]
        else:
            return content

    serialized_blocks: list[TextBlockParam | ImageBlockParam] = []

    for part in content:
        if part.type == "text":
            serialized_blocks.append(
                AnthropicMessageSerializer._serialize_content_part_text(
                    part,
                    use_cache,
                )  # <-- BUG 🔴 Passes use_cache=True to all text parts
            )
        elif part.type == "image_url":
            serialized_blocks.append(
                AnthropicMessageSerializer._serialize_content_part_image(part)
            )

    return serialized_blocks


@staticmethod
def _serialize_tool_calls_to_content(
    tool_calls,
    use_cache: bool = False,
) -> list[ToolUseBlockParam]:
    """Convert tool calls to Anthropic's ToolUseBlockParam format."""
    blocks: list[ToolUseBlockParam] = []

    for tool_call in tool_calls:
        try:
            input_obj = json.loads(tool_call.function.arguments)
        except json.JSONDecodeError:
            input_obj = {"arguments": tool_call.function.arguments}

        blocks.append(
            ToolUseBlockParam(
                id=tool_call.id,
                input=input_obj,
                name=tool_call.function.name,
                type="tool_use",
                cache_control=AnthropicMessageSerializer._serialize_cache_control(use_cache),  # <-- BUG 🔴 Applies cache_control to every tool call when use_cache=True
            )
        )

    return blocks


elif isinstance(message, AssistantMessage):
    # Handle content and tool calls
    blocks: list[TextBlockParam | ToolUseBlockParam] = []

    # Add content blocks if present
    if message.content is not None:
        if isinstance(message.content, str):
            blocks.append(
                TextBlockParam(
                    text=message.content,
                    type="text",
                    cache_control=AnthropicMessageSerializer._serialize_cache_control(
                        message.cache
                    ),  # <-- BUG 🔴 Applies cache_control to single-string content
                )
            )
        else:
            # Process content parts (text and refusal)
            for part in message.content:
                if part.type == "text":
                    blocks.append(
                        AnthropicMessageSerializer._serialize_content_part_text(
                            part,
                            use_cache=message.cache,
                        )  # <-- BUG 🔴 Applies cache_control to all content parts
                    )

    # Add tool use blocks if present
    if message.tool_calls:
        tool_blocks = AnthropicMessageSerializer._serialize_tool_calls_to_content(
            message.tool_calls,
            use_cache=message.cache,
        )
        blocks.extend(tool_blocks)  # <-- BUG 🔴 All tool_use blocks get cache_control

Failing test

from browser_use.llm.anthropic.serializer import AnthropicMessageSerializer
from browser_use.llm.messages import ContentPartTextParam, UserMessage


def count_cache_blocks(content):
    """Count blocks with cache_control in a message content."""
    if isinstance(content, str):
        return 0

    if isinstance(content, list):
        return sum(
            1
            for block in content
            if isinstance(block, dict)
            and block.get("cache_control") is not None
        )

    return 0


def test_user_message_with_multiple_text_parts():
    """Test that only the LAST text part should have cache_control."""
    user_msg = UserMessage(
        content=[
            ContentPartTextParam(text="Part 1"),
            ContentPartTextParam(text="Part 2"),
            ContentPartTextParam(text="Part 3"),
        ],
        cache=True,
    )

    serialized = AnthropicMessageSerializer.serialize(user_msg)

    cache_count = count_cache_blocks(serialized["content"])

    assert (
        cache_count == 1
    ), f"Expected 1 cache_control block, got {cache_count}"

Test output:


Recommended fix

The fix should ensure that when use_cache=True for a message, only the last content block in that message receives cache_control:

@staticmethod
def _serialize_content(
    content: str | list[ContentPartTextParam | ContentPartImageParam],
    use_cache: bool = False,
) -> str | list[TextBlockParam | ImageBlockParam]:
    """Serialize content to Anthropic format."""
    if isinstance(content, str):
        if use_cache:
            return [
                TextBlockParam(
                    text=content,
                    type="text",
                    cache_control=CacheControlEphemeralParam(type="ephemeral"),
                )
            ]
        else:
            return content

    serialized_blocks: list[TextBlockParam | ImageBlockParam] = []

    for i, part in enumerate(content):
        is_last = i == len(content) - 1  # <-- FIX 🟢

        if part.type == "text":
            serialized_blocks.append(
                AnthropicMessageSerializer._serialize_content_part_text(
                    part,
                    use_cache=use_cache and is_last,
                )
            )  # <-- FIX 🟢 Only cache last block
        elif part.type == "image_url":
            serialized_blocks.append(
                AnthropicMessageSerializer._serialize_content_part_image(part)
            )

    return serialized_blocks

Similar fixes needed in:

  • _serialize_content_to_str()

  • _serialize_tool_calls_to_content()

  • AssistantMessage serialization logic

Update tests to verify only the last block within each message has cache_control when cache=True.