diff --git a/astrbot/core/tools/computer_tools/fs.py b/astrbot/core/tools/computer_tools/fs.py index 5660022fd0..f5b1f506df 100644 --- a/astrbot/core/tools/computer_tools/fs.py +++ b/astrbot/core/tools/computer_tools/fs.py @@ -38,6 +38,8 @@ from dataclasses import dataclass, field from pathlib import Path +import mcp.types + from astrbot.api import FunctionTool, logger from astrbot.api.event import MessageChain from astrbot.core.agent.run_context import ContextWrapper @@ -215,6 +217,66 @@ def _decode_escaped_text(value: str) -> str: ) +def _provider_supports_image(context: ContextWrapper[AstrAgentContext]) -> bool: + """Check if the current provider supports image modality.""" + try: + umo = context.context.event.unified_msg_origin + provider = context.context.context.get_using_provider(umo=umo) + if provider is None: + return True # Cannot determine, assume supported + modalities = provider.provider_config.get("modalities", []) + return "image" in modalities + except Exception: + return True # Cannot determine, assume supported + + +async def _caption_image_fallback( + context: ContextWrapper[AstrAgentContext], + image_path: str, +) -> ToolExecResult: + """Try to caption an image using the configured image caption provider. + + Returns the caption text or an error message if no caption provider is available. + """ + from astrbot.core.provider.provider import Provider + + umo = context.context.event.unified_msg_origin + cfg = context.context.context.get_config(umo=umo) + provider_settings = cfg.get("provider_settings", {}) + caption_provider_id = provider_settings.get("default_image_caption_provider_id", "") + + if not caption_provider_id: + return ( + "Error: your provider does not support image modality, " + "and no image caption provider is configured. Unable to read image file." + ) + + caption_provider = context.context.context.get_provider_by_id(caption_provider_id) + if caption_provider is None or not isinstance(caption_provider, Provider): + return ( + "Error: your provider does not support image modality, " + f"and the configured image caption provider `{caption_provider_id}` is not available. " + "Unable to read image file." + ) + + caption_prompt = provider_settings.get( + "image_caption_prompt", "Please describe the image." + ) + + try: + llm_resp = await caption_provider.text_chat( + prompt=caption_prompt, + image_urls=[image_path], + ) + caption = (llm_resp.completion_text or "").strip() + if not caption: + return "Error: image caption provider returned an empty description." + return f"[Image description]: {caption}" + except Exception as exc: + logger.error(f"Image captioning failed: {exc}") + return f"Error: failed to generate image description: {exc}" + + @builtin_tool(config=_COMPUTER_RUNTIME_TOOL_CONFIG) @dataclass class FileReadTool(FunctionTool): @@ -281,7 +343,7 @@ async def call( context.context.context, context.context.event.unified_msg_origin, ) - return await read_file_tool_result( + result = await read_file_tool_result( sb, local_mode=local_env, path=normalized_path, @@ -293,6 +355,20 @@ async def call( else None ), ) + + # If the result is an image and the provider doesn't support image modality, + # fall back to image captioning or return an error. + if ( + isinstance(result, mcp.types.CallToolResult) + and result.content + and any( + isinstance(item, mcp.types.ImageContent) for item in result.content + ) + and not _provider_supports_image(context) + ): + return await _caption_image_fallback(context, normalized_path) + + return result except PermissionError as exc: return f"Error: {exc}" except Exception as exc: