feat(core): Parse Multimodal MCP Tool responses (#5529)

Co-authored-by: Luccas Paroni <luccasparoni@google.com>
2025-08-05 16:19:47 -03:00 · 2025-08-05 16:19:47 -03:00 · 2778c7d851
parent b465145229
commit 2778c7d851
4 changed files with 580 additions and 56 deletions
--- a/docs/core/tools-api.md
+++ b/docs/core/tools-api.md
@ -15,9 +15,11 @@ The Gemini CLI core (`packages/core`) features a robust system for defining, reg
  - `execute()`: The core method that performs the tool's action and returns a `ToolResult`.

 - **`ToolResult` (`tools.ts`):** An interface defining the structure of a tool's execution outcome:
-  - `llmContent`: The factual string content to be included in the history sent back to the LLM for context.
+  - `llmContent`: The factual content to be included in the history sent back to the LLM for context. This can be a simple string or a `PartListUnion` (an array of `Part` objects and strings) for rich content.
  - `returnDisplay`: A user-friendly string (often Markdown) or a special object (like `FileDiff`) for display in the CLI.

+- **Returning Rich Content:** Tools are not limited to returning simple text. The `llmContent` can be a `PartListUnion`, which is an array that can contain a mix of `Part` objects (for images, audio, etc.) and `string`s. This allows a single tool execution to return multiple pieces of rich content.
+
 - **Tool Registry (`tool-registry.ts`):** A class (`ToolRegistry`) responsible for:
  - **Registering Tools:** Holding a collection of all available built-in tools (e.g., `ReadFileTool`, `ShellTool`).
  - **Discovering Tools:** It can also discover tools dynamically:
--- a/docs/tools/mcp-server.md
+++ b/docs/tools/mcp-server.md
@ -571,6 +571,56 @@ The MCP integration tracks several states:

 This comprehensive integration makes MCP servers a powerful way to extend the Gemini CLI's capabilities while maintaining security, reliability, and ease of use.

+## Returning Rich Content from Tools
+
+MCP tools are not limited to returning simple text. You can return rich, multi-part content, including text, images, audio, and other binary data in a single tool response. This allows you to build powerful tools that can provide diverse information to the model in a single turn.
+
+All data returned from the tool is processed and sent to the model as context for its next generation, enabling it to reason about or summarize the provided information.
+
+### How It Works
+
+To return rich content, your tool's response must adhere to the MCP specification for a [`CallToolResult`](https://modelcontextprotocol.io/specification/2025-06-18/server/tools#tool-result). The `content` field of the result should be an array of `ContentBlock` objects. The Gemini CLI will correctly process this array, separating text from binary data and packaging it for the model.
+
+You can mix and match different content block types in the `content` array. The supported block types include:
+
+- `text`
+- `image`
+- `audio`
+- `resource` (embedded content)
+- `resource_link`
+
+### Example: Returning Text and an Image
+
+Here is an example of a valid JSON response from an MCP tool that returns both a text description and an image:
+
+```json
+{
+  "content": [
+    {
+      "type": "text",
+      "text": "Here is the logo you requested."
+    },
+    {
+      "type": "image",
+      "data": "BASE64_ENCODED_IMAGE_DATA_HERE",
+      "mimeType": "image/png"
+    },
+    {
+      "type": "text",
+      "text": "The logo was created in 2025."
+    }
+  ]
+}
+```
+
+When the Gemini CLI receives this response, it will:
+
+1.  Extract all the text and combine it into a single `functionResponse` part for the model.
+2.  Present the image data as a separate `inlineData` part.
+3.  Provide a clean, user-friendly summary in the CLI, indicating that both text and an image were received.
+
+This enables you to build sophisticated tools that can provide rich, multi-modal context to the Gemini model.
+
 ## MCP Prompts as Slash Commands

 In addition to tools, MCP servers can expose predefined prompts that can be executed as slash commands within the Gemini CLI. This allows you to create shortcuts for common or complex queries that can be easily invoked by name.
--- a/packages/core/src/tools/mcp-tool.test.ts
+++ b/packages/core/src/tools/mcp-tool.test.ts
@ -131,8 +131,11 @@ describe('DiscoveredMCPTool', () => {
        success: true,
        details: 'executed',
      };
-      const mockFunctionResponseContent: Part[] = [
-        { text: JSON.stringify(mockToolSuccessResultObject) },
+      const mockFunctionResponseContent = [
+        {
+          type: 'text',
+          text: JSON.stringify(mockToolSuccessResultObject),
+        },
      ];
      const mockMcpToolResponseParts: Part[] = [
        {
@ -149,11 +152,13 @@ describe('DiscoveredMCPTool', () => {
      expect(mockCallTool).toHaveBeenCalledWith([
        { name: serverToolName, args: params },
      ]);
-      expect(toolResult.llmContent).toEqual(mockMcpToolResponseParts);

      const stringifiedResponseContent = JSON.stringify(
        mockToolSuccessResultObject,
      );
+      expect(toolResult.llmContent).toEqual([
+        { text: stringifiedResponseContent },
+      ]);
      expect(toolResult.returnDisplay).toBe(stringifiedResponseContent);
    });

@ -170,6 +175,9 @@ describe('DiscoveredMCPTool', () => {
      mockCallTool.mockResolvedValue(mockMcpToolResponsePartsEmpty);
      const toolResult: ToolResult = await tool.execute(params);
      expect(toolResult.returnDisplay).toBe('```json\n[]\n```');
+      expect(toolResult.llmContent).toEqual([
+        { text: '[Error: Could not parse tool response]' },
+      ]);
    });

    it('should propagate rejection if mcpTool.callTool rejects', async () => {
@ -186,6 +194,361 @@ describe('DiscoveredMCPTool', () => {

      await expect(tool.execute(params)).rejects.toThrow(expectedError);
    });
+
+    it('should handle a simple text response correctly', async () => {
+      const tool = new DiscoveredMCPTool(
+        mockCallableToolInstance,
+        serverName,
+        serverToolName,
+        baseDescription,
+        inputSchema,
+      );
+      const params = { query: 'test' };
+      const successMessage = 'This is a success message.';
+
+      // Simulate the response from the GenAI SDK, which wraps the MCP
+      // response in a functionResponse Part.
+      const sdkResponse: Part[] = [
+        {
+          functionResponse: {
+            name: serverToolName,
+            response: {
+              // The `content` array contains MCP ContentBlocks.
+              content: [{ type: 'text', text: successMessage }],
+            },
+          },
+        },
+      ];
+      mockCallTool.mockResolvedValue(sdkResponse);
+
+      const toolResult = await tool.execute(params);
+
+      // 1. Assert that the llmContent sent to the scheduler is a clean Part array.
+      expect(toolResult.llmContent).toEqual([{ text: successMessage }]);
+
+      // 2. Assert that the display output is the simple text message.
+      expect(toolResult.returnDisplay).toBe(successMessage);
+
+      // 3. Verify that the underlying callTool was made correctly.
+      expect(mockCallTool).toHaveBeenCalledWith([
+        { name: serverToolName, args: params },
+      ]);
+    });
+
+    it('should handle an AudioBlock response', async () => {
+      const tool = new DiscoveredMCPTool(
+        mockCallableToolInstance,
+        serverName,
+        serverToolName,
+        baseDescription,
+        inputSchema,
+      );
+      const params = { action: 'play' };
+      const sdkResponse: Part[] = [
+        {
+          functionResponse: {
+            name: serverToolName,
+            response: {
+              content: [
+                {
+                  type: 'audio',
+                  data: 'BASE64_AUDIO_DATA',
+                  mimeType: 'audio/mp3',
+                },
+              ],
+            },
+          },
+        },
+      ];
+      mockCallTool.mockResolvedValue(sdkResponse);
+
+      const toolResult = await tool.execute(params);
+
+      expect(toolResult.llmContent).toEqual([
+        {
+          text: `[Tool '${serverToolName}' provided the following audio data with mime-type: audio/mp3]`,
+        },
+        {
+          inlineData: {
+            mimeType: 'audio/mp3',
+            data: 'BASE64_AUDIO_DATA',
+          },
+        },
+      ]);
+      expect(toolResult.returnDisplay).toBe('[Audio: audio/mp3]');
+    });
+
+    it('should handle a ResourceLinkBlock response', async () => {
+      const tool = new DiscoveredMCPTool(
+        mockCallableToolInstance,
+        serverName,
+        serverToolName,
+        baseDescription,
+        inputSchema,
+      );
+      const params = { resource: 'get' };
+      const sdkResponse: Part[] = [
+        {
+          functionResponse: {
+            name: serverToolName,
+            response: {
+              content: [
+                {
+                  type: 'resource_link',
+                  uri: 'file:///path/to/thing',
+                  name: 'resource-name',
+                  title: 'My Resource',
+                },
+              ],
+            },
+          },
+        },
+      ];
+      mockCallTool.mockResolvedValue(sdkResponse);
+
+      const toolResult = await tool.execute(params);
+
+      expect(toolResult.llmContent).toEqual([
+        {
+          text: 'Resource Link: My Resource at file:///path/to/thing',
+        },
+      ]);
+      expect(toolResult.returnDisplay).toBe(
+        '[Link to My Resource: file:///path/to/thing]',
+      );
+    });
+
+    it('should handle an embedded text ResourceBlock response', async () => {
+      const tool = new DiscoveredMCPTool(
+        mockCallableToolInstance,
+        serverName,
+        serverToolName,
+        baseDescription,
+        inputSchema,
+      );
+      const params = { resource: 'get' };
+      const sdkResponse: Part[] = [
+        {
+          functionResponse: {
+            name: serverToolName,
+            response: {
+              content: [
+                {
+                  type: 'resource',
+                  resource: {
+                    uri: 'file:///path/to/text.txt',
+                    text: 'This is the text content.',
+                    mimeType: 'text/plain',
+                  },
+                },
+              ],
+            },
+          },
+        },
+      ];
+      mockCallTool.mockResolvedValue(sdkResponse);
+
+      const toolResult = await tool.execute(params);
+
+      expect(toolResult.llmContent).toEqual([
+        { text: 'This is the text content.' },
+      ]);
+      expect(toolResult.returnDisplay).toBe('This is the text content.');
+    });
+
+    it('should handle an embedded binary ResourceBlock response', async () => {
+      const tool = new DiscoveredMCPTool(
+        mockCallableToolInstance,
+        serverName,
+        serverToolName,
+        baseDescription,
+        inputSchema,
+      );
+      const params = { resource: 'get' };
+      const sdkResponse: Part[] = [
+        {
+          functionResponse: {
+            name: serverToolName,
+            response: {
+              content: [
+                {
+                  type: 'resource',
+                  resource: {
+                    uri: 'file:///path/to/data.bin',
+                    blob: 'BASE64_BINARY_DATA',
+                    mimeType: 'application/octet-stream',
+                  },
+                },
+              ],
+            },
+          },
+        },
+      ];
+      mockCallTool.mockResolvedValue(sdkResponse);
+
+      const toolResult = await tool.execute(params);
+
+      expect(toolResult.llmContent).toEqual([
+        {
+          text: `[Tool '${serverToolName}' provided the following embedded resource with mime-type: application/octet-stream]`,
+        },
+        {
+          inlineData: {
+            mimeType: 'application/octet-stream',
+            data: 'BASE64_BINARY_DATA',
+          },
+        },
+      ]);
+      expect(toolResult.returnDisplay).toBe(
+        '[Embedded Resource: application/octet-stream]',
+      );
+    });
+
+    it('should handle a mix of content block types', async () => {
+      const tool = new DiscoveredMCPTool(
+        mockCallableToolInstance,
+        serverName,
+        serverToolName,
+        baseDescription,
+        inputSchema,
+      );
+      const params = { action: 'complex' };
+      const sdkResponse: Part[] = [
+        {
+          functionResponse: {
+            name: serverToolName,
+            response: {
+              content: [
+                { type: 'text', text: 'First part.' },
+                {
+                  type: 'image',
+                  data: 'BASE64_IMAGE_DATA',
+                  mimeType: 'image/jpeg',
+                },
+                { type: 'text', text: 'Second part.' },
+              ],
+            },
+          },
+        },
+      ];
+      mockCallTool.mockResolvedValue(sdkResponse);
+
+      const toolResult = await tool.execute(params);
+
+      expect(toolResult.llmContent).toEqual([
+        { text: 'First part.' },
+        {
+          text: `[Tool '${serverToolName}' provided the following image data with mime-type: image/jpeg]`,
+        },
+        {
+          inlineData: {
+            mimeType: 'image/jpeg',
+            data: 'BASE64_IMAGE_DATA',
+          },
+        },
+        { text: 'Second part.' },
+      ]);
+      expect(toolResult.returnDisplay).toBe(
+        'First part.\n[Image: image/jpeg]\nSecond part.',
+      );
+    });
+
+    it('should ignore unknown content block types', async () => {
+      const tool = new DiscoveredMCPTool(
+        mockCallableToolInstance,
+        serverName,
+        serverToolName,
+        baseDescription,
+        inputSchema,
+      );
+      const params = { action: 'test' };
+      const sdkResponse: Part[] = [
+        {
+          functionResponse: {
+            name: serverToolName,
+            response: {
+              content: [
+                { type: 'text', text: 'Valid part.' },
+                { type: 'future_block', data: 'some-data' },
+              ],
+            },
+          },
+        },
+      ];
+      mockCallTool.mockResolvedValue(sdkResponse);
+
+      const toolResult = await tool.execute(params);
+
+      expect(toolResult.llmContent).toEqual([{ text: 'Valid part.' }]);
+      expect(toolResult.returnDisplay).toBe(
+        'Valid part.\n[Unknown content type: future_block]',
+      );
+    });
+
+    it('should handle a complex mix of content block types', async () => {
+      const tool = new DiscoveredMCPTool(
+        mockCallableToolInstance,
+        serverName,
+        serverToolName,
+        baseDescription,
+        inputSchema,
+      );
+      const params = { action: 'super-complex' };
+      const sdkResponse: Part[] = [
+        {
+          functionResponse: {
+            name: serverToolName,
+            response: {
+              content: [
+                { type: 'text', text: 'Here is a resource.' },
+                {
+                  type: 'resource_link',
+                  uri: 'file:///path/to/resource',
+                  name: 'resource-name',
+                  title: 'My Resource',
+                },
+                {
+                  type: 'resource',
+                  resource: {
+                    uri: 'file:///path/to/text.txt',
+                    text: 'Embedded text content.',
+                    mimeType: 'text/plain',
+                  },
+                },
+                {
+                  type: 'image',
+                  data: 'BASE64_IMAGE_DATA',
+                  mimeType: 'image/jpeg',
+                },
+              ],
+            },
+          },
+        },
+      ];
+      mockCallTool.mockResolvedValue(sdkResponse);
+
+      const toolResult = await tool.execute(params);
+
+      expect(toolResult.llmContent).toEqual([
+        { text: 'Here is a resource.' },
+        {
+          text: 'Resource Link: My Resource at file:///path/to/resource',
+        },
+        { text: 'Embedded text content.' },
+        {
+          text: `[Tool '${serverToolName}' provided the following image data with mime-type: image/jpeg]`,
+        },
+        {
+          inlineData: {
+            mimeType: 'image/jpeg',
+            data: 'BASE64_IMAGE_DATA',
+          },
+        },
+      ]);
+      expect(toolResult.returnDisplay).toBe(
+        'Here is a resource.\n[Link to My Resource: file:///path/to/resource]\nEmbedded text content.\n[Image: image/jpeg]',
+      );
+    });
  });

  describe('shouldConfirmExecute', () => {
--- a/packages/core/src/tools/mcp-tool.ts
+++ b/packages/core/src/tools/mcp-tool.ts
@ -22,6 +22,40 @@ import {

 type ToolParams = Record<string, unknown>;

+// Discriminated union for MCP Content Blocks to ensure type safety.
+type McpTextBlock = {
+  type: 'text';
+  text: string;
+};
+
+type McpMediaBlock = {
+  type: 'image' | 'audio';
+  mimeType: string;
+  data: string;
+};
+
+type McpResourceBlock = {
+  type: 'resource';
+  resource: {
+    text?: string;
+    blob?: string;
+    mimeType?: string;
+  };
+};
+
+type McpResourceLinkBlock = {
+  type: 'resource_link';
+  uri: string;
+  title?: string;
+  name?: string;
+};
+
+type McpContentBlock =
+  | McpTextBlock
+  | McpMediaBlock
+  | McpResourceBlock
+  | McpResourceLinkBlock;
+
 export class DiscoveredMCPTool extends BaseTool<ToolParams, ToolResult> {
  private static readonly allowlist: Set<string> = new Set();

@ -114,70 +148,145 @@ export class DiscoveredMCPTool extends BaseTool<ToolParams, ToolResult> {
      },
    ];

-    const responseParts: Part[] = await this.mcpTool.callTool(functionCalls);
+    const rawResponseParts = await this.mcpTool.callTool(functionCalls);
+    const transformedParts = transformMcpContentToParts(rawResponseParts);

    return {
-      llmContent: responseParts,
-      returnDisplay: getStringifiedResultForDisplay(responseParts),
+      llmContent: transformedParts,
+      returnDisplay: getStringifiedResultForDisplay(rawResponseParts),
    };
  }
 }

-/**
- * Processes an array of `Part` objects, primarily from a tool's execution result,
- * to generate a user-friendly string representation, typically for display in a CLI.
- *
- * The `result` array can contain various types of `Part` objects:
- * 1. `FunctionResponse` parts:
- *    - If the `response.content` of a `FunctionResponse` is an array consisting solely
- *      of `TextPart` objects, their text content is concatenated into a single string.
- *      This is to present simple textual outputs directly.
- *    - If `response.content` is an array but contains other types of `Part` objects (or a mix),
- *      the `content` array itself is preserved. This handles structured data like JSON objects or arrays
- *      returned by a tool.
- *    - If `response.content` is not an array or is missing, the entire `functionResponse`
- *      object is preserved.
- * 2. Other `Part` types (e.g., `TextPart` directly in the `result` array):
- *    - These are preserved as is.
- *
- * All processed parts are then collected into an array, which is JSON.stringify-ed
- * with indentation and wrapped in a markdown JSON code block.
- */
-function getStringifiedResultForDisplay(result: Part[]) {
-  if (!result || result.length === 0) {
-    return '```json\n[]\n```';
+function transformTextBlock(block: McpTextBlock): Part {
+  return { text: block.text };
+}
+
+function transformImageAudioBlock(
+  block: McpMediaBlock,
+  toolName: string,
+): Part[] {
+  return [
+    {
+      text: `[Tool '${toolName}' provided the following ${
+        block.type
+      } data with mime-type: ${block.mimeType}]`,
+    },
+    {
+      inlineData: {
+        mimeType: block.mimeType,
+        data: block.data,
+      },
+    },
+  ];
+}
+
+function transformResourceBlock(
+  block: McpResourceBlock,
+  toolName: string,
+): Part | Part[] | null {
+  const resource = block.resource;
+  if (resource?.text) {
+    return { text: resource.text };
  }
+  if (resource?.blob) {
+    const mimeType = resource.mimeType || 'application/octet-stream';
+    return [
+      {
+        text: `[Tool '${toolName}' provided the following embedded resource with mime-type: ${mimeType}]`,
+      },
+      {
+        inlineData: {
+          mimeType,
+          data: resource.blob,
+        },
+      },
+    ];
+  }
+  return null;
+}

-  const processFunctionResponse = (part: Part) => {
-    if (part.functionResponse) {
-      const responseContent = part.functionResponse.response?.content;
-      if (responseContent && Array.isArray(responseContent)) {
-        // Check if all parts in responseContent are simple TextParts
-        const allTextParts = responseContent.every(
-          (p: Part) => p.text !== undefined,
-        );
-        if (allTextParts) {
-          return responseContent.map((p: Part) => p.text).join('');
-        }
-        // If not all simple text parts, return the array of these content parts for JSON stringification
-        return responseContent;
-      }
-
-      // If no content, or not an array, or not a functionResponse, stringify the whole functionResponse part for inspection
-      return part.functionResponse;
-    }
-    return part; // Fallback for unexpected structure or non-FunctionResponsePart
+function transformResourceLinkBlock(block: McpResourceLinkBlock): Part {
+  return {
+    text: `Resource Link: ${block.title || block.name} at ${block.uri}`,
  };
+}

-  const processedResults =
-    result.length === 1
-      ? processFunctionResponse(result[0])
-      : result.map(processFunctionResponse);
-  if (typeof processedResults === 'string') {
-    return processedResults;
+/**
+ * Transforms the raw MCP content blocks from the SDK response into a
+ * standard GenAI Part array.
+ * @param sdkResponse The raw Part[] array from `mcpTool.callTool()`.
+ * @returns A clean Part[] array ready for the scheduler.
+ */
+function transformMcpContentToParts(sdkResponse: Part[]): Part[] {
+  const funcResponse = sdkResponse?.[0]?.functionResponse;
+  const mcpContent = funcResponse?.response?.content as McpContentBlock[];
+  const toolName = funcResponse?.name || 'unknown tool';
+
+  if (!Array.isArray(mcpContent)) {
+    return [{ text: '[Error: Could not parse tool response]' }];
  }

-  return '```json\n' + JSON.stringify(processedResults, null, 2) + '\n```';
+  const transformed = mcpContent.flatMap(
+    (block: McpContentBlock): Part | Part[] | null => {
+      switch (block.type) {
+        case 'text':
+          return transformTextBlock(block);
+        case 'image':
+        case 'audio':
+          return transformImageAudioBlock(block, toolName);
+        case 'resource':
+          return transformResourceBlock(block, toolName);
+        case 'resource_link':
+          return transformResourceLinkBlock(block);
+        default:
+          return null;
+      }
+    },
+  );
+
+  return transformed.filter((part): part is Part => part !== null);
+}
+
+/**
+ * Processes the raw response from the MCP tool to generate a clean,
+ * human-readable string for display in the CLI. It summarizes non-text
+ * content and presents text directly.
+ *
+ * @param rawResponse The raw Part[] array from the GenAI SDK.
+ * @returns A formatted string representing the tool's output.
+ */
+function getStringifiedResultForDisplay(rawResponse: Part[]): string {
+  const mcpContent = rawResponse?.[0]?.functionResponse?.response
+    ?.content as McpContentBlock[];
+
+  if (!Array.isArray(mcpContent)) {
+    return '```json\n' + JSON.stringify(rawResponse, null, 2) + '\n```';
+  }
+
+  const displayParts = mcpContent.map((block: McpContentBlock): string => {
+    switch (block.type) {
+      case 'text':
+        return block.text;
+      case 'image':
+        return `[Image: ${block.mimeType}]`;
+      case 'audio':
+        return `[Audio: ${block.mimeType}]`;
+      case 'resource_link':
+        return `[Link to ${block.title || block.name}: ${block.uri}]`;
+      case 'resource':
+        if (block.resource?.text) {
+          return block.resource.text;
+        }
+        return `[Embedded Resource: ${
+          block.resource?.mimeType || 'unknown type'
+        }]`;
+      default:
+        return `[Unknown content type: ${(block as { type: string }).type}]`;
+    }
+  });
+
+  return displayParts.join('\n');
 }

 /** Visible for testing */