Summarize tool call outputs using tool specific summarizers (#3745)

2025-07-11 09:29:08 -07:00 · 2025-07-11 09:29:08 -07:00 · 23197151c2
parent cdbe2fffd9
commit 23197151c2
9 changed files with 421 additions and 9 deletions
--- a/packages/cli/src/ui/hooks/useToolScheduler.test.ts
+++ b/packages/cli/src/ui/hooks/useToolScheduler.test.ts
@ -128,6 +128,7 @@ describe('useReactToolScheduler in YOLO Mode', () => {
    (mockToolRequiresConfirmation.execute as Mock).mockResolvedValue({
      llmContent: expectedOutput,
      returnDisplay: 'YOLO Formatted tool output',
+      summary: 'YOLO summary',
    } as ToolResult);

    const { result } = renderSchedulerInYoloMode();
@ -280,6 +281,7 @@ describe('useReactToolScheduler', () => {
    (mockTool.execute as Mock).mockResolvedValue({
      llmContent: 'Tool output',
      returnDisplay: 'Formatted tool output',
+      summary: 'Formatted summary',
    } as ToolResult);
    (mockTool.shouldConfirmExecute as Mock).mockResolvedValue(null);

@ -442,6 +444,7 @@ describe('useReactToolScheduler', () => {
    (mockToolRequiresConfirmation.execute as Mock).mockResolvedValue({
      llmContent: expectedOutput,
      returnDisplay: 'Confirmed display',
+      summary: 'Confirmed summary',
    } as ToolResult);

    const { result } = renderScheduler();
@ -608,6 +611,7 @@ describe('useReactToolScheduler', () => {
      resolveExecutePromise({
        llmContent: 'Final output',
        returnDisplay: 'Final display',
+        summary: 'Final summary',
      } as ToolResult);
    });
    await act(async () => {
@ -644,6 +648,7 @@ describe('useReactToolScheduler', () => {
      execute: vi.fn().mockResolvedValue({
        llmContent: 'Output 1',
        returnDisplay: 'Display 1',
+        summary: 'Summary 1',
      } as ToolResult),
      shouldConfirmExecute: vi.fn().mockResolvedValue(null),
    };
@ -654,6 +659,7 @@ describe('useReactToolScheduler', () => {
      execute: vi.fn().mockResolvedValue({
        llmContent: 'Output 2',
        returnDisplay: 'Display 2',
+        summary: 'Summary 2',
      } as ToolResult),
      shouldConfirmExecute: vi.fn().mockResolvedValue(null),
    };
@ -733,7 +739,12 @@ describe('useReactToolScheduler', () => {
    mockToolRegistry.getTool.mockReturnValue(mockTool);
    const longExecutePromise = new Promise<ToolResult>((resolve) =>
      setTimeout(
-        () => resolve({ llmContent: 'done', returnDisplay: 'done display' }),
+        () =>
+          resolve({
+            llmContent: 'done',
+            returnDisplay: 'done display',
+            summary: 'done summary',
+          }),
        50,
      ),
    );
@ -814,6 +825,7 @@ describe('mapToDisplay', () => {
      } as PartUnion,
    ],
    resultDisplay: 'Test display output',
+    summary: 'Test summary',
    error: undefined,
  };

--- a/packages/core/src/core/client.ts
+++ b/packages/core/src/core/client.ts
@ -419,8 +419,9 @@ export class GeminiClient {
    contents: Content[],
    generationConfig: GenerateContentConfig,
    abortSignal: AbortSignal,
+    model?: string,
  ): Promise<GenerateContentResponse> {
-    const modelToUse = this.config.getModel();
+    const modelToUse = model ?? this.config.getModel();
    const configToUse: GenerateContentConfig = {
      ...this.generateContentConfig,
      ...generationConfig,
--- a/packages/core/src/core/coreToolScheduler.ts
+++ b/packages/core/src/core/coreToolScheduler.ts
@ -646,7 +646,7 @@ export class CoreToolScheduler {

        scheduledCall.tool
          .execute(scheduledCall.request.args, signal, liveOutputCallback)
-          .then((toolResult: ToolResult) => {
+          .then(async (toolResult: ToolResult) => {
            if (signal.aborted) {
              this.setStatusInternal(
                callId,
@ -656,18 +656,42 @@ export class CoreToolScheduler {
              return;
            }

+            let resultForDisplay: ToolResult = toolResult;
+            let summary: string | undefined;
+            if (scheduledCall.tool.summarizer) {
+              try {
+                const toolSignal = new AbortController();
+                summary = await scheduledCall.tool.summarizer(
+                  toolResult,
+                  this.config.getGeminiClient(),
+                  toolSignal.signal,
+                );
+                if (toolSignal.signal.aborted) {
+                  console.debug('aborted summarizing tool result');
+                  return;
+                }
+                if (scheduledCall.tool?.shouldSummarizeDisplay) {
+                  resultForDisplay = {
+                    ...toolResult,
+                    returnDisplay: summary,
+                  };
+                }
+              } catch (e) {
+                console.error('Error summarizing tool result:', e);
+              }
+            }
            const response = convertToFunctionResponse(
              toolName,
              callId,
-              toolResult.llmContent,
+              summary ? [summary] : toolResult.llmContent,
            );
-
            const successResponse: ToolCallResponseInfo = {
              callId,
              responseParts: response,
-              resultDisplay: toolResult.returnDisplay,
+              resultDisplay: resultForDisplay.returnDisplay,
              error: undefined,
            };
+
            this.setStatusInternal(callId, 'success', successResponse);
          })
          .catch((executionError: Error) => {
--- a/packages/core/src/core/nonInteractiveToolExecutor.ts
+++ b/packages/core/src/core/nonInteractiveToolExecutor.ts
@ -68,6 +68,18 @@ export async function executeToolCall(
      // No live output callback for non-interactive mode
    );

+    const tool_output = tool.summarizer
+      ? await tool.summarizer(
+          toolResult,
+          config.getGeminiClient(),
+          effectiveAbortSignal,
+        )
+      : toolResult.llmContent;
+
+    const tool_display = tool.shouldSummarizeDisplay
+      ? (tool_output as string)
+      : toolResult.returnDisplay;
+
    const durationMs = Date.now() - startTime;
    logToolCall(config, {
      'event.name': 'tool_call',
@ -82,13 +94,13 @@ export async function executeToolCall(
    const response = convertToFunctionResponse(
      toolCallRequest.name,
      toolCallRequest.callId,
-      toolResult.llmContent,
+      tool_output,
    );

    return {
      callId: toolCallRequest.callId,
      responseParts: response,
-      resultDisplay: toolResult.returnDisplay,
+      resultDisplay: tool_display,
      error: undefined,
    };
  } catch (e) {
--- a/packages/core/src/tools/shell.ts
+++ b/packages/core/src/tools/shell.ts
@ -27,6 +27,7 @@ export interface ShellToolParams {
  directory?: string;
 }
 import { spawn } from 'child_process';
+import { llmSummarizer } from '../utils/summarizer.js';

 const OUTPUT_UPDATE_INTERVAL_MS = 1000;

@ -73,6 +74,8 @@ Process Group PGID: Process group started or \`(none)\``,
      },
      false, // output is not markdown
      true, // output can be updated
+      llmSummarizer,
+      true, // should summarize display output
    );
  }

@ -487,7 +490,6 @@ Process Group PGID: Process group started or \`(none)\``,
        // returnDisplayMessage will remain empty, which is fine.
      }
    }
-
    return { llmContent, returnDisplay: returnDisplayMessage };
  }
 }
--- a/packages/core/src/tools/tool-registry.ts
+++ b/packages/core/src/tools/tool-registry.ts
@ -11,6 +11,7 @@ import { spawn } from 'node:child_process';
 import { StringDecoder } from 'node:string_decoder';
 import { discoverMcpTools } from './mcp-client.js';
 import { DiscoveredMCPTool } from './mcp-tool.js';
+import { defaultSummarizer } from '../utils/summarizer.js';
 import { parse } from 'shell-quote';

 type ToolParams = Record<string, unknown>;
@ -47,6 +48,7 @@ Signal: Signal number or \`(none)\` if no signal was received.
      parameterSchema,
      false, // isOutputMarkdown
      false, // canUpdateOutput
+      defaultSummarizer,
    );
  }

--- a/packages/core/src/tools/tools.ts
+++ b/packages/core/src/tools/tools.ts
@ -5,6 +5,7 @@
 */

 import { FunctionDeclaration, PartListUnion, Schema } from '@google/genai';
+import { Summarizer, defaultSummarizer } from '../utils/summarizer.js';

 /**
 * Interface representing the base Tool functionality
@ -43,6 +44,16 @@ export interface Tool<
   */
  canUpdateOutput: boolean;

+  /**
+   * A function that summarizes the result of the tool execution.
+   */
+  summarizer?: Summarizer;
+
+  /**
+   * Whether the tool's display output should be summarized
+   */
+  shouldSummarizeDisplay?: boolean;
+
  /**
   * Validates the parameters for the tool
   * Should be called from both `shouldConfirmExecute` and `execute`
@ -98,6 +109,8 @@ export abstract class BaseTool<
   * @param isOutputMarkdown Whether the tool's output should be rendered as markdown
   * @param canUpdateOutput Whether the tool supports live (streaming) output
   * @param parameterSchema JSON Schema defining the parameters
+   * @param summarizer Function to summarize the tool's output
+   * @param shouldSummarizeDisplay Whether the tool's display output should be summarized
   */
  constructor(
    readonly name: string,
@ -106,6 +119,8 @@ export abstract class BaseTool<
    readonly parameterSchema: Schema,
    readonly isOutputMarkdown: boolean = true,
    readonly canUpdateOutput: boolean = false,
+    readonly summarizer: Summarizer = defaultSummarizer,
+    readonly shouldSummarizeDisplay: boolean = false,
  ) {}

  /**
@ -173,6 +188,11 @@ export abstract class BaseTool<
 }

 export interface ToolResult {
+  /**
+   * A short, one-line summary of the tool's action and result.
+   * e.g., "Read 5 files", "Wrote 256 bytes to foo.txt"
+   */
+  summary?: string;
  /**
   * Content meant to be included in LLM history.
   * This should represent the factual outcome of the tool execution.
--- a/packages/core/src/utils/summarizer.test.ts
+++ b/packages/core/src/utils/summarizer.test.ts
@ -0,0 +1,208 @@
+/**
+ * @license
+ * Copyright 2025 Google LLC
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+import { describe, it, expect, vi, beforeEach, afterEach, Mock } from 'vitest';
+import { GeminiClient } from '../core/client.js';
+import { Config } from '../config/config.js';
+import {
+  summarizeToolOutput,
+  llmSummarizer,
+  defaultSummarizer,
+} from './summarizer.js';
+import { ToolResult } from '../tools/tools.js';
+
+// Mock GeminiClient and Config constructor
+vi.mock('../core/client.js');
+vi.mock('../config/config.js');
+
+describe('summarizers', () => {
+  let mockGeminiClient: GeminiClient;
+  let MockConfig: Mock;
+  const abortSignal = new AbortController().signal;
+
+  beforeEach(() => {
+    MockConfig = vi.mocked(Config);
+    const mockConfigInstance = new MockConfig(
+      'test-api-key',
+      'gemini-pro',
+      false,
+      '.',
+      false,
+      undefined,
+      false,
+      undefined,
+      undefined,
+      undefined,
+    );
+
+    mockGeminiClient = new GeminiClient(mockConfigInstance);
+    (mockGeminiClient.generateContent as Mock) = vi.fn();
+
+    vi.spyOn(console, 'error').mockImplementation(() => {});
+  });
+
+  afterEach(() => {
+    vi.clearAllMocks();
+    (console.error as Mock).mockRestore();
+  });
+
+  describe('summarizeToolOutput', () => {
+    it('should return original text if it is shorter than maxLength', async () => {
+      const shortText = 'This is a short text.';
+      const result = await summarizeToolOutput(
+        shortText,
+        mockGeminiClient,
+        abortSignal,
+        2000,
+      );
+      expect(result).toBe(shortText);
+      expect(mockGeminiClient.generateContent).not.toHaveBeenCalled();
+    });
+
+    it('should return original text if it is empty', async () => {
+      const emptyText = '';
+      const result = await summarizeToolOutput(
+        emptyText,
+        mockGeminiClient,
+        abortSignal,
+        2000,
+      );
+      expect(result).toBe(emptyText);
+      expect(mockGeminiClient.generateContent).not.toHaveBeenCalled();
+    });
+
+    it('should call generateContent if text is longer than maxLength', async () => {
+      const longText = 'This is a very long text.'.repeat(200);
+      const summary = 'This is a summary.';
+      (mockGeminiClient.generateContent as Mock).mockResolvedValue({
+        candidates: [{ content: { parts: [{ text: summary }] } }],
+      });
+
+      const result = await summarizeToolOutput(
+        longText,
+        mockGeminiClient,
+        abortSignal,
+        2000,
+      );
+
+      expect(mockGeminiClient.generateContent).toHaveBeenCalledTimes(1);
+      expect(result).toBe(summary);
+    });
+
+    it('should return original text if generateContent throws an error', async () => {
+      const longText = 'This is a very long text.'.repeat(200);
+      const error = new Error('API Error');
+      (mockGeminiClient.generateContent as Mock).mockRejectedValue(error);
+
+      const result = await summarizeToolOutput(
+        longText,
+        mockGeminiClient,
+        abortSignal,
+        2000,
+      );
+
+      expect(mockGeminiClient.generateContent).toHaveBeenCalledTimes(1);
+      expect(result).toBe(longText);
+      expect(console.error).toHaveBeenCalledWith(
+        'Failed to summarize tool output.',
+        error,
+      );
+    });
+
+    it('should construct the correct prompt for summarization', async () => {
+      const longText = 'This is a very long text.'.repeat(200);
+      const summary = 'This is a summary.';
+      (mockGeminiClient.generateContent as Mock).mockResolvedValue({
+        candidates: [{ content: { parts: [{ text: summary }] } }],
+      });
+
+      await summarizeToolOutput(longText, mockGeminiClient, abortSignal, 1000);
+
+      const expectedPrompt = `Summarize the following tool output to be a maximum of 1000 characters. The summary should be concise and capture the main points of the tool output.
+
+The summarization should be done based on the content that is provided. Here are the basic rules to follow:
+1. If the text is a directory listing or any output that is structural, use the history of the conversation to understand the context. Using this context try to understand what information we need from the tool output and return that as a response.
+2. If the text is text content and there is nothing structural that we need, summarize the text.
+3. If the text is the output of a shell command, use the history of the conversation to understand the context. Using this context try to understand what information we need from the tool output and return a summarization along with the stack trace of any error within the <error></error> tags. The stack trace should be complete and not truncated. If there are warnings, you should include them in the summary within <warning></warning> tags.
+
+
+Text to summarize:
+"${longText}"
+
+Return the summary string which should first contain an overall summarization of text followed by the full stack trace of errors and warnings in the tool output.
+`;
+      const calledWith = (mockGeminiClient.generateContent as Mock).mock
+        .calls[0];
+      const contents = calledWith[0];
+      expect(contents[0].parts[0].text).toBe(expectedPrompt);
+    });
+  });
+
+  describe('llmSummarizer', () => {
+    it('should summarize tool output using summarizeToolOutput', async () => {
+      const toolResult: ToolResult = {
+        llmContent: 'This is a very long text.'.repeat(200),
+        returnDisplay: '',
+      };
+      const summary = 'This is a summary.';
+      (mockGeminiClient.generateContent as Mock).mockResolvedValue({
+        candidates: [{ content: { parts: [{ text: summary }] } }],
+      });
+
+      const result = await llmSummarizer(
+        toolResult,
+        mockGeminiClient,
+        abortSignal,
+      );
+
+      expect(mockGeminiClient.generateContent).toHaveBeenCalledTimes(1);
+      expect(result).toBe(summary);
+    });
+
+    it('should handle different llmContent types', async () => {
+      const longText = 'This is a very long text.'.repeat(200);
+      const toolResult: ToolResult = {
+        llmContent: [{ text: longText }],
+        returnDisplay: '',
+      };
+      const summary = 'This is a summary.';
+      (mockGeminiClient.generateContent as Mock).mockResolvedValue({
+        candidates: [{ content: { parts: [{ text: summary }] } }],
+      });
+
+      const result = await llmSummarizer(
+        toolResult,
+        mockGeminiClient,
+        abortSignal,
+      );
+
+      expect(mockGeminiClient.generateContent).toHaveBeenCalledTimes(1);
+      const calledWith = (mockGeminiClient.generateContent as Mock).mock
+        .calls[0];
+      const contents = calledWith[0];
+      expect(contents[0].parts[0].text).toContain(`"${longText}"`);
+      expect(result).toBe(summary);
+    });
+  });
+
+  describe('defaultSummarizer', () => {
+    it('should stringify the llmContent', async () => {
+      const toolResult: ToolResult = {
+        llmContent: { text: 'some data' },
+        returnDisplay: '',
+      };
+
+      const result = await defaultSummarizer(
+        toolResult,
+        mockGeminiClient,
+        abortSignal,
+      );
+
+      expect(result).toBe(JSON.stringify({ text: 'some data' }));
+      expect(mockGeminiClient.generateContent).not.toHaveBeenCalled();
+    });
+  });
+});
--- a/packages/core/src/utils/summarizer.ts
+++ b/packages/core/src/utils/summarizer.ts
@ -0,0 +1,131 @@
+/**
+ * @license
+ * Copyright 2025 Google LLC
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+import { ToolResult } from '../tools/tools.js';
+import {
+  Content,
+  GenerateContentConfig,
+  GenerateContentResponse,
+} from '@google/genai';
+import { GeminiClient } from '../core/client.js';
+import { DEFAULT_GEMINI_FLASH_MODEL } from '../config/models.js';
+import { PartListUnion } from '@google/genai';
+
+/**
+ * A function that summarizes the result of a tool execution.
+ *
+ * @param result The result of the tool execution.
+ * @returns The summary of the result.
+ */
+export type Summarizer = (
+  result: ToolResult,
+  geminiClient: GeminiClient,
+  abortSignal: AbortSignal,
+) => Promise<string>;
+
+/**
+ * The default summarizer for tool results.
+ *
+ * @param result The result of the tool execution.
+ * @param geminiClient The Gemini client to use for summarization.
+ * @param abortSignal The abort signal to use for summarization.
+ * @returns The summary of the result.
+ */
+export const defaultSummarizer: Summarizer = (
+  result: ToolResult,
+  _geminiClient: GeminiClient,
+  _abortSignal: AbortSignal,
+) => Promise.resolve(JSON.stringify(result.llmContent));
+
+// TODO: Move both these functions to utils
+function partToString(part: PartListUnion): string {
+  if (!part) {
+    return '';
+  }
+  if (typeof part === 'string') {
+    return part;
+  }
+  if (Array.isArray(part)) {
+    return part.map(partToString).join('');
+  }
+  if ('text' in part) {
+    return part.text ?? '';
+  }
+  return '';
+}
+
+function getResponseText(response: GenerateContentResponse): string | null {
+  if (response.candidates && response.candidates.length > 0) {
+    const candidate = response.candidates[0];
+    if (
+      candidate.content &&
+      candidate.content.parts &&
+      candidate.content.parts.length > 0
+    ) {
+      return candidate.content.parts
+        .filter((part) => part.text)
+        .map((part) => part.text)
+        .join('');
+    }
+  }
+  return null;
+}
+
+const toolOutputSummarizerModel = DEFAULT_GEMINI_FLASH_MODEL;
+const toolOutputSummarizerConfig: GenerateContentConfig = {
+  maxOutputTokens: 2000,
+};
+
+const SUMMARIZE_TOOL_OUTPUT_PROMPT = `Summarize the following tool output to be a maximum of {maxLength} characters. The summary should be concise and capture the main points of the tool output.
+
+The summarization should be done based on the content that is provided. Here are the basic rules to follow:
+1. If the text is a directory listing or any output that is structural, use the history of the conversation to understand the context. Using this context try to understand what information we need from the tool output and return that as a response.
+2. If the text is text content and there is nothing structural that we need, summarize the text.
+3. If the text is the output of a shell command, use the history of the conversation to understand the context. Using this context try to understand what information we need from the tool output and return a summarization along with the stack trace of any error within the <error></error> tags. The stack trace should be complete and not truncated. If there are warnings, you should include them in the summary within <warning></warning> tags.
+
+
+Text to summarize:
+"{textToSummarize}"
+
+Return the summary string which should first contain an overall summarization of text followed by the full stack trace of errors and warnings in the tool output.
+`;
+
+export const llmSummarizer: Summarizer = (result, geminiClient, abortSignal) =>
+  summarizeToolOutput(
+    partToString(result.llmContent),
+    geminiClient,
+    abortSignal,
+  );
+
+export async function summarizeToolOutput(
+  textToSummarize: string,
+  geminiClient: GeminiClient,
+  abortSignal: AbortSignal,
+  maxLength: number = 2000,
+): Promise<string> {
+  if (!textToSummarize || textToSummarize.length < maxLength) {
+    return textToSummarize;
+  }
+  const prompt = SUMMARIZE_TOOL_OUTPUT_PROMPT.replace(
+    '{maxLength}',
+    String(maxLength),
+  ).replace('{textToSummarize}', textToSummarize);
+
+  const contents: Content[] = [{ role: 'user', parts: [{ text: prompt }] }];
+
+  try {
+    const parsedResponse = (await geminiClient.generateContent(
+      contents,
+      toolOutputSummarizerConfig,
+      abortSignal,
+      toolOutputSummarizerModel,
+    )) as unknown as GenerateContentResponse;
+    return getResponseText(parsedResponse) || textToSummarize;
+  } catch (error) {
+    console.error('Failed to summarize tool output.', error);
+    return textToSummarize;
+  }
+}