From 387706607dfa372f4f0c6fee14286bf4a290b258 Mon Sep 17 00:00:00 2001 From: Allen Hutchison Date: Fri, 1 Aug 2025 14:33:33 -0700 Subject: [PATCH] fix(tests): refactor integration tests to be less flaky (#4890) Co-authored-by: matt korwel --- integration-tests/file-system.test.js | 77 ++- integration-tests/google_web_search.test.js | 67 ++- integration-tests/list_directory.test.js | 54 +- integration-tests/read_many_files.test.js | 40 +- integration-tests/replace.test.js | 56 +- integration-tests/run-tests.js | 1 + integration-tests/run_shell_command.test.js | 58 +- integration-tests/save_memory.test.js | 28 +- integration-tests/simple-mcp-server.test.js | 227 ++++++-- integration-tests/test-helper.js | 517 +++++++++++++++++- integration-tests/write_file.test.js | 57 +- packages/core/src/utils/bfsFileSearch.test.ts | 6 +- 12 files changed, 1073 insertions(+), 115 deletions(-) diff --git a/integration-tests/file-system.test.js b/integration-tests/file-system.test.js index 87e9efe2..d43f047f 100644 --- a/integration-tests/file-system.test.js +++ b/integration-tests/file-system.test.js @@ -6,25 +6,84 @@ import { strict as assert } from 'assert'; import { test } from 'node:test'; -import { TestRig } from './test-helper.js'; +import { TestRig, printDebugInfo, validateModelOutput } from './test-helper.js'; -test('reads a file', (t) => { +test('should be able to read a file', async () => { const rig = new TestRig(); - rig.setup(t.name); + await rig.setup('should be able to read a file'); rig.createFile('test.txt', 'hello world'); - const output = rig.run(`read the file name test.txt`); + const result = await rig.run( + `read the file test.txt and show me its contents`, + ); - assert.ok(output.toLowerCase().includes('hello')); + const foundToolCall = await rig.waitForToolCall('read_file'); + + // Add debugging information + if (!foundToolCall || !result.includes('hello world')) { + printDebugInfo(rig, result, { + 'Found tool call': foundToolCall, + 'Contains hello world': result.includes('hello world'), + }); + } + + assert.ok(foundToolCall, 'Expected to find a read_file tool call'); + + // Validate model output - will throw if no output, warn if missing expected content + validateModelOutput(result, 'hello world', 'File read test'); }); -test('writes a file', (t) => { +test('should be able to write a file', async () => { const rig = new TestRig(); - rig.setup(t.name); + await rig.setup('should be able to write a file'); rig.createFile('test.txt', ''); - rig.run(`edit test.txt to have a hello world message`); + const result = await rig.run(`edit test.txt to have a hello world message`); + + // Accept multiple valid tools for editing files + const foundToolCall = await rig.waitForAnyToolCall([ + 'write_file', + 'edit', + 'replace', + ]); + + // Add debugging information + if (!foundToolCall) { + printDebugInfo(rig, result); + } + + assert.ok( + foundToolCall, + 'Expected to find a write_file, edit, or replace tool call', + ); + + // Validate model output - will throw if no output + validateModelOutput(result, null, 'File write test'); const fileContent = rig.readFile('test.txt'); - assert.ok(fileContent.toLowerCase().includes('hello')); + + // Add debugging for file content + if (!fileContent.toLowerCase().includes('hello')) { + const writeCalls = rig + .readToolLogs() + .filter((t) => t.toolRequest.name === 'write_file') + .map((t) => t.toolRequest.args); + + printDebugInfo(rig, result, { + 'File content mismatch': true, + 'Expected to contain': 'hello', + 'Actual content': fileContent, + 'Write tool calls': JSON.stringify(writeCalls), + }); + } + + assert.ok( + fileContent.toLowerCase().includes('hello'), + 'Expected file to contain hello', + ); + + // Log success info if verbose + if (process.env.VERBOSE === 'true') { + console.log('File written successfully with hello message.'); + } }); diff --git a/integration-tests/google_web_search.test.js b/integration-tests/google_web_search.test.js index a8968117..31747421 100644 --- a/integration-tests/google_web_search.test.js +++ b/integration-tests/google_web_search.test.js @@ -6,14 +6,69 @@ import { test } from 'node:test'; import { strict as assert } from 'assert'; -import { TestRig } from './test-helper.js'; +import { TestRig, printDebugInfo, validateModelOutput } from './test-helper.js'; -test('should be able to search the web', async (t) => { +test('should be able to search the web', async () => { const rig = new TestRig(); - rig.setup(t.name); + await rig.setup('should be able to search the web'); - const prompt = `what planet do we live on`; - const result = await rig.run(prompt); + let result; + try { + result = await rig.run(`what is the weather in London`); + } catch (error) { + // Network errors can occur in CI environments + if ( + error.message.includes('network') || + error.message.includes('timeout') + ) { + console.warn('Skipping test due to network error:', error.message); + return; // Skip the test + } + throw error; // Re-throw if not a network error + } - assert.ok(result.toLowerCase().includes('earth')); + const foundToolCall = await rig.waitForToolCall('google_web_search'); + + // Add debugging information + if (!foundToolCall) { + const allTools = printDebugInfo(rig, result); + + // Check if the tool call failed due to network issues + const failedSearchCalls = allTools.filter( + (t) => + t.toolRequest.name === 'google_web_search' && !t.toolRequest.success, + ); + if (failedSearchCalls.length > 0) { + console.warn( + 'google_web_search tool was called but failed, possibly due to network issues', + ); + console.warn( + 'Failed calls:', + failedSearchCalls.map((t) => t.toolRequest.args), + ); + return; // Skip the test if network issues + } + } + + assert.ok(foundToolCall, 'Expected to find a call to google_web_search'); + + // Validate model output - will throw if no output, warn if missing expected content + const hasExpectedContent = validateModelOutput( + result, + ['weather', 'london'], + 'Google web search test', + ); + + // If content was missing, log the search queries used + if (!hasExpectedContent) { + const searchCalls = rig + .readToolLogs() + .filter((t) => t.toolRequest.name === 'google_web_search'); + if (searchCalls.length > 0) { + console.warn( + 'Search queries used:', + searchCalls.map((t) => t.toolRequest.args), + ); + } + } }); diff --git a/integration-tests/list_directory.test.js b/integration-tests/list_directory.test.js index af7aae78..16f49f4b 100644 --- a/integration-tests/list_directory.test.js +++ b/integration-tests/list_directory.test.js @@ -6,19 +6,57 @@ import { test } from 'node:test'; import { strict as assert } from 'assert'; -import { TestRig } from './test-helper.js'; +import { TestRig, printDebugInfo, validateModelOutput } from './test-helper.js'; +import { existsSync } from 'fs'; +import { join } from 'path'; -test('should be able to list a directory', async (t) => { +test('should be able to list a directory', async () => { const rig = new TestRig(); - rig.setup(t.name); + await rig.setup('should be able to list a directory'); rig.createFile('file1.txt', 'file 1 content'); rig.mkdir('subdir'); rig.sync(); - const prompt = `Can you list the files in the current directory. Display them in the style of 'ls'`; - const result = rig.run(prompt); + // Poll for filesystem changes to propagate in containers + await rig.poll( + () => { + // Check if the files exist in the test directory + const file1Path = join(rig.testDir, 'file1.txt'); + const subdirPath = join(rig.testDir, 'subdir'); + return existsSync(file1Path) && existsSync(subdirPath); + }, + 1000, // 1 second max wait + 50, // check every 50ms + ); - const lines = result.split('\n').filter((line) => line.trim() !== ''); - assert.ok(lines.some((line) => line.includes('file1.txt'))); - assert.ok(lines.some((line) => line.includes('subdir'))); + const prompt = `Can you list the files in the current directory. Display them in the style of 'ls'`; + + const result = await rig.run(prompt); + + const foundToolCall = await rig.waitForToolCall('list_directory'); + + // Add debugging information + if ( + !foundToolCall || + !result.includes('file1.txt') || + !result.includes('subdir') + ) { + const allTools = printDebugInfo(rig, result, { + 'Found tool call': foundToolCall, + 'Contains file1.txt': result.includes('file1.txt'), + 'Contains subdir': result.includes('subdir'), + }); + + console.error( + 'List directory calls:', + allTools + .filter((t) => t.toolRequest.name === 'list_directory') + .map((t) => t.toolRequest.args), + ); + } + + assert.ok(foundToolCall, 'Expected to find a list_directory tool call'); + + // Validate model output - will throw if no output, warn if missing expected content + validateModelOutput(result, ['file1.txt', 'subdir'], 'List directory test'); }); diff --git a/integration-tests/read_many_files.test.js b/integration-tests/read_many_files.test.js index 7e770036..74d2f358 100644 --- a/integration-tests/read_many_files.test.js +++ b/integration-tests/read_many_files.test.js @@ -6,17 +6,45 @@ import { test } from 'node:test'; import { strict as assert } from 'assert'; -import { TestRig } from './test-helper.js'; +import { TestRig, printDebugInfo, validateModelOutput } from './test-helper.js'; -test.skip('should be able to read multiple files', async (t) => { +test('should be able to read multiple files', async () => { const rig = new TestRig(); - rig.setup(t.name); + await rig.setup('should be able to read multiple files'); rig.createFile('file1.txt', 'file 1 content'); rig.createFile('file2.txt', 'file 2 content'); - const prompt = `Read the files in this directory, list them and print them to the screen`; + const prompt = `Please use read_many_files to read file1.txt and file2.txt and show me what's in them`; + const result = await rig.run(prompt); - assert.ok(result.includes('file 1 content')); - assert.ok(result.includes('file 2 content')); + // Check for either read_many_files or multiple read_file calls + const allTools = rig.readToolLogs(); + const readManyFilesCall = await rig.waitForToolCall('read_many_files'); + const readFileCalls = allTools.filter( + (t) => t.toolRequest.name === 'read_file', + ); + + // Accept either read_many_files OR at least 2 read_file calls + const foundValidPattern = readManyFilesCall || readFileCalls.length >= 2; + + // Add debugging information + if (!foundValidPattern) { + printDebugInfo(rig, result, { + 'read_many_files called': readManyFilesCall, + 'read_file calls': readFileCalls.length, + }); + } + + assert.ok( + foundValidPattern, + 'Expected to find either read_many_files or multiple read_file tool calls', + ); + + // Validate model output - will throw if no output, warn if missing expected content + validateModelOutput( + result, + ['file 1 content', 'file 2 content'], + 'Read many files test', + ); }); diff --git a/integration-tests/replace.test.js b/integration-tests/replace.test.js index 060aba55..1ac6f5a4 100644 --- a/integration-tests/replace.test.js +++ b/integration-tests/replace.test.js @@ -6,17 +6,61 @@ import { test } from 'node:test'; import { strict as assert } from 'assert'; -import { TestRig } from './test-helper.js'; +import { TestRig, printDebugInfo, validateModelOutput } from './test-helper.js'; -test('should be able to replace content in a file', async (t) => { +test('should be able to replace content in a file', async () => { const rig = new TestRig(); - rig.setup(t.name); + await rig.setup('should be able to replace content in a file'); const fileName = 'file_to_replace.txt'; - rig.createFile(fileName, 'original content'); + const originalContent = 'original content'; + const expectedContent = 'replaced content'; + + rig.createFile(fileName, originalContent); const prompt = `Can you replace 'original' with 'replaced' in the file 'file_to_replace.txt'`; - await rig.run(prompt); + const result = await rig.run(prompt); + + const foundToolCall = await rig.waitForToolCall('replace'); + + // Add debugging information + if (!foundToolCall) { + printDebugInfo(rig, result); + } + + assert.ok(foundToolCall, 'Expected to find a replace tool call'); + + // Validate model output - will throw if no output, warn if missing expected content + validateModelOutput( + result, + ['replaced', 'file_to_replace.txt'], + 'Replace content test', + ); + const newFileContent = rig.readFile(fileName); - assert.strictEqual(newFileContent, 'replaced content'); + + // Add debugging for file content + if (newFileContent !== expectedContent) { + console.error('File content mismatch - Debug info:'); + console.error('Expected:', expectedContent); + console.error('Actual:', newFileContent); + console.error( + 'Tool calls:', + rig.readToolLogs().map((t) => ({ + name: t.toolRequest.name, + args: t.toolRequest.args, + })), + ); + } + + assert.strictEqual( + newFileContent, + expectedContent, + 'File content should be updated correctly', + ); + + // Log success info if verbose + if (process.env.VERBOSE === 'true') { + console.log('File replaced successfully. New content:', newFileContent); + } }); diff --git a/integration-tests/run-tests.js b/integration-tests/run-tests.js index 4b4a9a94..05fb349e 100644 --- a/integration-tests/run-tests.js +++ b/integration-tests/run-tests.js @@ -101,6 +101,7 @@ async function main() { KEEP_OUTPUT: keepOutput.toString(), VERBOSE: verbose.toString(), TEST_FILE_NAME: testFileName, + TELEMETRY_LOG_FILE: join(testFileDir, 'telemetry.log'), }, }); diff --git a/integration-tests/run_shell_command.test.js b/integration-tests/run_shell_command.test.js index 52aee194..2a5f9ed4 100644 --- a/integration-tests/run_shell_command.test.js +++ b/integration-tests/run_shell_command.test.js @@ -6,26 +6,58 @@ import { test } from 'node:test'; import { strict as assert } from 'assert'; -import { TestRig } from './test-helper.js'; +import { TestRig, printDebugInfo, validateModelOutput } from './test-helper.js'; -test('should be able to run a shell command', async (t) => { +test('should be able to run a shell command', async () => { const rig = new TestRig(); - rig.setup(t.name); - rig.createFile('blah.txt', 'some content'); + await rig.setup('should be able to run a shell command'); - const prompt = `Can you use ls to list the contexts of the current folder`; - const result = rig.run(prompt); + const prompt = `Please run the command "echo hello-world" and show me the output`; - assert.ok(result.includes('blah.txt')); + const result = await rig.run(prompt); + + const foundToolCall = await rig.waitForToolCall('run_shell_command'); + + // Add debugging information + if (!foundToolCall || !result.includes('hello-world')) { + printDebugInfo(rig, result, { + 'Found tool call': foundToolCall, + 'Contains hello-world': result.includes('hello-world'), + }); + } + + assert.ok(foundToolCall, 'Expected to find a run_shell_command tool call'); + + // Validate model output - will throw if no output, warn if missing expected content + // Model often reports exit code instead of showing output + validateModelOutput( + result, + ['hello-world', 'exit code 0'], + 'Shell command test', + ); }); -test('should be able to run a shell command via stdin', async (t) => { +test('should be able to run a shell command via stdin', async () => { const rig = new TestRig(); - rig.setup(t.name); - rig.createFile('blah.txt', 'some content'); + await rig.setup('should be able to run a shell command via stdin'); - const prompt = `Can you use ls to list the contexts of the current folder`; - const result = rig.run({ stdin: prompt }); + const prompt = `Please run the command "echo test-stdin" and show me what it outputs`; - assert.ok(result.includes('blah.txt')); + const result = await rig.run({ stdin: prompt }); + + const foundToolCall = await rig.waitForToolCall('run_shell_command'); + + // Add debugging information + if (!foundToolCall || !result.includes('test-stdin')) { + printDebugInfo(rig, result, { + 'Test type': 'Stdin test', + 'Found tool call': foundToolCall, + 'Contains test-stdin': result.includes('test-stdin'), + }); + } + + assert.ok(foundToolCall, 'Expected to find a run_shell_command tool call'); + + // Validate model output - will throw if no output, warn if missing expected content + validateModelOutput(result, 'test-stdin', 'Shell command stdin test'); }); diff --git a/integration-tests/save_memory.test.js b/integration-tests/save_memory.test.js index 0716f978..3ec641d4 100644 --- a/integration-tests/save_memory.test.js +++ b/integration-tests/save_memory.test.js @@ -6,16 +6,36 @@ import { test } from 'node:test'; import { strict as assert } from 'assert'; -import { TestRig } from './test-helper.js'; +import { TestRig, printDebugInfo, validateModelOutput } from './test-helper.js'; -test('should be able to save to memory', async (t) => { +test('should be able to save to memory', async () => { const rig = new TestRig(); - rig.setup(t.name); + await rig.setup('should be able to save to memory'); const prompt = `remember that my favorite color is blue. what is my favorite color? tell me that and surround it with $ symbol`; const result = await rig.run(prompt); - assert.ok(result.toLowerCase().includes('$blue$')); + const foundToolCall = await rig.waitForToolCall('save_memory'); + + // Add debugging information + if (!foundToolCall || !result.toLowerCase().includes('blue')) { + const allTools = printDebugInfo(rig, result, { + 'Found tool call': foundToolCall, + 'Contains blue': result.toLowerCase().includes('blue'), + }); + + console.error( + 'Memory tool calls:', + allTools + .filter((t) => t.toolRequest.name === 'save_memory') + .map((t) => t.toolRequest.args), + ); + } + + assert.ok(foundToolCall, 'Expected to find a save_memory tool call'); + + // Validate model output - will throw if no output, warn if missing expected content + validateModelOutput(result, 'blue', 'Save memory test'); }); diff --git a/integration-tests/simple-mcp-server.test.js b/integration-tests/simple-mcp-server.test.js index fc88522d..987f69d2 100644 --- a/integration-tests/simple-mcp-server.test.js +++ b/integration-tests/simple-mcp-server.test.js @@ -4,67 +4,208 @@ * SPDX-License-Identifier: Apache-2.0 */ -import { test, describe, before, after } from 'node:test'; +/** + * This test verifies MCP (Model Context Protocol) server integration. + * It uses a minimal MCP server implementation that doesn't require + * external dependencies, making it compatible with Docker sandbox mode. + */ + +import { test, describe, before } from 'node:test'; import { strict as assert } from 'node:assert'; -import { TestRig } from './test-helper.js'; -import { spawn } from 'child_process'; +import { TestRig, validateModelOutput } from './test-helper.js'; import { join } from 'path'; import { fileURLToPath } from 'url'; -import { writeFileSync, unlinkSync } from 'fs'; +import { writeFileSync } from 'fs'; const __dirname = fileURLToPath(new URL('.', import.meta.url)); -const serverScriptPath = join(__dirname, './temp-server.js'); -const serverScript = ` -import { McpServer } from '@modelcontextprotocol/sdk/server/mcp.js'; -import { StdioServerTransport } from '@modelcontextprotocol/sdk/server/stdio.js'; -import { z } from 'zod'; +// Create a minimal MCP server that doesn't require external dependencies +// This implements the MCP protocol directly using Node.js built-ins +const serverScript = `#!/usr/bin/env node +/** + * @license + * Copyright 2025 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ -const server = new McpServer({ - name: 'addition-server', - version: '1.0.0', +const readline = require('readline'); +const fs = require('fs'); + +// Debug logging to stderr (only when MCP_DEBUG or VERBOSE is set) +const debugEnabled = process.env.MCP_DEBUG === 'true' || process.env.VERBOSE === 'true'; +function debug(msg) { + if (debugEnabled) { + fs.writeSync(2, \`[MCP-DEBUG] \${msg}\\n\`); + } +} + +debug('MCP server starting...'); + +// Simple JSON-RPC implementation for MCP +class SimpleJSONRPC { + constructor() { + this.handlers = new Map(); + this.rl = readline.createInterface({ + input: process.stdin, + output: process.stdout, + terminal: false + }); + + this.rl.on('line', (line) => { + debug(\`Received line: \${line}\`); + try { + const message = JSON.parse(line); + debug(\`Parsed message: \${JSON.stringify(message)}\`); + this.handleMessage(message); + } catch (e) { + debug(\`Parse error: \${e.message}\`); + } + }); + } + + send(message) { + const msgStr = JSON.stringify(message); + debug(\`Sending message: \${msgStr}\`); + process.stdout.write(msgStr + '\\n'); + } + + async handleMessage(message) { + if (message.method && this.handlers.has(message.method)) { + try { + const result = await this.handlers.get(message.method)(message.params || {}); + if (message.id !== undefined) { + this.send({ + jsonrpc: '2.0', + id: message.id, + result + }); + } + } catch (error) { + if (message.id !== undefined) { + this.send({ + jsonrpc: '2.0', + id: message.id, + error: { + code: -32603, + message: error.message + } + }); + } + } + } else if (message.id !== undefined) { + this.send({ + jsonrpc: '2.0', + id: message.id, + error: { + code: -32601, + message: 'Method not found' + } + }); + } + } + + on(method, handler) { + this.handlers.set(method, handler); + } +} + +// Create MCP server +const rpc = new SimpleJSONRPC(); + +// Handle initialize +rpc.on('initialize', async (params) => { + debug('Handling initialize request'); + return { + protocolVersion: '2024-11-05', + capabilities: { + tools: {} + }, + serverInfo: { + name: 'addition-server', + version: '1.0.0' + } + }; }); -server.registerTool( - 'add', - { - title: 'Addition Tool', - description: 'Add two numbers', - inputSchema: { a: z.number(), b: z.number() }, - }, - async ({ a, b }) => ({ - content: [{ type: 'text', text: String(a + b) }], - }), -); +// Handle tools/list +rpc.on('tools/list', async () => { + debug('Handling tools/list request'); + return { + tools: [{ + name: 'add', + description: 'Add two numbers', + inputSchema: { + type: 'object', + properties: { + a: { type: 'number', description: 'First number' }, + b: { type: 'number', description: 'Second number' } + }, + required: ['a', 'b'] + } + }] + }; +}); -const transport = new StdioServerTransport(); -await server.connect(transport); +// Handle tools/call +rpc.on('tools/call', async (params) => { + debug(\`Handling tools/call request for tool: \${params.name}\`); + if (params.name === 'add') { + const { a, b } = params.arguments; + return { + content: [{ + type: 'text', + text: String(a + b) + }] + }; + } + throw new Error('Unknown tool: ' + params.name); +}); + +// Send initialization notification +rpc.send({ + jsonrpc: '2.0', + method: 'initialized' +}); `; describe('simple-mcp-server', () => { const rig = new TestRig(); - let child; - before(() => { - writeFileSync(serverScriptPath, serverScript); - child = spawn('node', [serverScriptPath], { - stdio: ['pipe', 'pipe', 'pipe'], + before(async () => { + // Setup test directory with MCP server configuration + await rig.setup('simple-mcp-server', { + settings: { + mcpServers: { + 'addition-server': { + command: 'node', + args: ['mcp-server.cjs'], + }, + }, + }, }); - child.stderr.on('data', (data) => { - console.error(`stderr: ${data}`); - }); - // Wait for the server to be ready - return new Promise((resolve) => setTimeout(resolve, 2000)); + + // Create server script in the test directory + const testServerPath = join(rig.testDir, 'mcp-server.cjs'); + writeFileSync(testServerPath, serverScript); + + // Make the script executable (though running with 'node' should work anyway) + if (process.platform !== 'win32') { + const { chmodSync } = await import('fs'); + chmodSync(testServerPath, 0o755); + } }); - after(() => { - child.kill(); - unlinkSync(serverScriptPath); - }); + test('should add two numbers', async () => { + // Test directory is already set up in before hook + // Just run the command - MCP server config is in settings.json + const output = await rig.run('add 5 and 10'); - test('should add two numbers', () => { - rig.setup('should add two numbers'); - const output = rig.run('add 5 and 10'); - assert.ok(output.includes('15')); + const foundToolCall = await rig.waitForToolCall('add'); + + assert.ok(foundToolCall, 'Expected to find an add tool call'); + + // Validate model output - will throw if no output, fail if missing expected content + validateModelOutput(output, '15', 'MCP server test'); + assert.ok(output.includes('15'), 'Expected output to contain the sum (15)'); }); }); diff --git a/integration-tests/test-helper.js b/integration-tests/test-helper.js index 7ee3db87..9526ea5f 100644 --- a/integration-tests/test-helper.js +++ b/integration-tests/test-helper.js @@ -4,11 +4,13 @@ * SPDX-License-Identifier: Apache-2.0 */ -import { execSync } from 'child_process'; +import { execSync, spawn } from 'child_process'; +import { parse } from 'shell-quote'; import { mkdirSync, writeFileSync, readFileSync } from 'fs'; import { join, dirname } from 'path'; import { fileURLToPath } from 'url'; import { env } from 'process'; +import { fileExists } from '../scripts/telemetry_utils.js'; const __dirname = dirname(fileURLToPath(import.meta.url)); @@ -19,17 +21,129 @@ function sanitizeTestName(name) { .replace(/-+/g, '-'); } +// Helper to create detailed error messages +export function createToolCallErrorMessage(expectedTools, foundTools, result) { + const expectedStr = Array.isArray(expectedTools) + ? expectedTools.join(' or ') + : expectedTools; + return ( + `Expected to find ${expectedStr} tool call(s). ` + + `Found: ${foundTools.length > 0 ? foundTools.join(', ') : 'none'}. ` + + `Output preview: ${result ? result.substring(0, 200) + '...' : 'no output'}` + ); +} + +// Helper to print debug information when tests fail +export function printDebugInfo(rig, result, context = {}) { + console.error('Test failed - Debug info:'); + console.error('Result length:', result.length); + console.error('Result (first 500 chars):', result.substring(0, 500)); + console.error( + 'Result (last 500 chars):', + result.substring(result.length - 500), + ); + + // Print any additional context provided + Object.entries(context).forEach(([key, value]) => { + console.error(`${key}:`, value); + }); + + // Check what tools were actually called + const allTools = rig.readToolLogs(); + console.error( + 'All tool calls found:', + allTools.map((t) => t.toolRequest.name), + ); + + return allTools; +} + +// Helper to validate model output and warn about unexpected content +export function validateModelOutput( + result, + expectedContent = null, + testName = '', +) { + // First, check if there's any output at all (this should fail the test if missing) + if (!result || result.trim().length === 0) { + throw new Error('Expected LLM to return some output'); + } + + // If expectedContent is provided, check for it and warn if missing + if (expectedContent) { + const contents = Array.isArray(expectedContent) + ? expectedContent + : [expectedContent]; + const missingContent = contents.filter((content) => { + if (typeof content === 'string') { + return !result.toLowerCase().includes(content.toLowerCase()); + } else if (content instanceof RegExp) { + return !content.test(result); + } + return false; + }); + + if (missingContent.length > 0) { + console.warn( + `Warning: LLM did not include expected content in response: ${missingContent.join(', ')}.`, + 'This is not ideal but not a test failure.', + ); + console.warn( + 'The tool was called successfully, which is the main requirement.', + ); + return false; + } else if (process.env.VERBOSE === 'true') { + console.log(`${testName}: Model output validated successfully.`); + } + return true; + } + + return true; +} + export class TestRig { constructor() { this.bundlePath = join(__dirname, '..', 'bundle/gemini.js'); this.testDir = null; } - setup(testName) { + // Get timeout based on environment + getDefaultTimeout() { + if (env.CI) return 60000; // 1 minute in CI + if (env.GEMINI_SANDBOX) return 30000; // 30s in containers + return 15000; // 15s locally + } + + setup(testName, options = {}) { this.testName = testName; const sanitizedName = sanitizeTestName(testName); this.testDir = join(env.INTEGRATION_TEST_FILE_DIR, sanitizedName); mkdirSync(this.testDir, { recursive: true }); + + // Create a settings file to point the CLI to the local collector + const geminiDir = join(this.testDir, '.gemini'); + mkdirSync(geminiDir, { recursive: true }); + // In sandbox mode, use an absolute path for telemetry inside the container + // The container mounts the test directory at the same path as the host + const telemetryPath = + env.GEMINI_SANDBOX && env.GEMINI_SANDBOX !== 'false' + ? join(this.testDir, 'telemetry.log') // Absolute path in test directory + : env.TELEMETRY_LOG_FILE; // Absolute path for non-sandbox + + const settings = { + telemetry: { + enabled: true, + target: 'local', + otlpEndpoint: '', + outfile: telemetryPath, + }, + sandbox: env.GEMINI_SANDBOX !== 'false' ? env.GEMINI_SANDBOX : false, + ...options.settings, // Allow tests to override/add settings + }; + writeFileSync( + join(geminiDir, 'settings.json'), + JSON.stringify(settings, null, 2), + ); } createFile(fileName, content) { @@ -39,7 +153,7 @@ export class TestRig { } mkdir(dir) { - mkdirSync(join(this.testDir, dir)); + mkdirSync(join(this.testDir, dir), { recursive: true }); } sync() { @@ -70,19 +184,88 @@ export class TestRig { command += ` ${args.join(' ')}`; - const output = execSync(command, execOptions); + const commandArgs = parse(command); + const node = commandArgs.shift(); - if (env.KEEP_OUTPUT === 'true' || env.VERBOSE === 'true') { - const testId = `${env.TEST_FILE_NAME.replace( - '.test.js', - '', - )}:${this.testName.replace(/ /g, '-')}`; - console.log(`--- TEST: ${testId} ---`); - console.log(output); - console.log(`--- END TEST: ${testId} ---`); + const child = spawn(node, commandArgs, { + cwd: this.testDir, + stdio: 'pipe', + }); + + let stdout = ''; + let stderr = ''; + + // Handle stdin if provided + if (execOptions.input) { + child.stdin.write(execOptions.input); + child.stdin.end(); } - return output; + child.stdout.on('data', (data) => { + stdout += data; + if (env.KEEP_OUTPUT === 'true' || env.VERBOSE === 'true') { + process.stdout.write(data); + } + }); + + child.stderr.on('data', (data) => { + stderr += data; + if (env.KEEP_OUTPUT === 'true' || env.VERBOSE === 'true') { + process.stderr.write(data); + } + }); + + const promise = new Promise((resolve, reject) => { + child.on('close', (code) => { + if (code === 0) { + // Store the raw stdout for Podman telemetry parsing + this._lastRunStdout = stdout; + + // Filter out telemetry output when running with Podman + // Podman seems to output telemetry to stdout even when writing to file + let result = stdout; + if (env.GEMINI_SANDBOX === 'podman') { + // Remove telemetry JSON objects from output + // They are multi-line JSON objects that start with { and contain telemetry fields + const lines = result.split('\n'); + const filteredLines = []; + let inTelemetryObject = false; + let braceDepth = 0; + + for (const line of lines) { + if (!inTelemetryObject && line.trim() === '{') { + // Check if this might be start of telemetry object + inTelemetryObject = true; + braceDepth = 1; + } else if (inTelemetryObject) { + // Count braces to track nesting + for (const char of line) { + if (char === '{') braceDepth++; + else if (char === '}') braceDepth--; + } + + // Check if we've closed all braces + if (braceDepth === 0) { + inTelemetryObject = false; + // Skip this line (the closing brace) + continue; + } + } else { + // Not in telemetry object, keep the line + filteredLines.push(line); + } + } + + result = filteredLines.join('\n'); + } + resolve(result); + } else { + reject(new Error(`Process exited with code ${code}:\n${stderr}`)); + } + }); + }); + + return promise; } readFile(fileName) { @@ -98,4 +281,312 @@ export class TestRig { } return content; } + + async cleanup() { + // Clean up test directory + if (this.testDir && !env.KEEP_OUTPUT) { + try { + execSync(`rm -rf ${this.testDir}`); + } catch (error) { + // Ignore cleanup errors + if (env.VERBOSE === 'true') { + console.warn('Cleanup warning:', error.message); + } + } + } + } + + async waitForTelemetryReady() { + // In sandbox mode, telemetry is written to a relative path in the test directory + const logFilePath = + env.GEMINI_SANDBOX && env.GEMINI_SANDBOX !== 'false' + ? join(this.testDir, 'telemetry.log') + : env.TELEMETRY_LOG_FILE; + + if (!logFilePath) return; + + // Wait for telemetry file to exist and have content + await this.poll( + () => { + if (!fileExists(logFilePath)) return false; + try { + const content = readFileSync(logFilePath, 'utf-8'); + // Check if file has meaningful content (at least one complete JSON object) + return content.includes('"event.name"'); + } catch (_e) { + return false; + } + }, + 2000, // 2 seconds max - reduced since telemetry should flush on exit now + 100, // check every 100ms + ); + } + + async waitForToolCall(toolName, timeout) { + // Use environment-specific timeout + if (!timeout) { + timeout = this.getDefaultTimeout(); + } + + // Wait for telemetry to be ready before polling for tool calls + await this.waitForTelemetryReady(); + + return this.poll( + () => { + const toolLogs = this.readToolLogs(); + return toolLogs.some((log) => log.toolRequest.name === toolName); + }, + timeout, + 100, + ); + } + + async waitForAnyToolCall(toolNames, timeout) { + // Use environment-specific timeout + if (!timeout) { + timeout = this.getDefaultTimeout(); + } + + // Wait for telemetry to be ready before polling for tool calls + await this.waitForTelemetryReady(); + + return this.poll( + () => { + const toolLogs = this.readToolLogs(); + return toolNames.some((name) => + toolLogs.some((log) => log.toolRequest.name === name), + ); + }, + timeout, + 100, + ); + } + + async poll(predicate, timeout, interval) { + const startTime = Date.now(); + let attempts = 0; + while (Date.now() - startTime < timeout) { + attempts++; + const result = predicate(); + if (env.VERBOSE === 'true' && attempts % 5 === 0) { + console.log( + `Poll attempt ${attempts}: ${result ? 'success' : 'waiting...'}`, + ); + } + if (result) { + return true; + } + await new Promise((resolve) => setTimeout(resolve, interval)); + } + if (env.VERBOSE === 'true') { + console.log(`Poll timed out after ${attempts} attempts`); + } + return false; + } + + _parseToolLogsFromStdout(stdout) { + const logs = []; + + // The console output from Podman is JavaScript object notation, not JSON + // Look for tool call events in the output + // Updated regex to handle tool names with hyphens and underscores + const toolCallPattern = + /body:\s*'Tool call:\s*([\w-]+)\..*?Success:\s*(\w+)\..*?Duration:\s*(\d+)ms\.'/g; + const matches = [...stdout.matchAll(toolCallPattern)]; + + for (const match of matches) { + const toolName = match[1]; + const success = match[2] === 'true'; + const duration = parseInt(match[3], 10); + + // Try to find function_args nearby + const matchIndex = match.index || 0; + const contextStart = Math.max(0, matchIndex - 500); + const contextEnd = Math.min(stdout.length, matchIndex + 500); + const context = stdout.substring(contextStart, contextEnd); + + // Look for function_args in the context + let args = '{}'; + const argsMatch = context.match(/function_args:\s*'([^']+)'/); + if (argsMatch) { + args = argsMatch[1]; + } + + // Also try to find function_name to double-check + // Updated regex to handle tool names with hyphens and underscores + const nameMatch = context.match(/function_name:\s*'([\w-]+)'/); + const actualToolName = nameMatch ? nameMatch[1] : toolName; + + logs.push({ + timestamp: Date.now(), + toolRequest: { + name: actualToolName, + args: args, + success: success, + duration_ms: duration, + }, + }); + } + + // If no matches found with the simple pattern, try the JSON parsing approach + // in case the format changes + if (logs.length === 0) { + const lines = stdout.split('\n'); + let currentObject = ''; + let inObject = false; + let braceDepth = 0; + + for (const line of lines) { + if (!inObject && line.trim() === '{') { + inObject = true; + braceDepth = 1; + currentObject = line + '\n'; + } else if (inObject) { + currentObject += line + '\n'; + + // Count braces + for (const char of line) { + if (char === '{') braceDepth++; + else if (char === '}') braceDepth--; + } + + // If we've closed all braces, try to parse the object + if (braceDepth === 0) { + inObject = false; + try { + const obj = JSON.parse(currentObject); + + // Check for tool call in different formats + if ( + obj.body && + obj.body.includes('Tool call:') && + obj.attributes + ) { + const bodyMatch = obj.body.match(/Tool call: (\w+)\./); + if (bodyMatch) { + logs.push({ + timestamp: obj.timestamp || Date.now(), + toolRequest: { + name: bodyMatch[1], + args: obj.attributes.function_args || '{}', + success: obj.attributes.success !== false, + duration_ms: obj.attributes.duration_ms || 0, + }, + }); + } + } else if ( + obj.attributes && + obj.attributes['event.name'] === 'gemini_cli.tool_call' + ) { + logs.push({ + timestamp: obj.attributes['event.timestamp'], + toolRequest: { + name: obj.attributes.function_name, + args: obj.attributes.function_args, + success: obj.attributes.success, + duration_ms: obj.attributes.duration_ms, + }, + }); + } + } catch (_e) { + // Not valid JSON + } + currentObject = ''; + } + } + } + } + + return logs; + } + + readToolLogs() { + // For Podman, first check if telemetry file exists and has content + // If not, fall back to parsing from stdout + if (env.GEMINI_SANDBOX === 'podman') { + // Try reading from file first + const logFilePath = join(this.testDir, 'telemetry.log'); + + if (fileExists(logFilePath)) { + try { + const content = readFileSync(logFilePath, 'utf-8'); + if (content && content.includes('"event.name"')) { + // File has content, use normal file parsing + // Continue to the normal file parsing logic below + } else if (this._lastRunStdout) { + // File exists but is empty or doesn't have events, parse from stdout + return this._parseToolLogsFromStdout(this._lastRunStdout); + } + } catch (_e) { + // Error reading file, fall back to stdout + if (this._lastRunStdout) { + return this._parseToolLogsFromStdout(this._lastRunStdout); + } + } + } else if (this._lastRunStdout) { + // No file exists, parse from stdout + return this._parseToolLogsFromStdout(this._lastRunStdout); + } + } + + // In sandbox mode, telemetry is written to a relative path in the test directory + const logFilePath = + env.GEMINI_SANDBOX && env.GEMINI_SANDBOX !== 'false' + ? join(this.testDir, 'telemetry.log') + : env.TELEMETRY_LOG_FILE; + + if (!logFilePath) { + console.warn(`TELEMETRY_LOG_FILE environment variable not set`); + return []; + } + + // Check if file exists, if not return empty array (file might not be created yet) + if (!fileExists(logFilePath)) { + return []; + } + + const content = readFileSync(logFilePath, 'utf-8'); + + // Split the content into individual JSON objects + // They are separated by "}\n{" pattern + const jsonObjects = content + .split(/}\s*\n\s*{/) + .map((obj, index, array) => { + // Add back the braces we removed during split + if (index > 0) obj = '{' + obj; + if (index < array.length - 1) obj = obj + '}'; + return obj.trim(); + }) + .filter((obj) => obj); + + const logs = []; + + for (const jsonStr of jsonObjects) { + try { + const logData = JSON.parse(jsonStr); + // Look for tool call logs + if ( + logData.attributes && + logData.attributes['event.name'] === 'gemini_cli.tool_call' + ) { + const toolName = logData.attributes.function_name; + logs.push({ + toolRequest: { + name: toolName, + args: logData.attributes.function_args, + success: logData.attributes.success, + duration_ms: logData.attributes.duration_ms, + }, + }); + } + } catch (_e) { + // Skip objects that aren't valid JSON + if (env.VERBOSE === 'true') { + console.error('Failed to parse telemetry object:', _e.message); + } + } + } + + return logs; + } } diff --git a/integration-tests/write_file.test.js b/integration-tests/write_file.test.js index 46a15f3c..7809161e 100644 --- a/integration-tests/write_file.test.js +++ b/integration-tests/write_file.test.js @@ -6,16 +6,63 @@ import { test } from 'node:test'; import { strict as assert } from 'assert'; -import { TestRig } from './test-helper.js'; +import { + TestRig, + createToolCallErrorMessage, + printDebugInfo, + validateModelOutput, +} from './test-helper.js'; -test('should be able to write a file', async (t) => { +test('should be able to write a file', async () => { const rig = new TestRig(); - rig.setup(t.name); + await rig.setup('should be able to write a file'); const prompt = `show me an example of using the write tool. put a dad joke in dad.txt`; - await rig.run(prompt); + const result = await rig.run(prompt); + + const foundToolCall = await rig.waitForToolCall('write_file'); + + // Add debugging information + if (!foundToolCall) { + printDebugInfo(rig, result); + } + + const allTools = rig.readToolLogs(); + assert.ok( + foundToolCall, + createToolCallErrorMessage( + 'write_file', + allTools.map((t) => t.toolRequest.name), + result, + ), + ); + + // Validate model output - will throw if no output, warn if missing expected content + validateModelOutput(result, 'dad.txt', 'Write file test'); + const newFilePath = 'dad.txt'; const newFileContent = rig.readFile(newFilePath); - assert.notEqual(newFileContent, ''); + + // Add debugging for file content + if (newFileContent === '') { + console.error('File was created but is empty'); + console.error( + 'Tool calls:', + rig.readToolLogs().map((t) => ({ + name: t.toolRequest.name, + args: t.toolRequest.args, + })), + ); + } + + assert.notEqual(newFileContent, '', 'Expected file to have content'); + + // Log success info if verbose + if (process.env.VERBOSE === 'true') { + console.log( + 'File created successfully with content:', + newFileContent.substring(0, 100) + '...', + ); + } }); diff --git a/packages/core/src/utils/bfsFileSearch.test.ts b/packages/core/src/utils/bfsFileSearch.test.ts index 3d5a0010..ce19f80e 100644 --- a/packages/core/src/utils/bfsFileSearch.test.ts +++ b/packages/core/src/utils/bfsFileSearch.test.ts @@ -258,10 +258,12 @@ describe('bfsFileSearch', () => { expect(avgDuration).toBeLessThan(2000); // Very generous limit // Ensure consistency across runs (variance should not be too high) - expect(consistencyRatio).toBeLessThan(1.5); // Max variance should be less than 150% of average + // More tolerant in CI environments where performance can be variable + const maxConsistencyRatio = process.env.CI ? 3.0 : 1.5; + expect(consistencyRatio).toBeLessThan(maxConsistencyRatio); // Max variance should be reasonable console.log( - `✅ Performance test passed: avg=${avgDuration.toFixed(2)}ms, consistency=${(consistencyRatio * 100).toFixed(1)}%`, + `✅ Performance test passed: avg=${avgDuration.toFixed(2)}ms, consistency=${(consistencyRatio * 100).toFixed(1)}% (threshold: ${(maxConsistencyRatio * 100).toFixed(0)}%)`, ); }); });