fix(tests): refactor integration tests to be less flaky (#4890)

Co-authored-by: matt korwel <matt.korwel@gmail.com>
This commit is contained in:
Allen Hutchison 2025-08-01 14:33:33 -07:00 committed by GitHub
parent dccca91fc9
commit 387706607d
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
12 changed files with 1073 additions and 115 deletions

View File

@ -6,25 +6,84 @@
import { strict as assert } from 'assert';
import { test } from 'node:test';
import { TestRig } from './test-helper.js';
import { TestRig, printDebugInfo, validateModelOutput } from './test-helper.js';
test('reads a file', (t) => {
test('should be able to read a file', async () => {
const rig = new TestRig();
rig.setup(t.name);
await rig.setup('should be able to read a file');
rig.createFile('test.txt', 'hello world');
const output = rig.run(`read the file name test.txt`);
const result = await rig.run(
`read the file test.txt and show me its contents`,
);
assert.ok(output.toLowerCase().includes('hello'));
const foundToolCall = await rig.waitForToolCall('read_file');
// Add debugging information
if (!foundToolCall || !result.includes('hello world')) {
printDebugInfo(rig, result, {
'Found tool call': foundToolCall,
'Contains hello world': result.includes('hello world'),
});
}
assert.ok(foundToolCall, 'Expected to find a read_file tool call');
// Validate model output - will throw if no output, warn if missing expected content
validateModelOutput(result, 'hello world', 'File read test');
});
test('writes a file', (t) => {
test('should be able to write a file', async () => {
const rig = new TestRig();
rig.setup(t.name);
await rig.setup('should be able to write a file');
rig.createFile('test.txt', '');
rig.run(`edit test.txt to have a hello world message`);
const result = await rig.run(`edit test.txt to have a hello world message`);
// Accept multiple valid tools for editing files
const foundToolCall = await rig.waitForAnyToolCall([
'write_file',
'edit',
'replace',
]);
// Add debugging information
if (!foundToolCall) {
printDebugInfo(rig, result);
}
assert.ok(
foundToolCall,
'Expected to find a write_file, edit, or replace tool call',
);
// Validate model output - will throw if no output
validateModelOutput(result, null, 'File write test');
const fileContent = rig.readFile('test.txt');
assert.ok(fileContent.toLowerCase().includes('hello'));
// Add debugging for file content
if (!fileContent.toLowerCase().includes('hello')) {
const writeCalls = rig
.readToolLogs()
.filter((t) => t.toolRequest.name === 'write_file')
.map((t) => t.toolRequest.args);
printDebugInfo(rig, result, {
'File content mismatch': true,
'Expected to contain': 'hello',
'Actual content': fileContent,
'Write tool calls': JSON.stringify(writeCalls),
});
}
assert.ok(
fileContent.toLowerCase().includes('hello'),
'Expected file to contain hello',
);
// Log success info if verbose
if (process.env.VERBOSE === 'true') {
console.log('File written successfully with hello message.');
}
});

View File

@ -6,14 +6,69 @@
import { test } from 'node:test';
import { strict as assert } from 'assert';
import { TestRig } from './test-helper.js';
import { TestRig, printDebugInfo, validateModelOutput } from './test-helper.js';
test('should be able to search the web', async (t) => {
test('should be able to search the web', async () => {
const rig = new TestRig();
rig.setup(t.name);
await rig.setup('should be able to search the web');
const prompt = `what planet do we live on`;
const result = await rig.run(prompt);
let result;
try {
result = await rig.run(`what is the weather in London`);
} catch (error) {
// Network errors can occur in CI environments
if (
error.message.includes('network') ||
error.message.includes('timeout')
) {
console.warn('Skipping test due to network error:', error.message);
return; // Skip the test
}
throw error; // Re-throw if not a network error
}
assert.ok(result.toLowerCase().includes('earth'));
const foundToolCall = await rig.waitForToolCall('google_web_search');
// Add debugging information
if (!foundToolCall) {
const allTools = printDebugInfo(rig, result);
// Check if the tool call failed due to network issues
const failedSearchCalls = allTools.filter(
(t) =>
t.toolRequest.name === 'google_web_search' && !t.toolRequest.success,
);
if (failedSearchCalls.length > 0) {
console.warn(
'google_web_search tool was called but failed, possibly due to network issues',
);
console.warn(
'Failed calls:',
failedSearchCalls.map((t) => t.toolRequest.args),
);
return; // Skip the test if network issues
}
}
assert.ok(foundToolCall, 'Expected to find a call to google_web_search');
// Validate model output - will throw if no output, warn if missing expected content
const hasExpectedContent = validateModelOutput(
result,
['weather', 'london'],
'Google web search test',
);
// If content was missing, log the search queries used
if (!hasExpectedContent) {
const searchCalls = rig
.readToolLogs()
.filter((t) => t.toolRequest.name === 'google_web_search');
if (searchCalls.length > 0) {
console.warn(
'Search queries used:',
searchCalls.map((t) => t.toolRequest.args),
);
}
}
});

View File

@ -6,19 +6,57 @@
import { test } from 'node:test';
import { strict as assert } from 'assert';
import { TestRig } from './test-helper.js';
import { TestRig, printDebugInfo, validateModelOutput } from './test-helper.js';
import { existsSync } from 'fs';
import { join } from 'path';
test('should be able to list a directory', async (t) => {
test('should be able to list a directory', async () => {
const rig = new TestRig();
rig.setup(t.name);
await rig.setup('should be able to list a directory');
rig.createFile('file1.txt', 'file 1 content');
rig.mkdir('subdir');
rig.sync();
const prompt = `Can you list the files in the current directory. Display them in the style of 'ls'`;
const result = rig.run(prompt);
// Poll for filesystem changes to propagate in containers
await rig.poll(
() => {
// Check if the files exist in the test directory
const file1Path = join(rig.testDir, 'file1.txt');
const subdirPath = join(rig.testDir, 'subdir');
return existsSync(file1Path) && existsSync(subdirPath);
},
1000, // 1 second max wait
50, // check every 50ms
);
const lines = result.split('\n').filter((line) => line.trim() !== '');
assert.ok(lines.some((line) => line.includes('file1.txt')));
assert.ok(lines.some((line) => line.includes('subdir')));
const prompt = `Can you list the files in the current directory. Display them in the style of 'ls'`;
const result = await rig.run(prompt);
const foundToolCall = await rig.waitForToolCall('list_directory');
// Add debugging information
if (
!foundToolCall ||
!result.includes('file1.txt') ||
!result.includes('subdir')
) {
const allTools = printDebugInfo(rig, result, {
'Found tool call': foundToolCall,
'Contains file1.txt': result.includes('file1.txt'),
'Contains subdir': result.includes('subdir'),
});
console.error(
'List directory calls:',
allTools
.filter((t) => t.toolRequest.name === 'list_directory')
.map((t) => t.toolRequest.args),
);
}
assert.ok(foundToolCall, 'Expected to find a list_directory tool call');
// Validate model output - will throw if no output, warn if missing expected content
validateModelOutput(result, ['file1.txt', 'subdir'], 'List directory test');
});

View File

@ -6,17 +6,45 @@
import { test } from 'node:test';
import { strict as assert } from 'assert';
import { TestRig } from './test-helper.js';
import { TestRig, printDebugInfo, validateModelOutput } from './test-helper.js';
test.skip('should be able to read multiple files', async (t) => {
test('should be able to read multiple files', async () => {
const rig = new TestRig();
rig.setup(t.name);
await rig.setup('should be able to read multiple files');
rig.createFile('file1.txt', 'file 1 content');
rig.createFile('file2.txt', 'file 2 content');
const prompt = `Read the files in this directory, list them and print them to the screen`;
const prompt = `Please use read_many_files to read file1.txt and file2.txt and show me what's in them`;
const result = await rig.run(prompt);
assert.ok(result.includes('file 1 content'));
assert.ok(result.includes('file 2 content'));
// Check for either read_many_files or multiple read_file calls
const allTools = rig.readToolLogs();
const readManyFilesCall = await rig.waitForToolCall('read_many_files');
const readFileCalls = allTools.filter(
(t) => t.toolRequest.name === 'read_file',
);
// Accept either read_many_files OR at least 2 read_file calls
const foundValidPattern = readManyFilesCall || readFileCalls.length >= 2;
// Add debugging information
if (!foundValidPattern) {
printDebugInfo(rig, result, {
'read_many_files called': readManyFilesCall,
'read_file calls': readFileCalls.length,
});
}
assert.ok(
foundValidPattern,
'Expected to find either read_many_files or multiple read_file tool calls',
);
// Validate model output - will throw if no output, warn if missing expected content
validateModelOutput(
result,
['file 1 content', 'file 2 content'],
'Read many files test',
);
});

View File

@ -6,17 +6,61 @@
import { test } from 'node:test';
import { strict as assert } from 'assert';
import { TestRig } from './test-helper.js';
import { TestRig, printDebugInfo, validateModelOutput } from './test-helper.js';
test('should be able to replace content in a file', async (t) => {
test('should be able to replace content in a file', async () => {
const rig = new TestRig();
rig.setup(t.name);
await rig.setup('should be able to replace content in a file');
const fileName = 'file_to_replace.txt';
rig.createFile(fileName, 'original content');
const originalContent = 'original content';
const expectedContent = 'replaced content';
rig.createFile(fileName, originalContent);
const prompt = `Can you replace 'original' with 'replaced' in the file 'file_to_replace.txt'`;
await rig.run(prompt);
const result = await rig.run(prompt);
const foundToolCall = await rig.waitForToolCall('replace');
// Add debugging information
if (!foundToolCall) {
printDebugInfo(rig, result);
}
assert.ok(foundToolCall, 'Expected to find a replace tool call');
// Validate model output - will throw if no output, warn if missing expected content
validateModelOutput(
result,
['replaced', 'file_to_replace.txt'],
'Replace content test',
);
const newFileContent = rig.readFile(fileName);
assert.strictEqual(newFileContent, 'replaced content');
// Add debugging for file content
if (newFileContent !== expectedContent) {
console.error('File content mismatch - Debug info:');
console.error('Expected:', expectedContent);
console.error('Actual:', newFileContent);
console.error(
'Tool calls:',
rig.readToolLogs().map((t) => ({
name: t.toolRequest.name,
args: t.toolRequest.args,
})),
);
}
assert.strictEqual(
newFileContent,
expectedContent,
'File content should be updated correctly',
);
// Log success info if verbose
if (process.env.VERBOSE === 'true') {
console.log('File replaced successfully. New content:', newFileContent);
}
});

View File

@ -101,6 +101,7 @@ async function main() {
KEEP_OUTPUT: keepOutput.toString(),
VERBOSE: verbose.toString(),
TEST_FILE_NAME: testFileName,
TELEMETRY_LOG_FILE: join(testFileDir, 'telemetry.log'),
},
});

View File

@ -6,26 +6,58 @@
import { test } from 'node:test';
import { strict as assert } from 'assert';
import { TestRig } from './test-helper.js';
import { TestRig, printDebugInfo, validateModelOutput } from './test-helper.js';
test('should be able to run a shell command', async (t) => {
test('should be able to run a shell command', async () => {
const rig = new TestRig();
rig.setup(t.name);
rig.createFile('blah.txt', 'some content');
await rig.setup('should be able to run a shell command');
const prompt = `Can you use ls to list the contexts of the current folder`;
const result = rig.run(prompt);
const prompt = `Please run the command "echo hello-world" and show me the output`;
assert.ok(result.includes('blah.txt'));
const result = await rig.run(prompt);
const foundToolCall = await rig.waitForToolCall('run_shell_command');
// Add debugging information
if (!foundToolCall || !result.includes('hello-world')) {
printDebugInfo(rig, result, {
'Found tool call': foundToolCall,
'Contains hello-world': result.includes('hello-world'),
});
}
assert.ok(foundToolCall, 'Expected to find a run_shell_command tool call');
// Validate model output - will throw if no output, warn if missing expected content
// Model often reports exit code instead of showing output
validateModelOutput(
result,
['hello-world', 'exit code 0'],
'Shell command test',
);
});
test('should be able to run a shell command via stdin', async (t) => {
test('should be able to run a shell command via stdin', async () => {
const rig = new TestRig();
rig.setup(t.name);
rig.createFile('blah.txt', 'some content');
await rig.setup('should be able to run a shell command via stdin');
const prompt = `Can you use ls to list the contexts of the current folder`;
const result = rig.run({ stdin: prompt });
const prompt = `Please run the command "echo test-stdin" and show me what it outputs`;
assert.ok(result.includes('blah.txt'));
const result = await rig.run({ stdin: prompt });
const foundToolCall = await rig.waitForToolCall('run_shell_command');
// Add debugging information
if (!foundToolCall || !result.includes('test-stdin')) {
printDebugInfo(rig, result, {
'Test type': 'Stdin test',
'Found tool call': foundToolCall,
'Contains test-stdin': result.includes('test-stdin'),
});
}
assert.ok(foundToolCall, 'Expected to find a run_shell_command tool call');
// Validate model output - will throw if no output, warn if missing expected content
validateModelOutput(result, 'test-stdin', 'Shell command stdin test');
});

View File

@ -6,16 +6,36 @@
import { test } from 'node:test';
import { strict as assert } from 'assert';
import { TestRig } from './test-helper.js';
import { TestRig, printDebugInfo, validateModelOutput } from './test-helper.js';
test('should be able to save to memory', async (t) => {
test('should be able to save to memory', async () => {
const rig = new TestRig();
rig.setup(t.name);
await rig.setup('should be able to save to memory');
const prompt = `remember that my favorite color is blue.
what is my favorite color? tell me that and surround it with $ symbol`;
const result = await rig.run(prompt);
assert.ok(result.toLowerCase().includes('$blue$'));
const foundToolCall = await rig.waitForToolCall('save_memory');
// Add debugging information
if (!foundToolCall || !result.toLowerCase().includes('blue')) {
const allTools = printDebugInfo(rig, result, {
'Found tool call': foundToolCall,
'Contains blue': result.toLowerCase().includes('blue'),
});
console.error(
'Memory tool calls:',
allTools
.filter((t) => t.toolRequest.name === 'save_memory')
.map((t) => t.toolRequest.args),
);
}
assert.ok(foundToolCall, 'Expected to find a save_memory tool call');
// Validate model output - will throw if no output, warn if missing expected content
validateModelOutput(result, 'blue', 'Save memory test');
});

View File

@ -4,67 +4,208 @@
* SPDX-License-Identifier: Apache-2.0
*/
import { test, describe, before, after } from 'node:test';
/**
* This test verifies MCP (Model Context Protocol) server integration.
* It uses a minimal MCP server implementation that doesn't require
* external dependencies, making it compatible with Docker sandbox mode.
*/
import { test, describe, before } from 'node:test';
import { strict as assert } from 'node:assert';
import { TestRig } from './test-helper.js';
import { spawn } from 'child_process';
import { TestRig, validateModelOutput } from './test-helper.js';
import { join } from 'path';
import { fileURLToPath } from 'url';
import { writeFileSync, unlinkSync } from 'fs';
import { writeFileSync } from 'fs';
const __dirname = fileURLToPath(new URL('.', import.meta.url));
const serverScriptPath = join(__dirname, './temp-server.js');
const serverScript = `
import { McpServer } from '@modelcontextprotocol/sdk/server/mcp.js';
import { StdioServerTransport } from '@modelcontextprotocol/sdk/server/stdio.js';
import { z } from 'zod';
// Create a minimal MCP server that doesn't require external dependencies
// This implements the MCP protocol directly using Node.js built-ins
const serverScript = `#!/usr/bin/env node
/**
* @license
* Copyright 2025 Google LLC
* SPDX-License-Identifier: Apache-2.0
*/
const server = new McpServer({
name: 'addition-server',
version: '1.0.0',
const readline = require('readline');
const fs = require('fs');
// Debug logging to stderr (only when MCP_DEBUG or VERBOSE is set)
const debugEnabled = process.env.MCP_DEBUG === 'true' || process.env.VERBOSE === 'true';
function debug(msg) {
if (debugEnabled) {
fs.writeSync(2, \`[MCP-DEBUG] \${msg}\\n\`);
}
}
debug('MCP server starting...');
// Simple JSON-RPC implementation for MCP
class SimpleJSONRPC {
constructor() {
this.handlers = new Map();
this.rl = readline.createInterface({
input: process.stdin,
output: process.stdout,
terminal: false
});
this.rl.on('line', (line) => {
debug(\`Received line: \${line}\`);
try {
const message = JSON.parse(line);
debug(\`Parsed message: \${JSON.stringify(message)}\`);
this.handleMessage(message);
} catch (e) {
debug(\`Parse error: \${e.message}\`);
}
});
}
send(message) {
const msgStr = JSON.stringify(message);
debug(\`Sending message: \${msgStr}\`);
process.stdout.write(msgStr + '\\n');
}
async handleMessage(message) {
if (message.method && this.handlers.has(message.method)) {
try {
const result = await this.handlers.get(message.method)(message.params || {});
if (message.id !== undefined) {
this.send({
jsonrpc: '2.0',
id: message.id,
result
});
}
} catch (error) {
if (message.id !== undefined) {
this.send({
jsonrpc: '2.0',
id: message.id,
error: {
code: -32603,
message: error.message
}
});
}
}
} else if (message.id !== undefined) {
this.send({
jsonrpc: '2.0',
id: message.id,
error: {
code: -32601,
message: 'Method not found'
}
});
}
}
on(method, handler) {
this.handlers.set(method, handler);
}
}
// Create MCP server
const rpc = new SimpleJSONRPC();
// Handle initialize
rpc.on('initialize', async (params) => {
debug('Handling initialize request');
return {
protocolVersion: '2024-11-05',
capabilities: {
tools: {}
},
serverInfo: {
name: 'addition-server',
version: '1.0.0'
}
};
});
server.registerTool(
'add',
{
title: 'Addition Tool',
description: 'Add two numbers',
inputSchema: { a: z.number(), b: z.number() },
},
async ({ a, b }) => ({
content: [{ type: 'text', text: String(a + b) }],
}),
);
// Handle tools/list
rpc.on('tools/list', async () => {
debug('Handling tools/list request');
return {
tools: [{
name: 'add',
description: 'Add two numbers',
inputSchema: {
type: 'object',
properties: {
a: { type: 'number', description: 'First number' },
b: { type: 'number', description: 'Second number' }
},
required: ['a', 'b']
}
}]
};
});
const transport = new StdioServerTransport();
await server.connect(transport);
// Handle tools/call
rpc.on('tools/call', async (params) => {
debug(\`Handling tools/call request for tool: \${params.name}\`);
if (params.name === 'add') {
const { a, b } = params.arguments;
return {
content: [{
type: 'text',
text: String(a + b)
}]
};
}
throw new Error('Unknown tool: ' + params.name);
});
// Send initialization notification
rpc.send({
jsonrpc: '2.0',
method: 'initialized'
});
`;
describe('simple-mcp-server', () => {
const rig = new TestRig();
let child;
before(() => {
writeFileSync(serverScriptPath, serverScript);
child = spawn('node', [serverScriptPath], {
stdio: ['pipe', 'pipe', 'pipe'],
before(async () => {
// Setup test directory with MCP server configuration
await rig.setup('simple-mcp-server', {
settings: {
mcpServers: {
'addition-server': {
command: 'node',
args: ['mcp-server.cjs'],
},
},
},
});
child.stderr.on('data', (data) => {
console.error(`stderr: ${data}`);
});
// Wait for the server to be ready
return new Promise((resolve) => setTimeout(resolve, 2000));
// Create server script in the test directory
const testServerPath = join(rig.testDir, 'mcp-server.cjs');
writeFileSync(testServerPath, serverScript);
// Make the script executable (though running with 'node' should work anyway)
if (process.platform !== 'win32') {
const { chmodSync } = await import('fs');
chmodSync(testServerPath, 0o755);
}
});
after(() => {
child.kill();
unlinkSync(serverScriptPath);
});
test('should add two numbers', async () => {
// Test directory is already set up in before hook
// Just run the command - MCP server config is in settings.json
const output = await rig.run('add 5 and 10');
test('should add two numbers', () => {
rig.setup('should add two numbers');
const output = rig.run('add 5 and 10');
assert.ok(output.includes('15'));
const foundToolCall = await rig.waitForToolCall('add');
assert.ok(foundToolCall, 'Expected to find an add tool call');
// Validate model output - will throw if no output, fail if missing expected content
validateModelOutput(output, '15', 'MCP server test');
assert.ok(output.includes('15'), 'Expected output to contain the sum (15)');
});
});

View File

@ -4,11 +4,13 @@
* SPDX-License-Identifier: Apache-2.0
*/
import { execSync } from 'child_process';
import { execSync, spawn } from 'child_process';
import { parse } from 'shell-quote';
import { mkdirSync, writeFileSync, readFileSync } from 'fs';
import { join, dirname } from 'path';
import { fileURLToPath } from 'url';
import { env } from 'process';
import { fileExists } from '../scripts/telemetry_utils.js';
const __dirname = dirname(fileURLToPath(import.meta.url));
@ -19,17 +21,129 @@ function sanitizeTestName(name) {
.replace(/-+/g, '-');
}
// Helper to create detailed error messages
export function createToolCallErrorMessage(expectedTools, foundTools, result) {
const expectedStr = Array.isArray(expectedTools)
? expectedTools.join(' or ')
: expectedTools;
return (
`Expected to find ${expectedStr} tool call(s). ` +
`Found: ${foundTools.length > 0 ? foundTools.join(', ') : 'none'}. ` +
`Output preview: ${result ? result.substring(0, 200) + '...' : 'no output'}`
);
}
// Helper to print debug information when tests fail
export function printDebugInfo(rig, result, context = {}) {
console.error('Test failed - Debug info:');
console.error('Result length:', result.length);
console.error('Result (first 500 chars):', result.substring(0, 500));
console.error(
'Result (last 500 chars):',
result.substring(result.length - 500),
);
// Print any additional context provided
Object.entries(context).forEach(([key, value]) => {
console.error(`${key}:`, value);
});
// Check what tools were actually called
const allTools = rig.readToolLogs();
console.error(
'All tool calls found:',
allTools.map((t) => t.toolRequest.name),
);
return allTools;
}
// Helper to validate model output and warn about unexpected content
export function validateModelOutput(
result,
expectedContent = null,
testName = '',
) {
// First, check if there's any output at all (this should fail the test if missing)
if (!result || result.trim().length === 0) {
throw new Error('Expected LLM to return some output');
}
// If expectedContent is provided, check for it and warn if missing
if (expectedContent) {
const contents = Array.isArray(expectedContent)
? expectedContent
: [expectedContent];
const missingContent = contents.filter((content) => {
if (typeof content === 'string') {
return !result.toLowerCase().includes(content.toLowerCase());
} else if (content instanceof RegExp) {
return !content.test(result);
}
return false;
});
if (missingContent.length > 0) {
console.warn(
`Warning: LLM did not include expected content in response: ${missingContent.join(', ')}.`,
'This is not ideal but not a test failure.',
);
console.warn(
'The tool was called successfully, which is the main requirement.',
);
return false;
} else if (process.env.VERBOSE === 'true') {
console.log(`${testName}: Model output validated successfully.`);
}
return true;
}
return true;
}
export class TestRig {
constructor() {
this.bundlePath = join(__dirname, '..', 'bundle/gemini.js');
this.testDir = null;
}
setup(testName) {
// Get timeout based on environment
getDefaultTimeout() {
if (env.CI) return 60000; // 1 minute in CI
if (env.GEMINI_SANDBOX) return 30000; // 30s in containers
return 15000; // 15s locally
}
setup(testName, options = {}) {
this.testName = testName;
const sanitizedName = sanitizeTestName(testName);
this.testDir = join(env.INTEGRATION_TEST_FILE_DIR, sanitizedName);
mkdirSync(this.testDir, { recursive: true });
// Create a settings file to point the CLI to the local collector
const geminiDir = join(this.testDir, '.gemini');
mkdirSync(geminiDir, { recursive: true });
// In sandbox mode, use an absolute path for telemetry inside the container
// The container mounts the test directory at the same path as the host
const telemetryPath =
env.GEMINI_SANDBOX && env.GEMINI_SANDBOX !== 'false'
? join(this.testDir, 'telemetry.log') // Absolute path in test directory
: env.TELEMETRY_LOG_FILE; // Absolute path for non-sandbox
const settings = {
telemetry: {
enabled: true,
target: 'local',
otlpEndpoint: '',
outfile: telemetryPath,
},
sandbox: env.GEMINI_SANDBOX !== 'false' ? env.GEMINI_SANDBOX : false,
...options.settings, // Allow tests to override/add settings
};
writeFileSync(
join(geminiDir, 'settings.json'),
JSON.stringify(settings, null, 2),
);
}
createFile(fileName, content) {
@ -39,7 +153,7 @@ export class TestRig {
}
mkdir(dir) {
mkdirSync(join(this.testDir, dir));
mkdirSync(join(this.testDir, dir), { recursive: true });
}
sync() {
@ -70,19 +184,88 @@ export class TestRig {
command += ` ${args.join(' ')}`;
const output = execSync(command, execOptions);
const commandArgs = parse(command);
const node = commandArgs.shift();
if (env.KEEP_OUTPUT === 'true' || env.VERBOSE === 'true') {
const testId = `${env.TEST_FILE_NAME.replace(
'.test.js',
'',
)}:${this.testName.replace(/ /g, '-')}`;
console.log(`--- TEST: ${testId} ---`);
console.log(output);
console.log(`--- END TEST: ${testId} ---`);
const child = spawn(node, commandArgs, {
cwd: this.testDir,
stdio: 'pipe',
});
let stdout = '';
let stderr = '';
// Handle stdin if provided
if (execOptions.input) {
child.stdin.write(execOptions.input);
child.stdin.end();
}
return output;
child.stdout.on('data', (data) => {
stdout += data;
if (env.KEEP_OUTPUT === 'true' || env.VERBOSE === 'true') {
process.stdout.write(data);
}
});
child.stderr.on('data', (data) => {
stderr += data;
if (env.KEEP_OUTPUT === 'true' || env.VERBOSE === 'true') {
process.stderr.write(data);
}
});
const promise = new Promise((resolve, reject) => {
child.on('close', (code) => {
if (code === 0) {
// Store the raw stdout for Podman telemetry parsing
this._lastRunStdout = stdout;
// Filter out telemetry output when running with Podman
// Podman seems to output telemetry to stdout even when writing to file
let result = stdout;
if (env.GEMINI_SANDBOX === 'podman') {
// Remove telemetry JSON objects from output
// They are multi-line JSON objects that start with { and contain telemetry fields
const lines = result.split('\n');
const filteredLines = [];
let inTelemetryObject = false;
let braceDepth = 0;
for (const line of lines) {
if (!inTelemetryObject && line.trim() === '{') {
// Check if this might be start of telemetry object
inTelemetryObject = true;
braceDepth = 1;
} else if (inTelemetryObject) {
// Count braces to track nesting
for (const char of line) {
if (char === '{') braceDepth++;
else if (char === '}') braceDepth--;
}
// Check if we've closed all braces
if (braceDepth === 0) {
inTelemetryObject = false;
// Skip this line (the closing brace)
continue;
}
} else {
// Not in telemetry object, keep the line
filteredLines.push(line);
}
}
result = filteredLines.join('\n');
}
resolve(result);
} else {
reject(new Error(`Process exited with code ${code}:\n${stderr}`));
}
});
});
return promise;
}
readFile(fileName) {
@ -98,4 +281,312 @@ export class TestRig {
}
return content;
}
async cleanup() {
// Clean up test directory
if (this.testDir && !env.KEEP_OUTPUT) {
try {
execSync(`rm -rf ${this.testDir}`);
} catch (error) {
// Ignore cleanup errors
if (env.VERBOSE === 'true') {
console.warn('Cleanup warning:', error.message);
}
}
}
}
async waitForTelemetryReady() {
// In sandbox mode, telemetry is written to a relative path in the test directory
const logFilePath =
env.GEMINI_SANDBOX && env.GEMINI_SANDBOX !== 'false'
? join(this.testDir, 'telemetry.log')
: env.TELEMETRY_LOG_FILE;
if (!logFilePath) return;
// Wait for telemetry file to exist and have content
await this.poll(
() => {
if (!fileExists(logFilePath)) return false;
try {
const content = readFileSync(logFilePath, 'utf-8');
// Check if file has meaningful content (at least one complete JSON object)
return content.includes('"event.name"');
} catch (_e) {
return false;
}
},
2000, // 2 seconds max - reduced since telemetry should flush on exit now
100, // check every 100ms
);
}
async waitForToolCall(toolName, timeout) {
// Use environment-specific timeout
if (!timeout) {
timeout = this.getDefaultTimeout();
}
// Wait for telemetry to be ready before polling for tool calls
await this.waitForTelemetryReady();
return this.poll(
() => {
const toolLogs = this.readToolLogs();
return toolLogs.some((log) => log.toolRequest.name === toolName);
},
timeout,
100,
);
}
async waitForAnyToolCall(toolNames, timeout) {
// Use environment-specific timeout
if (!timeout) {
timeout = this.getDefaultTimeout();
}
// Wait for telemetry to be ready before polling for tool calls
await this.waitForTelemetryReady();
return this.poll(
() => {
const toolLogs = this.readToolLogs();
return toolNames.some((name) =>
toolLogs.some((log) => log.toolRequest.name === name),
);
},
timeout,
100,
);
}
async poll(predicate, timeout, interval) {
const startTime = Date.now();
let attempts = 0;
while (Date.now() - startTime < timeout) {
attempts++;
const result = predicate();
if (env.VERBOSE === 'true' && attempts % 5 === 0) {
console.log(
`Poll attempt ${attempts}: ${result ? 'success' : 'waiting...'}`,
);
}
if (result) {
return true;
}
await new Promise((resolve) => setTimeout(resolve, interval));
}
if (env.VERBOSE === 'true') {
console.log(`Poll timed out after ${attempts} attempts`);
}
return false;
}
_parseToolLogsFromStdout(stdout) {
const logs = [];
// The console output from Podman is JavaScript object notation, not JSON
// Look for tool call events in the output
// Updated regex to handle tool names with hyphens and underscores
const toolCallPattern =
/body:\s*'Tool call:\s*([\w-]+)\..*?Success:\s*(\w+)\..*?Duration:\s*(\d+)ms\.'/g;
const matches = [...stdout.matchAll(toolCallPattern)];
for (const match of matches) {
const toolName = match[1];
const success = match[2] === 'true';
const duration = parseInt(match[3], 10);
// Try to find function_args nearby
const matchIndex = match.index || 0;
const contextStart = Math.max(0, matchIndex - 500);
const contextEnd = Math.min(stdout.length, matchIndex + 500);
const context = stdout.substring(contextStart, contextEnd);
// Look for function_args in the context
let args = '{}';
const argsMatch = context.match(/function_args:\s*'([^']+)'/);
if (argsMatch) {
args = argsMatch[1];
}
// Also try to find function_name to double-check
// Updated regex to handle tool names with hyphens and underscores
const nameMatch = context.match(/function_name:\s*'([\w-]+)'/);
const actualToolName = nameMatch ? nameMatch[1] : toolName;
logs.push({
timestamp: Date.now(),
toolRequest: {
name: actualToolName,
args: args,
success: success,
duration_ms: duration,
},
});
}
// If no matches found with the simple pattern, try the JSON parsing approach
// in case the format changes
if (logs.length === 0) {
const lines = stdout.split('\n');
let currentObject = '';
let inObject = false;
let braceDepth = 0;
for (const line of lines) {
if (!inObject && line.trim() === '{') {
inObject = true;
braceDepth = 1;
currentObject = line + '\n';
} else if (inObject) {
currentObject += line + '\n';
// Count braces
for (const char of line) {
if (char === '{') braceDepth++;
else if (char === '}') braceDepth--;
}
// If we've closed all braces, try to parse the object
if (braceDepth === 0) {
inObject = false;
try {
const obj = JSON.parse(currentObject);
// Check for tool call in different formats
if (
obj.body &&
obj.body.includes('Tool call:') &&
obj.attributes
) {
const bodyMatch = obj.body.match(/Tool call: (\w+)\./);
if (bodyMatch) {
logs.push({
timestamp: obj.timestamp || Date.now(),
toolRequest: {
name: bodyMatch[1],
args: obj.attributes.function_args || '{}',
success: obj.attributes.success !== false,
duration_ms: obj.attributes.duration_ms || 0,
},
});
}
} else if (
obj.attributes &&
obj.attributes['event.name'] === 'gemini_cli.tool_call'
) {
logs.push({
timestamp: obj.attributes['event.timestamp'],
toolRequest: {
name: obj.attributes.function_name,
args: obj.attributes.function_args,
success: obj.attributes.success,
duration_ms: obj.attributes.duration_ms,
},
});
}
} catch (_e) {
// Not valid JSON
}
currentObject = '';
}
}
}
}
return logs;
}
readToolLogs() {
// For Podman, first check if telemetry file exists and has content
// If not, fall back to parsing from stdout
if (env.GEMINI_SANDBOX === 'podman') {
// Try reading from file first
const logFilePath = join(this.testDir, 'telemetry.log');
if (fileExists(logFilePath)) {
try {
const content = readFileSync(logFilePath, 'utf-8');
if (content && content.includes('"event.name"')) {
// File has content, use normal file parsing
// Continue to the normal file parsing logic below
} else if (this._lastRunStdout) {
// File exists but is empty or doesn't have events, parse from stdout
return this._parseToolLogsFromStdout(this._lastRunStdout);
}
} catch (_e) {
// Error reading file, fall back to stdout
if (this._lastRunStdout) {
return this._parseToolLogsFromStdout(this._lastRunStdout);
}
}
} else if (this._lastRunStdout) {
// No file exists, parse from stdout
return this._parseToolLogsFromStdout(this._lastRunStdout);
}
}
// In sandbox mode, telemetry is written to a relative path in the test directory
const logFilePath =
env.GEMINI_SANDBOX && env.GEMINI_SANDBOX !== 'false'
? join(this.testDir, 'telemetry.log')
: env.TELEMETRY_LOG_FILE;
if (!logFilePath) {
console.warn(`TELEMETRY_LOG_FILE environment variable not set`);
return [];
}
// Check if file exists, if not return empty array (file might not be created yet)
if (!fileExists(logFilePath)) {
return [];
}
const content = readFileSync(logFilePath, 'utf-8');
// Split the content into individual JSON objects
// They are separated by "}\n{" pattern
const jsonObjects = content
.split(/}\s*\n\s*{/)
.map((obj, index, array) => {
// Add back the braces we removed during split
if (index > 0) obj = '{' + obj;
if (index < array.length - 1) obj = obj + '}';
return obj.trim();
})
.filter((obj) => obj);
const logs = [];
for (const jsonStr of jsonObjects) {
try {
const logData = JSON.parse(jsonStr);
// Look for tool call logs
if (
logData.attributes &&
logData.attributes['event.name'] === 'gemini_cli.tool_call'
) {
const toolName = logData.attributes.function_name;
logs.push({
toolRequest: {
name: toolName,
args: logData.attributes.function_args,
success: logData.attributes.success,
duration_ms: logData.attributes.duration_ms,
},
});
}
} catch (_e) {
// Skip objects that aren't valid JSON
if (env.VERBOSE === 'true') {
console.error('Failed to parse telemetry object:', _e.message);
}
}
}
return logs;
}
}

View File

@ -6,16 +6,63 @@
import { test } from 'node:test';
import { strict as assert } from 'assert';
import { TestRig } from './test-helper.js';
import {
TestRig,
createToolCallErrorMessage,
printDebugInfo,
validateModelOutput,
} from './test-helper.js';
test('should be able to write a file', async (t) => {
test('should be able to write a file', async () => {
const rig = new TestRig();
rig.setup(t.name);
await rig.setup('should be able to write a file');
const prompt = `show me an example of using the write tool. put a dad joke in dad.txt`;
await rig.run(prompt);
const result = await rig.run(prompt);
const foundToolCall = await rig.waitForToolCall('write_file');
// Add debugging information
if (!foundToolCall) {
printDebugInfo(rig, result);
}
const allTools = rig.readToolLogs();
assert.ok(
foundToolCall,
createToolCallErrorMessage(
'write_file',
allTools.map((t) => t.toolRequest.name),
result,
),
);
// Validate model output - will throw if no output, warn if missing expected content
validateModelOutput(result, 'dad.txt', 'Write file test');
const newFilePath = 'dad.txt';
const newFileContent = rig.readFile(newFilePath);
assert.notEqual(newFileContent, '');
// Add debugging for file content
if (newFileContent === '') {
console.error('File was created but is empty');
console.error(
'Tool calls:',
rig.readToolLogs().map((t) => ({
name: t.toolRequest.name,
args: t.toolRequest.args,
})),
);
}
assert.notEqual(newFileContent, '', 'Expected file to have content');
// Log success info if verbose
if (process.env.VERBOSE === 'true') {
console.log(
'File created successfully with content:',
newFileContent.substring(0, 100) + '...',
);
}
});

View File

@ -258,10 +258,12 @@ describe('bfsFileSearch', () => {
expect(avgDuration).toBeLessThan(2000); // Very generous limit
// Ensure consistency across runs (variance should not be too high)
expect(consistencyRatio).toBeLessThan(1.5); // Max variance should be less than 150% of average
// More tolerant in CI environments where performance can be variable
const maxConsistencyRatio = process.env.CI ? 3.0 : 1.5;
expect(consistencyRatio).toBeLessThan(maxConsistencyRatio); // Max variance should be reasonable
console.log(
`✅ Performance test passed: avg=${avgDuration.toFixed(2)}ms, consistency=${(consistencyRatio * 100).toFixed(1)}%`,
`✅ Performance test passed: avg=${avgDuration.toFixed(2)}ms, consistency=${(consistencyRatio * 100).toFixed(1)}% (threshold: ${(maxConsistencyRatio * 100).toFixed(0)}%)`,
);
});
});