712 lines
21 KiB
TypeScript
712 lines
21 KiB
TypeScript
/**
|
|
* @license
|
|
* Copyright 2025 Google LLC
|
|
* SPDX-License-Identifier: Apache-2.0
|
|
*/
|
|
|
|
import { execSync, spawn } from 'child_process';
|
|
import { parse } from 'shell-quote';
|
|
import { mkdirSync, writeFileSync, readFileSync } from 'fs';
|
|
import { join, dirname } from 'path';
|
|
import { fileURLToPath } from 'url';
|
|
import { env } from 'process';
|
|
import fs from 'fs';
|
|
|
|
const __dirname = dirname(fileURLToPath(import.meta.url));
|
|
|
|
function sanitizeTestName(name: string) {
|
|
return name
|
|
.toLowerCase()
|
|
.replace(/[^a-z0-9]/g, '-')
|
|
.replace(/-+/g, '-');
|
|
}
|
|
|
|
// Helper to create detailed error messages
|
|
export function createToolCallErrorMessage(
|
|
expectedTools: string | string[],
|
|
foundTools: string[],
|
|
result: string,
|
|
) {
|
|
const expectedStr = Array.isArray(expectedTools)
|
|
? expectedTools.join(' or ')
|
|
: expectedTools;
|
|
return (
|
|
`Expected to find ${expectedStr} tool call(s). ` +
|
|
`Found: ${foundTools.length > 0 ? foundTools.join(', ') : 'none'}. ` +
|
|
`Output preview: ${result ? result.substring(0, 200) + '...' : 'no output'}`
|
|
);
|
|
}
|
|
|
|
// Helper to print debug information when tests fail
|
|
export function printDebugInfo(
|
|
rig: TestRig,
|
|
result: string,
|
|
context: Record<string, unknown> = {},
|
|
) {
|
|
console.error('Test failed - Debug info:');
|
|
console.error('Result length:', result.length);
|
|
console.error('Result (first 500 chars):', result.substring(0, 500));
|
|
console.error(
|
|
'Result (last 500 chars):',
|
|
result.substring(result.length - 500),
|
|
);
|
|
|
|
// Print any additional context provided
|
|
Object.entries(context).forEach(([key, value]) => {
|
|
console.error(`${key}:`, value);
|
|
});
|
|
|
|
// Check what tools were actually called
|
|
const allTools = rig.readToolLogs();
|
|
console.error(
|
|
'All tool calls found:',
|
|
allTools.map((t) => t.toolRequest.name),
|
|
);
|
|
|
|
return allTools;
|
|
}
|
|
|
|
// Helper to validate model output and warn about unexpected content
|
|
export function validateModelOutput(
|
|
result: string,
|
|
expectedContent: string | (string | RegExp)[] | null = null,
|
|
testName = '',
|
|
) {
|
|
// First, check if there's any output at all (this should fail the test if missing)
|
|
if (!result || result.trim().length === 0) {
|
|
throw new Error('Expected LLM to return some output');
|
|
}
|
|
|
|
// If expectedContent is provided, check for it and warn if missing
|
|
if (expectedContent) {
|
|
const contents = Array.isArray(expectedContent)
|
|
? expectedContent
|
|
: [expectedContent];
|
|
const missingContent = contents.filter((content) => {
|
|
if (typeof content === 'string') {
|
|
return !result.toLowerCase().includes(content.toLowerCase());
|
|
} else if (content instanceof RegExp) {
|
|
return !content.test(result);
|
|
}
|
|
return false;
|
|
});
|
|
|
|
if (missingContent.length > 0) {
|
|
console.warn(
|
|
`Warning: LLM did not include expected content in response: ${missingContent.join(
|
|
', ',
|
|
)}.`,
|
|
'This is not ideal but not a test failure.',
|
|
);
|
|
console.warn(
|
|
'The tool was called successfully, which is the main requirement.',
|
|
);
|
|
return false;
|
|
} else if (process.env.VERBOSE === 'true') {
|
|
console.log(`${testName}: Model output validated successfully.`);
|
|
}
|
|
return true;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
export class TestRig {
|
|
bundlePath: string;
|
|
testDir: string | null;
|
|
testName?: string;
|
|
_lastRunStdout?: string;
|
|
|
|
constructor() {
|
|
this.bundlePath = join(__dirname, '..', 'bundle/gemini.js');
|
|
this.testDir = null;
|
|
}
|
|
|
|
// Get timeout based on environment
|
|
getDefaultTimeout() {
|
|
if (env.CI) return 60000; // 1 minute in CI
|
|
if (env.GEMINI_SANDBOX) return 30000; // 30s in containers
|
|
return 15000; // 15s locally
|
|
}
|
|
|
|
setup(
|
|
testName: string,
|
|
options: { settings?: Record<string, unknown> } = {},
|
|
) {
|
|
this.testName = testName;
|
|
const sanitizedName = sanitizeTestName(testName);
|
|
this.testDir = join(env.INTEGRATION_TEST_FILE_DIR!, sanitizedName);
|
|
mkdirSync(this.testDir, { recursive: true });
|
|
|
|
// Create a settings file to point the CLI to the local collector
|
|
const geminiDir = join(this.testDir, '.gemini');
|
|
mkdirSync(geminiDir, { recursive: true });
|
|
// In sandbox mode, use an absolute path for telemetry inside the container
|
|
// The container mounts the test directory at the same path as the host
|
|
const telemetryPath = join(this.testDir, 'telemetry.log'); // Always use test directory for telemetry
|
|
|
|
const settings = {
|
|
telemetry: {
|
|
enabled: true,
|
|
target: 'local',
|
|
otlpEndpoint: '',
|
|
outfile: telemetryPath,
|
|
},
|
|
sandbox: env.GEMINI_SANDBOX !== 'false' ? env.GEMINI_SANDBOX : false,
|
|
...options.settings, // Allow tests to override/add settings
|
|
};
|
|
writeFileSync(
|
|
join(geminiDir, 'settings.json'),
|
|
JSON.stringify(settings, null, 2),
|
|
);
|
|
}
|
|
|
|
createFile(fileName: string, content: string) {
|
|
const filePath = join(this.testDir!, fileName);
|
|
writeFileSync(filePath, content);
|
|
return filePath;
|
|
}
|
|
|
|
mkdir(dir: string) {
|
|
mkdirSync(join(this.testDir!, dir), { recursive: true });
|
|
}
|
|
|
|
sync() {
|
|
// ensure file system is done before spawning
|
|
execSync('sync', { cwd: this.testDir! });
|
|
}
|
|
|
|
run(
|
|
promptOrOptions: string | { prompt?: string; stdin?: string },
|
|
...args: string[]
|
|
): Promise<string> {
|
|
let command = `node ${this.bundlePath} --yolo`;
|
|
const execOptions: {
|
|
cwd: string;
|
|
encoding: 'utf-8';
|
|
input?: string;
|
|
} = {
|
|
cwd: this.testDir!,
|
|
encoding: 'utf-8',
|
|
};
|
|
|
|
if (typeof promptOrOptions === 'string') {
|
|
command += ` --prompt ${JSON.stringify(promptOrOptions)}`;
|
|
} else if (
|
|
typeof promptOrOptions === 'object' &&
|
|
promptOrOptions !== null
|
|
) {
|
|
if (promptOrOptions.prompt) {
|
|
command += ` --prompt ${JSON.stringify(promptOrOptions.prompt)}`;
|
|
}
|
|
if (promptOrOptions.stdin) {
|
|
execOptions.input = promptOrOptions.stdin;
|
|
}
|
|
}
|
|
|
|
command += ` ${args.join(' ')}`;
|
|
|
|
const commandArgs = parse(command);
|
|
const node = commandArgs.shift() as string;
|
|
|
|
const child = spawn(node, commandArgs as string[], {
|
|
cwd: this.testDir!,
|
|
stdio: 'pipe',
|
|
});
|
|
|
|
let stdout = '';
|
|
let stderr = '';
|
|
|
|
// Handle stdin if provided
|
|
if (execOptions.input) {
|
|
child.stdin!.write(execOptions.input);
|
|
}
|
|
child.stdin!.end();
|
|
|
|
child.stdout!.on('data', (data: Buffer) => {
|
|
stdout += data;
|
|
if (env.KEEP_OUTPUT === 'true' || env.VERBOSE === 'true') {
|
|
process.stdout.write(data);
|
|
}
|
|
});
|
|
|
|
child.stderr!.on('data', (data: Buffer) => {
|
|
stderr += data;
|
|
if (env.KEEP_OUTPUT === 'true' || env.VERBOSE === 'true') {
|
|
process.stderr.write(data);
|
|
}
|
|
});
|
|
|
|
const promise = new Promise<string>((resolve, reject) => {
|
|
child.on('close', (code: number) => {
|
|
if (code === 0) {
|
|
// Store the raw stdout for Podman telemetry parsing
|
|
this._lastRunStdout = stdout;
|
|
|
|
// Filter out telemetry output when running with Podman
|
|
// Podman seems to output telemetry to stdout even when writing to file
|
|
let result = stdout;
|
|
if (env.GEMINI_SANDBOX === 'podman') {
|
|
// Remove telemetry JSON objects from output
|
|
// They are multi-line JSON objects that start with { and contain telemetry fields
|
|
const lines = result.split('\n');
|
|
const filteredLines = [];
|
|
let inTelemetryObject = false;
|
|
let braceDepth = 0;
|
|
|
|
for (const line of lines) {
|
|
if (!inTelemetryObject && line.trim() === '{') {
|
|
// Check if this might be start of telemetry object
|
|
inTelemetryObject = true;
|
|
braceDepth = 1;
|
|
} else if (inTelemetryObject) {
|
|
// Count braces to track nesting
|
|
for (const char of line) {
|
|
if (char === '{') braceDepth++;
|
|
else if (char === '}') braceDepth--;
|
|
}
|
|
|
|
// Check if we've closed all braces
|
|
if (braceDepth === 0) {
|
|
inTelemetryObject = false;
|
|
// Skip this line (the closing brace)
|
|
continue;
|
|
}
|
|
} else {
|
|
// Not in telemetry object, keep the line
|
|
filteredLines.push(line);
|
|
}
|
|
}
|
|
|
|
result = filteredLines.join('\n');
|
|
}
|
|
// If we have stderr output, include that also
|
|
if (stderr) {
|
|
result += `\n\nStdErr:\n${stderr}`;
|
|
}
|
|
|
|
resolve(result);
|
|
} else {
|
|
reject(new Error(`Process exited with code ${code}:\n${stderr}`));
|
|
}
|
|
});
|
|
});
|
|
|
|
return promise;
|
|
}
|
|
|
|
readFile(fileName: string) {
|
|
const filePath = join(this.testDir!, fileName);
|
|
const content = readFileSync(filePath, 'utf-8');
|
|
if (env.KEEP_OUTPUT === 'true' || env.VERBOSE === 'true') {
|
|
console.log(`--- FILE: ${filePath} ---`);
|
|
console.log(content);
|
|
console.log(`--- END FILE: ${filePath} ---`);
|
|
}
|
|
return content;
|
|
}
|
|
|
|
async cleanup() {
|
|
// Clean up test directory
|
|
if (this.testDir && !env.KEEP_OUTPUT) {
|
|
try {
|
|
execSync(`rm -rf ${this.testDir}`);
|
|
} catch (error) {
|
|
// Ignore cleanup errors
|
|
if (env.VERBOSE === 'true') {
|
|
console.warn('Cleanup warning:', (error as Error).message);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
async waitForTelemetryReady() {
|
|
// Telemetry is always written to the test directory
|
|
const logFilePath = join(this.testDir!, 'telemetry.log');
|
|
|
|
if (!logFilePath) return;
|
|
|
|
// Wait for telemetry file to exist and have content
|
|
await this.poll(
|
|
() => {
|
|
if (!fs.existsSync(logFilePath)) return false;
|
|
try {
|
|
const content = readFileSync(logFilePath, 'utf-8');
|
|
// Check if file has meaningful content (at least one complete JSON object)
|
|
return content.includes('"event.name"');
|
|
} catch {
|
|
return false;
|
|
}
|
|
},
|
|
2000, // 2 seconds max - reduced since telemetry should flush on exit now
|
|
100, // check every 100ms
|
|
);
|
|
}
|
|
|
|
async waitForTelemetryEvent(eventName: string, timeout?: number) {
|
|
if (!timeout) {
|
|
timeout = this.getDefaultTimeout();
|
|
}
|
|
|
|
await this.waitForTelemetryReady();
|
|
|
|
return this.poll(
|
|
() => {
|
|
const logFilePath = join(this.testDir!, 'telemetry.log');
|
|
|
|
if (!logFilePath || !fs.existsSync(logFilePath)) {
|
|
return false;
|
|
}
|
|
|
|
const content = readFileSync(logFilePath, 'utf-8');
|
|
const jsonObjects = content
|
|
.split(/}\n{/)
|
|
.map((obj, index, array) => {
|
|
// Add back the braces we removed during split
|
|
if (index > 0) obj = '{' + obj;
|
|
if (index < array.length - 1) obj = obj + '}';
|
|
return obj.trim();
|
|
})
|
|
.filter((obj) => obj);
|
|
|
|
for (const jsonStr of jsonObjects) {
|
|
try {
|
|
const logData = JSON.parse(jsonStr);
|
|
if (
|
|
logData.attributes &&
|
|
logData.attributes['event.name'] === `gemini_cli.${eventName}`
|
|
) {
|
|
return true;
|
|
}
|
|
} catch {
|
|
// ignore
|
|
}
|
|
}
|
|
return false;
|
|
},
|
|
timeout,
|
|
100,
|
|
);
|
|
}
|
|
|
|
async waitForToolCall(toolName: string, timeout?: number) {
|
|
// Use environment-specific timeout
|
|
if (!timeout) {
|
|
timeout = this.getDefaultTimeout();
|
|
}
|
|
|
|
// Wait for telemetry to be ready before polling for tool calls
|
|
await this.waitForTelemetryReady();
|
|
|
|
return this.poll(
|
|
() => {
|
|
const toolLogs = this.readToolLogs();
|
|
return toolLogs.some((log) => log.toolRequest.name === toolName);
|
|
},
|
|
timeout,
|
|
100,
|
|
);
|
|
}
|
|
|
|
async waitForAnyToolCall(toolNames: string[], timeout?: number) {
|
|
// Use environment-specific timeout
|
|
if (!timeout) {
|
|
timeout = this.getDefaultTimeout();
|
|
}
|
|
|
|
// Wait for telemetry to be ready before polling for tool calls
|
|
await this.waitForTelemetryReady();
|
|
|
|
return this.poll(
|
|
() => {
|
|
const toolLogs = this.readToolLogs();
|
|
return toolNames.some((name) =>
|
|
toolLogs.some((log) => log.toolRequest.name === name),
|
|
);
|
|
},
|
|
timeout,
|
|
100,
|
|
);
|
|
}
|
|
|
|
async poll(
|
|
predicate: () => boolean,
|
|
timeout: number,
|
|
interval: number,
|
|
): Promise<boolean> {
|
|
const startTime = Date.now();
|
|
let attempts = 0;
|
|
while (Date.now() - startTime < timeout) {
|
|
attempts++;
|
|
const result = predicate();
|
|
if (env.VERBOSE === 'true' && attempts % 5 === 0) {
|
|
console.log(
|
|
`Poll attempt ${attempts}: ${result ? 'success' : 'waiting...'}`,
|
|
);
|
|
}
|
|
if (result) {
|
|
return true;
|
|
}
|
|
await new Promise((resolve) => setTimeout(resolve, interval));
|
|
}
|
|
if (env.VERBOSE === 'true') {
|
|
console.log(`Poll timed out after ${attempts} attempts`);
|
|
}
|
|
return false;
|
|
}
|
|
|
|
_parseToolLogsFromStdout(stdout: string) {
|
|
const logs: {
|
|
timestamp: number;
|
|
toolRequest: {
|
|
name: string;
|
|
args: string;
|
|
success: boolean;
|
|
duration_ms: number;
|
|
};
|
|
}[] = [];
|
|
|
|
// The console output from Podman is JavaScript object notation, not JSON
|
|
// Look for tool call events in the output
|
|
// Updated regex to handle tool names with hyphens and underscores
|
|
const toolCallPattern =
|
|
/body:\s*'Tool call:\s*([\w-]+)\..*?Success:\s*(\w+)\..*?Duration:\s*(\d+)ms\.'/g;
|
|
const matches = [...stdout.matchAll(toolCallPattern)];
|
|
|
|
for (const match of matches) {
|
|
const toolName = match[1];
|
|
const success = match[2] === 'true';
|
|
const duration = parseInt(match[3], 10);
|
|
|
|
// Try to find function_args nearby
|
|
const matchIndex = match.index || 0;
|
|
const contextStart = Math.max(0, matchIndex - 500);
|
|
const contextEnd = Math.min(stdout.length, matchIndex + 500);
|
|
const context = stdout.substring(contextStart, contextEnd);
|
|
|
|
// Look for function_args in the context
|
|
let args = '{}';
|
|
const argsMatch = context.match(/function_args:\s*'([^']+)'/);
|
|
if (argsMatch) {
|
|
args = argsMatch[1];
|
|
}
|
|
|
|
// Also try to find function_name to double-check
|
|
// Updated regex to handle tool names with hyphens and underscores
|
|
const nameMatch = context.match(/function_name:\s*'([\w-]+)'/);
|
|
const actualToolName = nameMatch ? nameMatch[1] : toolName;
|
|
|
|
logs.push({
|
|
timestamp: Date.now(),
|
|
toolRequest: {
|
|
name: actualToolName,
|
|
args: args,
|
|
success: success,
|
|
duration_ms: duration,
|
|
},
|
|
});
|
|
}
|
|
|
|
// If no matches found with the simple pattern, try the JSON parsing approach
|
|
// in case the format changes
|
|
if (logs.length === 0) {
|
|
const lines = stdout.split('\n');
|
|
let currentObject = '';
|
|
let inObject = false;
|
|
let braceDepth = 0;
|
|
|
|
for (const line of lines) {
|
|
if (!inObject && line.trim() === '{') {
|
|
inObject = true;
|
|
braceDepth = 1;
|
|
currentObject = line + '\n';
|
|
} else if (inObject) {
|
|
currentObject += line + '\n';
|
|
|
|
// Count braces
|
|
for (const char of line) {
|
|
if (char === '{') braceDepth++;
|
|
else if (char === '}') braceDepth--;
|
|
}
|
|
|
|
// If we've closed all braces, try to parse the object
|
|
if (braceDepth === 0) {
|
|
inObject = false;
|
|
try {
|
|
const obj = JSON.parse(currentObject);
|
|
|
|
// Check for tool call in different formats
|
|
if (
|
|
obj.body &&
|
|
obj.body.includes('Tool call:') &&
|
|
obj.attributes
|
|
) {
|
|
const bodyMatch = obj.body.match(/Tool call: (\w+)\./);
|
|
if (bodyMatch) {
|
|
logs.push({
|
|
timestamp: obj.timestamp || Date.now(),
|
|
toolRequest: {
|
|
name: bodyMatch[1],
|
|
args: obj.attributes.function_args || '{}',
|
|
success: obj.attributes.success !== false,
|
|
duration_ms: obj.attributes.duration_ms || 0,
|
|
},
|
|
});
|
|
}
|
|
} else if (
|
|
obj.attributes &&
|
|
obj.attributes['event.name'] === 'gemini_cli.tool_call'
|
|
) {
|
|
logs.push({
|
|
timestamp: obj.attributes['event.timestamp'],
|
|
toolRequest: {
|
|
name: obj.attributes.function_name,
|
|
args: obj.attributes.function_args,
|
|
success: obj.attributes.success,
|
|
duration_ms: obj.attributes.duration_ms,
|
|
},
|
|
});
|
|
}
|
|
} catch {
|
|
// Not valid JSON
|
|
}
|
|
currentObject = '';
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
return logs;
|
|
}
|
|
|
|
readToolLogs() {
|
|
// For Podman, first check if telemetry file exists and has content
|
|
// If not, fall back to parsing from stdout
|
|
if (env.GEMINI_SANDBOX === 'podman') {
|
|
// Try reading from file first
|
|
const logFilePath = join(this.testDir!, 'telemetry.log');
|
|
|
|
if (fs.existsSync(logFilePath)) {
|
|
try {
|
|
const content = readFileSync(logFilePath, 'utf-8');
|
|
if (content && content.includes('"event.name"')) {
|
|
// File has content, use normal file parsing
|
|
// Continue to the normal file parsing logic below
|
|
} else if (this._lastRunStdout) {
|
|
// File exists but is empty or doesn't have events, parse from stdout
|
|
return this._parseToolLogsFromStdout(this._lastRunStdout);
|
|
}
|
|
} catch {
|
|
// Error reading file, fall back to stdout
|
|
if (this._lastRunStdout) {
|
|
return this._parseToolLogsFromStdout(this._lastRunStdout);
|
|
}
|
|
}
|
|
} else if (this._lastRunStdout) {
|
|
// No file exists, parse from stdout
|
|
return this._parseToolLogsFromStdout(this._lastRunStdout);
|
|
}
|
|
}
|
|
|
|
// Telemetry is always written to the test directory
|
|
const logFilePath = join(this.testDir!, 'telemetry.log');
|
|
|
|
if (!logFilePath) {
|
|
console.warn(`TELEMETRY_LOG_FILE environment variable not set`);
|
|
return [];
|
|
}
|
|
|
|
// Check if file exists, if not return empty array (file might not be created yet)
|
|
if (!fs.existsSync(logFilePath)) {
|
|
return [];
|
|
}
|
|
|
|
const content = readFileSync(logFilePath, 'utf-8');
|
|
|
|
// Split the content into individual JSON objects
|
|
// They are separated by "}\n{"
|
|
const jsonObjects = content
|
|
.split(/}\n{/)
|
|
.map((obj, index, array) => {
|
|
// Add back the braces we removed during split
|
|
if (index > 0) obj = '{' + obj;
|
|
if (index < array.length - 1) obj = obj + '}';
|
|
return obj.trim();
|
|
})
|
|
.filter((obj) => obj);
|
|
|
|
const logs: {
|
|
toolRequest: {
|
|
name: string;
|
|
args: string;
|
|
success: boolean;
|
|
duration_ms: number;
|
|
};
|
|
}[] = [];
|
|
|
|
for (const jsonStr of jsonObjects) {
|
|
try {
|
|
const logData = JSON.parse(jsonStr);
|
|
// Look for tool call logs
|
|
if (
|
|
logData.attributes &&
|
|
logData.attributes['event.name'] === 'gemini_cli.tool_call'
|
|
) {
|
|
const toolName = logData.attributes.function_name;
|
|
logs.push({
|
|
toolRequest: {
|
|
name: toolName,
|
|
args: logData.attributes.function_args,
|
|
success: logData.attributes.success,
|
|
duration_ms: logData.attributes.duration_ms,
|
|
},
|
|
});
|
|
}
|
|
} catch (e) {
|
|
// Skip objects that aren't valid JSON
|
|
if (env.VERBOSE === 'true') {
|
|
console.error('Failed to parse telemetry object:', e);
|
|
}
|
|
}
|
|
}
|
|
|
|
return logs;
|
|
}
|
|
|
|
readLastApiRequest(): Record<string, unknown> | null {
|
|
// Telemetry is always written to the test directory
|
|
const logFilePath = join(this.testDir!, 'telemetry.log');
|
|
|
|
if (!logFilePath || !fs.existsSync(logFilePath)) {
|
|
return null;
|
|
}
|
|
|
|
const content = readFileSync(logFilePath, 'utf-8');
|
|
const jsonObjects = content
|
|
.split(/}\n{/)
|
|
.map((obj, index, array) => {
|
|
if (index > 0) obj = '{' + obj;
|
|
if (index < array.length - 1) obj = obj + '}';
|
|
return obj.trim();
|
|
})
|
|
.filter((obj) => obj);
|
|
|
|
let lastApiRequest = null;
|
|
|
|
for (const jsonStr of jsonObjects) {
|
|
try {
|
|
const logData = JSON.parse(jsonStr);
|
|
if (
|
|
logData.attributes &&
|
|
logData.attributes['event.name'] === 'gemini_cli.api_request'
|
|
) {
|
|
lastApiRequest = logData;
|
|
}
|
|
} catch {
|
|
// ignore
|
|
}
|
|
}
|
|
return lastApiRequest;
|
|
}
|
|
}
|