Implement loop check with LLM (#4337)
Co-authored-by: N. Taylor Mullen <ntaylormullen@google.com>
This commit is contained in:
parent
c5761317f4
commit
9dadf22958
|
@ -170,43 +170,45 @@ describe('Gemini Client (client.ts)', () => {
|
|||
getTool: vi.fn().mockReturnValue(null),
|
||||
};
|
||||
const fileService = new FileDiscoveryService('/test/dir');
|
||||
const MockedConfig = vi.mocked(Config, true);
|
||||
const contentGeneratorConfig = {
|
||||
model: 'test-model',
|
||||
apiKey: 'test-key',
|
||||
vertexai: false,
|
||||
authType: AuthType.USE_GEMINI,
|
||||
};
|
||||
MockedConfig.mockImplementation(() => {
|
||||
const mock = {
|
||||
getContentGeneratorConfig: vi
|
||||
.fn()
|
||||
.mockReturnValue(contentGeneratorConfig),
|
||||
getToolRegistry: vi.fn().mockResolvedValue(mockToolRegistry),
|
||||
getModel: vi.fn().mockReturnValue('test-model'),
|
||||
getEmbeddingModel: vi.fn().mockReturnValue('test-embedding-model'),
|
||||
getApiKey: vi.fn().mockReturnValue('test-key'),
|
||||
getVertexAI: vi.fn().mockReturnValue(false),
|
||||
getUserAgent: vi.fn().mockReturnValue('test-agent'),
|
||||
getUserMemory: vi.fn().mockReturnValue(''),
|
||||
getFullContext: vi.fn().mockReturnValue(false),
|
||||
getSessionId: vi.fn().mockReturnValue('test-session-id'),
|
||||
getProxy: vi.fn().mockReturnValue(undefined),
|
||||
getWorkingDir: vi.fn().mockReturnValue('/test/dir'),
|
||||
getFileService: vi.fn().mockReturnValue(fileService),
|
||||
getMaxSessionTurns: vi.fn().mockReturnValue(0),
|
||||
getQuotaErrorOccurred: vi.fn().mockReturnValue(false),
|
||||
setQuotaErrorOccurred: vi.fn(),
|
||||
getNoBrowser: vi.fn().mockReturnValue(false),
|
||||
getIdeMode: vi.fn().mockReturnValue(false),
|
||||
};
|
||||
return mock as unknown as Config;
|
||||
});
|
||||
const mockConfigObject = {
|
||||
getContentGeneratorConfig: vi
|
||||
.fn()
|
||||
.mockReturnValue(contentGeneratorConfig),
|
||||
getToolRegistry: vi.fn().mockResolvedValue(mockToolRegistry),
|
||||
getModel: vi.fn().mockReturnValue('test-model'),
|
||||
getEmbeddingModel: vi.fn().mockReturnValue('test-embedding-model'),
|
||||
getApiKey: vi.fn().mockReturnValue('test-key'),
|
||||
getVertexAI: vi.fn().mockReturnValue(false),
|
||||
getUserAgent: vi.fn().mockReturnValue('test-agent'),
|
||||
getUserMemory: vi.fn().mockReturnValue(''),
|
||||
getFullContext: vi.fn().mockReturnValue(false),
|
||||
getSessionId: vi.fn().mockReturnValue('test-session-id'),
|
||||
getProxy: vi.fn().mockReturnValue(undefined),
|
||||
getWorkingDir: vi.fn().mockReturnValue('/test/dir'),
|
||||
getFileService: vi.fn().mockReturnValue(fileService),
|
||||
getMaxSessionTurns: vi.fn().mockReturnValue(0),
|
||||
getQuotaErrorOccurred: vi.fn().mockReturnValue(false),
|
||||
setQuotaErrorOccurred: vi.fn(),
|
||||
getNoBrowser: vi.fn().mockReturnValue(false),
|
||||
getIdeMode: vi.fn().mockReturnValue(false),
|
||||
getGeminiClient: vi.fn(),
|
||||
};
|
||||
const MockedConfig = vi.mocked(Config, true);
|
||||
MockedConfig.mockImplementation(
|
||||
() => mockConfigObject as unknown as Config,
|
||||
);
|
||||
|
||||
// We can instantiate the client here since Config is mocked
|
||||
// and the constructor will use the mocked GoogleGenAI
|
||||
const mockConfig = new Config({} as never);
|
||||
client = new GeminiClient(mockConfig);
|
||||
client = new GeminiClient(new Config({} as never));
|
||||
mockConfigObject.getGeminiClient.mockReturnValue(client);
|
||||
|
||||
await client.initialize(contentGeneratorConfig);
|
||||
});
|
||||
|
||||
|
@ -655,6 +657,7 @@ describe('Gemini Client (client.ts)', () => {
|
|||
|
||||
const mockGenerator: Partial<ContentGenerator> = {
|
||||
countTokens: vi.fn().mockResolvedValue({ totalTokens: 0 }),
|
||||
generateContent: mockGenerateContentFn,
|
||||
};
|
||||
client['contentGenerator'] = mockGenerator as ContentGenerator;
|
||||
|
||||
|
@ -704,6 +707,7 @@ describe('Gemini Client (client.ts)', () => {
|
|||
|
||||
const mockGenerator: Partial<ContentGenerator> = {
|
||||
countTokens: vi.fn().mockResolvedValue({ totalTokens: 0 }),
|
||||
generateContent: mockGenerateContentFn,
|
||||
};
|
||||
client['contentGenerator'] = mockGenerator as ContentGenerator;
|
||||
|
||||
|
@ -796,6 +800,7 @@ describe('Gemini Client (client.ts)', () => {
|
|||
|
||||
const mockGenerator: Partial<ContentGenerator> = {
|
||||
countTokens: vi.fn().mockResolvedValue({ totalTokens: 0 }),
|
||||
generateContent: mockGenerateContentFn,
|
||||
};
|
||||
client['contentGenerator'] = mockGenerator as ContentGenerator;
|
||||
|
||||
|
@ -857,6 +862,7 @@ describe('Gemini Client (client.ts)', () => {
|
|||
|
||||
const mockGenerator: Partial<ContentGenerator> = {
|
||||
countTokens: vi.fn().mockResolvedValue({ totalTokens: 0 }),
|
||||
generateContent: mockGenerateContentFn,
|
||||
};
|
||||
client['contentGenerator'] = mockGenerator as ContentGenerator;
|
||||
|
||||
|
|
|
@ -324,6 +324,13 @@ This is the cursor position in the file:
|
|||
}
|
||||
|
||||
const turn = new Turn(this.getChat(), prompt_id);
|
||||
|
||||
const loopDetected = await this.loopDetector.turnStarted(signal);
|
||||
if (loopDetected) {
|
||||
yield { type: GeminiEventType.LoopDetected };
|
||||
return turn;
|
||||
}
|
||||
|
||||
const resultStream = turn.run(request, signal);
|
||||
for await (const event of resultStream) {
|
||||
if (this.loopDetector.addAndCheck(event)) {
|
||||
|
|
|
@ -4,16 +4,18 @@
|
|||
* SPDX-License-Identifier: Apache-2.0
|
||||
*/
|
||||
|
||||
import { describe, it, expect, beforeEach, vi } from 'vitest';
|
||||
import { LoopDetectionService } from './loopDetectionService.js';
|
||||
import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest';
|
||||
import { Config } from '../config/config.js';
|
||||
import { GeminiClient } from '../core/client.js';
|
||||
import {
|
||||
GeminiEventType,
|
||||
ServerGeminiContentEvent,
|
||||
ServerGeminiStreamEvent,
|
||||
ServerGeminiToolCallRequestEvent,
|
||||
} from '../core/turn.js';
|
||||
import { ServerGeminiStreamEvent } from '../core/turn.js';
|
||||
import { Config } from '../config/config.js';
|
||||
import * as loggers from '../telemetry/loggers.js';
|
||||
import { LoopType } from '../telemetry/types.js';
|
||||
import { LoopDetectionService } from './loopDetectionService.js';
|
||||
|
||||
vi.mock('../telemetry/loggers.js', () => ({
|
||||
logLoopDetected: vi.fn(),
|
||||
|
@ -330,3 +332,112 @@ describe('LoopDetectionService', () => {
|
|||
});
|
||||
});
|
||||
});
|
||||
|
||||
describe('LoopDetectionService LLM Checks', () => {
|
||||
let service: LoopDetectionService;
|
||||
let mockConfig: Config;
|
||||
let mockGeminiClient: GeminiClient;
|
||||
let abortController: AbortController;
|
||||
|
||||
beforeEach(() => {
|
||||
mockGeminiClient = {
|
||||
getHistory: vi.fn().mockReturnValue([]),
|
||||
generateJson: vi.fn(),
|
||||
} as unknown as GeminiClient;
|
||||
|
||||
mockConfig = {
|
||||
getGeminiClient: () => mockGeminiClient,
|
||||
getDebugMode: () => false,
|
||||
getTelemetryEnabled: () => true,
|
||||
} as unknown as Config;
|
||||
|
||||
service = new LoopDetectionService(mockConfig);
|
||||
abortController = new AbortController();
|
||||
vi.clearAllMocks();
|
||||
});
|
||||
|
||||
afterEach(() => {
|
||||
vi.restoreAllMocks();
|
||||
});
|
||||
|
||||
const advanceTurns = async (count: number) => {
|
||||
for (let i = 0; i < count; i++) {
|
||||
await service.turnStarted(abortController.signal);
|
||||
}
|
||||
};
|
||||
|
||||
it('should not trigger LLM check before LLM_CHECK_AFTER_TURNS', async () => {
|
||||
await advanceTurns(29);
|
||||
expect(mockGeminiClient.generateJson).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
it('should trigger LLM check on the 30th turn', async () => {
|
||||
mockGeminiClient.generateJson = vi
|
||||
.fn()
|
||||
.mockResolvedValue({ confidence: 0.1 });
|
||||
await advanceTurns(30);
|
||||
expect(mockGeminiClient.generateJson).toHaveBeenCalledTimes(1);
|
||||
});
|
||||
|
||||
it('should detect a cognitive loop when confidence is high', async () => {
|
||||
// First check at turn 30
|
||||
mockGeminiClient.generateJson = vi
|
||||
.fn()
|
||||
.mockResolvedValue({ confidence: 0.85, reasoning: 'Repetitive actions' });
|
||||
await advanceTurns(30);
|
||||
expect(mockGeminiClient.generateJson).toHaveBeenCalledTimes(1);
|
||||
|
||||
// The confidence of 0.85 will result in a low interval.
|
||||
// The interval will be: 5 + (15 - 5) * (1 - 0.85) = 5 + 10 * 0.15 = 6.5 -> rounded to 7
|
||||
await advanceTurns(6); // advance to turn 36
|
||||
|
||||
mockGeminiClient.generateJson = vi
|
||||
.fn()
|
||||
.mockResolvedValue({ confidence: 0.95, reasoning: 'Repetitive actions' });
|
||||
const finalResult = await service.turnStarted(abortController.signal); // This is turn 37
|
||||
|
||||
expect(finalResult).toBe(true);
|
||||
expect(loggers.logLoopDetected).toHaveBeenCalledWith(
|
||||
mockConfig,
|
||||
expect.objectContaining({
|
||||
'event.name': 'loop_detected',
|
||||
loop_type: LoopType.LLM_DETECTED_LOOP,
|
||||
}),
|
||||
);
|
||||
});
|
||||
|
||||
it('should not detect a loop when confidence is low', async () => {
|
||||
mockGeminiClient.generateJson = vi
|
||||
.fn()
|
||||
.mockResolvedValue({ confidence: 0.5, reasoning: 'Looks okay' });
|
||||
await advanceTurns(30);
|
||||
const result = await service.turnStarted(abortController.signal);
|
||||
expect(result).toBe(false);
|
||||
expect(loggers.logLoopDetected).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
it('should adjust the check interval based on confidence', async () => {
|
||||
// Confidence is 0.0, so interval should be MAX_LLM_CHECK_INTERVAL (15)
|
||||
mockGeminiClient.generateJson = vi
|
||||
.fn()
|
||||
.mockResolvedValue({ confidence: 0.0 });
|
||||
await advanceTurns(30); // First check at turn 30
|
||||
expect(mockGeminiClient.generateJson).toHaveBeenCalledTimes(1);
|
||||
|
||||
await advanceTurns(14); // Advance to turn 44
|
||||
expect(mockGeminiClient.generateJson).toHaveBeenCalledTimes(1);
|
||||
|
||||
await service.turnStarted(abortController.signal); // Turn 45
|
||||
expect(mockGeminiClient.generateJson).toHaveBeenCalledTimes(2);
|
||||
});
|
||||
|
||||
it('should handle errors from generateJson gracefully', async () => {
|
||||
mockGeminiClient.generateJson = vi
|
||||
.fn()
|
||||
.mockRejectedValue(new Error('API error'));
|
||||
await advanceTurns(30);
|
||||
const result = await service.turnStarted(abortController.signal);
|
||||
expect(result).toBe(false);
|
||||
expect(loggers.logLoopDetected).not.toHaveBeenCalled();
|
||||
});
|
||||
});
|
||||
|
|
|
@ -8,10 +8,40 @@ import { createHash } from 'crypto';
|
|||
import { GeminiEventType, ServerGeminiStreamEvent } from '../core/turn.js';
|
||||
import { logLoopDetected } from '../telemetry/loggers.js';
|
||||
import { LoopDetectedEvent, LoopType } from '../telemetry/types.js';
|
||||
import { Config } from '../config/config.js';
|
||||
import { Config, DEFAULT_GEMINI_FLASH_MODEL } from '../config/config.js';
|
||||
import { SchemaUnion, Type } from '@google/genai';
|
||||
|
||||
const TOOL_CALL_LOOP_THRESHOLD = 5;
|
||||
const CONTENT_LOOP_THRESHOLD = 10;
|
||||
|
||||
/**
|
||||
* The number of recent conversation turns to include in the history when asking the LLM to check for a loop.
|
||||
*/
|
||||
const LLM_LOOP_CHECK_HISTORY_COUNT = 20;
|
||||
|
||||
/**
|
||||
* The number of turns that must pass in a single prompt before the LLM-based loop check is activated.
|
||||
*/
|
||||
const LLM_CHECK_AFTER_TURNS = 30;
|
||||
|
||||
/**
|
||||
* The default interval, in number of turns, at which the LLM-based loop check is performed.
|
||||
* This value is adjusted dynamically based on the LLM's confidence.
|
||||
*/
|
||||
const DEFAULT_LLM_CHECK_INTERVAL = 3;
|
||||
|
||||
/**
|
||||
* The minimum interval for LLM-based loop checks.
|
||||
* This is used when the confidence of a loop is high, to check more frequently.
|
||||
*/
|
||||
const MIN_LLM_CHECK_INTERVAL = 5;
|
||||
|
||||
/**
|
||||
* The maximum interval for LLM-based loop checks.
|
||||
* This is used when the confidence of a loop is low, to check less frequently.
|
||||
*/
|
||||
const MAX_LLM_CHECK_INTERVAL = 15;
|
||||
|
||||
const SENTENCE_ENDING_PUNCTUATION_REGEX = /[.!?]+(?=\s|$)/;
|
||||
|
||||
/**
|
||||
|
@ -19,6 +49,8 @@ const SENTENCE_ENDING_PUNCTUATION_REGEX = /[.!?]+(?=\s|$)/;
|
|||
* Monitors tool call repetitions and content sentence repetitions.
|
||||
*/
|
||||
export class LoopDetectionService {
|
||||
private readonly config: Config;
|
||||
|
||||
// Tool call tracking
|
||||
private lastToolCallKey: string | null = null;
|
||||
private toolCallRepetitionCount: number = 0;
|
||||
|
@ -27,7 +59,11 @@ export class LoopDetectionService {
|
|||
private lastRepeatedSentence: string = '';
|
||||
private sentenceRepetitionCount: number = 0;
|
||||
private partialContent: string = '';
|
||||
private config: Config;
|
||||
|
||||
// LLM loop track tracking
|
||||
private turnsInCurrentPrompt = 0;
|
||||
private llmCheckInterval = DEFAULT_LLM_CHECK_INTERVAL;
|
||||
private lastCheckTurn = 0;
|
||||
|
||||
constructor(config: Config) {
|
||||
this.config = config;
|
||||
|
@ -58,6 +94,30 @@ export class LoopDetectionService {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Signals the start of a new turn in the conversation.
|
||||
*
|
||||
* This method increments the turn counter and, if specific conditions are met,
|
||||
* triggers an LLM-based check to detect potential conversation loops. The check
|
||||
* is performed periodically based on the `llmCheckInterval`.
|
||||
*
|
||||
* @param signal - An AbortSignal to allow for cancellation of the asynchronous LLM check.
|
||||
* @returns A promise that resolves to `true` if a loop is detected, and `false` otherwise.
|
||||
*/
|
||||
async turnStarted(signal: AbortSignal) {
|
||||
this.turnsInCurrentPrompt++;
|
||||
|
||||
if (
|
||||
this.turnsInCurrentPrompt >= LLM_CHECK_AFTER_TURNS &&
|
||||
this.turnsInCurrentPrompt - this.lastCheckTurn >= this.llmCheckInterval
|
||||
) {
|
||||
this.lastCheckTurn = this.turnsInCurrentPrompt;
|
||||
return await this.checkForLoopWithLLM(signal);
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
private checkToolCallLoop(toolCall: { name: string; args: object }): boolean {
|
||||
const key = this.getToolCallKey(toolCall);
|
||||
if (this.lastToolCallKey === key) {
|
||||
|
@ -118,12 +178,83 @@ export class LoopDetectionService {
|
|||
return false;
|
||||
}
|
||||
|
||||
private async checkForLoopWithLLM(signal: AbortSignal) {
|
||||
const recentHistory = this.config
|
||||
.getGeminiClient()
|
||||
.getHistory()
|
||||
.slice(-LLM_LOOP_CHECK_HISTORY_COUNT);
|
||||
|
||||
const prompt = `You are a sophisticated AI diagnostic agent specializing in identifying when a conversational AI is stuck in an unproductive state. Your task is to analyze the provided conversation history and determine if the assistant has ceased to make meaningful progress.
|
||||
|
||||
An unproductive state is characterized by one or more of the following patterns over the last 5 or more assistant turns:
|
||||
|
||||
Repetitive Actions: The assistant repeats the same tool calls or conversational responses a decent number of times. This includes simple loops (e.g., tool_A, tool_A, tool_A) and alternating patterns (e.g., tool_A, tool_B, tool_A, tool_B, ...).
|
||||
|
||||
Cognitive Loop: The assistant seems unable to determine the next logical step. It might express confusion, repeatedly ask the same questions, or generate responses that don't logically follow from the previous turns, indicating it's stuck and not advancing the task.
|
||||
|
||||
Crucially, differentiate between a true unproductive state and legitimate, incremental progress.
|
||||
For example, a series of 'tool_A' or 'tool_B' tool calls that make small, distinct changes to the same file (like adding docstrings to functions one by one) is considered forward progress and is NOT a loop. A loop would be repeatedly replacing the same text with the same content, or cycling between a small set of files with no net change.
|
||||
|
||||
Please analyze the conversation history to determine the possibility that the conversation is stuck in a repetitive, non-productive state.`;
|
||||
const contents = [
|
||||
...recentHistory,
|
||||
{ role: 'user', parts: [{ text: prompt }] },
|
||||
];
|
||||
const schema: SchemaUnion = {
|
||||
type: Type.OBJECT,
|
||||
properties: {
|
||||
reasoning: {
|
||||
type: Type.STRING,
|
||||
description:
|
||||
'Your reasoning on if the conversation is looping without forward progress.',
|
||||
},
|
||||
confidence: {
|
||||
type: Type.NUMBER,
|
||||
description:
|
||||
'A number between 0.0 and 1.0 representing your confidence that the conversation is in an unproductive state.',
|
||||
},
|
||||
},
|
||||
required: ['reasoning', 'confidence'],
|
||||
};
|
||||
let result;
|
||||
try {
|
||||
result = await this.config
|
||||
.getGeminiClient()
|
||||
.generateJson(contents, schema, signal, DEFAULT_GEMINI_FLASH_MODEL);
|
||||
} catch (e) {
|
||||
// Do nothing, treat it as a non-loop.
|
||||
this.config.getDebugMode() ? console.error(e) : console.debug(e);
|
||||
return false;
|
||||
}
|
||||
|
||||
if (typeof result.confidence === 'number') {
|
||||
if (result.confidence > 0.9) {
|
||||
if (typeof result.reasoning === 'string' && result.reasoning) {
|
||||
console.warn(result.reasoning);
|
||||
}
|
||||
logLoopDetected(
|
||||
this.config,
|
||||
new LoopDetectedEvent(LoopType.LLM_DETECTED_LOOP),
|
||||
);
|
||||
return true;
|
||||
} else {
|
||||
this.llmCheckInterval = Math.round(
|
||||
MIN_LLM_CHECK_INTERVAL +
|
||||
(MAX_LLM_CHECK_INTERVAL - MIN_LLM_CHECK_INTERVAL) *
|
||||
(1 - result.confidence),
|
||||
);
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Resets all loop detection state.
|
||||
*/
|
||||
reset(): void {
|
||||
this.resetToolCallCount();
|
||||
this.resetSentenceCount();
|
||||
this.resetLlmCheckTracking();
|
||||
}
|
||||
|
||||
private resetToolCallCount(): void {
|
||||
|
@ -136,4 +267,10 @@ export class LoopDetectionService {
|
|||
this.sentenceRepetitionCount = 0;
|
||||
this.partialContent = '';
|
||||
}
|
||||
|
||||
private resetLlmCheckTracking(): void {
|
||||
this.turnsInCurrentPrompt = 0;
|
||||
this.llmCheckInterval = DEFAULT_LLM_CHECK_INTERVAL;
|
||||
this.lastCheckTurn = 0;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -249,6 +249,7 @@ export class FlashFallbackEvent {
|
|||
export enum LoopType {
|
||||
CONSECUTIVE_IDENTICAL_TOOL_CALLS = 'consecutive_identical_tool_calls',
|
||||
CHANTING_IDENTICAL_SENTENCES = 'chanting_identical_sentences',
|
||||
LLM_DETECTED_LOOP = 'llm_detected_loop',
|
||||
}
|
||||
|
||||
export class LoopDetectedEvent {
|
||||
|
|
Loading…
Reference in New Issue