diff --git a/packages/server/src/core/client.ts b/packages/server/src/core/client.ts index 46af465a..4c20c0ad 100644 --- a/packages/server/src/core/client.ts +++ b/packages/server/src/core/client.ts @@ -21,6 +21,7 @@ import { Config } from '../config/config.js'; import { getCoreSystemPrompt } from './prompts.js'; import { ReadManyFilesTool } from '../tools/read-many-files.js'; import { getResponseText } from '../utils/generateContentResponseUtilities.js'; +import { checkNextSpeaker } from '../utils/nextSpeakerChecker.js'; export class GeminiClient { private client: GoogleGenAI; @@ -103,6 +104,16 @@ export class GeminiClient { .getToolRegistry() .getFunctionDeclarations(); const tools: Tool[] = [{ functionDeclarations: toolDeclarations }]; + const history: Content[] = [ + { + role: 'user', + parts: envParts, + }, + { + role: 'model', + parts: [{ text: 'Got it. Thanks for the context!' }], + }, + ]; try { return this.client.chats.create({ model: this.model, @@ -111,16 +122,7 @@ export class GeminiClient { ...this.generateContentConfig, tools, }, - history: [ - { - role: 'user', - parts: envParts, - }, - { - role: 'model', - parts: [{ text: 'Got it. Thanks for the context!' }], - }, - ], + history, }); } catch (error) { console.error('Error initializing Gemini chat session:', error); @@ -149,10 +151,15 @@ export class GeminiClient { break; } - // What do we do when we have both function responses and confirmations? const fnResponses = turn.getFunctionResponses(); if (fnResponses.length === 0) { - break; // user's turn to respond + const nextSpeakerCheck = await checkNextSpeaker(chat, this); + if (nextSpeakerCheck?.next_speaker === 'model') { + request = [{ text: 'Please continue.' }]; + continue; + } else { + break; + } } request = fnResponses; } @@ -167,7 +174,7 @@ export class GeminiClient { ): Promise> { try { const result = await this.client.models.generateContent({ - model: this.model, + model: 'gemini-2.0-flash', config: { ...this.generateContentConfig, systemInstruction: getCoreSystemPrompt(), diff --git a/packages/server/src/utils/nextSpeakerChecker.test.ts b/packages/server/src/utils/nextSpeakerChecker.test.ts new file mode 100644 index 00000000..b8d17875 --- /dev/null +++ b/packages/server/src/utils/nextSpeakerChecker.test.ts @@ -0,0 +1,186 @@ +/** + * @license + * Copyright 2025 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ + +import { describe, it, expect, vi, beforeEach, Mock, afterEach } from 'vitest'; +import { Chat, Content } from '@google/genai'; +import { GeminiClient } from '../core/client.js'; +import { Config } from '../config/config.js'; // Added Config import +import { checkNextSpeaker, NextSpeakerResponse } from './nextSpeakerChecker.js'; + +// Mock GeminiClient and Config constructor +vi.mock('../core/client.js'); +vi.mock('../config/config.js'); + +// Mock @google/genai +const mockGetHistory = vi.fn(); +const mockCreateChat = vi.fn(() => ({ + getHistory: mockGetHistory, +})); + +vi.mock('@google/genai', async () => { + const actualGenAI = + await vi.importActual('@google/genai'); + return { + ...actualGenAI, + GoogleGenAI: vi.fn().mockImplementation(() => ({ + chats: { + create: mockCreateChat, + }, + })), + // Keep Chat constructor mock for type safety if direct instantiation is attempted, + // but primary path is via client.chats.create + Chat: vi.fn().mockImplementation(() => ({ + getHistory: mockGetHistory, + })), + }; +}); + +describe('checkNextSpeaker', () => { + let mockChat: Chat; + let mockGeminiClient: GeminiClient; + let MockConfig: Mock; + + beforeEach(() => { + // Dynamically import and assign the mock + // Must be done within beforeEach or test to ensure mocks are reset + MockConfig = vi.mocked(Config); + // Create a mock instance of Config + const mockConfigInstance = new MockConfig( + 'test-api-key', + 'gemini-pro', + false, + '.', + false, + undefined, + false, + undefined, + undefined, + undefined, + ); + // Mock any methods on mockConfigInstance if needed, e.g., mockConfigInstance.getToolRegistry = vi.fn()... + + mockGeminiClient = new GeminiClient(mockConfigInstance); + // Simulate chat creation as done in GeminiClient + mockChat = { getHistory: mockGetHistory } as unknown as Chat; + }); + + afterEach(() => { + vi.clearAllMocks(); + }); + + it('should return null if history is empty', async () => { + (mockChat.getHistory as Mock).mockResolvedValue([]); + const result = await checkNextSpeaker(mockChat, mockGeminiClient); + expect(result).toBeNull(); + expect(mockGeminiClient.generateJson).not.toHaveBeenCalled(); + }); + + it('should return null if the last speaker was the user', async () => { + (mockChat.getHistory as Mock).mockResolvedValue([ + { role: 'user', parts: [{ text: 'Hello' }] }, + ] as Content[]); + const result = await checkNextSpeaker(mockChat, mockGeminiClient); + expect(result).toBeNull(); + expect(mockGeminiClient.generateJson).not.toHaveBeenCalled(); + }); + + it("should return { next_speaker: 'model' } when model intends to continue", async () => { + (mockChat.getHistory as Mock).mockResolvedValue([ + { role: 'model', parts: [{ text: 'I will now do something.' }] }, + ] as Content[]); + const mockApiResponse: NextSpeakerResponse = { + reasoning: 'Model stated it will do something.', + next_speaker: 'model', + }; + (mockGeminiClient.generateJson as Mock).mockResolvedValue(mockApiResponse); + + const result = await checkNextSpeaker(mockChat, mockGeminiClient); + expect(result).toEqual(mockApiResponse); + expect(mockGeminiClient.generateJson).toHaveBeenCalledTimes(1); + }); + + it("should return { next_speaker: 'user' } when model asks a question", async () => { + (mockChat.getHistory as Mock).mockResolvedValue([ + { role: 'model', parts: [{ text: 'What would you like to do?' }] }, + ] as Content[]); + const mockApiResponse: NextSpeakerResponse = { + reasoning: 'Model asked a question.', + next_speaker: 'user', + }; + (mockGeminiClient.generateJson as Mock).mockResolvedValue(mockApiResponse); + + const result = await checkNextSpeaker(mockChat, mockGeminiClient); + expect(result).toEqual(mockApiResponse); + }); + + it("should return { next_speaker: 'user' } when model makes a statement", async () => { + (mockChat.getHistory as Mock).mockResolvedValue([ + { role: 'model', parts: [{ text: 'This is a statement.' }] }, + ] as Content[]); + const mockApiResponse: NextSpeakerResponse = { + reasoning: 'Model made a statement, awaiting user input.', + next_speaker: 'user', + }; + (mockGeminiClient.generateJson as Mock).mockResolvedValue(mockApiResponse); + + const result = await checkNextSpeaker(mockChat, mockGeminiClient); + expect(result).toEqual(mockApiResponse); + }); + + it('should return null if geminiClient.generateJson throws an error', async () => { + const consoleWarnSpy = vi + .spyOn(console, 'warn') + .mockImplementation(() => {}); + (mockChat.getHistory as Mock).mockResolvedValue([ + { role: 'model', parts: [{ text: 'Some model output.' }] }, + ] as Content[]); + (mockGeminiClient.generateJson as Mock).mockRejectedValue( + new Error('API Error'), + ); + + const result = await checkNextSpeaker(mockChat, mockGeminiClient); + expect(result).toBeNull(); + consoleWarnSpy.mockRestore(); + }); + + it('should return null if geminiClient.generateJson returns invalid JSON (missing next_speaker)', async () => { + (mockChat.getHistory as Mock).mockResolvedValue([ + { role: 'model', parts: [{ text: 'Some model output.' }] }, + ] as Content[]); + (mockGeminiClient.generateJson as Mock).mockResolvedValue({ + reasoning: 'This is incomplete.', + } as unknown as NextSpeakerResponse); // Type assertion to simulate invalid response + + const result = await checkNextSpeaker(mockChat, mockGeminiClient); + expect(result).toBeNull(); + }); + + it('should return null if geminiClient.generateJson returns a non-string next_speaker', async () => { + (mockChat.getHistory as Mock).mockResolvedValue([ + { role: 'model', parts: [{ text: 'Some model output.' }] }, + ] as Content[]); + (mockGeminiClient.generateJson as Mock).mockResolvedValue({ + reasoning: 'Model made a statement, awaiting user input.', + next_speaker: 123, // Invalid type + } as unknown as NextSpeakerResponse); + + const result = await checkNextSpeaker(mockChat, mockGeminiClient); + expect(result).toBeNull(); + }); + + it('should return null if geminiClient.generateJson returns an invalid next_speaker string value', async () => { + (mockChat.getHistory as Mock).mockResolvedValue([ + { role: 'model', parts: [{ text: 'Some model output.' }] }, + ] as Content[]); + (mockGeminiClient.generateJson as Mock).mockResolvedValue({ + reasoning: 'Model made a statement, awaiting user input.', + next_speaker: 'neither', // Invalid enum value + } as unknown as NextSpeakerResponse); + + const result = await checkNextSpeaker(mockChat, mockGeminiClient); + expect(result).toBeNull(); + }); +}); diff --git a/packages/server/src/utils/nextSpeakerChecker.ts b/packages/server/src/utils/nextSpeakerChecker.ts new file mode 100644 index 00000000..f852879f --- /dev/null +++ b/packages/server/src/utils/nextSpeakerChecker.ts @@ -0,0 +1,97 @@ +/** + * @license + * Copyright 2025 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ + +import { Chat, Content, SchemaUnion, Type } from '@google/genai'; +import { GeminiClient } from '../core/client.js'; + +const CHECK_PROMPT = `Analyze *only* the content and structure of your immediately preceding response (your last turn in the conversation history). Based *strictly* on that response, determine who should logically speak next: the 'user' or the 'model' (you). +**Decision Rules (apply in order):** +1. **Model Continues:** If your last response explicitly states an immediate next action *you* intend to take (e.g., "Next, I will...", "Now I'll process...", "Moving on to analyze...", indicates an intended tool call that didn't execute), OR if the response seems clearly incomplete (cut off mid-thought without a natural conclusion), then the **'model'** should speak next. +2. **Question to User:** If your last response ends with a direct question specifically addressed *to the user*, then the **'user'** should speak next. +3. **Waiting for User:** If your last response completed a thought, statement, or task *and* does not meet the criteria for Rule 1 (Model Continues) or Rule 2 (Question to User), it implies a pause expecting user input or reaction. In this case, the **'user'** should speak next. +**Output Format:** +Respond *only* in JSON format according to the following schema. Do not include any text outside the JSON structure. +\`\`\`json +{ + "type": "object", + "properties": { + "reasoning": { + "type": "string", + "description": "Brief explanation justifying the 'next_speaker' choice based *strictly* on the applicable rule and the content/structure of the preceding turn." + }, + "next_speaker": { + "type": "string", + "enum": ["user", "model"], + "description": "Who should speak next based *only* on the preceding turn and the decision rules." + } + }, + "required": ["next_speaker", "reasoning"] +} +\`\`\` +`; + +const RESPONSE_SCHEMA: SchemaUnion = { + type: Type.OBJECT, + properties: { + reasoning: { + type: Type.STRING, + description: + "Brief explanation justifying the 'next_speaker' choice based *strictly* on the applicable rule and the content/structure of the preceding turn.", + }, + next_speaker: { + type: Type.STRING, + enum: ['user', 'model'], + description: + 'Who should speak next based *only* on the preceding turn and the decision rules', + }, + }, + required: ['reasoning', 'next_speaker'], +}; + +export interface NextSpeakerResponse { + reasoning: string; + next_speaker: 'user' | 'model'; +} + +export async function checkNextSpeaker( + chat: Chat, + geminiClient: GeminiClient, +): Promise { + const history = await chat.getHistory(); + // Ensure there's a model response to analyze + if (history.length === 0 || history[history.length - 1].role !== 'model') { + // Cannot determine next speaker if the last turn wasn't from the model + // or if history is empty. + return null; + } + + const contents: Content[] = [ + ...history, + { role: 'user', parts: [{ text: CHECK_PROMPT }] }, + ]; + + try { + const parsedResponse = (await geminiClient.generateJson( + contents, + RESPONSE_SCHEMA, + )) as unknown as NextSpeakerResponse; + + if ( + parsedResponse && + parsedResponse.next_speaker && + ['user', 'model'].includes(parsedResponse.next_speaker) + ) { + return parsedResponse; + } + return null; + } catch (error) { + console.warn( + 'Failed to talk to Gemiin endpoint when seeing if conversation should continue.', + error, + ); + return null; + } +}