Don't prematurely end convo w/ Gemini.

- There seems to be a root model bug where the model will preemptively bail on conversations without trying harder. Typically the stops are VERY obvious and bug-looking where you need to prmopt the model to "continue". - This PR attempts to fix the above by running a 2.0-flash request (don't need somethign more powerful) at the end of every full interaction to see who should speak (user or model). - Add tests for nextSpeakerChecker Fixes https://b.corp.google.com/issues/416826051
2025-05-10 13:11:03 -07:00 · 2025-05-10 13:11:03 -07:00 · d159a1507e
parent c0eab31c02
commit d159a1507e
3 changed files with 303 additions and 13 deletions
--- a/packages/server/src/core/client.ts
+++ b/packages/server/src/core/client.ts
@ -21,6 +21,7 @@ import { Config } from '../config/config.js';
 import { getCoreSystemPrompt } from './prompts.js';
 import { ReadManyFilesTool } from '../tools/read-many-files.js';
 import { getResponseText } from '../utils/generateContentResponseUtilities.js';
+import { checkNextSpeaker } from '../utils/nextSpeakerChecker.js';

 export class GeminiClient {
  private client: GoogleGenAI;
@ -103,15 +104,7 @@ export class GeminiClient {
      .getToolRegistry()
      .getFunctionDeclarations();
    const tools: Tool[] = [{ functionDeclarations: toolDeclarations }];
-    try {
-      return this.client.chats.create({
-        model: this.model,
-        config: {
-          systemInstruction: getCoreSystemPrompt(),
-          ...this.generateContentConfig,
-          tools,
-        },
-        history: [
+    const history: Content[] = [
      {
        role: 'user',
        parts: envParts,
@ -120,7 +113,16 @@ export class GeminiClient {
        role: 'model',
        parts: [{ text: 'Got it. Thanks for the context!' }],
      },
-        ],
+    ];
+    try {
+      return this.client.chats.create({
+        model: this.model,
+        config: {
+          systemInstruction: getCoreSystemPrompt(),
+          ...this.generateContentConfig,
+          tools,
+        },
+        history,
      });
    } catch (error) {
      console.error('Error initializing Gemini chat session:', error);
@ -149,10 +151,15 @@ export class GeminiClient {
        break;
      }

-      // What do we do when we have both function responses and confirmations?
      const fnResponses = turn.getFunctionResponses();
      if (fnResponses.length === 0) {
-        break; // user's turn to respond
+        const nextSpeakerCheck = await checkNextSpeaker(chat, this);
+        if (nextSpeakerCheck?.next_speaker === 'model') {
+          request = [{ text: 'Please continue.' }];
+          continue;
+        } else {
+          break;
+        }
      }
      request = fnResponses;
    }
@ -167,7 +174,7 @@ export class GeminiClient {
  ): Promise<Record<string, unknown>> {
    try {
      const result = await this.client.models.generateContent({
-        model: this.model,
+        model: 'gemini-2.0-flash',
        config: {
          ...this.generateContentConfig,
          systemInstruction: getCoreSystemPrompt(),
--- a/packages/server/src/utils/nextSpeakerChecker.test.ts
+++ b/packages/server/src/utils/nextSpeakerChecker.test.ts
@ -0,0 +1,186 @@
+/**
+ * @license
+ * Copyright 2025 Google LLC
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+import { describe, it, expect, vi, beforeEach, Mock, afterEach } from 'vitest';
+import { Chat, Content } from '@google/genai';
+import { GeminiClient } from '../core/client.js';
+import { Config } from '../config/config.js'; // Added Config import
+import { checkNextSpeaker, NextSpeakerResponse } from './nextSpeakerChecker.js';
+
+// Mock GeminiClient and Config constructor
+vi.mock('../core/client.js');
+vi.mock('../config/config.js');
+
+// Mock @google/genai
+const mockGetHistory = vi.fn();
+const mockCreateChat = vi.fn(() => ({
+  getHistory: mockGetHistory,
+}));
+
+vi.mock('@google/genai', async () => {
+  const actualGenAI =
+    await vi.importActual<typeof import('@google/genai')>('@google/genai');
+  return {
+    ...actualGenAI,
+    GoogleGenAI: vi.fn().mockImplementation(() => ({
+      chats: {
+        create: mockCreateChat,
+      },
+    })),
+    // Keep Chat constructor mock for type safety if direct instantiation is attempted,
+    // but primary path is via client.chats.create
+    Chat: vi.fn().mockImplementation(() => ({
+      getHistory: mockGetHistory,
+    })),
+  };
+});
+
+describe('checkNextSpeaker', () => {
+  let mockChat: Chat;
+  let mockGeminiClient: GeminiClient;
+  let MockConfig: Mock;
+
+  beforeEach(() => {
+    // Dynamically import and assign the mock
+    // Must be done within beforeEach or test to ensure mocks are reset
+    MockConfig = vi.mocked(Config);
+    // Create a mock instance of Config
+    const mockConfigInstance = new MockConfig(
+      'test-api-key',
+      'gemini-pro',
+      false,
+      '.',
+      false,
+      undefined,
+      false,
+      undefined,
+      undefined,
+      undefined,
+    );
+    // Mock any methods on mockConfigInstance if needed, e.g., mockConfigInstance.getToolRegistry = vi.fn()...
+
+    mockGeminiClient = new GeminiClient(mockConfigInstance);
+    // Simulate chat creation as done in GeminiClient
+    mockChat = { getHistory: mockGetHistory } as unknown as Chat;
+  });
+
+  afterEach(() => {
+    vi.clearAllMocks();
+  });
+
+  it('should return null if history is empty', async () => {
+    (mockChat.getHistory as Mock).mockResolvedValue([]);
+    const result = await checkNextSpeaker(mockChat, mockGeminiClient);
+    expect(result).toBeNull();
+    expect(mockGeminiClient.generateJson).not.toHaveBeenCalled();
+  });
+
+  it('should return null if the last speaker was the user', async () => {
+    (mockChat.getHistory as Mock).mockResolvedValue([
+      { role: 'user', parts: [{ text: 'Hello' }] },
+    ] as Content[]);
+    const result = await checkNextSpeaker(mockChat, mockGeminiClient);
+    expect(result).toBeNull();
+    expect(mockGeminiClient.generateJson).not.toHaveBeenCalled();
+  });
+
+  it("should return { next_speaker: 'model' } when model intends to continue", async () => {
+    (mockChat.getHistory as Mock).mockResolvedValue([
+      { role: 'model', parts: [{ text: 'I will now do something.' }] },
+    ] as Content[]);
+    const mockApiResponse: NextSpeakerResponse = {
+      reasoning: 'Model stated it will do something.',
+      next_speaker: 'model',
+    };
+    (mockGeminiClient.generateJson as Mock).mockResolvedValue(mockApiResponse);
+
+    const result = await checkNextSpeaker(mockChat, mockGeminiClient);
+    expect(result).toEqual(mockApiResponse);
+    expect(mockGeminiClient.generateJson).toHaveBeenCalledTimes(1);
+  });
+
+  it("should return { next_speaker: 'user' } when model asks a question", async () => {
+    (mockChat.getHistory as Mock).mockResolvedValue([
+      { role: 'model', parts: [{ text: 'What would you like to do?' }] },
+    ] as Content[]);
+    const mockApiResponse: NextSpeakerResponse = {
+      reasoning: 'Model asked a question.',
+      next_speaker: 'user',
+    };
+    (mockGeminiClient.generateJson as Mock).mockResolvedValue(mockApiResponse);
+
+    const result = await checkNextSpeaker(mockChat, mockGeminiClient);
+    expect(result).toEqual(mockApiResponse);
+  });
+
+  it("should return { next_speaker: 'user' } when model makes a statement", async () => {
+    (mockChat.getHistory as Mock).mockResolvedValue([
+      { role: 'model', parts: [{ text: 'This is a statement.' }] },
+    ] as Content[]);
+    const mockApiResponse: NextSpeakerResponse = {
+      reasoning: 'Model made a statement, awaiting user input.',
+      next_speaker: 'user',
+    };
+    (mockGeminiClient.generateJson as Mock).mockResolvedValue(mockApiResponse);
+
+    const result = await checkNextSpeaker(mockChat, mockGeminiClient);
+    expect(result).toEqual(mockApiResponse);
+  });
+
+  it('should return null if geminiClient.generateJson throws an error', async () => {
+    const consoleWarnSpy = vi
+      .spyOn(console, 'warn')
+      .mockImplementation(() => {});
+    (mockChat.getHistory as Mock).mockResolvedValue([
+      { role: 'model', parts: [{ text: 'Some model output.' }] },
+    ] as Content[]);
+    (mockGeminiClient.generateJson as Mock).mockRejectedValue(
+      new Error('API Error'),
+    );
+
+    const result = await checkNextSpeaker(mockChat, mockGeminiClient);
+    expect(result).toBeNull();
+    consoleWarnSpy.mockRestore();
+  });
+
+  it('should return null if geminiClient.generateJson returns invalid JSON (missing next_speaker)', async () => {
+    (mockChat.getHistory as Mock).mockResolvedValue([
+      { role: 'model', parts: [{ text: 'Some model output.' }] },
+    ] as Content[]);
+    (mockGeminiClient.generateJson as Mock).mockResolvedValue({
+      reasoning: 'This is incomplete.',
+    } as unknown as NextSpeakerResponse); // Type assertion to simulate invalid response
+
+    const result = await checkNextSpeaker(mockChat, mockGeminiClient);
+    expect(result).toBeNull();
+  });
+
+  it('should return null if geminiClient.generateJson returns a non-string next_speaker', async () => {
+    (mockChat.getHistory as Mock).mockResolvedValue([
+      { role: 'model', parts: [{ text: 'Some model output.' }] },
+    ] as Content[]);
+    (mockGeminiClient.generateJson as Mock).mockResolvedValue({
+      reasoning: 'Model made a statement, awaiting user input.',
+      next_speaker: 123, // Invalid type
+    } as unknown as NextSpeakerResponse);
+
+    const result = await checkNextSpeaker(mockChat, mockGeminiClient);
+    expect(result).toBeNull();
+  });
+
+  it('should return null if geminiClient.generateJson returns an invalid next_speaker string value', async () => {
+    (mockChat.getHistory as Mock).mockResolvedValue([
+      { role: 'model', parts: [{ text: 'Some model output.' }] },
+    ] as Content[]);
+    (mockGeminiClient.generateJson as Mock).mockResolvedValue({
+      reasoning: 'Model made a statement, awaiting user input.',
+      next_speaker: 'neither', // Invalid enum value
+    } as unknown as NextSpeakerResponse);
+
+    const result = await checkNextSpeaker(mockChat, mockGeminiClient);
+    expect(result).toBeNull();
+  });
+});
--- a/packages/server/src/utils/nextSpeakerChecker.ts
+++ b/packages/server/src/utils/nextSpeakerChecker.ts
@ -0,0 +1,97 @@
+/**
+ * @license
+ * Copyright 2025 Google LLC
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+import { Chat, Content, SchemaUnion, Type } from '@google/genai';
+import { GeminiClient } from '../core/client.js';
+
+const CHECK_PROMPT = `Analyze *only* the content and structure of your immediately preceding response (your last turn in the conversation history). Based *strictly* on that response, determine who should logically speak next: the 'user' or the 'model' (you).
+**Decision Rules (apply in order):**
+1.  **Model Continues:** If your last response explicitly states an immediate next action *you* intend to take (e.g., "Next, I will...", "Now I'll process...", "Moving on to analyze...", indicates an intended tool call that didn't execute), OR if the response seems clearly incomplete (cut off mid-thought without a natural conclusion), then the **'model'** should speak next.
+2.  **Question to User:** If your last response ends with a direct question specifically addressed *to the user*, then the **'user'** should speak next.
+3.  **Waiting for User:** If your last response completed a thought, statement, or task *and* does not meet the criteria for Rule 1 (Model Continues) or Rule 2 (Question to User), it implies a pause expecting user input or reaction. In this case, the **'user'** should speak next.
+**Output Format:**
+Respond *only* in JSON format according to the following schema. Do not include any text outside the JSON structure.
+\`\`\`json
+{
+  "type": "object",
+  "properties": {
+    "reasoning": {
+        "type": "string",
+        "description": "Brief explanation justifying the 'next_speaker' choice based *strictly* on the applicable rule and the content/structure of the preceding turn."
+    },
+    "next_speaker": {
+      "type": "string",
+      "enum": ["user", "model"],
+      "description": "Who should speak next based *only* on the preceding turn and the decision rules."
+    }
+  },
+  "required": ["next_speaker", "reasoning"]
+}
+\`\`\`
+`;
+
+const RESPONSE_SCHEMA: SchemaUnion = {
+  type: Type.OBJECT,
+  properties: {
+    reasoning: {
+      type: Type.STRING,
+      description:
+        "Brief explanation justifying the 'next_speaker' choice based *strictly* on the applicable rule and the content/structure of the preceding turn.",
+    },
+    next_speaker: {
+      type: Type.STRING,
+      enum: ['user', 'model'],
+      description:
+        'Who should speak next based *only* on the preceding turn and the decision rules',
+    },
+  },
+  required: ['reasoning', 'next_speaker'],
+};
+
+export interface NextSpeakerResponse {
+  reasoning: string;
+  next_speaker: 'user' | 'model';
+}
+
+export async function checkNextSpeaker(
+  chat: Chat,
+  geminiClient: GeminiClient,
+): Promise<NextSpeakerResponse | null> {
+  const history = await chat.getHistory();
+  // Ensure there's a model response to analyze
+  if (history.length === 0 || history[history.length - 1].role !== 'model') {
+    // Cannot determine next speaker if the last turn wasn't from the model
+    // or if history is empty.
+    return null;
+  }
+
+  const contents: Content[] = [
+    ...history,
+    { role: 'user', parts: [{ text: CHECK_PROMPT }] },
+  ];
+
+  try {
+    const parsedResponse = (await geminiClient.generateJson(
+      contents,
+      RESPONSE_SCHEMA,
+    )) as unknown as NextSpeakerResponse;
+
+    if (
+      parsedResponse &&
+      parsedResponse.next_speaker &&
+      ['user', 'model'].includes(parsedResponse.next_speaker)
+    ) {
+      return parsedResponse;
+    }
+    return null;
+  } catch (error) {
+    console.warn(
+      'Failed to talk to Gemiin endpoint when seeing if conversation should continue.',
+      error,
+    );
+    return null;
+  }
+}