Remove auto-execution on Flash in the event of a 429/Quota failover (#3662)

Co-authored-by: Jenna Inouye <jinouye@google.com>
2025-07-09 13:55:56 -04:00 · 2025-07-09 13:55:56 -04:00 · 8a6509ffeb
parent 01e756481f
commit 8a6509ffeb
14 changed files with 292 additions and 86 deletions
--- a/packages/cli/src/ui/App.tsx
+++ b/packages/cli/src/ui/App.tsx
@ -70,6 +70,7 @@ import { UpdateNotification } from './components/UpdateNotification.js';
 import {
  isProQuotaExceededError,
  isGenericQuotaExceededError,
  UserTierId,
 } from '@google/gemini-cli-core';
 import { checkForUpdates } from './utils/updateCheck.js';
 import ansiEscapes from 'ansi-escapes';
@ -136,6 +137,8 @@ const App = ({ config, settings, startupWarnings = [] }: AppProps) => {
  const ctrlDTimerRef = useRef<NodeJS.Timeout | null>(null);
  const [constrainHeight, setConstrainHeight] = useState<boolean>(true);
  const [showPrivacyNotice, setShowPrivacyNotice] = useState<boolean>(false);
  const [modelSwitchedFromQuotaError, setModelSwitchedFromQuotaError] =
    useState<boolean>(false);
  const openPrivacyNotice = useCallback(() => {
    setShowPrivacyNotice(true);
@ -251,23 +254,51 @@ const App = ({ config, settings, startupWarnings = [] }: AppProps) => {
    ): Promise<boolean> => {
      let message: string;
      // For quota errors, assume FREE tier (safe default) - only show upgrade messaging to free tier users
      // TODO: Get actual user tier from config when available
      const userTier = undefined; // Defaults to FREE tier behavior
      const isPaidTier =
        userTier === UserTierId.LEGACY || userTier === UserTierId.STANDARD;
      // Check if this is a Pro quota exceeded error
      if (error && isProQuotaExceededError(error)) {
        if (isPaidTier) {
          message = `⚡ You have reached your daily ${currentModel} quota limit.
 ⚡ Automatically switching from ${currentModel} to ${fallbackModel} for the remainder of this session.
 ⚡ To continue accessing the ${currentModel} model today, consider using /auth to switch to using a paid API key from AI Studio at https://aistudio.google.com/apikey`;
        } else {
          message = `⚡ You have reached your daily ${currentModel} quota limit.
 ⚡ Automatically switching from ${currentModel} to ${fallbackModel} for the remainder of this session.
 ⚡ To increase your limits, upgrade to a Gemini Code Assist Standard or Enterprise plan with higher limits at https://goo.gle/set-up-gemini-code-assist
 ⚡ Or you can utilize a Gemini API Key. See: https://goo.gle/gemini-cli-docs-auth#gemini-api-key
 ⚡ You can switch authentication methods by typing /auth`;
        }
      } else if (error && isGenericQuotaExceededError(error)) {
        if (isPaidTier) {
          message = `⚡ You have reached your daily quota limit.
 ⚡ Automatically switching from ${currentModel} to ${fallbackModel} for the remainder of this session.
 ⚡ To continue accessing the ${currentModel} model today, consider using /auth to switch to using a paid API key from AI Studio at https://aistudio.google.com/apikey`;
        } else {
          message = `⚡ You have reached your daily quota limit.
 ⚡ Automatically switching from ${currentModel} to ${fallbackModel} for the remainder of this session.
 ⚡ To increase your limits, upgrade to a Gemini Code Assist Standard or Enterprise plan with higher limits at https://goo.gle/set-up-gemini-code-assist
 ⚡ Or you can utilize a Gemini API Key. See: https://goo.gle/gemini-cli-docs-auth#gemini-api-key
 ⚡ You can switch authentication methods by typing /auth`;
        }
      } else {
        if (isPaidTier) {
          // Default fallback message for other cases (like consecutive 429s)
          message = `⚡ Automatically switching from ${currentModel} to ${fallbackModel} for faster responses for the remainder of this session.
 ⚡ Possible reasons for this are that you have received multiple consecutive capacity errors or you have reached your daily ${currentModel} quota limit
 ⚡ To continue accessing the ${currentModel} model today, consider using /auth to switch to using a paid API key from AI Studio at https://aistudio.google.com/apikey`;
        } else {
          // Default fallback message for other cases (like consecutive 429s)
-        message = `⚡ Slow response times detected.
+          message = `⚡ Automatically switching from ${currentModel} to ${fallbackModel} for faster responses for the remainder of this session.  
-⚡ Automatically switching from ${currentModel} to ${fallbackModel} for faster responses for the remainder of this session.`;
+⚡ Possible reasons for this are that you have received multiple consecutive capacity errors or you have reached your daily ${currentModel} quota limit
 ⚡ To increase your limits, upgrade to a Gemini Code Assist Standard or Enterprise plan with higher limits at https://goo.gle/set-up-gemini-code-assist
 ⚡ Or you can utilize a Gemini API Key. See: https://goo.gle/gemini-cli-docs-auth#gemini-api-key
 ⚡ You can switch authentication methods by typing /auth`;
        }
      }
      // Add message to UI history
@ -278,7 +309,14 @@ const App = ({ config, settings, startupWarnings = [] }: AppProps) => {
        },
        Date.now(),
      );
-      return true; // Always accept the fallback
+
      // Set the flag to prevent tool continuation
      setModelSwitchedFromQuotaError(true);
      // Set global quota error flag to prevent Flash model calls
      config.setQuotaErrorOccurred(true);
      // Switch model for future use but return false to stop current retry
      config.setModel(fallbackModel);
      return false; // Don't continue with current prompt
    };
    config.setFlashFallbackHandler(flashFallbackHandler);
@ -445,6 +483,8 @@ const App = ({ config, settings, startupWarnings = [] }: AppProps) => {
    getPreferredEditor,
    onAuthError,
    performMemoryRefresh,
    modelSwitchedFromQuotaError,
    setModelSwitchedFromQuotaError,
  );
  pendingHistoryItems.push(...pendingGeminiHistoryItems);
  const { elapsedTime, currentLoadingPhrase } =
--- a/packages/cli/src/ui/hooks/useGeminiStream.test.tsx
+++ b/packages/cli/src/ui/hooks/useGeminiStream.test.tsx
@ -301,6 +301,8 @@ describe('useGeminiStream', () => {
      getUsageStatisticsEnabled: () => true,
      getDebugMode: () => false,
      addHistory: vi.fn(),
      setQuotaErrorOccurred: vi.fn(),
      getQuotaErrorOccurred: vi.fn(() => false),
    } as unknown as Config;
    mockOnDebugMessage = vi.fn();
    mockHandleSlashCommand = vi.fn().mockResolvedValue(false);
@ -386,6 +388,8 @@ describe('useGeminiStream', () => {
          () => 'vscode' as EditorType,
          () => {},
          () => Promise.resolve(),
          false,
          () => {},
        );
      },
      {
@ -518,6 +522,8 @@ describe('useGeminiStream', () => {
        () => 'vscode' as EditorType,
        () => {},
        () => Promise.resolve(),
        false,
        () => {},
      ),
    );
@ -582,6 +588,8 @@ describe('useGeminiStream', () => {
        () => 'vscode' as EditorType,
        () => {},
        () => Promise.resolve(),
        false,
        () => {},
      ),
    );
@ -675,6 +683,8 @@ describe('useGeminiStream', () => {
        () => 'vscode' as EditorType,
        () => {},
        () => Promise.resolve(),
        false,
        () => {},
      ),
    );
@ -775,6 +785,8 @@ describe('useGeminiStream', () => {
        () => 'vscode' as EditorType,
        () => {},
        () => Promise.resolve(),
        false,
        () => {},
      ),
    );
@ -1063,6 +1075,8 @@ describe('useGeminiStream', () => {
          () => 'vscode' as EditorType,
          () => {},
          mockPerformMemoryRefresh,
          false,
          () => {},
        ),
      );
@ -1113,6 +1127,8 @@ describe('useGeminiStream', () => {
          () => 'vscode' as EditorType,
          () => {},
          () => Promise.resolve(),
          false,
          () => {},
        ),
      );
--- a/packages/cli/src/ui/hooks/useGeminiStream.ts
+++ b/packages/cli/src/ui/hooks/useGeminiStream.ts
@ -90,6 +90,8 @@ export const useGeminiStream = (
  getPreferredEditor: () => EditorType | undefined,
  onAuthError: () => void,
  performMemoryRefresh: () => Promise<void>,
  modelSwitchedFromQuotaError: boolean,
  setModelSwitchedFromQuotaError: React.Dispatch<React.SetStateAction<boolean>>,
 ) => {
  const [initError, setInitError] = useState<string | null>(null);
  const abortControllerRef = useRef<AbortController | null>(null);
@ -494,6 +496,12 @@ export const useGeminiStream = (
      const userMessageTimestamp = Date.now();
      setShowHelp(false);
      // Reset quota error flag when starting a new query (not a continuation)
      if (!options?.isContinuation) {
        setModelSwitchedFromQuotaError(false);
        config.setQuotaErrorOccurred(false);
      }
      abortControllerRef.current = new AbortController();
      const abortSignal = abortControllerRef.current.signal;
      turnCancelledRef.current = false;
@ -552,6 +560,7 @@ export const useGeminiStream = (
    [
      streamingState,
      setShowHelp,
      setModelSwitchedFromQuotaError,
      prepareQueryForGemini,
      processGeminiStreamEvents,
      pendingHistoryItemRef,
@ -668,6 +677,12 @@ export const useGeminiStream = (
      );
      markToolsAsSubmitted(callIdsToMarkAsSubmitted);
      // Don't continue if model was switched due to quota error
      if (modelSwitchedFromQuotaError) {
        return;
      }
      submitQuery(mergePartListUnions(responsesToSend), {
        isContinuation: true,
      });
@ -678,6 +693,7 @@ export const useGeminiStream = (
      markToolsAsSubmitted,
      geminiClient,
      performMemoryRefresh,
      modelSwitchedFromQuotaError,
    ],
  );
--- a/packages/cli/src/ui/utils/errorParsing.test.ts
+++ b/packages/cli/src/ui/utils/errorParsing.test.ts
@ -39,7 +39,7 @@ describe('parseAndFormatApiError', () => {
    );
    expect(result).toContain('[API Error: Rate limit exceeded');
    expect(result).toContain(
-      'Slow response times detected. Switching to the gemini-2.5-flash model',
+      'Possible quota limitations in place or slow response times detected. Switching to the gemini-2.5-flash model',
    );
  });
@ -55,7 +55,7 @@ describe('parseAndFormatApiError', () => {
    );
    expect(result).toContain('[API Error: Rate limit exceeded');
    expect(result).toContain(
-      'Slow response times detected. Switching to the gemini-2.5-flash model',
+      'Possible quota limitations in place or slow response times detected. Switching to the gemini-2.5-flash model',
    );
  });
@ -169,7 +169,7 @@ describe('parseAndFormatApiError', () => {
    );
    expect(result).toContain('[API Error: Rate limit exceeded');
    expect(result).toContain(
-      'Slow response times detected. Switching to the gemini-2.5-flash model',
+      'Possible quota limitations in place or slow response times detected. Switching to the gemini-2.5-flash model',
    );
    expect(result).not.toContain(
      'You have reached your daily gemini-2.5-pro quota limit',
@ -262,21 +262,17 @@ describe('parseAndFormatApiError', () => {
    );
  });
-  it('should handle different Gemini version strings in Pro quota exceeded errors', () => {
+  it('should handle different Gemini 2.5 version strings in Pro quota exceeded errors', () => {
-    const errorMessage15 =
+    const errorMessage25 =
-      'got status: 429 Too Many Requests. {"error":{"code":429,"message":"Quota exceeded for quota metric \'Gemini 1.5 Pro Requests\' and limit \'RequestsPerDay\' of service \'generativelanguage.googleapis.com\' for consumer \'project_number:123456789\'.","status":"RESOURCE_EXHAUSTED"}}';
+      'got status: 429 Too Many Requests. {"error":{"code":429,"message":"Quota exceeded for quota metric \'Gemini 2.5 Pro Requests\' and limit \'RequestsPerDay\' of service \'generativelanguage.googleapis.com\' for consumer \'project_number:123456789\'.","status":"RESOURCE_EXHAUSTED"}}';
    const errorMessagePreview =
      'got status: 429 Too Many Requests. {"error":{"code":429,"message":"Quota exceeded for quota metric \'Gemini 2.5-preview Pro Requests\' and limit \'RequestsPerDay\' of service \'generativelanguage.googleapis.com\' for consumer \'project_number:123456789\'.","status":"RESOURCE_EXHAUSTED"}}';
    const errorMessageBeta =
      'got status: 429 Too Many Requests. {"error":{"code":429,"message":"Quota exceeded for quota metric \'Gemini beta-3.0 Pro Requests\' and limit \'RequestsPerDay\' of service \'generativelanguage.googleapis.com\' for consumer \'project_number:123456789\'.","status":"RESOURCE_EXHAUSTED"}}';
    const errorMessageExperimental =
      'got status: 429 Too Many Requests. {"error":{"code":429,"message":"Quota exceeded for quota metric \'Gemini experimental-v2 Pro Requests\' and limit \'RequestsPerDay\' of service \'generativelanguage.googleapis.com\' for consumer \'project_number:123456789\'.","status":"RESOURCE_EXHAUSTED"}}';
-    const result15 = parseAndFormatApiError(
+    const result25 = parseAndFormatApiError(
-      errorMessage15,
+      errorMessage25,
      AuthType.LOGIN_WITH_GOOGLE,
      undefined,
-      'gemini-1.5-pro',
+      'gemini-2.5-pro',
      DEFAULT_GEMINI_FLASH_MODEL,
    );
    const resultPreview = parseAndFormatApiError(
@ -286,45 +282,19 @@ describe('parseAndFormatApiError', () => {
      'gemini-2.5-preview-pro',
      DEFAULT_GEMINI_FLASH_MODEL,
    );
    const resultBeta = parseAndFormatApiError(
      errorMessageBeta,
      AuthType.LOGIN_WITH_GOOGLE,
      undefined,
      'gemini-beta-3.0-pro',
      DEFAULT_GEMINI_FLASH_MODEL,
    );
    const resultExperimental = parseAndFormatApiError(
      errorMessageExperimental,
      AuthType.LOGIN_WITH_GOOGLE,
      undefined,
      'gemini-experimental-v2-pro',
      DEFAULT_GEMINI_FLASH_MODEL,
    );
-    expect(result15).toContain(
+    expect(result25).toContain(
-      'You have reached your daily gemini-1.5-pro quota limit',
+      'You have reached your daily gemini-2.5-pro quota limit',
    );
    expect(resultPreview).toContain(
      'You have reached your daily gemini-2.5-preview-pro quota limit',
    );
-    expect(resultBeta).toContain(
+    expect(result25).toContain(
      'You have reached your daily gemini-beta-3.0-pro quota limit',
    );
    expect(resultExperimental).toContain(
      'You have reached your daily gemini-experimental-v2-pro quota limit',
    );
    expect(result15).toContain(
      'upgrade to a Gemini Code Assist Standard or Enterprise plan',
    );
    expect(resultPreview).toContain(
      'upgrade to a Gemini Code Assist Standard or Enterprise plan',
    );
    expect(resultBeta).toContain(
      'upgrade to a Gemini Code Assist Standard or Enterprise plan',
    );
    expect(resultExperimental).toContain(
      'upgrade to a Gemini Code Assist Standard or Enterprise plan',
    );
  });
  it('should not match non-Pro models with similar version strings', () => {
@ -339,16 +309,6 @@ describe('parseAndFormatApiError', () => {
        "Quota exceeded for quota metric 'Gemini 2.5-preview Flash Requests' and limit",
      ),
    ).toBe(false);
    expect(
      isProQuotaExceededError(
        "Quota exceeded for quota metric 'Gemini beta-3.0 Flash Requests' and limit",
      ),
    ).toBe(false);
    expect(
      isProQuotaExceededError(
        "Quota exceeded for quota metric 'Gemini experimental-v2 Flash Requests' and limit",
      ),
    ).toBe(false);
    // Test other model types
    expect(
--- a/packages/cli/src/ui/utils/errorParsing.ts
+++ b/packages/cli/src/ui/utils/errorParsing.ts
@ -19,7 +19,7 @@ import {
 const getRateLimitErrorMessageGoogleFree = (
  fallbackModel: string = DEFAULT_GEMINI_FLASH_MODEL,
 ) =>
-  `\nSlow response times detected. Switching to the ${fallbackModel} model for the rest of this session.`;
+  `\nPossible quota limitations in place or slow response times detected. Switching to the ${fallbackModel} model for the rest of this session.`;
 const getRateLimitErrorMessageGoogleProQuotaFree = (
  currentModel: string = DEFAULT_GEMINI_MODEL,
@ -34,7 +34,7 @@ const getRateLimitErrorMessageGoogleGenericQuotaFree = () =>
 const getRateLimitErrorMessageGooglePaid = (
  fallbackModel: string = DEFAULT_GEMINI_FLASH_MODEL,
 ) =>
-  `\nSlow response times detected. Switching to the ${fallbackModel} model for the rest of this session. We appreciate you for choosing Gemini Code Assist and the Gemini CLI.`;
+  `\nPossible quota limitations in place or slow response times detected. Switching to the ${fallbackModel} model for the rest of this session. We appreciate you for choosing Gemini Code Assist and the Gemini CLI.`;
 const getRateLimitErrorMessageGoogleProQuotaPaid = (
  currentModel: string = DEFAULT_GEMINI_MODEL,
@ -53,7 +53,7 @@ const RATE_LIMIT_ERROR_MESSAGE_VERTEX =
 const getRateLimitErrorMessageDefault = (
  fallbackModel: string = DEFAULT_GEMINI_FLASH_MODEL,
 ) =>
-  `\nSlow response times detected. Switching to the ${fallbackModel} model for the rest of this session.`;
+  `\nPossible quota limitations in place or slow response times detected. Switching to the ${fallbackModel} model for the rest of this session.`;
 function getRateLimitMessage(
  authType?: AuthType,
--- a/packages/core/src/code_assist/server.ts
+++ b/packages/core/src/code_assist/server.ts
@ -31,7 +31,23 @@ import {
  toCountTokenRequest,
  toGenerateContentRequest,
 } from './converter.js';
-import { PassThrough } from 'node:stream';
+import { Readable } from 'node:stream';
 interface ErrorData {
  error?: {
    message?: string;
  };
 }
 interface GaxiosResponse {
  status: number;
  data: unknown;
 }
 interface StreamError extends Error {
  status?: number;
  response?: GaxiosResponse;
 }
 /** HTTP options to be used in each of the requests. */
 export interface HttpOptions {
@ -177,8 +193,45 @@ export class CodeAssistServer implements ContentGenerator {
    });
    return (async function* (): AsyncGenerator<T> {
      // Convert ReadableStream to Node.js stream if needed
      let nodeStream: NodeJS.ReadableStream;
      if (res.data instanceof ReadableStream) {
        // Convert Web ReadableStream to Node.js Readable stream
        // eslint-disable-next-line @typescript-eslint/no-explicit-any
        nodeStream = Readable.fromWeb(res.data as any);
      } else if (
        res.data &&
        typeof (res.data as NodeJS.ReadableStream).on === 'function'
      ) {
        // Already a Node.js stream
        nodeStream = res.data as NodeJS.ReadableStream;
      } else {
        // If res.data is not a stream, it might be an error response
        // Try to extract error information from the response
        let errorMessage =
          'Response data is not a readable stream. This may indicate a server error or quota issue.';
        if (res.data && typeof res.data === 'object') {
          // Check if this is an error response with error details
          const errorData = res.data as ErrorData;
          if (errorData.error?.message) {
            errorMessage = errorData.error.message;
          } else if (typeof errorData === 'string') {
            errorMessage = errorData;
          }
        }
        // Create an error that looks like a quota error if it contains quota information
        const error: StreamError = new Error(errorMessage);
        // Add status and response properties so it can be properly handled by retry logic
        error.status = res.status;
        error.response = res;
        throw error;
      }
      const rl = readline.createInterface({
-        input: res.data as PassThrough,
+        input: nodeStream,
        crlfDelay: Infinity, // Recognizes '\r\n' and '\n' as line breaks
      });
--- a/packages/core/src/config/config.ts
+++ b/packages/core/src/config/config.ts
@ -104,7 +104,7 @@ export type FlashFallbackHandler = (
  currentModel: string,
  fallbackModel: string,
  error?: unknown,
-) => Promise<boolean>;
+) => Promise<boolean | string | null>;
 export interface ConfigParameters {
  sessionId: string;
@ -183,6 +183,7 @@ export class Config {
  private readonly listExtensions: boolean;
  private readonly _activeExtensions: ActiveExtension[];
  flashFallbackHandler?: FlashFallbackHandler;
  private quotaErrorOccurred: boolean = false;
  constructor(params: ConfigParameters) {
    this.sessionId = params.sessionId;
@ -304,6 +305,14 @@ export class Config {
    this.flashFallbackHandler = handler;
  }
  setQuotaErrorOccurred(value: boolean): void {
    this.quotaErrorOccurred = value;
  }
  getQuotaErrorOccurred(): boolean {
    return this.quotaErrorOccurred;
  }
  getEmbeddingModel(): string {
    return this.embeddingModel;
  }
--- a/packages/core/src/core/client.test.ts
+++ b/packages/core/src/core/client.test.ts
@ -178,6 +178,8 @@ describe('Gemini Client (client.ts)', () => {
        getProxy: vi.fn().mockReturnValue(undefined),
        getWorkingDir: vi.fn().mockReturnValue('/test/dir'),
        getFileService: vi.fn().mockReturnValue(fileService),
        getQuotaErrorOccurred: vi.fn().mockReturnValue(false),
        setQuotaErrorOccurred: vi.fn(),
      };
      return mock as unknown as Config;
    });
@ -351,7 +353,7 @@ describe('Gemini Client (client.ts)', () => {
      await client.generateJson(contents, schema, abortSignal);
      expect(mockGenerateContentFn).toHaveBeenCalledWith({
-        model: DEFAULT_GEMINI_FLASH_MODEL,
+        model: 'test-model', // Should use current model from config
        config: {
          abortSignal,
          systemInstruction: getCoreSystemPrompt(''),
--- a/packages/core/src/core/client.ts
+++ b/packages/core/src/core/client.ts
@ -262,6 +262,7 @@ export class GeminiClient {
    request: PartListUnion,
    signal: AbortSignal,
    turns: number = this.MAX_TURNS,
    originalModel?: string,
  ): AsyncGenerator<ServerGeminiStreamEvent, Turn> {
    // Ensure turns never exceeds MAX_TURNS to prevent infinite loops
    const boundedTurns = Math.min(turns, this.MAX_TURNS);
@ -269,6 +270,9 @@ export class GeminiClient {
      return new Turn(this.getChat());
    }
    // Track the original model from the first call to detect model switching
    const initialModel = originalModel || this.config.getModel();
    const compressed = await this.tryCompressChat();
    if (compressed) {
      yield { type: GeminiEventType.ChatCompressed, value: compressed };
@ -279,6 +283,14 @@ export class GeminiClient {
      yield event;
    }
    if (!turn.pendingToolCalls.length && signal && !signal.aborted) {
      // Check if model was switched during the call (likely due to quota error)
      const currentModel = this.config.getModel();
      if (currentModel !== initialModel) {
        // Model was switched (likely due to quota error fallback)
        // Don't continue with recursive call to prevent unwanted Flash execution
        return turn;
      }
      const nextSpeakerCheck = await checkNextSpeaker(
        this.getChat(),
        this,
@ -288,7 +300,12 @@ export class GeminiClient {
        const nextRequest = [{ text: 'Please continue.' }];
        // This recursive call's events will be yielded out, but the final
        // turn object will be from the top-level call.
-        yield* this.sendMessageStream(nextRequest, signal, boundedTurns - 1);
+        yield* this.sendMessageStream(
          nextRequest,
          signal,
          boundedTurns - 1,
          initialModel,
        );
      }
    }
    return turn;
@ -298,9 +315,12 @@ export class GeminiClient {
    contents: Content[],
    schema: SchemaUnion,
    abortSignal: AbortSignal,
-    model: string = DEFAULT_GEMINI_FLASH_MODEL,
+    model?: string,
    config: GenerateContentConfig = {},
  ): Promise<Record<string, unknown>> {
    // Use current model from config instead of hardcoded Flash model
    const modelToUse =
      model || this.config.getModel() || DEFAULT_GEMINI_FLASH_MODEL;
    try {
      const userMemory = this.config.getUserMemory();
      const systemInstruction = getCoreSystemPrompt(userMemory);
@ -312,7 +332,7 @@ export class GeminiClient {
      const apiCall = () =>
        this.getContentGenerator().generateContent({
-          model,
+          model: modelToUse,
          config: {
            ...requestConfig,
            systemInstruction,
@ -585,10 +605,14 @@ export class GeminiClient {
          fallbackModel,
          error,
        );
-        if (accepted) {
+        if (accepted !== false && accepted !== null) {
          this.config.setModel(fallbackModel);
          return fallbackModel;
        }
        // Check if the model was switched manually in the handler
        if (this.config.getModel() === fallbackModel) {
          return null; // Model was switched but don't continue with current prompt
        }
      } catch (error) {
        console.warn('Flash fallback handler failed:', error);
      }
--- a/packages/core/src/core/geminiChat.test.ts
+++ b/packages/core/src/core/geminiChat.test.ts
@ -43,6 +43,8 @@ describe('GeminiChat', () => {
      }),
      getModel: vi.fn().mockReturnValue('gemini-pro'),
      setModel: vi.fn(),
      getQuotaErrorOccurred: vi.fn().mockReturnValue(false),
      setQuotaErrorOccurred: vi.fn(),
      flashFallbackHandler: undefined,
    } as unknown as Config;
--- a/packages/core/src/core/geminiChat.ts
+++ b/packages/core/src/core/geminiChat.ts
@ -217,10 +217,14 @@ export class GeminiChat {
          fallbackModel,
          error,
        );
-        if (accepted) {
+        if (accepted !== false && accepted !== null) {
          this.config.setModel(fallbackModel);
          return fallbackModel;
        }
        // Check if the model was switched manually in the handler
        if (this.config.getModel() === fallbackModel) {
          return null; // Model was switched but don't continue with current prompt
        }
      } catch (error) {
        console.warn('Flash fallback handler failed:', error);
      }
@ -262,12 +266,25 @@ export class GeminiChat {
    let response: GenerateContentResponse;
    try {
-      const apiCall = () =>
+      const apiCall = () => {
-        this.contentGenerator.generateContent({
+        const modelToUse = this.config.getModel() || DEFAULT_GEMINI_FLASH_MODEL;
-          model: this.config.getModel() || DEFAULT_GEMINI_FLASH_MODEL,
+
        // Prevent Flash model calls immediately after quota error
        if (
          this.config.getQuotaErrorOccurred() &&
          modelToUse === DEFAULT_GEMINI_FLASH_MODEL
        ) {
          throw new Error(
            'Please submit a new query to continue with the Flash model.',
          );
        }
        return this.contentGenerator.generateContent({
          model: modelToUse,
          contents: requestContents,
          config: { ...this.generationConfig, ...params.config },
        });
      };
      response = await retryWithBackoff(apiCall, {
        shouldRetry: (error: Error) => {
@ -354,12 +371,25 @@ export class GeminiChat {
    const startTime = Date.now();
    try {
-      const apiCall = () =>
+      const apiCall = () => {
-        this.contentGenerator.generateContentStream({
+        const modelToUse = this.config.getModel();
-          model: this.config.getModel(),
+
        // Prevent Flash model calls immediately after quota error
        if (
          this.config.getQuotaErrorOccurred() &&
          modelToUse === DEFAULT_GEMINI_FLASH_MODEL
        ) {
          throw new Error(
            'Please submit a new query to continue with the Flash model.',
          );
        }
        return this.contentGenerator.generateContentStream({
          model: modelToUse,
          contents: requestContents,
          config: { ...this.generationConfig, ...params.config },
        });
      };
      // Note: Retrying streams can be complex. If generateContentStream itself doesn't handle retries
      // for transient issues internally before yielding the async generator, this retry will re-initiate
--- a/packages/core/src/utils/editCorrector.test.ts
+++ b/packages/core/src/utils/editCorrector.test.ts
@ -214,6 +214,8 @@ describe('editCorrector', () => {
        setAlwaysSkipModificationConfirmation: vi.fn((skip: boolean) => {
          configParams.alwaysSkipModificationConfirmation = skip;
        }),
        getQuotaErrorOccurred: vi.fn().mockReturnValue(false),
        setQuotaErrorOccurred: vi.fn(),
      } as unknown as Config;
      callCount = 0;
@ -654,6 +656,8 @@ describe('editCorrector', () => {
        setAlwaysSkipModificationConfirmation: vi.fn((skip: boolean) => {
          configParams.alwaysSkipModificationConfirmation = skip;
        }),
        getQuotaErrorOccurred: vi.fn().mockReturnValue(false),
        setQuotaErrorOccurred: vi.fn(),
      } as unknown as Config;
      callCount = 0;
--- a/packages/core/src/utils/quotaErrorDetection.ts
+++ b/packages/core/src/utils/quotaErrorDetection.ts
@ -41,14 +41,23 @@ export function isProQuotaExceededError(error: unknown): boolean {
  // Check for Pro quota exceeded errors by looking for the specific pattern
  // This will match patterns like:
  // - "Quota exceeded for quota metric 'Gemini 2.5 Pro Requests'"
-  // - "Quota exceeded for quota metric 'Gemini 1.5-preview Pro Requests'"
+  // - "Quota exceeded for quota metric 'Gemini 2.5-preview Pro Requests'"
  // - "Quota exceeded for quota metric 'Gemini beta-3.0 Pro Requests'"
  // - "Quota exceeded for quota metric 'Gemini experimental-v2 Pro Requests'"
  // We use string methods instead of regex to avoid ReDoS vulnerabilities
-  const checkMessage = (message: string): boolean =>
+  const checkMessage = (message: string): boolean => {
    console.log('[DEBUG] isProQuotaExceededError checking message:', message);
    const result =
      message.includes("Quota exceeded for quota metric 'Gemini") &&
      message.includes("Pro Requests'");
    console.log('[DEBUG] isProQuotaExceededError result:', result);
    return result;
  };
  // Log the full error object to understand its structure
  console.log(
    '[DEBUG] isProQuotaExceededError - full error object:',
    JSON.stringify(error, null, 2),
  );
  if (typeof error === 'string') {
    return checkMessage(error);
@ -62,6 +71,38 @@ export function isProQuotaExceededError(error: unknown): boolean {
    return checkMessage(error.error.message);
  }
  // Check if it's a Gaxios error with response data
  if (error && typeof error === 'object' && 'response' in error) {
    const gaxiosError = error as {
      response?: {
        data?: unknown;
      };
    };
    if (gaxiosError.response && gaxiosError.response.data) {
      console.log(
        '[DEBUG] isProQuotaExceededError - checking response data:',
        gaxiosError.response.data,
      );
      if (typeof gaxiosError.response.data === 'string') {
        return checkMessage(gaxiosError.response.data);
      }
      if (
        typeof gaxiosError.response.data === 'object' &&
        gaxiosError.response.data !== null &&
        'error' in gaxiosError.response.data
      ) {
        const errorData = gaxiosError.response.data as {
          error?: { message?: string };
        };
        return checkMessage(errorData.error?.message || '');
      }
    }
  }
  console.log(
    '[DEBUG] isProQuotaExceededError - no matching error format for:',
    error,
  );
  return false;
 }
--- a/packages/core/src/utils/retry.ts
+++ b/packages/core/src/utils/retry.ts
@ -18,7 +18,7 @@ export interface RetryOptions {
  onPersistent429?: (
    authType?: string,
    error?: unknown,
-  ) => Promise<string | null>;
+  ) => Promise<string | boolean | null>;
  authType?: string;
 }
@ -102,13 +102,16 @@ export async function retryWithBackoff<T>(
      ) {
        try {
          const fallbackModel = await onPersistent429(authType, error);
-          if (fallbackModel) {
+          if (fallbackModel !== false && fallbackModel !== null) {
            // Reset attempt counter and try with new model
            attempt = 0;
            consecutive429Count = 0;
            currentDelay = initialDelayMs;
            // With the model updated, we continue to the next attempt
            continue;
          } else {
            // Fallback handler returned null/false, meaning don't continue - stop retry process
            throw error;
          }
        } catch (fallbackError) {
          // If fallback fails, continue with original error
@ -126,13 +129,16 @@ export async function retryWithBackoff<T>(
      ) {
        try {
          const fallbackModel = await onPersistent429(authType, error);
-          if (fallbackModel) {
+          if (fallbackModel !== false && fallbackModel !== null) {
            // Reset attempt counter and try with new model
            attempt = 0;
            consecutive429Count = 0;
            currentDelay = initialDelayMs;
            // With the model updated, we continue to the next attempt
            continue;
          } else {
            // Fallback handler returned null/false, meaning don't continue - stop retry process
            throw error;
          }
        } catch (fallbackError) {
          // If fallback fails, continue with original error
@ -155,13 +161,16 @@ export async function retryWithBackoff<T>(
      ) {
        try {
          const fallbackModel = await onPersistent429(authType, error);
-          if (fallbackModel) {
+          if (fallbackModel !== false && fallbackModel !== null) {
            // Reset attempt counter and try with new model
            attempt = 0;
            consecutive429Count = 0;
            currentDelay = initialDelayMs;
            // With the model updated, we continue to the next attempt
            continue;
          } else {
            // Fallback handler returned null/false, meaning don't continue - stop retry process
            throw error;
          }
        } catch (fallbackError) {
          // If fallback fails, continue with original error