fix: character encoding issues in shell command processor (#1949)

Co-authored-by: Jacob Richman <jacob314@gmail.com> Co-authored-by: Sandy Tao <sandytao520@icloud.com>
2025-07-22 06:26:40 +08:00 · 2025-07-22 06:26:40 +08:00 · 12765eb775
parent 4c3532d2b3
commit 12765eb775
7 changed files with 696 additions and 9 deletions
--- a/package-lock.json
+++ b/package-lock.json
@ -3615,6 +3615,12 @@
        "node": ">=8"
      }
    },
+    "node_modules/chardet": {
+      "version": "2.1.0",
+      "resolved": "https://registry.npmjs.org/chardet/-/chardet-2.1.0.tgz",
+      "integrity": "sha512-bNFETTG/pM5ryzQ9Ad0lJOTa6HWD/YsScAR3EnCPZRPlQh77JocYktSHOUHelyhm8IARL+o4c4F1bP5KVOjiRA==",
+      "license": "MIT"
+    },
    "node_modules/check-error": {
      "version": "2.1.1",
      "resolved": "https://registry.npmjs.org/check-error/-/check-error-2.1.1.tgz",
@ -11813,6 +11819,7 @@
        "@types/glob": "^8.1.0",
        "@types/html-to-text": "^9.0.4",
        "ajv": "^8.17.1",
+        "chardet": "^2.1.0",
        "diff": "^7.0.0",
        "dotenv": "^17.1.0",
        "glob": "^10.4.5",
--- a/packages/cli/src/ui/hooks/shellCommandProcessor.test.ts
+++ b/packages/cli/src/ui/hooks/shellCommandProcessor.test.ts
@ -19,9 +19,11 @@ vi.mock('os', () => ({
  default: {
    platform: () => 'linux',
    tmpdir: () => '/tmp',
+    homedir: () => '/home/user',
  },
  platform: () => 'linux',
  tmpdir: () => '/tmp',
+  homedir: () => '/home/user',
 }));
 vi.mock('@google/gemini-cli-core');
 vi.mock('../utils/textUtils.js', () => ({
--- a/packages/cli/src/ui/hooks/shellCommandProcessor.ts
+++ b/packages/cli/src/ui/hooks/shellCommandProcessor.ts
@ -5,14 +5,18 @@
 */

 import { spawn } from 'child_process';
-import { StringDecoder } from 'string_decoder';
+import { TextDecoder } from 'util';
 import {
  HistoryItemWithoutId,
  IndividualToolCallDisplay,
  ToolCallStatus,
 } from '../types.js';
 import { useCallback } from 'react';
-import { Config, GeminiClient } from '@google/gemini-cli-core';
+import {
+  Config,
+  GeminiClient,
+  getCachedEncodingForBuffer,
+} from '@google/gemini-cli-core';
 import { type PartListUnion } from '@google/genai';
 import { formatMemoryUsage } from '../utils/formatters.js';
 import { isBinary } from '../utils/textUtils.js';
@ -71,8 +75,8 @@ function executeShellCommand(
    });

    // Use decoders to handle multi-byte characters safely (for streaming output).
-    const stdoutDecoder = new StringDecoder('utf8');
-    const stderrDecoder = new StringDecoder('utf8');
+    let stdoutDecoder: TextDecoder | null = null;
+    let stderrDecoder: TextDecoder | null = null;

    let stdout = '';
    let stderr = '';
@ -85,6 +89,12 @@ function executeShellCommand(
    let sniffedBytes = 0;

    const handleOutput = (data: Buffer, stream: 'stdout' | 'stderr') => {
+      if (!stdoutDecoder || !stderrDecoder) {
+        const encoding = getCachedEncodingForBuffer(data);
+        stdoutDecoder = new TextDecoder(encoding);
+        stderrDecoder = new TextDecoder(encoding);
+      }
+
      outputChunks.push(data);

      if (streamToUi && sniffedBytes < MAX_SNIFF_SIZE) {
@ -101,8 +111,8 @@ function executeShellCommand(

      const decodedChunk =
        stream === 'stdout'
-          ? stdoutDecoder.write(data)
-          : stderrDecoder.write(data);
+          ? stdoutDecoder.decode(data, { stream: true })
+          : stderrDecoder.decode(data, { stream: true });
      if (stream === 'stdout') {
        stdout += stripAnsi(decodedChunk);
      } else {
@ -160,8 +170,12 @@ function executeShellCommand(
      abortSignal.removeEventListener('abort', abortHandler);

      // Handle any final bytes lingering in the decoders
-      stdout += stdoutDecoder.end();
-      stderr += stderrDecoder.end();
+      if (stdoutDecoder) {
+        stdout += stdoutDecoder.decode();
+      }
+      if (stderrDecoder) {
+        stderr += stderrDecoder.decode();
+      }

      const finalBuffer = Buffer.concat(outputChunks);

--- a/packages/core/package.json
+++ b/packages/core/package.json
@ -44,7 +44,8 @@
    "simple-git": "^3.28.0",
    "strip-ansi": "^7.1.0",
    "undici": "^7.10.0",
-    "ws": "^8.18.0"
+    "ws": "^8.18.0",
+    "chardet": "^2.1.0"
  },
  "devDependencies": {
    "@types/diff": "^7.0.2",
--- a/packages/core/src/index.ts
+++ b/packages/core/src/index.ts
@ -35,6 +35,7 @@ export * from './utils/editor.js';
 export * from './utils/quotaErrorDetection.js';
 export * from './utils/fileUtils.js';
 export * from './utils/retry.js';
+export * from './utils/systemEncoding.js';

 // Export services
 export * from './services/fileDiscoveryService.js';
--- a/packages/core/src/utils/systemEncoding.test.ts
+++ b/packages/core/src/utils/systemEncoding.test.ts
@ -0,0 +1,496 @@
+/**
+ * @license
+ * Copyright 2025 Google LLC
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+import { vi, describe, it, expect, beforeEach, afterEach } from 'vitest';
+import { execSync } from 'child_process';
+import * as os from 'os';
+import { detect as chardetDetect } from 'chardet';
+
+// Mock dependencies
+vi.mock('child_process');
+vi.mock('os');
+vi.mock('chardet');
+
+// Import the functions we want to test after refactoring
+import {
+  getCachedEncodingForBuffer,
+  getSystemEncoding,
+  windowsCodePageToEncoding,
+  detectEncodingFromBuffer,
+  resetEncodingCache,
+} from './systemEncoding.js';
+
+describe('Shell Command Processor - Encoding Functions', () => {
+  let consoleWarnSpy: ReturnType<typeof vi.spyOn>;
+  let mockedExecSync: ReturnType<typeof vi.mocked<typeof execSync>>;
+  let mockedOsPlatform: ReturnType<typeof vi.mocked<() => string>>;
+  let mockedChardetDetect: ReturnType<typeof vi.mocked<typeof chardetDetect>>;
+
+  beforeEach(() => {
+    consoleWarnSpy = vi.spyOn(console, 'warn').mockImplementation(() => {});
+    mockedExecSync = vi.mocked(execSync);
+    mockedOsPlatform = vi.mocked(os.platform);
+    mockedChardetDetect = vi.mocked(chardetDetect);
+
+    // Reset the encoding cache before each test
+    resetEncodingCache();
+
+    // Clear environment variables that might affect tests
+    delete process.env.LC_ALL;
+    delete process.env.LC_CTYPE;
+    delete process.env.LANG;
+  });
+
+  afterEach(() => {
+    vi.restoreAllMocks();
+    resetEncodingCache();
+  });
+
+  describe('windowsCodePageToEncoding', () => {
+    it('should map common Windows code pages correctly', () => {
+      expect(windowsCodePageToEncoding(437)).toBe('cp437');
+      expect(windowsCodePageToEncoding(850)).toBe('cp850');
+      expect(windowsCodePageToEncoding(65001)).toBe('utf-8');
+      expect(windowsCodePageToEncoding(1252)).toBe('windows-1252');
+      expect(windowsCodePageToEncoding(932)).toBe('shift_jis');
+      expect(windowsCodePageToEncoding(936)).toBe('gb2312');
+      expect(windowsCodePageToEncoding(949)).toBe('euc-kr');
+      expect(windowsCodePageToEncoding(950)).toBe('big5');
+      expect(windowsCodePageToEncoding(1200)).toBe('utf-16le');
+      expect(windowsCodePageToEncoding(1201)).toBe('utf-16be');
+    });
+
+    it('should return null for unmapped code pages and warn', () => {
+      expect(windowsCodePageToEncoding(99999)).toBe(null);
+      expect(consoleWarnSpy).toHaveBeenCalledWith(
+        'Unable to determine encoding for windows code page 99999.',
+      );
+    });
+
+    it('should handle all Windows-specific code pages', () => {
+      expect(windowsCodePageToEncoding(874)).toBe('windows-874');
+      expect(windowsCodePageToEncoding(1250)).toBe('windows-1250');
+      expect(windowsCodePageToEncoding(1251)).toBe('windows-1251');
+      expect(windowsCodePageToEncoding(1253)).toBe('windows-1253');
+      expect(windowsCodePageToEncoding(1254)).toBe('windows-1254');
+      expect(windowsCodePageToEncoding(1255)).toBe('windows-1255');
+      expect(windowsCodePageToEncoding(1256)).toBe('windows-1256');
+      expect(windowsCodePageToEncoding(1257)).toBe('windows-1257');
+      expect(windowsCodePageToEncoding(1258)).toBe('windows-1258');
+    });
+  });
+
+  describe('detectEncodingFromBuffer', () => {
+    it('should detect encoding using chardet successfully', () => {
+      const buffer = Buffer.from('test content', 'utf8');
+      mockedChardetDetect.mockReturnValue('UTF-8');
+
+      const result = detectEncodingFromBuffer(buffer);
+      expect(result).toBe('utf-8');
+      expect(mockedChardetDetect).toHaveBeenCalledWith(buffer);
+    });
+
+    it('should handle chardet returning mixed case encoding', () => {
+      const buffer = Buffer.from('test content', 'utf8');
+      mockedChardetDetect.mockReturnValue('ISO-8859-1');
+
+      const result = detectEncodingFromBuffer(buffer);
+      expect(result).toBe('iso-8859-1');
+    });
+
+    it('should return null when chardet fails', () => {
+      const buffer = Buffer.from('test content', 'utf8');
+      mockedChardetDetect.mockImplementation(() => {
+        throw new Error('Detection failed');
+      });
+
+      const result = detectEncodingFromBuffer(buffer);
+      expect(result).toBe(null);
+      expect(consoleWarnSpy).toHaveBeenCalledWith(
+        'Failed to detect encoding with chardet:',
+        expect.any(Error),
+      );
+    });
+
+    it('should return null when chardet returns null', () => {
+      const buffer = Buffer.from('test content', 'utf8');
+      mockedChardetDetect.mockReturnValue(null);
+
+      const result = detectEncodingFromBuffer(buffer);
+      expect(result).toBe(null);
+    });
+
+    it('should return null when chardet returns non-string', () => {
+      const buffer = Buffer.from('test content', 'utf8');
+      mockedChardetDetect.mockReturnValue([
+        'utf-8',
+        'iso-8859-1',
+      ] as unknown as string);
+
+      const result = detectEncodingFromBuffer(buffer);
+      expect(result).toBe(null);
+    });
+  });
+
+  describe('getSystemEncoding - Windows', () => {
+    beforeEach(() => {
+      mockedOsPlatform.mockReturnValue('win32');
+    });
+
+    it('should parse Windows chcp output correctly', () => {
+      mockedExecSync.mockReturnValue('Active code page: 65001');
+
+      const result = getSystemEncoding();
+      expect(result).toBe('utf-8');
+      expect(mockedExecSync).toHaveBeenCalledWith('chcp', { encoding: 'utf8' });
+    });
+
+    it('should handle different chcp output formats', () => {
+      mockedExecSync.mockReturnValue('Current code page: 1252');
+
+      const result = getSystemEncoding();
+      expect(result).toBe('windows-1252');
+    });
+
+    it('should handle chcp output with extra whitespace', () => {
+      mockedExecSync.mockReturnValue('Active code page:   437   ');
+
+      const result = getSystemEncoding();
+      expect(result).toBe('cp437');
+    });
+
+    it('should return null when chcp command fails', () => {
+      mockedExecSync.mockImplementation(() => {
+        throw new Error('Command failed');
+      });
+
+      const result = getSystemEncoding();
+      expect(result).toBe(null);
+      expect(consoleWarnSpy).toHaveBeenCalledWith(
+        expect.stringContaining(
+          "Failed to get Windows code page using 'chcp' command",
+        ),
+      );
+    });
+
+    it('should return null when chcp output cannot be parsed', () => {
+      mockedExecSync.mockReturnValue('Unexpected output format');
+
+      const result = getSystemEncoding();
+      expect(result).toBe(null);
+      expect(consoleWarnSpy).toHaveBeenCalledWith(
+        expect.stringContaining(
+          "Failed to get Windows code page using 'chcp' command",
+        ),
+      );
+    });
+
+    it('should return null when code page is not a number', () => {
+      mockedExecSync.mockReturnValue('Active code page: abc');
+
+      const result = getSystemEncoding();
+      expect(result).toBe(null);
+      expect(consoleWarnSpy).toHaveBeenCalledWith(
+        expect.stringContaining(
+          "Failed to get Windows code page using 'chcp' command",
+        ),
+      );
+    });
+
+    it('should return null when code page maps to null', () => {
+      mockedExecSync.mockReturnValue('Active code page: 99999');
+
+      const result = getSystemEncoding();
+      expect(result).toBe(null);
+      // Should warn about unknown code page from windowsCodePageToEncoding
+      expect(consoleWarnSpy).toHaveBeenCalledWith(
+        'Unable to determine encoding for windows code page 99999.',
+      );
+    });
+  });
+
+  describe('getSystemEncoding - Unix-like', () => {
+    beforeEach(() => {
+      mockedOsPlatform.mockReturnValue('linux');
+    });
+
+    it('should parse locale from LC_ALL environment variable', () => {
+      process.env.LC_ALL = 'en_US.UTF-8';
+
+      const result = getSystemEncoding();
+      expect(result).toBe('utf-8');
+    });
+
+    it('should parse locale from LC_CTYPE when LC_ALL is not set', () => {
+      process.env.LC_CTYPE = 'fr_FR.ISO-8859-1';
+
+      const result = getSystemEncoding();
+      expect(result).toBe('iso-8859-1');
+    });
+
+    it('should parse locale from LANG when LC_ALL and LC_CTYPE are not set', () => {
+      process.env.LANG = 'de_DE.UTF-8';
+
+      const result = getSystemEncoding();
+      expect(result).toBe('utf-8');
+    });
+
+    it('should handle locale charmap command when environment variables are empty', () => {
+      mockedExecSync.mockReturnValue('UTF-8\n');
+
+      const result = getSystemEncoding();
+      expect(result).toBe('utf-8');
+      expect(mockedExecSync).toHaveBeenCalledWith('locale charmap', {
+        encoding: 'utf8',
+      });
+    });
+
+    it('should handle locale charmap with mixed case', () => {
+      mockedExecSync.mockReturnValue('ISO-8859-1\n');
+
+      const result = getSystemEncoding();
+      expect(result).toBe('iso-8859-1');
+    });
+
+    it('should return null when locale charmap fails', () => {
+      mockedExecSync.mockImplementation(() => {
+        throw new Error('Command failed');
+      });
+
+      const result = getSystemEncoding();
+      expect(result).toBe(null);
+      expect(consoleWarnSpy).toHaveBeenCalledWith(
+        'Failed to get locale charmap.',
+      );
+    });
+
+    it('should handle locale without encoding (no dot)', () => {
+      process.env.LANG = 'C';
+
+      const result = getSystemEncoding();
+      expect(result).toBe('c');
+    });
+
+    it('should handle empty locale environment variables', () => {
+      process.env.LC_ALL = '';
+      process.env.LC_CTYPE = '';
+      process.env.LANG = '';
+      mockedExecSync.mockReturnValue('UTF-8');
+
+      const result = getSystemEncoding();
+      expect(result).toBe('utf-8');
+    });
+
+    it('should return locale as-is when locale format has no dot', () => {
+      process.env.LANG = 'invalid_format';
+
+      const result = getSystemEncoding();
+      expect(result).toBe('invalid_format');
+    });
+
+    it('should prioritize LC_ALL over other environment variables', () => {
+      process.env.LC_ALL = 'en_US.UTF-8';
+      process.env.LC_CTYPE = 'fr_FR.ISO-8859-1';
+      process.env.LANG = 'de_DE.CP1252';
+
+      const result = getSystemEncoding();
+      expect(result).toBe('utf-8');
+    });
+
+    it('should prioritize LC_CTYPE over LANG', () => {
+      process.env.LC_CTYPE = 'fr_FR.ISO-8859-1';
+      process.env.LANG = 'de_DE.CP1252';
+
+      const result = getSystemEncoding();
+      expect(result).toBe('iso-8859-1');
+    });
+  });
+
+  describe('getEncodingForBuffer', () => {
+    beforeEach(() => {
+      mockedOsPlatform.mockReturnValue('linux');
+    });
+
+    it('should use cached system encoding on subsequent calls', () => {
+      process.env.LANG = 'en_US.UTF-8';
+      const buffer = Buffer.from('test');
+
+      // First call
+      const result1 = getCachedEncodingForBuffer(buffer);
+      expect(result1).toBe('utf-8');
+
+      // Change environment (should not affect cached result)
+      process.env.LANG = 'fr_FR.ISO-8859-1';
+
+      // Second call should use cached value
+      const result2 = getCachedEncodingForBuffer(buffer);
+      expect(result2).toBe('utf-8');
+    });
+
+    it('should fall back to buffer detection when system encoding fails', () => {
+      // No environment variables set
+      mockedExecSync.mockImplementation(() => {
+        throw new Error('locale command failed');
+      });
+
+      const buffer = Buffer.from('test');
+      mockedChardetDetect.mockReturnValue('ISO-8859-1');
+
+      const result = getCachedEncodingForBuffer(buffer);
+      expect(result).toBe('iso-8859-1');
+      expect(mockedChardetDetect).toHaveBeenCalledWith(buffer);
+    });
+
+    it('should fall back to utf-8 when both system and buffer detection fail', () => {
+      // System encoding fails
+      mockedExecSync.mockImplementation(() => {
+        throw new Error('locale command failed');
+      });
+
+      // Buffer detection fails
+      mockedChardetDetect.mockImplementation(() => {
+        throw new Error('chardet failed');
+      });
+
+      const buffer = Buffer.from('test');
+      const result = getCachedEncodingForBuffer(buffer);
+      expect(result).toBe('utf-8');
+    });
+
+    it('should not cache buffer detection results', () => {
+      // System encoding fails initially
+      mockedExecSync.mockImplementation(() => {
+        throw new Error('locale command failed');
+      });
+
+      const buffer1 = Buffer.from('test1');
+      const buffer2 = Buffer.from('test2');
+
+      mockedChardetDetect
+        .mockReturnValueOnce('ISO-8859-1')
+        .mockReturnValueOnce('UTF-16');
+
+      const result1 = getCachedEncodingForBuffer(buffer1);
+      const result2 = getCachedEncodingForBuffer(buffer2);
+
+      expect(result1).toBe('iso-8859-1');
+      expect(result2).toBe('utf-16');
+      expect(mockedChardetDetect).toHaveBeenCalledTimes(2);
+    });
+
+    it('should handle Windows system encoding', () => {
+      mockedOsPlatform.mockReturnValue('win32');
+      mockedExecSync.mockReturnValue('Active code page: 1252');
+
+      const buffer = Buffer.from('test');
+      const result = getCachedEncodingForBuffer(buffer);
+
+      expect(result).toBe('windows-1252');
+    });
+
+    it('should cache null system encoding result', () => {
+      // Reset the cache specifically for this test
+      resetEncodingCache();
+
+      // Ensure we're on Unix-like for this test
+      mockedOsPlatform.mockReturnValue('linux');
+
+      // System encoding detection returns null
+      mockedExecSync.mockImplementation(() => {
+        throw new Error('locale command failed');
+      });
+
+      const buffer1 = Buffer.from('test1');
+      const buffer2 = Buffer.from('test2');
+
+      mockedChardetDetect
+        .mockReturnValueOnce('ISO-8859-1')
+        .mockReturnValueOnce('UTF-16');
+
+      // Clear any previous calls from beforeEach setup or previous tests
+      mockedExecSync.mockClear();
+
+      const result1 = getCachedEncodingForBuffer(buffer1);
+      const result2 = getCachedEncodingForBuffer(buffer2);
+
+      // Should call execSync only once due to caching (null result is cached)
+      expect(mockedExecSync).toHaveBeenCalledTimes(1);
+      expect(result1).toBe('iso-8859-1');
+      expect(result2).toBe('utf-16');
+
+      // Call a third time to verify cache is still used
+      const buffer3 = Buffer.from('test3');
+      mockedChardetDetect.mockReturnValueOnce('UTF-32');
+      const result3 = getCachedEncodingForBuffer(buffer3);
+
+      // Still should be only one call to execSync
+      expect(mockedExecSync).toHaveBeenCalledTimes(1);
+      expect(result3).toBe('utf-32');
+    });
+  });
+
+  describe('Cross-platform behavior', () => {
+    it('should work correctly on macOS', () => {
+      mockedOsPlatform.mockReturnValue('darwin');
+      process.env.LANG = 'en_US.UTF-8';
+
+      const result = getSystemEncoding();
+      expect(result).toBe('utf-8');
+    });
+
+    it('should work correctly on other Unix-like systems', () => {
+      mockedOsPlatform.mockReturnValue('freebsd');
+      process.env.LANG = 'en_US.UTF-8';
+
+      const result = getSystemEncoding();
+      expect(result).toBe('utf-8');
+    });
+
+    it('should handle unknown platforms as Unix-like', () => {
+      mockedOsPlatform.mockReturnValue('unknown' as NodeJS.Platform);
+      process.env.LANG = 'en_US.UTF-8';
+
+      const result = getSystemEncoding();
+      expect(result).toBe('utf-8');
+    });
+  });
+
+  describe('Edge cases and error handling', () => {
+    it('should handle empty buffer gracefully', () => {
+      mockedOsPlatform.mockReturnValue('linux');
+      process.env.LANG = 'en_US.UTF-8';
+
+      const buffer = Buffer.alloc(0);
+      const result = getCachedEncodingForBuffer(buffer);
+      expect(result).toBe('utf-8');
+    });
+
+    it('should handle very large buffers', () => {
+      mockedOsPlatform.mockReturnValue('linux');
+      process.env.LANG = 'en_US.UTF-8';
+
+      const buffer = Buffer.alloc(1024 * 1024, 'a');
+      const result = getCachedEncodingForBuffer(buffer);
+      expect(result).toBe('utf-8');
+    });
+
+    it('should handle Unicode content', () => {
+      mockedOsPlatform.mockReturnValue('linux');
+      const unicodeText = '你好世界 🌍 ñoño';
+
+      // System encoding fails
+      mockedExecSync.mockImplementation(() => {
+        throw new Error('locale command failed');
+      });
+
+      mockedChardetDetect.mockReturnValue('UTF-8');
+
+      const buffer = Buffer.from(unicodeText, 'utf8');
+      const result = getCachedEncodingForBuffer(buffer);
+      expect(result).toBe('utf-8');
+    });
+  });
+});
--- a/packages/core/src/utils/systemEncoding.ts
+++ b/packages/core/src/utils/systemEncoding.ts
@ -0,0 +1,166 @@
+/**
+ * @license
+ * Copyright 2025 Google LLC
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+import { execSync } from 'child_process';
+import os from 'os';
+import { detect as chardetDetect } from 'chardet';
+
+// Cache for system encoding to avoid repeated detection
+// Use undefined to indicate "not yet checked" vs null meaning "checked but failed"
+let cachedSystemEncoding: string | null | undefined = undefined;
+
+/**
+ * Reset the encoding cache - useful for testing
+ */
+export function resetEncodingCache(): void {
+  cachedSystemEncoding = undefined;
+}
+
+/**
+ * Returns the system encoding, caching the result to avoid repeated system calls.
+ * If system encoding detection fails, falls back to detecting from the provided buffer.
+ * Note: Only the system encoding is cached - buffer-based detection runs for each buffer
+ * since different buffers may have different encodings.
+ * @param buffer A buffer to use for detecting encoding if system detection fails.
+ */
+export function getCachedEncodingForBuffer(buffer: Buffer): string {
+  // Cache system encoding detection since it's system-wide
+  if (cachedSystemEncoding === undefined) {
+    cachedSystemEncoding = getSystemEncoding();
+  }
+
+  // If we have a cached system encoding, use it
+  if (cachedSystemEncoding) {
+    return cachedSystemEncoding;
+  }
+
+  // Otherwise, detect from this specific buffer (don't cache this result)
+  return detectEncodingFromBuffer(buffer) || 'utf-8';
+}
+
+/**
+ * Detects the system encoding based on the platform.
+ * For Windows, it uses the 'chcp' command to get the current code page.
+ * For Unix-like systems, it checks environment variables like LC_ALL, LC_CTYPE, and LANG.
+ * If those are not set, it tries to run 'locale charmap' to get the encoding.
+ * If detection fails, it returns null.
+ * @returns The system encoding as a string, or null if detection fails.
+ */
+export function getSystemEncoding(): string | null {
+  // Windows
+  if (os.platform() === 'win32') {
+    try {
+      const output = execSync('chcp', { encoding: 'utf8' });
+      const match = output.match(/:\s*(\d+)/);
+      if (match) {
+        const codePage = parseInt(match[1], 10);
+        if (!isNaN(codePage)) {
+          return windowsCodePageToEncoding(codePage);
+        }
+      }
+      // Only warn if we can't parse the output format, not if windowsCodePageToEncoding fails
+      throw new Error(
+        `Unable to parse Windows code page from 'chcp' output "${output.trim()}". `,
+      );
+    } catch (error) {
+      console.warn(
+        `Failed to get Windows code page using 'chcp' command: ${error instanceof Error ? error.message : String(error)}. ` +
+          `Will attempt to detect encoding from command output instead.`,
+      );
+    }
+    return null;
+  }
+
+  // Unix-like
+  // Use environment variables LC_ALL, LC_CTYPE, and LANG to determine the
+  // system encoding. However, these environment variables might not always
+  // be set or accurate. Handle cases where none of these variables are set.
+  const env = process.env;
+  let locale = env.LC_ALL || env.LC_CTYPE || env.LANG || '';
+
+  // Fallback to querying the system directly when environment variables are missing
+  if (!locale) {
+    try {
+      locale = execSync('locale charmap', { encoding: 'utf8' })
+        .toString()
+        .trim();
+    } catch (_e) {
+      console.warn('Failed to get locale charmap.');
+      return null;
+    }
+  }
+
+  const match = locale.match(/\.(.+)/); // e.g., "en_US.UTF-8"
+  if (match && match[1]) {
+    return match[1].toLowerCase();
+  }
+
+  // Handle cases where locale charmap returns just the encoding name (e.g., "UTF-8")
+  if (locale && !locale.includes('.')) {
+    return locale.toLowerCase();
+  }
+
+  return null;
+}
+
+/**
+ * Converts a Windows code page number to a corresponding encoding name.
+ * @param cp The Windows code page number (e.g., 437, 850, etc.)
+ * @returns The corresponding encoding name as a string, or null if no mapping exists.
+ */
+export function windowsCodePageToEncoding(cp: number): string | null {
+  // Most common mappings; extend as needed
+  const map: { [key: number]: string } = {
+    437: 'cp437',
+    850: 'cp850',
+    852: 'cp852',
+    866: 'cp866',
+    874: 'windows-874',
+    932: 'shift_jis',
+    936: 'gb2312',
+    949: 'euc-kr',
+    950: 'big5',
+    1200: 'utf-16le',
+    1201: 'utf-16be',
+    1250: 'windows-1250',
+    1251: 'windows-1251',
+    1252: 'windows-1252',
+    1253: 'windows-1253',
+    1254: 'windows-1254',
+    1255: 'windows-1255',
+    1256: 'windows-1256',
+    1257: 'windows-1257',
+    1258: 'windows-1258',
+    65001: 'utf-8',
+  };
+
+  if (map[cp]) {
+    return map[cp];
+  }
+
+  console.warn(`Unable to determine encoding for windows code page ${cp}.`);
+  return null; // Return null if no mapping found
+}
+
+/**
+ * Attempts to detect encoding from a buffer using chardet.
+ * This is useful when system encoding detection fails.
+ * Returns the detected encoding in lowercase, or null if detection fails.
+ * @param buffer The buffer to analyze for encoding.
+ * @return The detected encoding as a lowercase string, or null if detection fails.
+ */
+export function detectEncodingFromBuffer(buffer: Buffer): string | null {
+  try {
+    const detected = chardetDetect(buffer);
+    if (detected && typeof detected === 'string') {
+      return detected.toLowerCase();
+    }
+  } catch (error) {
+    console.warn('Failed to detect encoding with chardet:', error);
+  }
+
+  return null;
+}