diff --git a/package-lock.json b/package-lock.json index 475d01e3..e35d9bc6 100644 --- a/package-lock.json +++ b/package-lock.json @@ -3615,6 +3615,12 @@ "node": ">=8" } }, + "node_modules/chardet": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/chardet/-/chardet-2.1.0.tgz", + "integrity": "sha512-bNFETTG/pM5ryzQ9Ad0lJOTa6HWD/YsScAR3EnCPZRPlQh77JocYktSHOUHelyhm8IARL+o4c4F1bP5KVOjiRA==", + "license": "MIT" + }, "node_modules/check-error": { "version": "2.1.1", "resolved": "https://registry.npmjs.org/check-error/-/check-error-2.1.1.tgz", @@ -11813,6 +11819,7 @@ "@types/glob": "^8.1.0", "@types/html-to-text": "^9.0.4", "ajv": "^8.17.1", + "chardet": "^2.1.0", "diff": "^7.0.0", "dotenv": "^17.1.0", "glob": "^10.4.5", diff --git a/packages/cli/src/ui/hooks/shellCommandProcessor.test.ts b/packages/cli/src/ui/hooks/shellCommandProcessor.test.ts index 53dcb0d4..5ebf2b1d 100644 --- a/packages/cli/src/ui/hooks/shellCommandProcessor.test.ts +++ b/packages/cli/src/ui/hooks/shellCommandProcessor.test.ts @@ -19,9 +19,11 @@ vi.mock('os', () => ({ default: { platform: () => 'linux', tmpdir: () => '/tmp', + homedir: () => '/home/user', }, platform: () => 'linux', tmpdir: () => '/tmp', + homedir: () => '/home/user', })); vi.mock('@google/gemini-cli-core'); vi.mock('../utils/textUtils.js', () => ({ diff --git a/packages/cli/src/ui/hooks/shellCommandProcessor.ts b/packages/cli/src/ui/hooks/shellCommandProcessor.ts index e04c9f54..5d2b3166 100644 --- a/packages/cli/src/ui/hooks/shellCommandProcessor.ts +++ b/packages/cli/src/ui/hooks/shellCommandProcessor.ts @@ -5,14 +5,18 @@ */ import { spawn } from 'child_process'; -import { StringDecoder } from 'string_decoder'; +import { TextDecoder } from 'util'; import { HistoryItemWithoutId, IndividualToolCallDisplay, ToolCallStatus, } from '../types.js'; import { useCallback } from 'react'; -import { Config, GeminiClient } from '@google/gemini-cli-core'; +import { + Config, + GeminiClient, + getCachedEncodingForBuffer, +} from '@google/gemini-cli-core'; import { type PartListUnion } from '@google/genai'; import { formatMemoryUsage } from '../utils/formatters.js'; import { isBinary } from '../utils/textUtils.js'; @@ -71,8 +75,8 @@ function executeShellCommand( }); // Use decoders to handle multi-byte characters safely (for streaming output). - const stdoutDecoder = new StringDecoder('utf8'); - const stderrDecoder = new StringDecoder('utf8'); + let stdoutDecoder: TextDecoder | null = null; + let stderrDecoder: TextDecoder | null = null; let stdout = ''; let stderr = ''; @@ -85,6 +89,12 @@ function executeShellCommand( let sniffedBytes = 0; const handleOutput = (data: Buffer, stream: 'stdout' | 'stderr') => { + if (!stdoutDecoder || !stderrDecoder) { + const encoding = getCachedEncodingForBuffer(data); + stdoutDecoder = new TextDecoder(encoding); + stderrDecoder = new TextDecoder(encoding); + } + outputChunks.push(data); if (streamToUi && sniffedBytes < MAX_SNIFF_SIZE) { @@ -101,8 +111,8 @@ function executeShellCommand( const decodedChunk = stream === 'stdout' - ? stdoutDecoder.write(data) - : stderrDecoder.write(data); + ? stdoutDecoder.decode(data, { stream: true }) + : stderrDecoder.decode(data, { stream: true }); if (stream === 'stdout') { stdout += stripAnsi(decodedChunk); } else { @@ -160,8 +170,12 @@ function executeShellCommand( abortSignal.removeEventListener('abort', abortHandler); // Handle any final bytes lingering in the decoders - stdout += stdoutDecoder.end(); - stderr += stderrDecoder.end(); + if (stdoutDecoder) { + stdout += stdoutDecoder.decode(); + } + if (stderrDecoder) { + stderr += stderrDecoder.decode(); + } const finalBuffer = Buffer.concat(outputChunks); diff --git a/packages/core/package.json b/packages/core/package.json index f19b4145..40f10aa0 100644 --- a/packages/core/package.json +++ b/packages/core/package.json @@ -44,7 +44,8 @@ "simple-git": "^3.28.0", "strip-ansi": "^7.1.0", "undici": "^7.10.0", - "ws": "^8.18.0" + "ws": "^8.18.0", + "chardet": "^2.1.0" }, "devDependencies": { "@types/diff": "^7.0.2", diff --git a/packages/core/src/index.ts b/packages/core/src/index.ts index 24d8bdb2..f560afb4 100644 --- a/packages/core/src/index.ts +++ b/packages/core/src/index.ts @@ -35,6 +35,7 @@ export * from './utils/editor.js'; export * from './utils/quotaErrorDetection.js'; export * from './utils/fileUtils.js'; export * from './utils/retry.js'; +export * from './utils/systemEncoding.js'; // Export services export * from './services/fileDiscoveryService.js'; diff --git a/packages/core/src/utils/systemEncoding.test.ts b/packages/core/src/utils/systemEncoding.test.ts new file mode 100644 index 00000000..9375db9e --- /dev/null +++ b/packages/core/src/utils/systemEncoding.test.ts @@ -0,0 +1,496 @@ +/** + * @license + * Copyright 2025 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ + +import { vi, describe, it, expect, beforeEach, afterEach } from 'vitest'; +import { execSync } from 'child_process'; +import * as os from 'os'; +import { detect as chardetDetect } from 'chardet'; + +// Mock dependencies +vi.mock('child_process'); +vi.mock('os'); +vi.mock('chardet'); + +// Import the functions we want to test after refactoring +import { + getCachedEncodingForBuffer, + getSystemEncoding, + windowsCodePageToEncoding, + detectEncodingFromBuffer, + resetEncodingCache, +} from './systemEncoding.js'; + +describe('Shell Command Processor - Encoding Functions', () => { + let consoleWarnSpy: ReturnType; + let mockedExecSync: ReturnType>; + let mockedOsPlatform: ReturnType string>>; + let mockedChardetDetect: ReturnType>; + + beforeEach(() => { + consoleWarnSpy = vi.spyOn(console, 'warn').mockImplementation(() => {}); + mockedExecSync = vi.mocked(execSync); + mockedOsPlatform = vi.mocked(os.platform); + mockedChardetDetect = vi.mocked(chardetDetect); + + // Reset the encoding cache before each test + resetEncodingCache(); + + // Clear environment variables that might affect tests + delete process.env.LC_ALL; + delete process.env.LC_CTYPE; + delete process.env.LANG; + }); + + afterEach(() => { + vi.restoreAllMocks(); + resetEncodingCache(); + }); + + describe('windowsCodePageToEncoding', () => { + it('should map common Windows code pages correctly', () => { + expect(windowsCodePageToEncoding(437)).toBe('cp437'); + expect(windowsCodePageToEncoding(850)).toBe('cp850'); + expect(windowsCodePageToEncoding(65001)).toBe('utf-8'); + expect(windowsCodePageToEncoding(1252)).toBe('windows-1252'); + expect(windowsCodePageToEncoding(932)).toBe('shift_jis'); + expect(windowsCodePageToEncoding(936)).toBe('gb2312'); + expect(windowsCodePageToEncoding(949)).toBe('euc-kr'); + expect(windowsCodePageToEncoding(950)).toBe('big5'); + expect(windowsCodePageToEncoding(1200)).toBe('utf-16le'); + expect(windowsCodePageToEncoding(1201)).toBe('utf-16be'); + }); + + it('should return null for unmapped code pages and warn', () => { + expect(windowsCodePageToEncoding(99999)).toBe(null); + expect(consoleWarnSpy).toHaveBeenCalledWith( + 'Unable to determine encoding for windows code page 99999.', + ); + }); + + it('should handle all Windows-specific code pages', () => { + expect(windowsCodePageToEncoding(874)).toBe('windows-874'); + expect(windowsCodePageToEncoding(1250)).toBe('windows-1250'); + expect(windowsCodePageToEncoding(1251)).toBe('windows-1251'); + expect(windowsCodePageToEncoding(1253)).toBe('windows-1253'); + expect(windowsCodePageToEncoding(1254)).toBe('windows-1254'); + expect(windowsCodePageToEncoding(1255)).toBe('windows-1255'); + expect(windowsCodePageToEncoding(1256)).toBe('windows-1256'); + expect(windowsCodePageToEncoding(1257)).toBe('windows-1257'); + expect(windowsCodePageToEncoding(1258)).toBe('windows-1258'); + }); + }); + + describe('detectEncodingFromBuffer', () => { + it('should detect encoding using chardet successfully', () => { + const buffer = Buffer.from('test content', 'utf8'); + mockedChardetDetect.mockReturnValue('UTF-8'); + + const result = detectEncodingFromBuffer(buffer); + expect(result).toBe('utf-8'); + expect(mockedChardetDetect).toHaveBeenCalledWith(buffer); + }); + + it('should handle chardet returning mixed case encoding', () => { + const buffer = Buffer.from('test content', 'utf8'); + mockedChardetDetect.mockReturnValue('ISO-8859-1'); + + const result = detectEncodingFromBuffer(buffer); + expect(result).toBe('iso-8859-1'); + }); + + it('should return null when chardet fails', () => { + const buffer = Buffer.from('test content', 'utf8'); + mockedChardetDetect.mockImplementation(() => { + throw new Error('Detection failed'); + }); + + const result = detectEncodingFromBuffer(buffer); + expect(result).toBe(null); + expect(consoleWarnSpy).toHaveBeenCalledWith( + 'Failed to detect encoding with chardet:', + expect.any(Error), + ); + }); + + it('should return null when chardet returns null', () => { + const buffer = Buffer.from('test content', 'utf8'); + mockedChardetDetect.mockReturnValue(null); + + const result = detectEncodingFromBuffer(buffer); + expect(result).toBe(null); + }); + + it('should return null when chardet returns non-string', () => { + const buffer = Buffer.from('test content', 'utf8'); + mockedChardetDetect.mockReturnValue([ + 'utf-8', + 'iso-8859-1', + ] as unknown as string); + + const result = detectEncodingFromBuffer(buffer); + expect(result).toBe(null); + }); + }); + + describe('getSystemEncoding - Windows', () => { + beforeEach(() => { + mockedOsPlatform.mockReturnValue('win32'); + }); + + it('should parse Windows chcp output correctly', () => { + mockedExecSync.mockReturnValue('Active code page: 65001'); + + const result = getSystemEncoding(); + expect(result).toBe('utf-8'); + expect(mockedExecSync).toHaveBeenCalledWith('chcp', { encoding: 'utf8' }); + }); + + it('should handle different chcp output formats', () => { + mockedExecSync.mockReturnValue('Current code page: 1252'); + + const result = getSystemEncoding(); + expect(result).toBe('windows-1252'); + }); + + it('should handle chcp output with extra whitespace', () => { + mockedExecSync.mockReturnValue('Active code page: 437 '); + + const result = getSystemEncoding(); + expect(result).toBe('cp437'); + }); + + it('should return null when chcp command fails', () => { + mockedExecSync.mockImplementation(() => { + throw new Error('Command failed'); + }); + + const result = getSystemEncoding(); + expect(result).toBe(null); + expect(consoleWarnSpy).toHaveBeenCalledWith( + expect.stringContaining( + "Failed to get Windows code page using 'chcp' command", + ), + ); + }); + + it('should return null when chcp output cannot be parsed', () => { + mockedExecSync.mockReturnValue('Unexpected output format'); + + const result = getSystemEncoding(); + expect(result).toBe(null); + expect(consoleWarnSpy).toHaveBeenCalledWith( + expect.stringContaining( + "Failed to get Windows code page using 'chcp' command", + ), + ); + }); + + it('should return null when code page is not a number', () => { + mockedExecSync.mockReturnValue('Active code page: abc'); + + const result = getSystemEncoding(); + expect(result).toBe(null); + expect(consoleWarnSpy).toHaveBeenCalledWith( + expect.stringContaining( + "Failed to get Windows code page using 'chcp' command", + ), + ); + }); + + it('should return null when code page maps to null', () => { + mockedExecSync.mockReturnValue('Active code page: 99999'); + + const result = getSystemEncoding(); + expect(result).toBe(null); + // Should warn about unknown code page from windowsCodePageToEncoding + expect(consoleWarnSpy).toHaveBeenCalledWith( + 'Unable to determine encoding for windows code page 99999.', + ); + }); + }); + + describe('getSystemEncoding - Unix-like', () => { + beforeEach(() => { + mockedOsPlatform.mockReturnValue('linux'); + }); + + it('should parse locale from LC_ALL environment variable', () => { + process.env.LC_ALL = 'en_US.UTF-8'; + + const result = getSystemEncoding(); + expect(result).toBe('utf-8'); + }); + + it('should parse locale from LC_CTYPE when LC_ALL is not set', () => { + process.env.LC_CTYPE = 'fr_FR.ISO-8859-1'; + + const result = getSystemEncoding(); + expect(result).toBe('iso-8859-1'); + }); + + it('should parse locale from LANG when LC_ALL and LC_CTYPE are not set', () => { + process.env.LANG = 'de_DE.UTF-8'; + + const result = getSystemEncoding(); + expect(result).toBe('utf-8'); + }); + + it('should handle locale charmap command when environment variables are empty', () => { + mockedExecSync.mockReturnValue('UTF-8\n'); + + const result = getSystemEncoding(); + expect(result).toBe('utf-8'); + expect(mockedExecSync).toHaveBeenCalledWith('locale charmap', { + encoding: 'utf8', + }); + }); + + it('should handle locale charmap with mixed case', () => { + mockedExecSync.mockReturnValue('ISO-8859-1\n'); + + const result = getSystemEncoding(); + expect(result).toBe('iso-8859-1'); + }); + + it('should return null when locale charmap fails', () => { + mockedExecSync.mockImplementation(() => { + throw new Error('Command failed'); + }); + + const result = getSystemEncoding(); + expect(result).toBe(null); + expect(consoleWarnSpy).toHaveBeenCalledWith( + 'Failed to get locale charmap.', + ); + }); + + it('should handle locale without encoding (no dot)', () => { + process.env.LANG = 'C'; + + const result = getSystemEncoding(); + expect(result).toBe('c'); + }); + + it('should handle empty locale environment variables', () => { + process.env.LC_ALL = ''; + process.env.LC_CTYPE = ''; + process.env.LANG = ''; + mockedExecSync.mockReturnValue('UTF-8'); + + const result = getSystemEncoding(); + expect(result).toBe('utf-8'); + }); + + it('should return locale as-is when locale format has no dot', () => { + process.env.LANG = 'invalid_format'; + + const result = getSystemEncoding(); + expect(result).toBe('invalid_format'); + }); + + it('should prioritize LC_ALL over other environment variables', () => { + process.env.LC_ALL = 'en_US.UTF-8'; + process.env.LC_CTYPE = 'fr_FR.ISO-8859-1'; + process.env.LANG = 'de_DE.CP1252'; + + const result = getSystemEncoding(); + expect(result).toBe('utf-8'); + }); + + it('should prioritize LC_CTYPE over LANG', () => { + process.env.LC_CTYPE = 'fr_FR.ISO-8859-1'; + process.env.LANG = 'de_DE.CP1252'; + + const result = getSystemEncoding(); + expect(result).toBe('iso-8859-1'); + }); + }); + + describe('getEncodingForBuffer', () => { + beforeEach(() => { + mockedOsPlatform.mockReturnValue('linux'); + }); + + it('should use cached system encoding on subsequent calls', () => { + process.env.LANG = 'en_US.UTF-8'; + const buffer = Buffer.from('test'); + + // First call + const result1 = getCachedEncodingForBuffer(buffer); + expect(result1).toBe('utf-8'); + + // Change environment (should not affect cached result) + process.env.LANG = 'fr_FR.ISO-8859-1'; + + // Second call should use cached value + const result2 = getCachedEncodingForBuffer(buffer); + expect(result2).toBe('utf-8'); + }); + + it('should fall back to buffer detection when system encoding fails', () => { + // No environment variables set + mockedExecSync.mockImplementation(() => { + throw new Error('locale command failed'); + }); + + const buffer = Buffer.from('test'); + mockedChardetDetect.mockReturnValue('ISO-8859-1'); + + const result = getCachedEncodingForBuffer(buffer); + expect(result).toBe('iso-8859-1'); + expect(mockedChardetDetect).toHaveBeenCalledWith(buffer); + }); + + it('should fall back to utf-8 when both system and buffer detection fail', () => { + // System encoding fails + mockedExecSync.mockImplementation(() => { + throw new Error('locale command failed'); + }); + + // Buffer detection fails + mockedChardetDetect.mockImplementation(() => { + throw new Error('chardet failed'); + }); + + const buffer = Buffer.from('test'); + const result = getCachedEncodingForBuffer(buffer); + expect(result).toBe('utf-8'); + }); + + it('should not cache buffer detection results', () => { + // System encoding fails initially + mockedExecSync.mockImplementation(() => { + throw new Error('locale command failed'); + }); + + const buffer1 = Buffer.from('test1'); + const buffer2 = Buffer.from('test2'); + + mockedChardetDetect + .mockReturnValueOnce('ISO-8859-1') + .mockReturnValueOnce('UTF-16'); + + const result1 = getCachedEncodingForBuffer(buffer1); + const result2 = getCachedEncodingForBuffer(buffer2); + + expect(result1).toBe('iso-8859-1'); + expect(result2).toBe('utf-16'); + expect(mockedChardetDetect).toHaveBeenCalledTimes(2); + }); + + it('should handle Windows system encoding', () => { + mockedOsPlatform.mockReturnValue('win32'); + mockedExecSync.mockReturnValue('Active code page: 1252'); + + const buffer = Buffer.from('test'); + const result = getCachedEncodingForBuffer(buffer); + + expect(result).toBe('windows-1252'); + }); + + it('should cache null system encoding result', () => { + // Reset the cache specifically for this test + resetEncodingCache(); + + // Ensure we're on Unix-like for this test + mockedOsPlatform.mockReturnValue('linux'); + + // System encoding detection returns null + mockedExecSync.mockImplementation(() => { + throw new Error('locale command failed'); + }); + + const buffer1 = Buffer.from('test1'); + const buffer2 = Buffer.from('test2'); + + mockedChardetDetect + .mockReturnValueOnce('ISO-8859-1') + .mockReturnValueOnce('UTF-16'); + + // Clear any previous calls from beforeEach setup or previous tests + mockedExecSync.mockClear(); + + const result1 = getCachedEncodingForBuffer(buffer1); + const result2 = getCachedEncodingForBuffer(buffer2); + + // Should call execSync only once due to caching (null result is cached) + expect(mockedExecSync).toHaveBeenCalledTimes(1); + expect(result1).toBe('iso-8859-1'); + expect(result2).toBe('utf-16'); + + // Call a third time to verify cache is still used + const buffer3 = Buffer.from('test3'); + mockedChardetDetect.mockReturnValueOnce('UTF-32'); + const result3 = getCachedEncodingForBuffer(buffer3); + + // Still should be only one call to execSync + expect(mockedExecSync).toHaveBeenCalledTimes(1); + expect(result3).toBe('utf-32'); + }); + }); + + describe('Cross-platform behavior', () => { + it('should work correctly on macOS', () => { + mockedOsPlatform.mockReturnValue('darwin'); + process.env.LANG = 'en_US.UTF-8'; + + const result = getSystemEncoding(); + expect(result).toBe('utf-8'); + }); + + it('should work correctly on other Unix-like systems', () => { + mockedOsPlatform.mockReturnValue('freebsd'); + process.env.LANG = 'en_US.UTF-8'; + + const result = getSystemEncoding(); + expect(result).toBe('utf-8'); + }); + + it('should handle unknown platforms as Unix-like', () => { + mockedOsPlatform.mockReturnValue('unknown' as NodeJS.Platform); + process.env.LANG = 'en_US.UTF-8'; + + const result = getSystemEncoding(); + expect(result).toBe('utf-8'); + }); + }); + + describe('Edge cases and error handling', () => { + it('should handle empty buffer gracefully', () => { + mockedOsPlatform.mockReturnValue('linux'); + process.env.LANG = 'en_US.UTF-8'; + + const buffer = Buffer.alloc(0); + const result = getCachedEncodingForBuffer(buffer); + expect(result).toBe('utf-8'); + }); + + it('should handle very large buffers', () => { + mockedOsPlatform.mockReturnValue('linux'); + process.env.LANG = 'en_US.UTF-8'; + + const buffer = Buffer.alloc(1024 * 1024, 'a'); + const result = getCachedEncodingForBuffer(buffer); + expect(result).toBe('utf-8'); + }); + + it('should handle Unicode content', () => { + mockedOsPlatform.mockReturnValue('linux'); + const unicodeText = '你好世界 🌍 ñoño'; + + // System encoding fails + mockedExecSync.mockImplementation(() => { + throw new Error('locale command failed'); + }); + + mockedChardetDetect.mockReturnValue('UTF-8'); + + const buffer = Buffer.from(unicodeText, 'utf8'); + const result = getCachedEncodingForBuffer(buffer); + expect(result).toBe('utf-8'); + }); + }); +}); diff --git a/packages/core/src/utils/systemEncoding.ts b/packages/core/src/utils/systemEncoding.ts new file mode 100644 index 00000000..f162c223 --- /dev/null +++ b/packages/core/src/utils/systemEncoding.ts @@ -0,0 +1,166 @@ +/** + * @license + * Copyright 2025 Google LLC + * SPDX-License-Identifier: Apache-2.0 + */ + +import { execSync } from 'child_process'; +import os from 'os'; +import { detect as chardetDetect } from 'chardet'; + +// Cache for system encoding to avoid repeated detection +// Use undefined to indicate "not yet checked" vs null meaning "checked but failed" +let cachedSystemEncoding: string | null | undefined = undefined; + +/** + * Reset the encoding cache - useful for testing + */ +export function resetEncodingCache(): void { + cachedSystemEncoding = undefined; +} + +/** + * Returns the system encoding, caching the result to avoid repeated system calls. + * If system encoding detection fails, falls back to detecting from the provided buffer. + * Note: Only the system encoding is cached - buffer-based detection runs for each buffer + * since different buffers may have different encodings. + * @param buffer A buffer to use for detecting encoding if system detection fails. + */ +export function getCachedEncodingForBuffer(buffer: Buffer): string { + // Cache system encoding detection since it's system-wide + if (cachedSystemEncoding === undefined) { + cachedSystemEncoding = getSystemEncoding(); + } + + // If we have a cached system encoding, use it + if (cachedSystemEncoding) { + return cachedSystemEncoding; + } + + // Otherwise, detect from this specific buffer (don't cache this result) + return detectEncodingFromBuffer(buffer) || 'utf-8'; +} + +/** + * Detects the system encoding based on the platform. + * For Windows, it uses the 'chcp' command to get the current code page. + * For Unix-like systems, it checks environment variables like LC_ALL, LC_CTYPE, and LANG. + * If those are not set, it tries to run 'locale charmap' to get the encoding. + * If detection fails, it returns null. + * @returns The system encoding as a string, or null if detection fails. + */ +export function getSystemEncoding(): string | null { + // Windows + if (os.platform() === 'win32') { + try { + const output = execSync('chcp', { encoding: 'utf8' }); + const match = output.match(/:\s*(\d+)/); + if (match) { + const codePage = parseInt(match[1], 10); + if (!isNaN(codePage)) { + return windowsCodePageToEncoding(codePage); + } + } + // Only warn if we can't parse the output format, not if windowsCodePageToEncoding fails + throw new Error( + `Unable to parse Windows code page from 'chcp' output "${output.trim()}". `, + ); + } catch (error) { + console.warn( + `Failed to get Windows code page using 'chcp' command: ${error instanceof Error ? error.message : String(error)}. ` + + `Will attempt to detect encoding from command output instead.`, + ); + } + return null; + } + + // Unix-like + // Use environment variables LC_ALL, LC_CTYPE, and LANG to determine the + // system encoding. However, these environment variables might not always + // be set or accurate. Handle cases where none of these variables are set. + const env = process.env; + let locale = env.LC_ALL || env.LC_CTYPE || env.LANG || ''; + + // Fallback to querying the system directly when environment variables are missing + if (!locale) { + try { + locale = execSync('locale charmap', { encoding: 'utf8' }) + .toString() + .trim(); + } catch (_e) { + console.warn('Failed to get locale charmap.'); + return null; + } + } + + const match = locale.match(/\.(.+)/); // e.g., "en_US.UTF-8" + if (match && match[1]) { + return match[1].toLowerCase(); + } + + // Handle cases where locale charmap returns just the encoding name (e.g., "UTF-8") + if (locale && !locale.includes('.')) { + return locale.toLowerCase(); + } + + return null; +} + +/** + * Converts a Windows code page number to a corresponding encoding name. + * @param cp The Windows code page number (e.g., 437, 850, etc.) + * @returns The corresponding encoding name as a string, or null if no mapping exists. + */ +export function windowsCodePageToEncoding(cp: number): string | null { + // Most common mappings; extend as needed + const map: { [key: number]: string } = { + 437: 'cp437', + 850: 'cp850', + 852: 'cp852', + 866: 'cp866', + 874: 'windows-874', + 932: 'shift_jis', + 936: 'gb2312', + 949: 'euc-kr', + 950: 'big5', + 1200: 'utf-16le', + 1201: 'utf-16be', + 1250: 'windows-1250', + 1251: 'windows-1251', + 1252: 'windows-1252', + 1253: 'windows-1253', + 1254: 'windows-1254', + 1255: 'windows-1255', + 1256: 'windows-1256', + 1257: 'windows-1257', + 1258: 'windows-1258', + 65001: 'utf-8', + }; + + if (map[cp]) { + return map[cp]; + } + + console.warn(`Unable to determine encoding for windows code page ${cp}.`); + return null; // Return null if no mapping found +} + +/** + * Attempts to detect encoding from a buffer using chardet. + * This is useful when system encoding detection fails. + * Returns the detected encoding in lowercase, or null if detection fails. + * @param buffer The buffer to analyze for encoding. + * @return The detected encoding as a lowercase string, or null if detection fails. + */ +export function detectEncodingFromBuffer(buffer: Buffer): string | null { + try { + const detected = chardetDetect(buffer); + if (detected && typeof detected === 'string') { + return detected.toLowerCase(); + } + } catch (error) { + console.warn('Failed to detect encoding with chardet:', error); + } + + return null; +}