gemini-cli/packages/core/src/utils/fileUtils.ts

373 lines
12 KiB
TypeScript

/**
* @license
* Copyright 2025 Google LLC
* SPDX-License-Identifier: Apache-2.0
*/
import fs from 'node:fs';
import path from 'node:path';
import { PartUnion } from '@google/genai';
import mime from 'mime-types';
// Constants for text file processing
const DEFAULT_MAX_LINES_TEXT_FILE = 2000;
const MAX_LINE_LENGTH_TEXT_FILE = 2000;
// Default values for encoding and separator format
export const DEFAULT_ENCODING: BufferEncoding = 'utf-8';
/**
* Looks up the specific MIME type for a file path.
* @param filePath Path to the file.
* @returns The specific MIME type string (e.g., 'text/python', 'application/javascript') or undefined if not found or ambiguous.
*/
export function getSpecificMimeType(filePath: string): string | undefined {
const lookedUpMime = mime.lookup(filePath);
return typeof lookedUpMime === 'string' ? lookedUpMime : undefined;
}
/**
* Checks if a path is within a given root directory.
* @param pathToCheck The absolute path to check.
* @param rootDirectory The absolute root directory.
* @returns True if the path is within the root directory, false otherwise.
*/
export function isWithinRoot(
pathToCheck: string,
rootDirectory: string,
): boolean {
const normalizedPathToCheck = path.resolve(pathToCheck);
const normalizedRootDirectory = path.resolve(rootDirectory);
// Ensure the rootDirectory path ends with a separator for correct startsWith comparison,
// unless it's the root path itself (e.g., '/' or 'C:\').
const rootWithSeparator =
normalizedRootDirectory === path.sep ||
normalizedRootDirectory.endsWith(path.sep)
? normalizedRootDirectory
: normalizedRootDirectory + path.sep;
return (
normalizedPathToCheck === normalizedRootDirectory ||
normalizedPathToCheck.startsWith(rootWithSeparator)
);
}
/**
* Determines if a file is likely binary based on content sampling.
* @param filePath Path to the file.
* @returns Promise that resolves to true if the file appears to be binary.
*/
export async function isBinaryFile(filePath: string): Promise<boolean> {
let fileHandle: fs.promises.FileHandle | undefined;
try {
fileHandle = await fs.promises.open(filePath, 'r');
// Read up to 4KB or file size, whichever is smaller
const stats = await fileHandle.stat();
const fileSize = stats.size;
if (fileSize === 0) {
// Empty file is not considered binary for content checking
return false;
}
const bufferSize = Math.min(4096, fileSize);
const buffer = Buffer.alloc(bufferSize);
const result = await fileHandle.read(buffer, 0, buffer.length, 0);
const bytesRead = result.bytesRead;
if (bytesRead === 0) return false;
let nonPrintableCount = 0;
for (let i = 0; i < bytesRead; i++) {
if (buffer[i] === 0) return true; // Null byte is a strong indicator
if (buffer[i] < 9 || (buffer[i] > 13 && buffer[i] < 32)) {
nonPrintableCount++;
}
}
// If >30% non-printable characters, consider it binary
return nonPrintableCount / bytesRead > 0.3;
} catch (error) {
// Log error for debugging while maintaining existing behavior
console.warn(
`Failed to check if file is binary: ${filePath}`,
error instanceof Error ? error.message : String(error),
);
// If any error occurs (e.g. file not found, permissions),
// treat as not binary here; let higher-level functions handle existence/access errors.
return false;
} finally {
// Safely close the file handle if it was successfully opened
if (fileHandle) {
try {
await fileHandle.close();
} catch (closeError) {
// Log close errors for debugging while continuing with cleanup
console.warn(
`Failed to close file handle for: ${filePath}`,
closeError instanceof Error ? closeError.message : String(closeError),
);
// The important thing is that we attempted to clean up
}
}
}
}
/**
* Detects the type of file based on extension and content.
* @param filePath Path to the file.
* @returns Promise that resolves to 'text', 'image', 'pdf', 'audio', 'video', 'binary' or 'svg'.
*/
export async function detectFileType(
filePath: string,
): Promise<'text' | 'image' | 'pdf' | 'audio' | 'video' | 'binary' | 'svg'> {
const ext = path.extname(filePath).toLowerCase();
// The mimetype for "ts" is MPEG transport stream (a video format) but we want
// to assume these are typescript files instead.
if (ext === '.ts') {
return 'text';
}
if (ext === '.svg') {
return 'svg';
}
const lookedUpMimeType = mime.lookup(filePath); // Returns false if not found, or the mime type string
if (lookedUpMimeType) {
if (lookedUpMimeType.startsWith('image/')) {
return 'image';
}
if (lookedUpMimeType.startsWith('audio/')) {
return 'audio';
}
if (lookedUpMimeType.startsWith('video/')) {
return 'video';
}
if (lookedUpMimeType === 'application/pdf') {
return 'pdf';
}
}
// Stricter binary check for common non-text extensions before content check
// These are often not well-covered by mime-types or might be misidentified.
if (
[
'.zip',
'.tar',
'.gz',
'.exe',
'.dll',
'.so',
'.class',
'.jar',
'.war',
'.7z',
'.doc',
'.docx',
'.xls',
'.xlsx',
'.ppt',
'.pptx',
'.odt',
'.ods',
'.odp',
'.bin',
'.dat',
'.obj',
'.o',
'.a',
'.lib',
'.wasm',
'.pyc',
'.pyo',
].includes(ext)
) {
return 'binary';
}
// Fall back to content-based check if mime type wasn't conclusive for image/pdf
// and it's not a known binary extension.
if (await isBinaryFile(filePath)) {
return 'binary';
}
return 'text';
}
export interface ProcessedFileReadResult {
llmContent: PartUnion; // string for text, Part for image/pdf/unreadable binary
returnDisplay: string;
error?: string; // Optional error message for the LLM if file processing failed
isTruncated?: boolean; // For text files, indicates if content was truncated
originalLineCount?: number; // For text files
linesShown?: [number, number]; // For text files [startLine, endLine] (1-based for display)
}
/**
* Reads and processes a single file, handling text, images, and PDFs.
* @param filePath Absolute path to the file.
* @param rootDirectory Absolute path to the project root for relative path display.
* @param offset Optional offset for text files (0-based line number).
* @param limit Optional limit for text files (number of lines to read).
* @returns ProcessedFileReadResult object.
*/
export async function processSingleFileContent(
filePath: string,
rootDirectory: string,
offset?: number,
limit?: number,
): Promise<ProcessedFileReadResult> {
try {
if (!fs.existsSync(filePath)) {
// Sync check is acceptable before async read
return {
llmContent: '',
returnDisplay: 'File not found.',
error: `File not found: ${filePath}`,
};
}
const stats = await fs.promises.stat(filePath);
if (stats.isDirectory()) {
return {
llmContent: '',
returnDisplay: 'Path is a directory.',
error: `Path is a directory, not a file: ${filePath}`,
};
}
const fileSizeInBytes = stats.size;
// 20MB limit
const maxFileSize = 20 * 1024 * 1024;
if (fileSizeInBytes > maxFileSize) {
throw new Error(
`File size exceeds the 20MB limit: ${filePath} (${(
fileSizeInBytes /
(1024 * 1024)
).toFixed(2)}MB)`,
);
}
const fileType = await detectFileType(filePath);
const relativePathForDisplay = path
.relative(rootDirectory, filePath)
.replace(/\\/g, '/');
switch (fileType) {
case 'binary': {
return {
llmContent: `Cannot display content of binary file: ${relativePathForDisplay}`,
returnDisplay: `Skipped binary file: ${relativePathForDisplay}`,
};
}
case 'svg': {
const SVG_MAX_SIZE_BYTES = 1 * 1024 * 1024;
if (stats.size > SVG_MAX_SIZE_BYTES) {
return {
llmContent: `Cannot display content of SVG file larger than 1MB: ${relativePathForDisplay}`,
returnDisplay: `Skipped large SVG file (>1MB): ${relativePathForDisplay}`,
};
}
const content = await fs.promises.readFile(filePath, 'utf8');
return {
llmContent: content,
returnDisplay: `Read SVG as text: ${relativePathForDisplay}`,
};
}
case 'text': {
const content = await fs.promises.readFile(filePath, 'utf8');
const lines = content.split('\n');
const originalLineCount = lines.length;
const startLine = offset || 0;
const effectiveLimit =
limit === undefined ? DEFAULT_MAX_LINES_TEXT_FILE : limit;
// Ensure endLine does not exceed originalLineCount
const endLine = Math.min(startLine + effectiveLimit, originalLineCount);
// Ensure selectedLines doesn't try to slice beyond array bounds if startLine is too high
const actualStartLine = Math.min(startLine, originalLineCount);
const selectedLines = lines.slice(actualStartLine, endLine);
let linesWereTruncatedInLength = false;
const formattedLines = selectedLines.map((line) => {
if (line.length > MAX_LINE_LENGTH_TEXT_FILE) {
linesWereTruncatedInLength = true;
return (
line.substring(0, MAX_LINE_LENGTH_TEXT_FILE) + '... [truncated]'
);
}
return line;
});
const contentRangeTruncated =
startLine > 0 || endLine < originalLineCount;
const isTruncated = contentRangeTruncated || linesWereTruncatedInLength;
let llmTextContent = '';
if (contentRangeTruncated) {
llmTextContent += `[File content truncated: showing lines ${actualStartLine + 1}-${endLine} of ${originalLineCount} total lines. Use offset/limit parameters to view more.]\n`;
} else if (linesWereTruncatedInLength) {
llmTextContent += `[File content partially truncated: some lines exceeded maximum length of ${MAX_LINE_LENGTH_TEXT_FILE} characters.]\n`;
}
llmTextContent += formattedLines.join('\n');
// By default, return nothing to streamline the common case of a successful read_file.
let returnDisplay = '';
if (contentRangeTruncated) {
returnDisplay = `Read lines ${
actualStartLine + 1
}-${endLine} of ${originalLineCount} from ${relativePathForDisplay}`;
if (linesWereTruncatedInLength) {
returnDisplay += ' (some lines were shortened)';
}
} else if (linesWereTruncatedInLength) {
returnDisplay = `Read all ${originalLineCount} lines from ${relativePathForDisplay} (some lines were shortened)`;
}
return {
llmContent: llmTextContent,
returnDisplay,
isTruncated,
originalLineCount,
linesShown: [actualStartLine + 1, endLine],
};
}
case 'image':
case 'pdf':
case 'audio':
case 'video': {
const contentBuffer = await fs.promises.readFile(filePath);
const base64Data = contentBuffer.toString('base64');
return {
llmContent: {
inlineData: {
data: base64Data,
mimeType: mime.lookup(filePath) || 'application/octet-stream',
},
},
returnDisplay: `Read ${fileType} file: ${relativePathForDisplay}`,
};
}
default: {
// Should not happen with current detectFileType logic
const exhaustiveCheck: never = fileType;
return {
llmContent: `Unhandled file type: ${exhaustiveCheck}`,
returnDisplay: `Skipped unhandled file type: ${relativePathForDisplay}`,
error: `Unhandled file type for ${filePath}`,
};
}
}
} catch (error) {
const errorMessage = error instanceof Error ? error.message : String(error);
const displayPath = path
.relative(rootDirectory, filePath)
.replace(/\\/g, '/');
return {
llmContent: `Error reading file ${displayPath}: ${errorMessage}`,
returnDisplay: `Error reading file ${displayPath}: ${errorMessage}`,
error: `Error reading file ${filePath}: ${errorMessage}`,
};
}
}