Add concatenation tool (#130)

* Adding a tool inspired by files-to-prompt that will recursivly read through all the files in a directory (guarded by targetDir) and concatenate those files for the model. Ignores common build artifacts and non-text files.

* Migraded glob logic to fast-glob. Buffed the tool description to give more guidance to the model. Incorporated reveiw feedback.

* lint and error checking.
This commit is contained in:
Allen Hutchison 2025-04-23 17:25:47 -07:00 committed by GitHub
parent d771dcbdb9
commit cf92ffab34
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 388 additions and 0 deletions

View File

@ -17,6 +17,7 @@ import { EditTool } from '../tools/edit.js';
import { TerminalTool } from '../tools/terminal.js';
import { WriteFileTool } from '../tools/write-file.js';
import { WebFetchTool } from '../tools/web-fetch.js';
import { ReadManyFilesTool } from '../tools/read-many-files.js';
const DEFAULT_PASSTHROUGH_COMMANDS = ['ls', 'git', 'npm'];
@ -130,6 +131,7 @@ function createToolRegistry(config: Config): ToolRegistry {
new TerminalTool(targetDir, config),
new WriteFileTool(targetDir),
new WebFetchTool(), // Note: WebFetchTool takes no arguments
new ReadManyFilesTool(targetDir),
];
for (const tool of tools) {
registry.registerTool(tool);

View File

@ -0,0 +1,386 @@
/**
* @license
* Copyright 2025 Google LLC
* SPDX-License-Identifier: Apache-2.0
*/
import { BaseTool, ToolResult } from './tools.js';
import { SchemaValidator } from '../utils/schemaValidator.js';
import { getErrorMessage } from '../utils/errors.js';
import * as fs from 'fs/promises';
import * as path from 'path';
import fg from 'fast-glob';
/**
* Parameters for the ReadManyFilesTool.
*/
export interface ReadManyFilesParams {
/**
* An array of file paths or directory paths to search within.
* Paths are relative to the tool's configured target directory.
* Glob patterns can be used directly in these paths.
*/
paths: string[];
/**
* Optional. Glob patterns for files to include.
* These are effectively combined with the `paths`.
* Example: ["*.ts", "src/** /*.md"]
*/
include?: string[];
/**
* Optional. Glob patterns for files/directories to exclude.
* Applied as ignore patterns.
* Example: ["*.log", "dist/**"]
*/
exclude?: string[];
/**
* Optional. Search directories recursively.
* This is generally controlled by glob patterns (e.g., `**`).
* The glob implementation is recursive by default for `**`.
* For simplicity, we'll rely on `**` for recursion.
*/
recursive?: boolean;
/**
* Optional. Apply default exclusion patterns. Defaults to true.
*/
useDefaultExcludes?: boolean;
}
/**
* Default exclusion patterns for commonly ignored directories and binary file types.
* These are compatible with glob ignore patterns.
* TODO(adh): Consider making this configurable or extendable through a command line arguement.
* TODO(adh): Look into sharing this list with the glob tool.
*/
const DEFAULT_EXCLUDES: string[] = [
'**/node_modules/**',
'**/.git/**',
'**/.vscode/**',
'**/.idea/**',
'**/dist/**',
'**/build/**',
'**/coverage/**',
'**/__pycache__/**',
'**/*.pyc',
'**/*.pyo',
'**/*.bin',
'**/*.exe',
'**/*.dll',
'**/*.so',
'**/*.dylib',
'**/*.class',
'**/*.jar',
'**/*.war',
'**/*.zip',
'**/*.tar',
'**/*.gz',
'**/*.bz2',
'**/*.rar',
'**/*.7z',
'**/*.png',
'**/*.jpg',
'**/*.jpeg',
'**/*.gif',
'**/*.bmp',
'**/*.tiff',
'**/*.ico',
'**/*.pdf',
'**/*.doc',
'**/*.docx',
'**/*.xls',
'**/*.xlsx',
'**/*.ppt',
'**/*.pptx',
'**/*.odt',
'**/*.ods',
'**/*.odp',
'**/*.DS_Store',
'**/.env',
];
// Default values for encoding and separator format
const DEFAULT_ENCODING: BufferEncoding = 'utf-8';
const DEFAULT_OUTPUT_SEPARATOR_FORMAT: string = '--- {filePath} ---';
/**
* Tool implementation for finding and reading multiple text files from the local filesystem
* within a specified target directory. The content is concatenated.
* It is intended to run in an environment with access to the local file system (e.g., a Node.js backend).
*/
export class ReadManyFilesTool extends BaseTool<
ReadManyFilesParams,
ToolResult
> {
static readonly Name: string = 'readManyFiles';
readonly targetDir: string;
/**
* Creates an instance of ReadManyFilesTool.
* @param targetDir The absolute root directory within which this tool is allowed to operate.
* All paths provided in `params` will be resolved relative to this directory.
*/
constructor(targetDir: string) {
const parameterSchema: Record<string, unknown> = {
type: 'object',
properties: {
paths: {
type: 'array',
items: { type: 'string' },
description:
"Required. An array of glob patterns or paths relative to the tool's target directory. Examples: ['src/**/*.ts'], ['README.md', 'docs/']",
},
include: {
type: 'array',
items: { type: 'string' },
description:
'Optional. Additional glob patterns to include. These are merged with `paths`. Example: ["*.test.ts"] to specifically add test files if they were broadly excluded.',
default: [],
},
exclude: {
type: 'array',
items: { type: 'string' },
description:
'Optional. Glob patterns for files/directories to exclude. Added to default excludes if useDefaultExcludes is true. Example: ["**/*.log", "temp/"]',
default: [],
},
recursive: {
type: 'boolean',
description:
'Optional. Whether to search recursively (primarily controlled by `**` in glob patterns). Defaults to true.',
default: true,
},
useDefaultExcludes: {
type: 'boolean',
description:
'Optional. Whether to apply a list of default exclusion patterns (e.g., node_modules, .git, binary files). Defaults to true.',
default: true,
},
},
required: ['paths'],
};
super(
ReadManyFilesTool.Name,
'Read Many Files',
`Reads content from multiple text files specified by paths or glob patterns within a configured target directory and concatenates them into a single string.
This tool is useful when you need to understand or analyze a collection of files, such as:
- Getting an overview of a codebase or parts of it (e.g., all TypeScript files in the 'src' directory).
- Finding where specific functionality is implemented if the user asks broad questions about code.
- Reviewing documentation files (e.g., all Markdown files in the 'docs' directory).
- Gathering context from multiple configuration files.
- When the user asks to "read all files in X directory" or "show me the content of all Y files".
Use this tool when the user's query implies needing the content of several files simultaneously for context, analysis, or summarization.
It uses default UTF-8 encoding and a '--- {filePath} ---' separator between file contents.
Ensure paths are relative to the target directory. Glob patterns like 'src/**/*.js' are supported.
Avoid using for single files if a more specific single-file reading tool is available, unless the user specifically requests to process a list containing just one file via this tool.
This tool should NOT be used for binary files; it attempts to skip them.
Default excludes apply to common non-text files and large dependency directories unless 'useDefaultExcludes' is false.`,
parameterSchema,
);
this.targetDir = path.resolve(targetDir);
}
validateParams(params: ReadManyFilesParams): string | null {
if (
this.schema.parameters &&
!SchemaValidator.validate(
this.schema.parameters as Record<string, unknown>,
params,
)
) {
if (
!params.paths ||
!Array.isArray(params.paths) ||
params.paths.length === 0
) {
return 'The "paths" parameter is required and must be a non-empty array of strings/glob patterns.';
}
return 'Parameters failed schema validation. Ensure "paths" is a non-empty array and other parameters match their expected types.';
}
for (const p of params.paths) {
if (typeof p !== 'string' || p.trim() === '') {
return 'Each item in "paths" must be a non-empty string/glob pattern.';
}
}
if (
params.include &&
(!Array.isArray(params.include) ||
!params.include.every((item) => typeof item === 'string'))
) {
return 'If provided, "include" must be an array of strings/glob patterns.';
}
if (
params.exclude &&
(!Array.isArray(params.exclude) ||
!params.exclude.every((item) => typeof item === 'string'))
) {
return 'If provided, "exclude" must be an array of strings/glob patterns.';
}
return null;
}
getDescription(params: ReadManyFilesParams): string {
const allPatterns = [...params.paths, ...(params.include || [])];
const pathDesc = `using patterns: \`${allPatterns.join('`, `')}\` (within target directory: \`${this.targetDir}\`)`;
let effectiveExcludes =
params.useDefaultExcludes !== false ? [...DEFAULT_EXCLUDES] : [];
if (params.exclude && params.exclude.length > 0) {
effectiveExcludes = [...effectiveExcludes, ...params.exclude];
}
const excludeDesc = `Excluding: ${effectiveExcludes.length > 0 ? `patterns like \`${effectiveExcludes.slice(0, 2).join('`, `')}${effectiveExcludes.length > 2 ? '...`' : '`'}` : 'none explicitly (beyond default non-text file avoidance).'}`;
return `Will attempt to read and concatenate files ${pathDesc}. ${excludeDesc}. File encoding: ${DEFAULT_ENCODING}. Separator: "${DEFAULT_OUTPUT_SEPARATOR_FORMAT.replace('{filePath}', 'path/to/file.ext')}".`;
}
async execute(params: ReadManyFilesParams): Promise<ToolResult> {
const validationError = this.validateParams(params);
if (validationError) {
return {
llmContent: `Error: Invalid parameters for ${this.displayName}. Reason: ${validationError}`,
returnDisplay: `## Parameter Error\n\n${validationError}`,
};
}
const {
paths: inputPatterns,
include = [],
exclude = [],
useDefaultExcludes = true,
} = params;
const toolBaseDir = this.targetDir;
const filesToConsider = new Set<string>();
const skippedFiles: { path: string; reason: string }[] = [];
const processedFilesRelativePaths: string[] = [];
let concatenatedContent = '';
const effectiveExcludes = useDefaultExcludes
? [...DEFAULT_EXCLUDES, ...exclude]
: [...exclude];
const searchPatterns = [...inputPatterns, ...include];
if (searchPatterns.length === 0) {
return {
llmContent: 'No search paths or include patterns provided.',
returnDisplay: `## Information\n\nNo search paths or include patterns were specified. Nothing to read or concatenate.`,
};
}
try {
// Using fast-glob (fg) for file searching based on patterns.
// The `cwd` option scopes the search to the toolBaseDir.
// `ignore` handles exclusions.
// `onlyFiles` ensures only files are returned.
// `dot` allows matching dotfiles (which can still be excluded by patterns).
// `absolute` returns absolute paths for consistent handling.
const entries = await fg(searchPatterns, {
cwd: toolBaseDir,
ignore: effectiveExcludes,
onlyFiles: true,
dot: true,
absolute: true,
caseSensitiveMatch: false,
});
for (const absoluteFilePath of entries) {
// Security check: ensure the glob library didn't return something outside targetDir.
// This should be guaranteed by `cwd` and the library's sandboxing, but an extra check is good practice.
if (!absoluteFilePath.startsWith(toolBaseDir)) {
skippedFiles.push({
path: absoluteFilePath,
reason: `Security: Glob library returned path outside target directory. Base: ${toolBaseDir}, Path: ${absoluteFilePath}`,
});
continue;
}
filesToConsider.add(absoluteFilePath);
}
} catch (error) {
return {
llmContent: `Error during file search: ${getErrorMessage(error)}`,
returnDisplay: `## File Search Error\n\nAn error occurred while searching for files:\n\`\`\`\n${getErrorMessage(error)}\n\`\`\``,
};
}
const sortedFiles = Array.from(filesToConsider).sort();
for (const filePath of sortedFiles) {
const relativePathForDisplay = path
.relative(toolBaseDir, filePath)
.replace(/\\/g, '/');
try {
const contentBuffer = await fs.readFile(filePath);
// Basic binary detection: check for null bytes in the first 1KB
const sample = contentBuffer.subarray(
0,
Math.min(contentBuffer.length, 1024),
);
if (sample.includes(0)) {
skippedFiles.push({
path: relativePathForDisplay,
reason: 'Skipped (appears to be binary)',
});
continue;
}
// Using default encoding
const fileContent = contentBuffer.toString(DEFAULT_ENCODING);
// Using default separator format
const separator = DEFAULT_OUTPUT_SEPARATOR_FORMAT.replace(
'{filePath}',
relativePathForDisplay,
);
concatenatedContent += `${separator}\n\n${fileContent}\n\n`;
processedFilesRelativePaths.push(relativePathForDisplay);
} catch (error) {
skippedFiles.push({
path: relativePathForDisplay,
reason: `Read error: ${getErrorMessage(error)}`,
});
}
}
let displayMessage = `### Read Many Files Result (Target Dir: \`${this.targetDir}\`)\n\n`;
if (processedFilesRelativePaths.length > 0) {
displayMessage += `Successfully read and concatenated content from **${processedFilesRelativePaths.length} file(s)**.\n`;
displayMessage += `\n**Processed Files (up to 10 shown):**\n`;
processedFilesRelativePaths
.slice(0, 10)
.forEach((p) => (displayMessage += `- \`${p}\`\n`));
if (processedFilesRelativePaths.length > 10) {
displayMessage += `- ...and ${processedFilesRelativePaths.length - 10} more.\n`;
}
} else {
displayMessage += `No files were read and concatenated based on the criteria.\n`;
}
if (skippedFiles.length > 0) {
displayMessage += `\n**Skipped ${skippedFiles.length} item(s) (up to 5 shown):**\n`;
skippedFiles
.slice(0, 5)
.forEach(
(f) => (displayMessage += `- \`${f.path}\` (Reason: ${f.reason})\n`),
);
if (skippedFiles.length > 5) {
displayMessage += `- ...and ${skippedFiles.length - 5} more.\n`;
}
}
if (
concatenatedContent.length === 0 &&
processedFilesRelativePaths.length === 0
) {
concatenatedContent =
'No files matching the criteria were found or all were skipped.';
}
return {
llmContent: concatenatedContent,
returnDisplay: displayMessage,
};
}
}