Add concatenation tool (#130)

* Adding a tool inspired by files-to-prompt that will recursivly read through all the files in a directory (guarded by targetDir) and concatenate those files for the model. Ignores common build artifacts and non-text files. * Migraded glob logic to fast-glob. Buffed the tool description to give more guidance to the model. Incorporated reveiw feedback. * lint and error checking.
2025-04-23 17:25:47 -07:00 · 2025-04-23 17:25:47 -07:00 · cf92ffab34
parent d771dcbdb9
commit cf92ffab34
2 changed files with 388 additions and 0 deletions
--- a/packages/server/src/config/config.ts
+++ b/packages/server/src/config/config.ts
@ -17,6 +17,7 @@ import { EditTool } from '../tools/edit.js';
 import { TerminalTool } from '../tools/terminal.js';
 import { WriteFileTool } from '../tools/write-file.js';
 import { WebFetchTool } from '../tools/web-fetch.js';
+import { ReadManyFilesTool } from '../tools/read-many-files.js';

 const DEFAULT_PASSTHROUGH_COMMANDS = ['ls', 'git', 'npm'];

@ -130,6 +131,7 @@ function createToolRegistry(config: Config): ToolRegistry {
    new TerminalTool(targetDir, config),
    new WriteFileTool(targetDir),
    new WebFetchTool(), // Note: WebFetchTool takes no arguments
+    new ReadManyFilesTool(targetDir),
  ];
  for (const tool of tools) {
    registry.registerTool(tool);
--- a/packages/server/src/tools/read-many-files.ts
+++ b/packages/server/src/tools/read-many-files.ts
@ -0,0 +1,386 @@
+/**
+ * @license
+ * Copyright 2025 Google LLC
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+import { BaseTool, ToolResult } from './tools.js';
+import { SchemaValidator } from '../utils/schemaValidator.js';
+import { getErrorMessage } from '../utils/errors.js';
+import * as fs from 'fs/promises';
+import * as path from 'path';
+import fg from 'fast-glob';
+
+/**
+ * Parameters for the ReadManyFilesTool.
+ */
+export interface ReadManyFilesParams {
+  /**
+   * An array of file paths or directory paths to search within.
+   * Paths are relative to the tool's configured target directory.
+   * Glob patterns can be used directly in these paths.
+   */
+  paths: string[];
+
+  /**
+   * Optional. Glob patterns for files to include.
+   * These are effectively combined with the `paths`.
+   * Example: ["*.ts", "src/** /*.md"]
+   */
+  include?: string[];
+
+  /**
+   * Optional. Glob patterns for files/directories to exclude.
+   * Applied as ignore patterns.
+   * Example: ["*.log", "dist/**"]
+   */
+  exclude?: string[];
+
+  /**
+   * Optional. Search directories recursively.
+   * This is generally controlled by glob patterns (e.g., `**`).
+   * The glob implementation is recursive by default for `**`.
+   * For simplicity, we'll rely on `**` for recursion.
+   */
+  recursive?: boolean;
+
+  /**
+   * Optional. Apply default exclusion patterns. Defaults to true.
+   */
+  useDefaultExcludes?: boolean;
+}
+
+/**
+ * Default exclusion patterns for commonly ignored directories and binary file types.
+ * These are compatible with glob ignore patterns.
+ * TODO(adh): Consider making this configurable or extendable through a command line arguement.
+ * TODO(adh): Look into sharing this list with the glob tool.
+ */
+const DEFAULT_EXCLUDES: string[] = [
+  '**/node_modules/**',
+  '**/.git/**',
+  '**/.vscode/**',
+  '**/.idea/**',
+  '**/dist/**',
+  '**/build/**',
+  '**/coverage/**',
+  '**/__pycache__/**',
+  '**/*.pyc',
+  '**/*.pyo',
+  '**/*.bin',
+  '**/*.exe',
+  '**/*.dll',
+  '**/*.so',
+  '**/*.dylib',
+  '**/*.class',
+  '**/*.jar',
+  '**/*.war',
+  '**/*.zip',
+  '**/*.tar',
+  '**/*.gz',
+  '**/*.bz2',
+  '**/*.rar',
+  '**/*.7z',
+  '**/*.png',
+  '**/*.jpg',
+  '**/*.jpeg',
+  '**/*.gif',
+  '**/*.bmp',
+  '**/*.tiff',
+  '**/*.ico',
+  '**/*.pdf',
+  '**/*.doc',
+  '**/*.docx',
+  '**/*.xls',
+  '**/*.xlsx',
+  '**/*.ppt',
+  '**/*.pptx',
+  '**/*.odt',
+  '**/*.ods',
+  '**/*.odp',
+  '**/*.DS_Store',
+  '**/.env',
+];
+
+// Default values for encoding and separator format
+const DEFAULT_ENCODING: BufferEncoding = 'utf-8';
+const DEFAULT_OUTPUT_SEPARATOR_FORMAT: string = '--- {filePath} ---';
+
+/**
+ * Tool implementation for finding and reading multiple text files from the local filesystem
+ * within a specified target directory. The content is concatenated.
+ * It is intended to run in an environment with access to the local file system (e.g., a Node.js backend).
+ */
+export class ReadManyFilesTool extends BaseTool<
+  ReadManyFilesParams,
+  ToolResult
+> {
+  static readonly Name: string = 'readManyFiles';
+  readonly targetDir: string;
+
+  /**
+   * Creates an instance of ReadManyFilesTool.
+   * @param targetDir The absolute root directory within which this tool is allowed to operate.
+   * All paths provided in `params` will be resolved relative to this directory.
+   */
+  constructor(targetDir: string) {
+    const parameterSchema: Record<string, unknown> = {
+      type: 'object',
+      properties: {
+        paths: {
+          type: 'array',
+          items: { type: 'string' },
+          description:
+            "Required. An array of glob patterns or paths relative to the tool's target directory. Examples: ['src/**/*.ts'], ['README.md', 'docs/']",
+        },
+        include: {
+          type: 'array',
+          items: { type: 'string' },
+          description:
+            'Optional. Additional glob patterns to include. These are merged with `paths`. Example: ["*.test.ts"] to specifically add test files if they were broadly excluded.',
+          default: [],
+        },
+        exclude: {
+          type: 'array',
+          items: { type: 'string' },
+          description:
+            'Optional. Glob patterns for files/directories to exclude. Added to default excludes if useDefaultExcludes is true. Example: ["**/*.log", "temp/"]',
+          default: [],
+        },
+        recursive: {
+          type: 'boolean',
+          description:
+            'Optional. Whether to search recursively (primarily controlled by `**` in glob patterns). Defaults to true.',
+          default: true,
+        },
+        useDefaultExcludes: {
+          type: 'boolean',
+          description:
+            'Optional. Whether to apply a list of default exclusion patterns (e.g., node_modules, .git, binary files). Defaults to true.',
+          default: true,
+        },
+      },
+      required: ['paths'],
+    };
+
+    super(
+      ReadManyFilesTool.Name,
+      'Read Many Files',
+      `Reads content from multiple text files specified by paths or glob patterns within a configured target directory and concatenates them into a single string.
+This tool is useful when you need to understand or analyze a collection of files, such as:
+- Getting an overview of a codebase or parts of it (e.g., all TypeScript files in the 'src' directory).
+- Finding where specific functionality is implemented if the user asks broad questions about code.
+- Reviewing documentation files (e.g., all Markdown files in the 'docs' directory).
+- Gathering context from multiple configuration files.
+- When the user asks to "read all files in X directory" or "show me the content of all Y files".
+
+Use this tool when the user's query implies needing the content of several files simultaneously for context, analysis, or summarization.
+It uses default UTF-8 encoding and a '--- {filePath} ---' separator between file contents.
+Ensure paths are relative to the target directory. Glob patterns like 'src/**/*.js' are supported.
+Avoid using for single files if a more specific single-file reading tool is available, unless the user specifically requests to process a list containing just one file via this tool.
+This tool should NOT be used for binary files; it attempts to skip them.
+Default excludes apply to common non-text files and large dependency directories unless 'useDefaultExcludes' is false.`,
+      parameterSchema,
+    );
+    this.targetDir = path.resolve(targetDir);
+  }
+
+  validateParams(params: ReadManyFilesParams): string | null {
+    if (
+      this.schema.parameters &&
+      !SchemaValidator.validate(
+        this.schema.parameters as Record<string, unknown>,
+        params,
+      )
+    ) {
+      if (
+        !params.paths ||
+        !Array.isArray(params.paths) ||
+        params.paths.length === 0
+      ) {
+        return 'The "paths" parameter is required and must be a non-empty array of strings/glob patterns.';
+      }
+      return 'Parameters failed schema validation. Ensure "paths" is a non-empty array and other parameters match their expected types.';
+    }
+    for (const p of params.paths) {
+      if (typeof p !== 'string' || p.trim() === '') {
+        return 'Each item in "paths" must be a non-empty string/glob pattern.';
+      }
+    }
+    if (
+      params.include &&
+      (!Array.isArray(params.include) ||
+        !params.include.every((item) => typeof item === 'string'))
+    ) {
+      return 'If provided, "include" must be an array of strings/glob patterns.';
+    }
+    if (
+      params.exclude &&
+      (!Array.isArray(params.exclude) ||
+        !params.exclude.every((item) => typeof item === 'string'))
+    ) {
+      return 'If provided, "exclude" must be an array of strings/glob patterns.';
+    }
+    return null;
+  }
+
+  getDescription(params: ReadManyFilesParams): string {
+    const allPatterns = [...params.paths, ...(params.include || [])];
+    const pathDesc = `using patterns: \`${allPatterns.join('`, `')}\` (within target directory: \`${this.targetDir}\`)`;
+
+    let effectiveExcludes =
+      params.useDefaultExcludes !== false ? [...DEFAULT_EXCLUDES] : [];
+    if (params.exclude && params.exclude.length > 0) {
+      effectiveExcludes = [...effectiveExcludes, ...params.exclude];
+    }
+    const excludeDesc = `Excluding: ${effectiveExcludes.length > 0 ? `patterns like \`${effectiveExcludes.slice(0, 2).join('`, `')}${effectiveExcludes.length > 2 ? '...`' : '`'}` : 'none explicitly (beyond default non-text file avoidance).'}`;
+
+    return `Will attempt to read and concatenate files ${pathDesc}. ${excludeDesc}. File encoding: ${DEFAULT_ENCODING}. Separator: "${DEFAULT_OUTPUT_SEPARATOR_FORMAT.replace('{filePath}', 'path/to/file.ext')}".`;
+  }
+
+  async execute(params: ReadManyFilesParams): Promise<ToolResult> {
+    const validationError = this.validateParams(params);
+    if (validationError) {
+      return {
+        llmContent: `Error: Invalid parameters for ${this.displayName}. Reason: ${validationError}`,
+        returnDisplay: `## Parameter Error\n\n${validationError}`,
+      };
+    }
+
+    const {
+      paths: inputPatterns,
+      include = [],
+      exclude = [],
+      useDefaultExcludes = true,
+    } = params;
+
+    const toolBaseDir = this.targetDir;
+
+    const filesToConsider = new Set<string>();
+    const skippedFiles: { path: string; reason: string }[] = [];
+    const processedFilesRelativePaths: string[] = [];
+    let concatenatedContent = '';
+
+    const effectiveExcludes = useDefaultExcludes
+      ? [...DEFAULT_EXCLUDES, ...exclude]
+      : [...exclude];
+
+    const searchPatterns = [...inputPatterns, ...include];
+    if (searchPatterns.length === 0) {
+      return {
+        llmContent: 'No search paths or include patterns provided.',
+        returnDisplay: `## Information\n\nNo search paths or include patterns were specified. Nothing to read or concatenate.`,
+      };
+    }
+
+    try {
+      // Using fast-glob (fg) for file searching based on patterns.
+      // The `cwd` option scopes the search to the toolBaseDir.
+      // `ignore` handles exclusions.
+      // `onlyFiles` ensures only files are returned.
+      // `dot` allows matching dotfiles (which can still be excluded by patterns).
+      // `absolute` returns absolute paths for consistent handling.
+      const entries = await fg(searchPatterns, {
+        cwd: toolBaseDir,
+        ignore: effectiveExcludes,
+        onlyFiles: true,
+        dot: true,
+        absolute: true,
+        caseSensitiveMatch: false,
+      });
+
+      for (const absoluteFilePath of entries) {
+        // Security check: ensure the glob library didn't return something outside targetDir.
+        // This should be guaranteed by `cwd` and the library's sandboxing, but an extra check is good practice.
+        if (!absoluteFilePath.startsWith(toolBaseDir)) {
+          skippedFiles.push({
+            path: absoluteFilePath,
+            reason: `Security: Glob library returned path outside target directory. Base: ${toolBaseDir}, Path: ${absoluteFilePath}`,
+          });
+          continue;
+        }
+        filesToConsider.add(absoluteFilePath);
+      }
+    } catch (error) {
+      return {
+        llmContent: `Error during file search: ${getErrorMessage(error)}`,
+        returnDisplay: `## File Search Error\n\nAn error occurred while searching for files:\n\`\`\`\n${getErrorMessage(error)}\n\`\`\``,
+      };
+    }
+
+    const sortedFiles = Array.from(filesToConsider).sort();
+
+    for (const filePath of sortedFiles) {
+      const relativePathForDisplay = path
+        .relative(toolBaseDir, filePath)
+        .replace(/\\/g, '/');
+      try {
+        const contentBuffer = await fs.readFile(filePath);
+        // Basic binary detection: check for null bytes in the first 1KB
+        const sample = contentBuffer.subarray(
+          0,
+          Math.min(contentBuffer.length, 1024),
+        );
+        if (sample.includes(0)) {
+          skippedFiles.push({
+            path: relativePathForDisplay,
+            reason: 'Skipped (appears to be binary)',
+          });
+          continue;
+        }
+        // Using default encoding
+        const fileContent = contentBuffer.toString(DEFAULT_ENCODING);
+        // Using default separator format
+        const separator = DEFAULT_OUTPUT_SEPARATOR_FORMAT.replace(
+          '{filePath}',
+          relativePathForDisplay,
+        );
+        concatenatedContent += `${separator}\n\n${fileContent}\n\n`;
+        processedFilesRelativePaths.push(relativePathForDisplay);
+      } catch (error) {
+        skippedFiles.push({
+          path: relativePathForDisplay,
+          reason: `Read error: ${getErrorMessage(error)}`,
+        });
+      }
+    }
+
+    let displayMessage = `### Read Many Files Result (Target Dir: \`${this.targetDir}\`)\n\n`;
+    if (processedFilesRelativePaths.length > 0) {
+      displayMessage += `Successfully read and concatenated content from **${processedFilesRelativePaths.length} file(s)**.\n`;
+      displayMessage += `\n**Processed Files (up to 10 shown):**\n`;
+      processedFilesRelativePaths
+        .slice(0, 10)
+        .forEach((p) => (displayMessage += `- \`${p}\`\n`));
+      if (processedFilesRelativePaths.length > 10) {
+        displayMessage += `- ...and ${processedFilesRelativePaths.length - 10} more.\n`;
+      }
+    } else {
+      displayMessage += `No files were read and concatenated based on the criteria.\n`;
+    }
+
+    if (skippedFiles.length > 0) {
+      displayMessage += `\n**Skipped ${skippedFiles.length} item(s) (up to 5 shown):**\n`;
+      skippedFiles
+        .slice(0, 5)
+        .forEach(
+          (f) => (displayMessage += `- \`${f.path}\` (Reason: ${f.reason})\n`),
+        );
+      if (skippedFiles.length > 5) {
+        displayMessage += `- ...and ${skippedFiles.length - 5} more.\n`;
+      }
+    }
+    if (
+      concatenatedContent.length === 0 &&
+      processedFilesRelativePaths.length === 0
+    ) {
+      concatenatedContent =
+        'No files matching the criteria were found or all were skipped.';
+    }
+
+    return {
+      llmContent: concatenatedContent,
+      returnDisplay: displayMessage,
+    };
+  }
+}