perf(flattener): improve memory efficiency by streaming xml output

- Replace in-memory XML generation with streaming approach - Add comprehensive common ignore patterns list - Update statistics calculation to use file size instead of content length
2025-07-22 12:12:47 -05:00 · 2025-07-22 12:12:47 -05:00 · 855960318c
parent 3d27ebdf18
commit 855960318c
2 changed files with 217 additions and 91 deletions
--- a/bmad-core/working-in-the-brownfield.md
+++ b/bmad-core/working-in-the-brownfield.md
@ -210,10 +210,12 @@ The PO ensures:
   docs/brownfield-architecture.md
 2. Shard your docs:
   In your IDE
   ```bash
   @po
   shard docs/brownfield-prd.md
   ```
   ```bash
   @po
   shard docs/brownfield-architecture.md
--- a/tools/flattener/main.js
+++ b/tools/flattener/main.js
@ -16,13 +16,119 @@ async function discoverFiles(rootDir) {
    const gitignorePath = path.join(rootDir, '.gitignore');
    const gitignorePatterns = await parseGitignore(gitignorePath);
-    const combinedIgnores = [
+    // Common gitignore patterns that should always be ignored
-    ...gitignorePatterns,
+    const commonIgnorePatterns = [
      // Version control
      '.git/**',
      '.svn/**',
      '.hg/**',
      '.bzr/**',
      // Dependencies
      'node_modules/**',
      'bower_components/**',
      'vendor/**',
      'packages/**',
      // Build outputs
      'build/**',
      'dist/**',
      'out/**',
      'target/**',
      'bin/**',
      'obj/**',
      'release/**',
      'debug/**',
      // Environment and config
      '.env',
      '.env.*',
      '*.env',
      '.config',
      // Logs
      'logs/**',
      '*.log',
      'npm-debug.log*',
      'yarn-debug.log*',
      'yarn-error.log*',
      'lerna-debug.log*',
      // Coverage and testing
      'coverage/**',
      '.nyc_output/**',
      '.coverage/**',
      'test-results/**',
      'junit.xml',
      // Cache directories
      '.cache/**',
      '.tmp/**',
      '.temp/**',
      'tmp/**',
      'temp/**',
      '.sass-cache/**',
      '.eslintcache',
      '.stylelintcache',
      // OS generated files
      '.DS_Store',
      '.DS_Store?',
      '._*',
      '.Spotlight-V100',
      '.Trashes',
      'ehthumbs.db',
      'Thumbs.db',
      'desktop.ini',
      // IDE and editor files
      '.vscode/**',
      '.idea/**',
      '*.swp',
      '*.swo',
      '*~',
      '.project',
      '.classpath',
      '.settings/**',
      '*.sublime-project',
      '*.sublime-workspace',
      // Package manager files
      'package-lock.json',
      'yarn.lock',
      'pnpm-lock.yaml',
      'composer.lock',
      'Pipfile.lock',
      // Runtime and compiled files
      '*.pyc',
      '*.pyo',
      '*.pyd',
      '__pycache__/**',
      '*.class',
      '*.jar',
      '*.war',
      '*.ear',
      '*.o',
      '*.so',
      '*.dll',
      '*.exe',
      // Documentation build
      '_site/**',
      '.jekyll-cache/**',
      '.jekyll-metadata',
      // Flattener specific outputs
      'flattened-codebase.xml',
      'repomix-output.xml'
    ];
    const combinedIgnores = [
      ...gitignorePatterns,
      ...commonIgnorePatterns
    ];
    // Use glob to recursively find all files, excluding common ignore patterns
    const files = await glob('**/*', {
      cwd: rootDir,
@ -180,49 +286,67 @@ async function aggregateFileContents(files, rootDir, spinner = null) {
 }
 /**
- * Generate XML output with aggregated file contents
+ * Generate XML output with aggregated file contents using streaming
 * @param {Object} aggregatedContent - The aggregated content object
- * @param {string} projectRoot - The project root directory
+ * @param {string} outputPath - The output file path
- * @returns {string} XML content
+ * @returns {Promise<void>} Promise that resolves when writing is complete
 */
-function generateXMLOutput(aggregatedContent) {
+async function generateXMLOutput(aggregatedContent, outputPath) {
  const { textFiles } = aggregatedContent;
-  let xml = `<?xml version="1.0" encoding="UTF-8"?>
+  // Create write stream for efficient memory usage
-`;
+  const writeStream = fs.createWriteStream(outputPath, { encoding: 'utf8' });
  xml += `<files>
 `;
-  // Add text files with content (only text files as per story requirements)
+  return new Promise((resolve, reject) => {
-  for (const file of textFiles) {
+    writeStream.on('error', reject);
-    xml += `  <file path="${escapeXml(file.path)}">`;
+    writeStream.on('finish', resolve);
    // Write XML header
    writeStream.write('<?xml version="1.0" encoding="UTF-8"?>\n');
    writeStream.write('<files>\n');
    // Process files one by one to minimize memory usage
    let fileIndex = 0;
    const writeNextFile = () => {
      if (fileIndex >= textFiles.length) {
        // All files processed, close XML and stream
        writeStream.write('</files>\n');
        writeStream.end();
        return;
      }
      const file = textFiles[fileIndex];
      fileIndex++;
      // Write file opening tag
      writeStream.write(`  <file path="${escapeXml(file.path)}">`);
      // Use CDATA for code content, handling CDATA end sequences properly
      if (file.content?.trim()) {
        const indentedContent = indentFileContent(file.content);
        if (file.content.includes(']]>')) {
          // If content contains ]]>, split it and wrap each part in CDATA
-        xml += splitAndWrapCDATA(indentedContent);
+          writeStream.write(splitAndWrapCDATA(indentedContent));
        } else {
-        xml += `<![CDATA[
+          writeStream.write(`<![CDATA[\n${indentedContent}\n    ]]>`);
 ${indentedContent}
    ]]>`;
        }
      } else if (file.content) {
        // Handle empty or whitespace-only content
        const indentedContent = indentFileContent(file.content);
-      xml += `<![CDATA[
+        writeStream.write(`<![CDATA[\n${indentedContent}\n    ]]>`);
 ${indentedContent}
    ]]>`;
      }
-    xml += `</file>
+      // Write file closing tag
-`;
+      writeStream.write('</file>\n');
  }
-  xml += `</files>
+      // Continue with next file on next tick to avoid stack overflow
-`;
+      setImmediate(writeNextFile);
-  return xml;
+    };
    // Start processing files
    writeNextFile();
  });
 }
 /**
@ -276,10 +400,10 @@ ${escapedContent}
 /**
 * Calculate statistics for the processed files
 * @param {Object} aggregatedContent - The aggregated content object
- * @param {string} xmlContent - The generated XML content
+ * @param {number} xmlFileSize - The size of the generated XML file in bytes
 * @returns {Object} Statistics object
 */
-function calculateStatistics(aggregatedContent, xmlContent) {
+function calculateStatistics(aggregatedContent, xmlFileSize) {
  const { textFiles, binaryFiles, errors } = aggregatedContent;
  // Calculate total file size in bytes
@ -291,7 +415,7 @@ function calculateStatistics(aggregatedContent, xmlContent) {
  const totalLines = textFiles.reduce((sum, file) => sum + file.lines, 0);
  // Estimate token count (rough approximation: 1 token ≈ 4 characters)
-  const estimatedTokens = Math.ceil(xmlContent.length / 4);
+  const estimatedTokens = Math.ceil(xmlFileSize / 4);
  // Format file size
  const formatSize = (bytes) => {
@ -306,7 +430,7 @@ function calculateStatistics(aggregatedContent, xmlContent) {
    binaryFiles: binaryFiles.length,
    errorFiles: errors.length,
    totalSize: formatSize(totalSize),
-    xmlSize: formatSize(xmlContent.length),
+    xmlSize: formatSize(xmlFileSize),
    totalLines,
    estimatedTokens: estimatedTokens.toLocaleString()
  };
@ -402,14 +526,14 @@ program
        console.log(`Binary files: ${aggregatedContent.binaryFiles.length}`);
      }
-      // Generate XML output
+      // Generate XML output using streaming
      const xmlSpinner = ora('🔧 Generating XML output...').start();
-      const xmlOutput = generateXMLOutput(aggregatedContent);
+      await generateXMLOutput(aggregatedContent, options.output);
      await fs.writeFile(options.output, xmlOutput);
      xmlSpinner.succeed('📝 XML generation completed');
      // Calculate and display statistics
-      const stats = calculateStatistics(aggregatedContent, xmlOutput);
+      const outputStats = await fs.stat(options.output);
      const stats = calculateStatistics(aggregatedContent, outputStats.size);
      // Display completion summary
      console.log('\n📊 Completion Summary:');