perf(flattener): improve memory efficiency by streaming xml output

- Replace in-memory XML generation with streaming approach - Add comprehensive common ignore patterns list - Update statistics calculation to use file size instead of content length
2025-07-22 12:12:47 -05:00 · 2025-07-22 12:12:47 -05:00 · 855960318c
parent 3d27ebdf18
commit 855960318c
2 changed files with 217 additions and 91 deletions
--- a/bmad-core/working-in-the-brownfield.md
+++ b/bmad-core/working-in-the-brownfield.md
@ -210,10 +210,12 @@ The PO ensures:
   docs/brownfield-architecture.md
 2. Shard your docs:
   In your IDE
+
   ```bash
   @po
   shard docs/brownfield-prd.md
   ```
+
   ```bash
   @po
   shard docs/brownfield-architecture.md
--- a/tools/flattener/main.js
+++ b/tools/flattener/main.js
@ -16,12 +16,118 @@ async function discoverFiles(rootDir) {
    const gitignorePath = path.join(rootDir, '.gitignore');
    const gitignorePatterns = await parseGitignore(gitignorePath);

+    // Common gitignore patterns that should always be ignored
+    const commonIgnorePatterns = [
+      // Version control
+      '.git/**',
+      '.svn/**',
+      '.hg/**',
+      '.bzr/**',
+
+      // Dependencies
+      'node_modules/**',
+      'bower_components/**',
+      'vendor/**',
+      'packages/**',
+
+      // Build outputs
+      'build/**',
+      'dist/**',
+      'out/**',
+      'target/**',
+      'bin/**',
+      'obj/**',
+      'release/**',
+      'debug/**',
+
+      // Environment and config
+      '.env',
+      '.env.*',
+      '*.env',
+      '.config',
+
+      // Logs
+      'logs/**',
+      '*.log',
+      'npm-debug.log*',
+      'yarn-debug.log*',
+      'yarn-error.log*',
+      'lerna-debug.log*',
+
+      // Coverage and testing
+      'coverage/**',
+      '.nyc_output/**',
+      '.coverage/**',
+      'test-results/**',
+      'junit.xml',
+
+      // Cache directories
+      '.cache/**',
+      '.tmp/**',
+      '.temp/**',
+      'tmp/**',
+      'temp/**',
+      '.sass-cache/**',
+      '.eslintcache',
+      '.stylelintcache',
+
+      // OS generated files
+      '.DS_Store',
+      '.DS_Store?',
+      '._*',
+      '.Spotlight-V100',
+      '.Trashes',
+      'ehthumbs.db',
+      'Thumbs.db',
+      'desktop.ini',
+
+      // IDE and editor files
+      '.vscode/**',
+      '.idea/**',
+      '*.swp',
+      '*.swo',
+      '*~',
+      '.project',
+      '.classpath',
+      '.settings/**',
+      '*.sublime-project',
+      '*.sublime-workspace',
+
+      // Package manager files
+      'package-lock.json',
+      'yarn.lock',
+      'pnpm-lock.yaml',
+      'composer.lock',
+      'Pipfile.lock',
+
+      // Runtime and compiled files
+      '*.pyc',
+      '*.pyo',
+      '*.pyd',
+      '__pycache__/**',
+      '*.class',
+      '*.jar',
+      '*.war',
+      '*.ear',
+      '*.o',
+      '*.so',
+      '*.dll',
+      '*.exe',
+
+      // Documentation build
+      '_site/**',
+      '.jekyll-cache/**',
+      '.jekyll-metadata',
+
+      // Flattener specific outputs
+      'flattened-codebase.xml',
+      'repomix-output.xml'
+    ];
+
    const combinedIgnores = [
-    ...gitignorePatterns,
-    '.git/**',
-    'flattened-codebase.xml',
-    'repomix-output.xml'
-  ];
+      ...gitignorePatterns,
+      ...commonIgnorePatterns
+    ];

    // Use glob to recursively find all files, excluding common ignore patterns
    const files = await glob('**/*', {
@ -31,7 +137,7 @@ async function discoverFiles(rootDir) {
      follow: false, // Don't follow symbolic links
      ignore: combinedIgnores
    });
-    
+
    return files.map(file => path.resolve(rootDir, file));
  } catch (error) {
    console.error('Error discovering files:', error.message);
@ -49,7 +155,7 @@ async function parseGitignore(gitignorePath) {
    if (!await fs.pathExists(gitignorePath)) {
      return [];
    }
-    
+
    const content = await fs.readFile(gitignorePath, 'utf8');
    return content
      .split('\n')
@ -85,18 +191,18 @@ async function isBinaryFile(filePath) {
      '.ttf', '.otf', '.woff', '.woff2',
      '.bin', '.dat', '.db', '.sqlite'
    ];
-    
+
    const ext = path.extname(filePath).toLowerCase();
    if (binaryExtensions.includes(ext)) {
      return true;
    }
-    
+
    // For files without clear extensions, try to read a small sample
    const stats = await fs.stat(filePath);
    if (stats.size === 0) {
      return false; // Empty files are considered text
    }
-    
+
    // Read first 1024 bytes to check for null bytes
    const sampleSize = Math.min(1024, stats.size);
    const buffer = await fs.readFile(filePath, { encoding: null, flag: 'r' });
@ -124,18 +230,18 @@ async function aggregateFileContents(files, rootDir, spinner = null) {
    totalFiles: files.length,
    processedFiles: 0
  };
-  
+
  for (const filePath of files) {
    try {
      const relativePath = path.relative(rootDir, filePath);
-      
+
      // Update progress indicator
      if (spinner) {
        spinner.text = `Processing file ${results.processedFiles + 1}/${results.totalFiles}: ${relativePath}`;
      }
-      
+
      const isBinary = await isBinaryFile(filePath);
-      
+
      if (isBinary) {
        results.binaryFiles.push({
          path: relativePath,
@ -153,7 +259,7 @@ async function aggregateFileContents(files, rootDir, spinner = null) {
          lines: content.split('\n').length
        });
      }
-      
+
      results.processedFiles++;
    } catch (error) {
      const relativePath = path.relative(rootDir, filePath);
@ -162,67 +268,85 @@ async function aggregateFileContents(files, rootDir, spinner = null) {
        absolutePath: filePath,
        error: error.message
      };
-      
+
      results.errors.push(errorInfo);
-      
+
      // Log warning without interfering with spinner
      if (spinner) {
        spinner.warn(`Warning: Could not read file ${relativePath}: ${error.message}`);
      } else {
        console.warn(`Warning: Could not read file ${relativePath}: ${error.message}`);
      }
-      
+
      results.processedFiles++;
    }
  }
-  
+
  return results;
 }

 /**
- * Generate XML output with aggregated file contents
+ * Generate XML output with aggregated file contents using streaming
 * @param {Object} aggregatedContent - The aggregated content object
- * @param {string} projectRoot - The project root directory
- * @returns {string} XML content
+ * @param {string} outputPath - The output file path
+ * @returns {Promise<void>} Promise that resolves when writing is complete
 */
-function generateXMLOutput(aggregatedContent) {
+async function generateXMLOutput(aggregatedContent, outputPath) {
  const { textFiles } = aggregatedContent;
-  
-  let xml = `<?xml version="1.0" encoding="UTF-8"?>
-`;
-  xml += `<files>
-`;
-  
-  // Add text files with content (only text files as per story requirements)
-  for (const file of textFiles) {
-    xml += `  <file path="${escapeXml(file.path)}">`;
-    
-    // Use CDATA for code content, handling CDATA end sequences properly
-    if (file.content?.trim()) {
-      const indentedContent = indentFileContent(file.content);
-      if (file.content.includes(']]>')) {
-        // If content contains ]]>, split it and wrap each part in CDATA
-        xml += splitAndWrapCDATA(indentedContent);
-      } else {
-        xml += `<![CDATA[
-${indentedContent}
-    ]]>`;
+
+  // Create write stream for efficient memory usage
+  const writeStream = fs.createWriteStream(outputPath, { encoding: 'utf8' });
+
+  return new Promise((resolve, reject) => {
+    writeStream.on('error', reject);
+    writeStream.on('finish', resolve);
+
+    // Write XML header
+    writeStream.write('<?xml version="1.0" encoding="UTF-8"?>\n');
+    writeStream.write('<files>\n');
+
+    // Process files one by one to minimize memory usage
+    let fileIndex = 0;
+
+    const writeNextFile = () => {
+      if (fileIndex >= textFiles.length) {
+        // All files processed, close XML and stream
+        writeStream.write('</files>\n');
+        writeStream.end();
+        return;
      }
-    } else if (file.content) {
-      // Handle empty or whitespace-only content
-      const indentedContent = indentFileContent(file.content);
-      xml += `<![CDATA[
-${indentedContent}
-    ]]>`;
-    }
-    
-    xml += `</file>
-`;
-  }
-  
-  xml += `</files>
-`;
-  return xml;
+
+      const file = textFiles[fileIndex];
+      fileIndex++;
+
+      // Write file opening tag
+      writeStream.write(`  <file path="${escapeXml(file.path)}">`);
+
+      // Use CDATA for code content, handling CDATA end sequences properly
+      if (file.content?.trim()) {
+        const indentedContent = indentFileContent(file.content);
+        if (file.content.includes(']]>')) {
+          // If content contains ]]>, split it and wrap each part in CDATA
+          writeStream.write(splitAndWrapCDATA(indentedContent));
+        } else {
+          writeStream.write(`<![CDATA[\n${indentedContent}\n    ]]>`);
+        }
+      } else if (file.content) {
+        // Handle empty or whitespace-only content
+        const indentedContent = indentFileContent(file.content);
+        writeStream.write(`<![CDATA[\n${indentedContent}\n    ]]>`);
+      }
+
+      // Write file closing tag
+      writeStream.write('</file>\n');
+
+      // Continue with next file on next tick to avoid stack overflow
+      setImmediate(writeNextFile);
+    };
+
+    // Start processing files
+    writeNextFile();
+  });
 }

 /**
@ -251,7 +375,7 @@ function indentFileContent(content) {
  if (typeof content !== 'string') {
    return String(content);
  }
-  
+
  // Split content into lines and add 4 spaces of indentation to each line
  return content.split('\n').map(line => `    ${line}`).join('\n');
 }
@ -265,7 +389,7 @@ function splitAndWrapCDATA(content) {
  if (typeof content !== 'string') {
    return String(content);
  }
-  
+
  // Replace ]]> with ]]]]><![CDATA[> to escape it within CDATA
  const escapedContent = content.replace(/]]>/g, ']]]]><![CDATA[>');
  return `<![CDATA[
@ -276,37 +400,37 @@ ${escapedContent}
 /**
 * Calculate statistics for the processed files
 * @param {Object} aggregatedContent - The aggregated content object
- * @param {string} xmlContent - The generated XML content
+ * @param {number} xmlFileSize - The size of the generated XML file in bytes
 * @returns {Object} Statistics object
 */
-function calculateStatistics(aggregatedContent, xmlContent) {
+function calculateStatistics(aggregatedContent, xmlFileSize) {
  const { textFiles, binaryFiles, errors } = aggregatedContent;
-  
+
  // Calculate total file size in bytes
  const totalTextSize = textFiles.reduce((sum, file) => sum + file.size, 0);
  const totalBinarySize = binaryFiles.reduce((sum, file) => sum + file.size, 0);
  const totalSize = totalTextSize + totalBinarySize;
-  
+
  // Calculate total lines of code
  const totalLines = textFiles.reduce((sum, file) => sum + file.lines, 0);
-  
+
  // Estimate token count (rough approximation: 1 token ≈ 4 characters)
-  const estimatedTokens = Math.ceil(xmlContent.length / 4);
-  
+  const estimatedTokens = Math.ceil(xmlFileSize / 4);
+
  // Format file size
  const formatSize = (bytes) => {
    if (bytes < 1024) return `${bytes} B`;
    if (bytes < 1024 * 1024) return `${(bytes / 1024).toFixed(1)} KB`;
    return `${(bytes / (1024 * 1024)).toFixed(1)} MB`;
  };
-  
+
  return {
    totalFiles: textFiles.length + binaryFiles.length,
    textFiles: textFiles.length,
    binaryFiles: binaryFiles.length,
    errorFiles: errors.length,
    totalSize: formatSize(totalSize),
-    xmlSize: formatSize(xmlContent.length),
+    xmlSize: formatSize(xmlFileSize),
    totalLines,
    estimatedTokens: estimatedTokens.toLocaleString()
  };
@ -321,24 +445,24 @@ function calculateStatistics(aggregatedContent, xmlContent) {
 async function filterFiles(files, rootDir) {
  const gitignorePath = path.join(rootDir, '.gitignore');
  const ignorePatterns = await parseGitignore(gitignorePath);
-  
+
  if (ignorePatterns.length === 0) {
    return files;
  }
-  
+
  // Convert absolute paths to relative for pattern matching
  const relativeFiles = files.map(file => path.relative(rootDir, file));
-  
+
  // Separate positive and negative patterns
  const positivePatterns = ignorePatterns.filter(p => !p.startsWith('!'));
  const negativePatterns = ignorePatterns.filter(p => p.startsWith('!')).map(p => p.slice(1));
-  
+
  // Filter out files that match ignore patterns
  const filteredRelative = [];
-  
+
  for (const file of relativeFiles) {
    let shouldIgnore = false;
-    
+
    // First check positive patterns (ignore these files)
    for (const pattern of positivePatterns) {
      if (minimatch(file, pattern)) {
@ -346,7 +470,7 @@ async function filterFiles(files, rootDir) {
        break;
      }
    }
-    
+
    // Then check negative patterns (don't ignore these files even if they match positive patterns)
    if (shouldIgnore) {
      for (const pattern of negativePatterns) {
@ -356,12 +480,12 @@ async function filterFiles(files, rootDir) {
        }
      }
    }
-    
+
    if (!shouldIgnore) {
      filteredRelative.push(file);
    }
  }
-  
+
  // Convert back to absolute paths
  return filteredRelative.map(file => path.resolve(rootDir, file));
 }
@ -375,23 +499,23 @@ program
  .option('-o, --output <path>', 'Output file path', 'flattened-codebase.xml')
  .action(async (options) => {
    console.log(`Flattening codebase to: ${options.output}`);
-    
+
    try {
      // Import ora dynamically
      const { default: ora } = await import('ora');
-      
+
      // Start file discovery with spinner
      const discoverySpinner = ora('🔍 Discovering files...').start();
      const files = await discoverFiles(process.cwd());
      const filteredFiles = await filterFiles(files, process.cwd());
      discoverySpinner.succeed(`📁 Found ${filteredFiles.length} files to include`);
-      
+
      // Process files with progress tracking
      console.log('Reading file contents');
      const processingSpinner = ora('📄 Processing files...').start();
      const aggregatedContent = await aggregateFileContents(filteredFiles, process.cwd(), processingSpinner);
      processingSpinner.succeed(`✅ Processed ${aggregatedContent.processedFiles}/${filteredFiles.length} files`);
-      
+
      // Log processing results for test validation
      console.log(`Processed ${aggregatedContent.processedFiles}/${filteredFiles.length} files`);
      if (aggregatedContent.errors.length > 0) {
@ -401,16 +525,16 @@ program
      if (aggregatedContent.binaryFiles.length > 0) {
        console.log(`Binary files: ${aggregatedContent.binaryFiles.length}`);
      }
-      
-      // Generate XML output
+
+      // Generate XML output using streaming
      const xmlSpinner = ora('🔧 Generating XML output...').start();
-      const xmlOutput = generateXMLOutput(aggregatedContent);
-      await fs.writeFile(options.output, xmlOutput);
+      await generateXMLOutput(aggregatedContent, options.output);
      xmlSpinner.succeed('📝 XML generation completed');
-      
+
      // Calculate and display statistics
-      const stats = calculateStatistics(aggregatedContent, xmlOutput);
-      
+      const outputStats = await fs.stat(options.output);
+      const stats = calculateStatistics(aggregatedContent, outputStats.size);
+
      // Display completion summary
      console.log('\n📊 Completion Summary:');
      console.log(`✅ Successfully processed ${filteredFiles.length} files into ${options.output}`);
@ -420,7 +544,7 @@ program
      console.log(`📝 Total lines of code: ${stats.totalLines.toLocaleString()}`);
      console.log(`🔢 Estimated tokens: ${stats.estimatedTokens}`);
      console.log(`📊 File breakdown: ${stats.textFiles} text, ${stats.binaryFiles} binary, ${stats.errorFiles} errors`);
-      
+
    } catch (error) {
      console.error('❌ Critical error:', error.message);
      console.error('An unexpected error occurred.');
@ -432,4 +556,4 @@ if (require.main === module) {
  program.parse();
 }

-module.exports = program;
+module.exports = program;