perf(flattener): improve memory efficiency by streaming xml output

- Replace in-memory XML generation with streaming approach
- Add comprehensive common ignore patterns list
- Update statistics calculation to use file size instead of content length
This commit is contained in:
manjaroblack 2025-07-22 12:12:47 -05:00
parent 3d27ebdf18
commit 855960318c
No known key found for this signature in database
GPG Key ID: 02FD4111DA5560B4
2 changed files with 217 additions and 91 deletions

View File

@ -210,10 +210,12 @@ The PO ensures:
docs/brownfield-architecture.md docs/brownfield-architecture.md
2. Shard your docs: 2. Shard your docs:
In your IDE In your IDE
```bash ```bash
@po @po
shard docs/brownfield-prd.md shard docs/brownfield-prd.md
``` ```
```bash ```bash
@po @po
shard docs/brownfield-architecture.md shard docs/brownfield-architecture.md

View File

@ -16,12 +16,118 @@ async function discoverFiles(rootDir) {
const gitignorePath = path.join(rootDir, '.gitignore'); const gitignorePath = path.join(rootDir, '.gitignore');
const gitignorePatterns = await parseGitignore(gitignorePath); const gitignorePatterns = await parseGitignore(gitignorePath);
// Common gitignore patterns that should always be ignored
const commonIgnorePatterns = [
// Version control
'.git/**',
'.svn/**',
'.hg/**',
'.bzr/**',
// Dependencies
'node_modules/**',
'bower_components/**',
'vendor/**',
'packages/**',
// Build outputs
'build/**',
'dist/**',
'out/**',
'target/**',
'bin/**',
'obj/**',
'release/**',
'debug/**',
// Environment and config
'.env',
'.env.*',
'*.env',
'.config',
// Logs
'logs/**',
'*.log',
'npm-debug.log*',
'yarn-debug.log*',
'yarn-error.log*',
'lerna-debug.log*',
// Coverage and testing
'coverage/**',
'.nyc_output/**',
'.coverage/**',
'test-results/**',
'junit.xml',
// Cache directories
'.cache/**',
'.tmp/**',
'.temp/**',
'tmp/**',
'temp/**',
'.sass-cache/**',
'.eslintcache',
'.stylelintcache',
// OS generated files
'.DS_Store',
'.DS_Store?',
'._*',
'.Spotlight-V100',
'.Trashes',
'ehthumbs.db',
'Thumbs.db',
'desktop.ini',
// IDE and editor files
'.vscode/**',
'.idea/**',
'*.swp',
'*.swo',
'*~',
'.project',
'.classpath',
'.settings/**',
'*.sublime-project',
'*.sublime-workspace',
// Package manager files
'package-lock.json',
'yarn.lock',
'pnpm-lock.yaml',
'composer.lock',
'Pipfile.lock',
// Runtime and compiled files
'*.pyc',
'*.pyo',
'*.pyd',
'__pycache__/**',
'*.class',
'*.jar',
'*.war',
'*.ear',
'*.o',
'*.so',
'*.dll',
'*.exe',
// Documentation build
'_site/**',
'.jekyll-cache/**',
'.jekyll-metadata',
// Flattener specific outputs
'flattened-codebase.xml',
'repomix-output.xml'
];
const combinedIgnores = [ const combinedIgnores = [
...gitignorePatterns, ...gitignorePatterns,
'.git/**', ...commonIgnorePatterns
'flattened-codebase.xml', ];
'repomix-output.xml'
];
// Use glob to recursively find all files, excluding common ignore patterns // Use glob to recursively find all files, excluding common ignore patterns
const files = await glob('**/*', { const files = await glob('**/*', {
@ -31,7 +137,7 @@ async function discoverFiles(rootDir) {
follow: false, // Don't follow symbolic links follow: false, // Don't follow symbolic links
ignore: combinedIgnores ignore: combinedIgnores
}); });
return files.map(file => path.resolve(rootDir, file)); return files.map(file => path.resolve(rootDir, file));
} catch (error) { } catch (error) {
console.error('Error discovering files:', error.message); console.error('Error discovering files:', error.message);
@ -49,7 +155,7 @@ async function parseGitignore(gitignorePath) {
if (!await fs.pathExists(gitignorePath)) { if (!await fs.pathExists(gitignorePath)) {
return []; return [];
} }
const content = await fs.readFile(gitignorePath, 'utf8'); const content = await fs.readFile(gitignorePath, 'utf8');
return content return content
.split('\n') .split('\n')
@ -85,18 +191,18 @@ async function isBinaryFile(filePath) {
'.ttf', '.otf', '.woff', '.woff2', '.ttf', '.otf', '.woff', '.woff2',
'.bin', '.dat', '.db', '.sqlite' '.bin', '.dat', '.db', '.sqlite'
]; ];
const ext = path.extname(filePath).toLowerCase(); const ext = path.extname(filePath).toLowerCase();
if (binaryExtensions.includes(ext)) { if (binaryExtensions.includes(ext)) {
return true; return true;
} }
// For files without clear extensions, try to read a small sample // For files without clear extensions, try to read a small sample
const stats = await fs.stat(filePath); const stats = await fs.stat(filePath);
if (stats.size === 0) { if (stats.size === 0) {
return false; // Empty files are considered text return false; // Empty files are considered text
} }
// Read first 1024 bytes to check for null bytes // Read first 1024 bytes to check for null bytes
const sampleSize = Math.min(1024, stats.size); const sampleSize = Math.min(1024, stats.size);
const buffer = await fs.readFile(filePath, { encoding: null, flag: 'r' }); const buffer = await fs.readFile(filePath, { encoding: null, flag: 'r' });
@ -124,18 +230,18 @@ async function aggregateFileContents(files, rootDir, spinner = null) {
totalFiles: files.length, totalFiles: files.length,
processedFiles: 0 processedFiles: 0
}; };
for (const filePath of files) { for (const filePath of files) {
try { try {
const relativePath = path.relative(rootDir, filePath); const relativePath = path.relative(rootDir, filePath);
// Update progress indicator // Update progress indicator
if (spinner) { if (spinner) {
spinner.text = `Processing file ${results.processedFiles + 1}/${results.totalFiles}: ${relativePath}`; spinner.text = `Processing file ${results.processedFiles + 1}/${results.totalFiles}: ${relativePath}`;
} }
const isBinary = await isBinaryFile(filePath); const isBinary = await isBinaryFile(filePath);
if (isBinary) { if (isBinary) {
results.binaryFiles.push({ results.binaryFiles.push({
path: relativePath, path: relativePath,
@ -153,7 +259,7 @@ async function aggregateFileContents(files, rootDir, spinner = null) {
lines: content.split('\n').length lines: content.split('\n').length
}); });
} }
results.processedFiles++; results.processedFiles++;
} catch (error) { } catch (error) {
const relativePath = path.relative(rootDir, filePath); const relativePath = path.relative(rootDir, filePath);
@ -162,67 +268,85 @@ async function aggregateFileContents(files, rootDir, spinner = null) {
absolutePath: filePath, absolutePath: filePath,
error: error.message error: error.message
}; };
results.errors.push(errorInfo); results.errors.push(errorInfo);
// Log warning without interfering with spinner // Log warning without interfering with spinner
if (spinner) { if (spinner) {
spinner.warn(`Warning: Could not read file ${relativePath}: ${error.message}`); spinner.warn(`Warning: Could not read file ${relativePath}: ${error.message}`);
} else { } else {
console.warn(`Warning: Could not read file ${relativePath}: ${error.message}`); console.warn(`Warning: Could not read file ${relativePath}: ${error.message}`);
} }
results.processedFiles++; results.processedFiles++;
} }
} }
return results; return results;
} }
/** /**
* Generate XML output with aggregated file contents * Generate XML output with aggregated file contents using streaming
* @param {Object} aggregatedContent - The aggregated content object * @param {Object} aggregatedContent - The aggregated content object
* @param {string} projectRoot - The project root directory * @param {string} outputPath - The output file path
* @returns {string} XML content * @returns {Promise<void>} Promise that resolves when writing is complete
*/ */
function generateXMLOutput(aggregatedContent) { async function generateXMLOutput(aggregatedContent, outputPath) {
const { textFiles } = aggregatedContent; const { textFiles } = aggregatedContent;
let xml = `<?xml version="1.0" encoding="UTF-8"?> // Create write stream for efficient memory usage
`; const writeStream = fs.createWriteStream(outputPath, { encoding: 'utf8' });
xml += `<files>
`; return new Promise((resolve, reject) => {
writeStream.on('error', reject);
// Add text files with content (only text files as per story requirements) writeStream.on('finish', resolve);
for (const file of textFiles) {
xml += ` <file path="${escapeXml(file.path)}">`; // Write XML header
writeStream.write('<?xml version="1.0" encoding="UTF-8"?>\n');
// Use CDATA for code content, handling CDATA end sequences properly writeStream.write('<files>\n');
if (file.content?.trim()) {
const indentedContent = indentFileContent(file.content); // Process files one by one to minimize memory usage
if (file.content.includes(']]>')) { let fileIndex = 0;
// If content contains ]]>, split it and wrap each part in CDATA
xml += splitAndWrapCDATA(indentedContent); const writeNextFile = () => {
} else { if (fileIndex >= textFiles.length) {
xml += `<![CDATA[ // All files processed, close XML and stream
${indentedContent} writeStream.write('</files>\n');
]]>`; writeStream.end();
return;
} }
} else if (file.content) {
// Handle empty or whitespace-only content const file = textFiles[fileIndex];
const indentedContent = indentFileContent(file.content); fileIndex++;
xml += `<![CDATA[
${indentedContent} // Write file opening tag
]]>`; writeStream.write(` <file path="${escapeXml(file.path)}">`);
}
// Use CDATA for code content, handling CDATA end sequences properly
xml += `</file> if (file.content?.trim()) {
`; const indentedContent = indentFileContent(file.content);
} if (file.content.includes(']]>')) {
// If content contains ]]>, split it and wrap each part in CDATA
xml += `</files> writeStream.write(splitAndWrapCDATA(indentedContent));
`; } else {
return xml; writeStream.write(`<![CDATA[\n${indentedContent}\n ]]>`);
}
} else if (file.content) {
// Handle empty or whitespace-only content
const indentedContent = indentFileContent(file.content);
writeStream.write(`<![CDATA[\n${indentedContent}\n ]]>`);
}
// Write file closing tag
writeStream.write('</file>\n');
// Continue with next file on next tick to avoid stack overflow
setImmediate(writeNextFile);
};
// Start processing files
writeNextFile();
});
} }
/** /**
@ -251,7 +375,7 @@ function indentFileContent(content) {
if (typeof content !== 'string') { if (typeof content !== 'string') {
return String(content); return String(content);
} }
// Split content into lines and add 4 spaces of indentation to each line // Split content into lines and add 4 spaces of indentation to each line
return content.split('\n').map(line => ` ${line}`).join('\n'); return content.split('\n').map(line => ` ${line}`).join('\n');
} }
@ -265,7 +389,7 @@ function splitAndWrapCDATA(content) {
if (typeof content !== 'string') { if (typeof content !== 'string') {
return String(content); return String(content);
} }
// Replace ]]> with ]]]]><![CDATA[> to escape it within CDATA // Replace ]]> with ]]]]><![CDATA[> to escape it within CDATA
const escapedContent = content.replace(/]]>/g, ']]]]><![CDATA[>'); const escapedContent = content.replace(/]]>/g, ']]]]><![CDATA[>');
return `<![CDATA[ return `<![CDATA[
@ -276,37 +400,37 @@ ${escapedContent}
/** /**
* Calculate statistics for the processed files * Calculate statistics for the processed files
* @param {Object} aggregatedContent - The aggregated content object * @param {Object} aggregatedContent - The aggregated content object
* @param {string} xmlContent - The generated XML content * @param {number} xmlFileSize - The size of the generated XML file in bytes
* @returns {Object} Statistics object * @returns {Object} Statistics object
*/ */
function calculateStatistics(aggregatedContent, xmlContent) { function calculateStatistics(aggregatedContent, xmlFileSize) {
const { textFiles, binaryFiles, errors } = aggregatedContent; const { textFiles, binaryFiles, errors } = aggregatedContent;
// Calculate total file size in bytes // Calculate total file size in bytes
const totalTextSize = textFiles.reduce((sum, file) => sum + file.size, 0); const totalTextSize = textFiles.reduce((sum, file) => sum + file.size, 0);
const totalBinarySize = binaryFiles.reduce((sum, file) => sum + file.size, 0); const totalBinarySize = binaryFiles.reduce((sum, file) => sum + file.size, 0);
const totalSize = totalTextSize + totalBinarySize; const totalSize = totalTextSize + totalBinarySize;
// Calculate total lines of code // Calculate total lines of code
const totalLines = textFiles.reduce((sum, file) => sum + file.lines, 0); const totalLines = textFiles.reduce((sum, file) => sum + file.lines, 0);
// Estimate token count (rough approximation: 1 token ≈ 4 characters) // Estimate token count (rough approximation: 1 token ≈ 4 characters)
const estimatedTokens = Math.ceil(xmlContent.length / 4); const estimatedTokens = Math.ceil(xmlFileSize / 4);
// Format file size // Format file size
const formatSize = (bytes) => { const formatSize = (bytes) => {
if (bytes < 1024) return `${bytes} B`; if (bytes < 1024) return `${bytes} B`;
if (bytes < 1024 * 1024) return `${(bytes / 1024).toFixed(1)} KB`; if (bytes < 1024 * 1024) return `${(bytes / 1024).toFixed(1)} KB`;
return `${(bytes / (1024 * 1024)).toFixed(1)} MB`; return `${(bytes / (1024 * 1024)).toFixed(1)} MB`;
}; };
return { return {
totalFiles: textFiles.length + binaryFiles.length, totalFiles: textFiles.length + binaryFiles.length,
textFiles: textFiles.length, textFiles: textFiles.length,
binaryFiles: binaryFiles.length, binaryFiles: binaryFiles.length,
errorFiles: errors.length, errorFiles: errors.length,
totalSize: formatSize(totalSize), totalSize: formatSize(totalSize),
xmlSize: formatSize(xmlContent.length), xmlSize: formatSize(xmlFileSize),
totalLines, totalLines,
estimatedTokens: estimatedTokens.toLocaleString() estimatedTokens: estimatedTokens.toLocaleString()
}; };
@ -321,24 +445,24 @@ function calculateStatistics(aggregatedContent, xmlContent) {
async function filterFiles(files, rootDir) { async function filterFiles(files, rootDir) {
const gitignorePath = path.join(rootDir, '.gitignore'); const gitignorePath = path.join(rootDir, '.gitignore');
const ignorePatterns = await parseGitignore(gitignorePath); const ignorePatterns = await parseGitignore(gitignorePath);
if (ignorePatterns.length === 0) { if (ignorePatterns.length === 0) {
return files; return files;
} }
// Convert absolute paths to relative for pattern matching // Convert absolute paths to relative for pattern matching
const relativeFiles = files.map(file => path.relative(rootDir, file)); const relativeFiles = files.map(file => path.relative(rootDir, file));
// Separate positive and negative patterns // Separate positive and negative patterns
const positivePatterns = ignorePatterns.filter(p => !p.startsWith('!')); const positivePatterns = ignorePatterns.filter(p => !p.startsWith('!'));
const negativePatterns = ignorePatterns.filter(p => p.startsWith('!')).map(p => p.slice(1)); const negativePatterns = ignorePatterns.filter(p => p.startsWith('!')).map(p => p.slice(1));
// Filter out files that match ignore patterns // Filter out files that match ignore patterns
const filteredRelative = []; const filteredRelative = [];
for (const file of relativeFiles) { for (const file of relativeFiles) {
let shouldIgnore = false; let shouldIgnore = false;
// First check positive patterns (ignore these files) // First check positive patterns (ignore these files)
for (const pattern of positivePatterns) { for (const pattern of positivePatterns) {
if (minimatch(file, pattern)) { if (minimatch(file, pattern)) {
@ -346,7 +470,7 @@ async function filterFiles(files, rootDir) {
break; break;
} }
} }
// Then check negative patterns (don't ignore these files even if they match positive patterns) // Then check negative patterns (don't ignore these files even if they match positive patterns)
if (shouldIgnore) { if (shouldIgnore) {
for (const pattern of negativePatterns) { for (const pattern of negativePatterns) {
@ -356,12 +480,12 @@ async function filterFiles(files, rootDir) {
} }
} }
} }
if (!shouldIgnore) { if (!shouldIgnore) {
filteredRelative.push(file); filteredRelative.push(file);
} }
} }
// Convert back to absolute paths // Convert back to absolute paths
return filteredRelative.map(file => path.resolve(rootDir, file)); return filteredRelative.map(file => path.resolve(rootDir, file));
} }
@ -375,23 +499,23 @@ program
.option('-o, --output <path>', 'Output file path', 'flattened-codebase.xml') .option('-o, --output <path>', 'Output file path', 'flattened-codebase.xml')
.action(async (options) => { .action(async (options) => {
console.log(`Flattening codebase to: ${options.output}`); console.log(`Flattening codebase to: ${options.output}`);
try { try {
// Import ora dynamically // Import ora dynamically
const { default: ora } = await import('ora'); const { default: ora } = await import('ora');
// Start file discovery with spinner // Start file discovery with spinner
const discoverySpinner = ora('🔍 Discovering files...').start(); const discoverySpinner = ora('🔍 Discovering files...').start();
const files = await discoverFiles(process.cwd()); const files = await discoverFiles(process.cwd());
const filteredFiles = await filterFiles(files, process.cwd()); const filteredFiles = await filterFiles(files, process.cwd());
discoverySpinner.succeed(`📁 Found ${filteredFiles.length} files to include`); discoverySpinner.succeed(`📁 Found ${filteredFiles.length} files to include`);
// Process files with progress tracking // Process files with progress tracking
console.log('Reading file contents'); console.log('Reading file contents');
const processingSpinner = ora('📄 Processing files...').start(); const processingSpinner = ora('📄 Processing files...').start();
const aggregatedContent = await aggregateFileContents(filteredFiles, process.cwd(), processingSpinner); const aggregatedContent = await aggregateFileContents(filteredFiles, process.cwd(), processingSpinner);
processingSpinner.succeed(`✅ Processed ${aggregatedContent.processedFiles}/${filteredFiles.length} files`); processingSpinner.succeed(`✅ Processed ${aggregatedContent.processedFiles}/${filteredFiles.length} files`);
// Log processing results for test validation // Log processing results for test validation
console.log(`Processed ${aggregatedContent.processedFiles}/${filteredFiles.length} files`); console.log(`Processed ${aggregatedContent.processedFiles}/${filteredFiles.length} files`);
if (aggregatedContent.errors.length > 0) { if (aggregatedContent.errors.length > 0) {
@ -401,16 +525,16 @@ program
if (aggregatedContent.binaryFiles.length > 0) { if (aggregatedContent.binaryFiles.length > 0) {
console.log(`Binary files: ${aggregatedContent.binaryFiles.length}`); console.log(`Binary files: ${aggregatedContent.binaryFiles.length}`);
} }
// Generate XML output // Generate XML output using streaming
const xmlSpinner = ora('🔧 Generating XML output...').start(); const xmlSpinner = ora('🔧 Generating XML output...').start();
const xmlOutput = generateXMLOutput(aggregatedContent); await generateXMLOutput(aggregatedContent, options.output);
await fs.writeFile(options.output, xmlOutput);
xmlSpinner.succeed('📝 XML generation completed'); xmlSpinner.succeed('📝 XML generation completed');
// Calculate and display statistics // Calculate and display statistics
const stats = calculateStatistics(aggregatedContent, xmlOutput); const outputStats = await fs.stat(options.output);
const stats = calculateStatistics(aggregatedContent, outputStats.size);
// Display completion summary // Display completion summary
console.log('\n📊 Completion Summary:'); console.log('\n📊 Completion Summary:');
console.log(`✅ Successfully processed ${filteredFiles.length} files into ${options.output}`); console.log(`✅ Successfully processed ${filteredFiles.length} files into ${options.output}`);
@ -420,7 +544,7 @@ program
console.log(`📝 Total lines of code: ${stats.totalLines.toLocaleString()}`); console.log(`📝 Total lines of code: ${stats.totalLines.toLocaleString()}`);
console.log(`🔢 Estimated tokens: ${stats.estimatedTokens}`); console.log(`🔢 Estimated tokens: ${stats.estimatedTokens}`);
console.log(`📊 File breakdown: ${stats.textFiles} text, ${stats.binaryFiles} binary, ${stats.errorFiles} errors`); console.log(`📊 File breakdown: ${stats.textFiles} text, ${stats.binaryFiles} binary, ${stats.errorFiles} errors`);
} catch (error) { } catch (error) {
console.error('❌ Critical error:', error.message); console.error('❌ Critical error:', error.message);
console.error('An unexpected error occurred.'); console.error('An unexpected error occurred.');
@ -432,4 +556,4 @@ if (require.main === module) {
program.parse(); program.parse();
} }
module.exports = program; module.exports = program;