perf(flattener): improve memory efficiency by streaming xml output

- Replace in-memory XML generation with streaming approach
- Add comprehensive common ignore patterns list
- Update statistics calculation to use file size instead of content length
This commit is contained in:
manjaroblack 2025-07-22 12:12:47 -05:00
parent 3d27ebdf18
commit 855960318c
No known key found for this signature in database
GPG Key ID: 02FD4111DA5560B4
2 changed files with 217 additions and 91 deletions

View File

@ -210,10 +210,12 @@ The PO ensures:
docs/brownfield-architecture.md docs/brownfield-architecture.md
2. Shard your docs: 2. Shard your docs:
In your IDE In your IDE
```bash ```bash
@po @po
shard docs/brownfield-prd.md shard docs/brownfield-prd.md
``` ```
```bash ```bash
@po @po
shard docs/brownfield-architecture.md shard docs/brownfield-architecture.md

View File

@ -16,13 +16,119 @@ async function discoverFiles(rootDir) {
const gitignorePath = path.join(rootDir, '.gitignore'); const gitignorePath = path.join(rootDir, '.gitignore');
const gitignorePatterns = await parseGitignore(gitignorePath); const gitignorePatterns = await parseGitignore(gitignorePath);
const combinedIgnores = [ // Common gitignore patterns that should always be ignored
...gitignorePatterns, const commonIgnorePatterns = [
// Version control
'.git/**', '.git/**',
'.svn/**',
'.hg/**',
'.bzr/**',
// Dependencies
'node_modules/**',
'bower_components/**',
'vendor/**',
'packages/**',
// Build outputs
'build/**',
'dist/**',
'out/**',
'target/**',
'bin/**',
'obj/**',
'release/**',
'debug/**',
// Environment and config
'.env',
'.env.*',
'*.env',
'.config',
// Logs
'logs/**',
'*.log',
'npm-debug.log*',
'yarn-debug.log*',
'yarn-error.log*',
'lerna-debug.log*',
// Coverage and testing
'coverage/**',
'.nyc_output/**',
'.coverage/**',
'test-results/**',
'junit.xml',
// Cache directories
'.cache/**',
'.tmp/**',
'.temp/**',
'tmp/**',
'temp/**',
'.sass-cache/**',
'.eslintcache',
'.stylelintcache',
// OS generated files
'.DS_Store',
'.DS_Store?',
'._*',
'.Spotlight-V100',
'.Trashes',
'ehthumbs.db',
'Thumbs.db',
'desktop.ini',
// IDE and editor files
'.vscode/**',
'.idea/**',
'*.swp',
'*.swo',
'*~',
'.project',
'.classpath',
'.settings/**',
'*.sublime-project',
'*.sublime-workspace',
// Package manager files
'package-lock.json',
'yarn.lock',
'pnpm-lock.yaml',
'composer.lock',
'Pipfile.lock',
// Runtime and compiled files
'*.pyc',
'*.pyo',
'*.pyd',
'__pycache__/**',
'*.class',
'*.jar',
'*.war',
'*.ear',
'*.o',
'*.so',
'*.dll',
'*.exe',
// Documentation build
'_site/**',
'.jekyll-cache/**',
'.jekyll-metadata',
// Flattener specific outputs
'flattened-codebase.xml', 'flattened-codebase.xml',
'repomix-output.xml' 'repomix-output.xml'
]; ];
const combinedIgnores = [
...gitignorePatterns,
...commonIgnorePatterns
];
// Use glob to recursively find all files, excluding common ignore patterns // Use glob to recursively find all files, excluding common ignore patterns
const files = await glob('**/*', { const files = await glob('**/*', {
cwd: rootDir, cwd: rootDir,
@ -180,49 +286,67 @@ async function aggregateFileContents(files, rootDir, spinner = null) {
} }
/** /**
* Generate XML output with aggregated file contents * Generate XML output with aggregated file contents using streaming
* @param {Object} aggregatedContent - The aggregated content object * @param {Object} aggregatedContent - The aggregated content object
* @param {string} projectRoot - The project root directory * @param {string} outputPath - The output file path
* @returns {string} XML content * @returns {Promise<void>} Promise that resolves when writing is complete
*/ */
function generateXMLOutput(aggregatedContent) { async function generateXMLOutput(aggregatedContent, outputPath) {
const { textFiles } = aggregatedContent; const { textFiles } = aggregatedContent;
let xml = `<?xml version="1.0" encoding="UTF-8"?> // Create write stream for efficient memory usage
`; const writeStream = fs.createWriteStream(outputPath, { encoding: 'utf8' });
xml += `<files>
`;
// Add text files with content (only text files as per story requirements) return new Promise((resolve, reject) => {
for (const file of textFiles) { writeStream.on('error', reject);
xml += ` <file path="${escapeXml(file.path)}">`; writeStream.on('finish', resolve);
// Write XML header
writeStream.write('<?xml version="1.0" encoding="UTF-8"?>\n');
writeStream.write('<files>\n');
// Process files one by one to minimize memory usage
let fileIndex = 0;
const writeNextFile = () => {
if (fileIndex >= textFiles.length) {
// All files processed, close XML and stream
writeStream.write('</files>\n');
writeStream.end();
return;
}
const file = textFiles[fileIndex];
fileIndex++;
// Write file opening tag
writeStream.write(` <file path="${escapeXml(file.path)}">`);
// Use CDATA for code content, handling CDATA end sequences properly // Use CDATA for code content, handling CDATA end sequences properly
if (file.content?.trim()) { if (file.content?.trim()) {
const indentedContent = indentFileContent(file.content); const indentedContent = indentFileContent(file.content);
if (file.content.includes(']]>')) { if (file.content.includes(']]>')) {
// If content contains ]]>, split it and wrap each part in CDATA // If content contains ]]>, split it and wrap each part in CDATA
xml += splitAndWrapCDATA(indentedContent); writeStream.write(splitAndWrapCDATA(indentedContent));
} else { } else {
xml += `<![CDATA[ writeStream.write(`<![CDATA[\n${indentedContent}\n ]]>`);
${indentedContent}
]]>`;
} }
} else if (file.content) { } else if (file.content) {
// Handle empty or whitespace-only content // Handle empty or whitespace-only content
const indentedContent = indentFileContent(file.content); const indentedContent = indentFileContent(file.content);
xml += `<![CDATA[ writeStream.write(`<![CDATA[\n${indentedContent}\n ]]>`);
${indentedContent}
]]>`;
} }
xml += `</file> // Write file closing tag
`; writeStream.write('</file>\n');
}
xml += `</files> // Continue with next file on next tick to avoid stack overflow
`; setImmediate(writeNextFile);
return xml; };
// Start processing files
writeNextFile();
});
} }
/** /**
@ -276,10 +400,10 @@ ${escapedContent}
/** /**
* Calculate statistics for the processed files * Calculate statistics for the processed files
* @param {Object} aggregatedContent - The aggregated content object * @param {Object} aggregatedContent - The aggregated content object
* @param {string} xmlContent - The generated XML content * @param {number} xmlFileSize - The size of the generated XML file in bytes
* @returns {Object} Statistics object * @returns {Object} Statistics object
*/ */
function calculateStatistics(aggregatedContent, xmlContent) { function calculateStatistics(aggregatedContent, xmlFileSize) {
const { textFiles, binaryFiles, errors } = aggregatedContent; const { textFiles, binaryFiles, errors } = aggregatedContent;
// Calculate total file size in bytes // Calculate total file size in bytes
@ -291,7 +415,7 @@ function calculateStatistics(aggregatedContent, xmlContent) {
const totalLines = textFiles.reduce((sum, file) => sum + file.lines, 0); const totalLines = textFiles.reduce((sum, file) => sum + file.lines, 0);
// Estimate token count (rough approximation: 1 token ≈ 4 characters) // Estimate token count (rough approximation: 1 token ≈ 4 characters)
const estimatedTokens = Math.ceil(xmlContent.length / 4); const estimatedTokens = Math.ceil(xmlFileSize / 4);
// Format file size // Format file size
const formatSize = (bytes) => { const formatSize = (bytes) => {
@ -306,7 +430,7 @@ function calculateStatistics(aggregatedContent, xmlContent) {
binaryFiles: binaryFiles.length, binaryFiles: binaryFiles.length,
errorFiles: errors.length, errorFiles: errors.length,
totalSize: formatSize(totalSize), totalSize: formatSize(totalSize),
xmlSize: formatSize(xmlContent.length), xmlSize: formatSize(xmlFileSize),
totalLines, totalLines,
estimatedTokens: estimatedTokens.toLocaleString() estimatedTokens: estimatedTokens.toLocaleString()
}; };
@ -402,14 +526,14 @@ program
console.log(`Binary files: ${aggregatedContent.binaryFiles.length}`); console.log(`Binary files: ${aggregatedContent.binaryFiles.length}`);
} }
// Generate XML output // Generate XML output using streaming
const xmlSpinner = ora('🔧 Generating XML output...').start(); const xmlSpinner = ora('🔧 Generating XML output...').start();
const xmlOutput = generateXMLOutput(aggregatedContent); await generateXMLOutput(aggregatedContent, options.output);
await fs.writeFile(options.output, xmlOutput);
xmlSpinner.succeed('📝 XML generation completed'); xmlSpinner.succeed('📝 XML generation completed');
// Calculate and display statistics // Calculate and display statistics
const stats = calculateStatistics(aggregatedContent, xmlOutput); const outputStats = await fs.stat(options.output);
const stats = calculateStatistics(aggregatedContent, outputStats.size);
// Display completion summary // Display completion summary
console.log('\n📊 Completion Summary:'); console.log('\n📊 Completion Summary:');