refactor(flattener): improve xml generation and file discovery

- Simplify file discovery by using gitignore patterns - Enhance XML generation with proper CDATA handling and indentation - Remove unused dependencies and clean up code
2025-07-19 18:08:20 -05:00 · 2025-07-19 18:08:20 -05:00 · 6a5a12599e
parent d1823fb083
commit 6a5a12599e
2 changed files with 66 additions and 31 deletions
--- a/.gitignore
+++ b/.gitignore
@ -33,8 +33,4 @@ docs/architecture/
 docs/prd/
 docs/stories/
 docs/project-architecture.md
-tests/
-custom-output.xml
-flattened-codebase.xml
 biome.json
-__tests__/
--- a/tools/flattener/main.js
+++ b/tools/flattener/main.js
@ -2,12 +2,9 @@

 const { Command } = require('commander');
 const fs = require('fs-extra');
-const path = require('path');
+const path = require('node:path');
 const { glob } = require('glob');
 const { minimatch } = require('minimatch');
-const { promisify } = require('util');
-const { exec } = require('child_process');
-const execAsync = promisify(exec);

 /**
 * Recursively discover all files in a directory
@ -16,27 +13,23 @@ const execAsync = promisify(exec);
 */
 async function discoverFiles(rootDir) {
  try {
+    const gitignorePath = path.join(rootDir, '.gitignore');
+    const gitignorePatterns = await parseGitignore(gitignorePath);
+
+    const combinedIgnores = [
+    ...gitignorePatterns,
+    '.git/**',
+    'flattened-codebase.xml',
+    'repomix-output.xml'
+  ];
+
    // Use glob to recursively find all files, excluding common ignore patterns
    const files = await glob('**/*', {
      cwd: rootDir,
      nodir: true, // Only files, not directories
      dot: true,   // Include hidden files
      follow: false, // Don't follow symbolic links
-      ignore: [
-        // Standard ignore patterns
-        'node_modules/**',
-        '.git/**',
-        'build/**',
-        'dist/**',
-        '.next/**',
-        'coverage/**',
-        '.nyc_output/**',
-        'tmp/**',
-        'temp/**',
-        '.gitignore',
-        '.gitattributes',
-        '.gitmodules'
-      ]
+      ignore: combinedIgnores
    });
    
    return files.map(file => path.resolve(rootDir, file));
@ -192,7 +185,7 @@ async function aggregateFileContents(files, rootDir, spinner = null) {
 * @param {string} projectRoot - The project root directory
 * @returns {string} XML content
 */
-function generateXMLOutput(aggregatedContent, projectRoot) {
+function generateXMLOutput(aggregatedContent) {
  const { textFiles } = aggregatedContent;
  
  let xml = `<?xml version="1.0" encoding="UTF-8"?>
@ -204,21 +197,36 @@ function generateXMLOutput(aggregatedContent, projectRoot) {
  for (const file of textFiles) {
    xml += `  <file path="${escapeXml(file.path)}">`;
    
-    // Use CDATA for code content to preserve formatting and handle special characters
-    if (file.content.trim()) {
-      xml += `<![CDATA[${file.content}]]>`;
+    // Use CDATA for code content, handling CDATA end sequences properly
+    if (file.content?.trim()) {
+      const indentedContent = indentFileContent(file.content);
+      if (file.content.includes(']]>')) {
+        // If content contains ]]>, split it and wrap each part in CDATA
+        xml += splitAndWrapCDATA(indentedContent);
+      } else {
+        xml += `<![CDATA[
+${indentedContent}
+    ]]>`;
+      }
+    } else if (file.content) {
+      // Handle empty or whitespace-only content
+      const indentedContent = indentFileContent(file.content);
+      xml += `<![CDATA[
+${indentedContent}
+    ]]>`;
    }
    
    xml += `</file>
 `;
  }
  
-  xml += `</files>`;
+  xml += `</files>
+`;
  return xml;
 }

 /**
- * Escape XML special characters
+ * Escape XML special characters for attributes
 * @param {string} str - String to escape
 * @returns {string} Escaped string
 */
@ -234,6 +242,37 @@ function escapeXml(str) {
    .replace(/'/g, '&apos;');
 }

+/**
+ * Indent file content with 4 spaces for each line
+ * @param {string} content - Content to indent
+ * @returns {string} Indented content
+ */
+function indentFileContent(content) {
+  if (typeof content !== 'string') {
+    return String(content);
+  }
+  
+  // Split content into lines and add 4 spaces of indentation to each line
+  return content.split('\n').map(line => `    ${line}`).join('\n');
+}
+
+/**
+ * Split content containing ]]> and wrap each part in CDATA
+ * @param {string} content - Content to process
+ * @returns {string} Content with properly wrapped CDATA sections
+ */
+function splitAndWrapCDATA(content) {
+  if (typeof content !== 'string') {
+    return String(content);
+  }
+  
+  // Replace ]]> with ]]]]><![CDATA[> to escape it within CDATA
+  const escapedContent = content.replace(/]]>/g, ']]]]><![CDATA[>');
+  return `<![CDATA[
+${escapedContent}
+    ]]>`;
+}
+
 /**
 * Calculate statistics for the processed files
 * @param {Object} aggregatedContent - The aggregated content object
@ -365,7 +404,7 @@ program
      
      // Generate XML output
      const xmlSpinner = ora('🔧 Generating XML output...').start();
-      const xmlOutput = generateXMLOutput(aggregatedContent, process.cwd());
+      const xmlOutput = generateXMLOutput(aggregatedContent);
      await fs.writeFile(options.output, xmlOutput);
      xmlSpinner.succeed('📝 XML generation completed');