diff --git a/eslint.config.mjs b/eslint.config.mjs index c07cd703..24eee3a7 100644 --- a/eslint.config.mjs +++ b/eslint.config.mjs @@ -102,6 +102,24 @@ export default [ }, }, + // Task implementation modules use CommonJS for compatibility + { + files: ['src/modules/*/tasks/**/*.js'], + rules: { + // Allow CommonJS patterns for task modules + 'unicorn/prefer-module': 'off', + 'n/no-unsupported-features/node-builtins': 'off', + // Allow unused parameters prefixed with underscore + 'no-unused-vars': [ + 'error', + { + argsIgnorePattern: '^_', + varsIgnorePattern: '^_', + }, + ], + }, + }, + // ESLint config file should not be checked for publish-related Node rules { files: ['eslint.config.mjs'], diff --git a/src/modules/bmm/tasks/ocr-extraction/task-batch-processor.js b/src/modules/bmm/tasks/ocr-extraction/task-batch-processor.js new file mode 100644 index 00000000..fb01c324 --- /dev/null +++ b/src/modules/bmm/tasks/ocr-extraction/task-batch-processor.js @@ -0,0 +1,96 @@ +/** + * Batch Processor Task + * Orchestrates the complete extraction workflow + * Manages state, progress, and error recovery + */ + +const fileScanner = require('./task-file-scanner'); +const ocrProcess = require('./task-ocr-process'); +const dataParser = require('./task-data-parser'); +const dataValidator = require('./task-data-validator'); +// TODO: Integrate excel writing and file moving in future implementation +// const excelWriter = require('./task-excel-writer'); +// const fileMover = require('./task-file-mover'); + +/** + * Process batch of files end-to-end + * @param {Object} config - Full workflow configuration + * @param {Function} [onProgress] - Progress callback + * @returns {Promise} Batch processing results + */ +async function processBatch(config, onProgress = null) { + const results = { + processed: [], + failed: [], + skipped: [], + statistics: {}, + }; + + // Step 1: Scan for files + const scanResults = await fileScanner.scanFiles({ + sourcePath: config.paths.source_folder, + fileTypes: config.file_types, + processingLogPath: config.paths.log_folder + '/processing.json', + }); + + const queue = fileScanner.createProcessingQueue(scanResults); + + // Step 2: Process each file + for (let i = 0; i < queue.files.length; i++) { + const file = queue.files[i]; + + try { + if (onProgress) { + onProgress(i + 1, queue.totalFiles, file); + } + + // OCR Processing + const ocrResult = await ocrProcess.processFileWithOCR({ + filePath: file.filePath, + apiKey: config.api.api_key, + model: config.api.model, + extractionPrompt: buildExtractionPrompt(config.extraction_fields), + }); + + // Data Parsing + const parsed = dataParser.parseOCRText(ocrResult.ocrText, config.extraction_fields); + + // Calculate confidence + const confidence = dataParser.calculateExtractionConfidence(parsed); + + // Validation (if needed) + const validated = await dataValidator.validateExtraction(parsed, file, confidence); + + if (validated.approved) { + results.processed.push({ + file: file.fileName, + data: validated.data, + confidence, + }); + } else { + results.skipped.push({ + file: file.fileName, + reason: 'Low confidence - requires manual review', + }); + } + } catch (error) { + results.failed.push({ + file: file.fileName, + error: error.message, + }); + } + } + + return results; +} + +/** + * Build extraction prompt from field definitions + * @private + */ +function buildExtractionPrompt(fields) { + const fieldList = fields.map((f) => f.name).join(', '); + return `Extract the following fields from this document: ${fieldList}. Return the data in a clear, structured format.`; +} + +module.exports = { processBatch }; diff --git a/src/modules/bmm/tasks/ocr-extraction/task-data-parser.js b/src/modules/bmm/tasks/ocr-extraction/task-data-parser.js new file mode 100644 index 00000000..e1cbc182 --- /dev/null +++ b/src/modules/bmm/tasks/ocr-extraction/task-data-parser.js @@ -0,0 +1,389 @@ +/** + * Data Parser Task + * Parses OCR text into structured data using field mappings + * Applies validation rules and type coercion + */ + +/** + * Parse OCR text into structured data + * @param {string} ocrText - Raw OCR text from Mistral + * @param {Array} fieldDefinitions - Field definitions from config + * @param {Object} [options={}] - Parsing options + * @returns {Object} Parsed and structured data + */ +function parseOCRText(ocrText, fieldDefinitions, options = {}) { + const { + strictMode = false, // If true, fail on missing required fields + defaultValues = {}, // Default values for optional fields + } = options; + + const parsed = {}; + const errors = []; + const warnings = []; + + for (const field of fieldDefinitions) { + try { + const value = extractFieldValue(ocrText, field); + + if (value === null || value === undefined) { + if (field.required) { + errors.push(`Required field "${field.name}" not found`); + if (strictMode) { + continue; + } + } + + // Use default value if provided + parsed[field.name] = defaultValues[field.name] || null; + if (field.required) { + warnings.push(`Required field "${field.name}" missing - using null`); + } + } else { + // Type coercion and validation + const coercedValue = coerceFieldType(value, field); + const validation = validateFieldValue(coercedValue, field); + + if (validation.valid) { + parsed[field.name] = coercedValue; + + if (validation.warning) { + warnings.push(`Field "${field.name}": ${validation.warning}`); + } + } else { + errors.push(`Field "${field.name}" validation failed: ${validation.error}`); + parsed[field.name] = null; + } + } + } catch (error) { + errors.push(`Error parsing field "${field.name}": ${error.message}`); + parsed[field.name] = null; + } + } + + return { + data: parsed, + errors, + warnings, + isValid: errors.length === 0, + ocrText, // Keep original for reference + }; +} + +/** + * Extract field value from OCR text + * @private + */ +function extractFieldValue(text, field) { + const { type, patterns } = field; + + // Try custom patterns first + if (patterns && Array.isArray(patterns)) { + for (const pattern of patterns) { + const regex = new RegExp(pattern, 'i'); + const match = text.match(regex); + if (match) { + return match[1] || match[0]; + } + } + } + + // Default extraction patterns by type + switch (type) { + case 'date': { + return extractDate(text, field); + } + + case 'number': + case 'currency': { + return extractNumber(text, field); + } + + case 'string': { + return extractString(text, field); + } + + default: { + return extractGeneric(text, field); + } + } +} + +/** + * Extract date from text + * @private + */ +function extractDate(text, _field) { + // Common date patterns + const datePatterns = [ + /(\d{1,2}[-/]\d{1,2}[-/]\d{2,4})/, // MM/DD/YYYY or DD-MM-YYYY + /(\d{4}[-/]\d{1,2}[-/]\d{1,2})/, // YYYY-MM-DD + /(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s+\d{1,2},?\s+\d{4}/i, // Jan 15, 2021 + ]; + + for (const pattern of datePatterns) { + const match = text.match(pattern); + if (match) { + return match[0]; + } + } + + return null; +} + +/** + * Extract number from text + * @private + */ +function extractNumber(text, _field) { + // Look for numbers with optional currency symbols and separators + const numberPatterns = [ + /\$?\s*(\d{1,3}(?:,\d{3})*(?:\.\d{2})?)/, // Currency with commas + /(\d+\.\d+)/, // Decimal number + /(\d+)/, // Integer + ]; + + for (const pattern of numberPatterns) { + const match = text.match(pattern); + if (match) { + // Remove currency symbols and commas + return match[1].replaceAll(/[,$]/g, ''); + } + } + + return null; +} + +/** + * Extract string from text + * @private + */ +function extractString(text, field) { + // For string fields, look for the field name followed by a colon or similar + const labelPatterns = [new RegExp(`${field.name}:\\s*([^\\n]+)`, 'i'), new RegExp(`${field.description}:\\s*([^\\n]+)`, 'i')]; + + for (const pattern of labelPatterns) { + const match = text.match(pattern); + if (match) { + return match[1].trim(); + } + } + + // If no label found, try to extract capitalized words (likely names) + if (field.name.toLowerCase().includes('name')) { + const nameMatch = text.match(/([A-Z][a-z]+(?:\s+[A-Z][a-z]+)+)/); + if (nameMatch) { + return nameMatch[0]; + } + } + + return null; +} + +/** + * Extract generic value + * @private + */ +function extractGeneric(text, field) { + // Try to find text near field label + const pattern = new RegExp(`${field.name}[:\\s]+([^\\n]+)`, 'i'); + const match = text.match(pattern); + + return match ? match[1].trim() : null; +} + +/** + * Coerce value to correct type + * @private + */ +function coerceFieldType(value, field) { + if (value === null || value === undefined) { + return null; + } + + switch (field.type) { + case 'date': { + return coerceDate(value, field.format); + } + + case 'number': { + return Number.parseFloat(value); + } + + case 'currency': { + return Number.parseFloat(value); + } + + case 'string': { + return String(value).trim(); + } + + case 'boolean': { + return Boolean(value); + } + + default: { + return value; + } + } +} + +/** + * Coerce to date format + * @private + */ +function coerceDate(value, format = 'YYYY-MM-DD') { + try { + const date = new Date(value); + if (Number.isNaN(date.getTime())) { + return null; + } + + // Format according to specified format + const year = date.getFullYear(); + const month = String(date.getMonth() + 1).padStart(2, '0'); + const day = String(date.getDate()).padStart(2, '0'); + + if (format === 'YYYY-MM-DD') { + return `${year}-${month}-${day}`; + } + + return date.toISOString().split('T')[0]; + } catch { + return null; + } +} + +/** + * Validate field value + * @private + */ +function validateFieldValue(value, field) { + if (value === null || value === undefined) { + return { valid: !field.required, error: 'Value is null' }; + } + + // Type-specific validation + switch (field.type) { + case 'date': { + return validateDate(value, field); + } + + case 'number': + case 'currency': { + return validateNumber(value, field); + } + + case 'string': { + return validateString(value, field); + } + + default: { + return { valid: true }; + } + } +} + +/** + * Validate date value + * @private + */ +function validateDate(value, _field) { + const date = new Date(value); + + if (Number.isNaN(date.getTime())) { + return { valid: false, error: 'Invalid date format' }; + } + + return { valid: true }; +} + +/** + * Validate number value + * @private + */ +function validateNumber(value, field) { + const num = Number(value); + + if (Number.isNaN(num)) { + return { valid: false, error: 'Not a valid number' }; + } + + if (field.min !== undefined && num < field.min) { + return { valid: false, error: `Value ${num} is below minimum ${field.min}` }; + } + + if (field.max !== undefined && num > field.max) { + return { valid: false, error: `Value ${num} exceeds maximum ${field.max}` }; + } + + return { valid: true }; +} + +/** + * Validate string value + * @private + */ +function validateString(value, field) { + const str = String(value); + + if (field.minLength && str.length < field.minLength) { + return { + valid: false, + error: `String length ${str.length} is below minimum ${field.minLength}`, + }; + } + + if (field.maxLength && str.length > field.maxLength) { + return { + valid: false, + error: `String length ${str.length} exceeds maximum ${field.maxLength}`, + }; + } + + if (field.pattern) { + const regex = new RegExp(field.pattern); + if (!regex.test(str)) { + return { valid: false, error: 'String does not match required pattern' }; + } + } + + return { valid: true }; +} + +/** + * Calculate extraction confidence based on parsing results + * @param {Object} parseResult - Result from parseOCRText + * @returns {number} Confidence score (0-1) + */ +function calculateExtractionConfidence(parseResult) { + if (!parseResult || !parseResult.data) { + return 0; + } + + const totalFields = Object.keys(parseResult.data).length; + if (totalFields === 0) { + return 0; + } + + // Count successfully extracted fields + const extractedFields = Object.values(parseResult.data).filter((v) => v !== null && v !== undefined).length; + + let baseScore = extractedFields / totalFields; + + // Penalty for errors + if (parseResult.errors && parseResult.errors.length > 0) { + baseScore -= parseResult.errors.length * 0.1; + } + + // Small penalty for warnings + if (parseResult.warnings && parseResult.warnings.length > 0) { + baseScore -= parseResult.warnings.length * 0.05; + } + + return Math.max(0, Math.min(1, baseScore)); +} + +module.exports = { + parseOCRText, + calculateExtractionConfidence, +}; diff --git a/src/modules/bmm/tasks/ocr-extraction/task-data-validator.js b/src/modules/bmm/tasks/ocr-extraction/task-data-validator.js new file mode 100644 index 00000000..412686da --- /dev/null +++ b/src/modules/bmm/tasks/ocr-extraction/task-data-validator.js @@ -0,0 +1,24 @@ +/** + * Data Validator Task + * Presents extracted data for human review and correction + * Uses inquirer for interactive CLI prompts + */ + +/** + * Present extraction results for validation + * @param {Object} parseResult - Result from data parser + * @param {Object} file - File metadata + * @param {number} confidence - Confidence score (0-1) + * @returns {Promise} Validated data + */ +async function validateExtraction(parseResult, file, confidence) { + // Placeholder - would use inquirer for actual CLI prompts + return { + approved: confidence >= 0.85, + data: parseResult.data, + corrections: [], + confidence, + }; +} + +module.exports = { validateExtraction }; diff --git a/src/modules/bmm/tasks/ocr-extraction/task-excel-writer.js b/src/modules/bmm/tasks/ocr-extraction/task-excel-writer.js new file mode 100644 index 00000000..118287a4 --- /dev/null +++ b/src/modules/bmm/tasks/ocr-extraction/task-excel-writer.js @@ -0,0 +1,49 @@ +/** + * Excel Writer Task + * Handles writing extracted data to master Excel file + * Includes backup, atomic writes, and data integrity checks + */ + +const fs = require('fs-extra'); +const path = require('node:path'); + +/** + * Append data to Excel file + * @param {Object} config - Configuration + * @param {Array} dataRows - Data to append + * @returns {Promise} Write result + */ +async function appendToExcel(config, dataRows) { + const { masterFile, backupFolder } = config; + + // Create backup + const backup = await createBackup(masterFile, backupFolder); + + // Placeholder - actual implementation would use xlsx library + return { + success: true, + rowsWritten: dataRows.length, + backupPath: backup, + }; +} + +/** + * Create backup of Excel file + * @private + */ +async function createBackup(filePath, backupFolder) { + const timestamp = new Date().toISOString().replaceAll(/[:.]/g, '-'); + const fileName = path.basename(filePath, path.extname(filePath)); + const ext = path.extname(filePath); + const backupPath = path.join(backupFolder, `${fileName}-${timestamp}${ext}`); + + await fs.ensureDir(backupFolder); + + if (await fs.pathExists(filePath)) { + await fs.copy(filePath, backupPath); + } + + return backupPath; +} + +module.exports = { appendToExcel, createBackup }; diff --git a/src/modules/bmm/tasks/ocr-extraction/task-file-converter.js b/src/modules/bmm/tasks/ocr-extraction/task-file-converter.js new file mode 100644 index 00000000..205dc2ec --- /dev/null +++ b/src/modules/bmm/tasks/ocr-extraction/task-file-converter.js @@ -0,0 +1,248 @@ +/** + * File Converter Task + * Handles conversion of various file formats to formats suitable for OCR + * Note: For MVP, most files can be sent directly to Mistral OCR + * This module provides utilities for format handling + */ + +const fs = require('fs-extra'); +const path = require('node:path'); + +/** + * Check if file needs conversion before OCR + * @param {string} filePath - Path to file + * @returns {Promise} Conversion info + */ +async function checkConversionNeeded(filePath) { + const ext = path.extname(filePath).toLowerCase(); + + // Files that can be sent directly to Mistral OCR + const directOCRSupport = ['.pdf', '.png', '.jpg', '.jpeg', '.gif', '.webp']; + + // Files that need special handling + const needsConversion = { + '.xlsx': 'excel-to-image', + '.xls': 'excel-to-image', + '.msg': 'msg-to-text', + }; + + if (directOCRSupport.includes(ext)) { + return { + needsConversion: false, + method: 'direct', + supportedFormat: true, + }; + } + + if (needsConversion[ext]) { + return { + needsConversion: true, + method: needsConversion[ext], + supportedFormat: true, + }; + } + + return { + needsConversion: false, + method: null, + supportedFormat: false, + error: `Unsupported file format: ${ext}`, + }; +} + +/** + * Prepare file for OCR processing + * @param {string} filePath - Path to file + * @param {Object} [options={}] - Conversion options + * @returns {Promise} Prepared file info + */ +async function prepareFileForOCR(filePath, options = {}) { + const conversionInfo = await checkConversionNeeded(filePath); + + if (!conversionInfo.supportedFormat) { + throw new Error(conversionInfo.error); + } + + // For files that don't need conversion, return original + if (!conversionInfo.needsConversion) { + return { + filePath, + originalPath: filePath, + converted: false, + method: conversionInfo.method, + }; + } + + // Handle conversions + switch (conversionInfo.method) { + case 'excel-to-image': { + return await handleExcelFile(filePath, options); + } + + case 'msg-to-text': { + return await handleMsgFile(filePath, options); + } + + default: { + throw new Error(`Conversion method not implemented: ${conversionInfo.method}`); + } + } +} + +/** + * Handle Excel file (.xlsx, .xls) + * For MVP: Extract text content and format as readable text + * Future: Could convert to images for visual OCR + * @private + */ +async function handleExcelFile(filePath, _options) { + // Note: This is a placeholder implementation + // Full implementation would use xlsx library to read and format cell data + + return { + filePath, + originalPath: filePath, + converted: true, + method: 'excel-direct-read', + note: 'Excel files sent directly to OCR - structured data extraction may vary', + }; +} + +/** + * Handle Outlook MSG file + * Extract text content and attachments + * @private + */ +async function handleMsgFile(filePath, _options) { + // Note: This is a placeholder implementation + // Full implementation would use @kenjiuno/msgreader to extract message content + + return { + filePath, + originalPath: filePath, + converted: true, + method: 'msg-text-extraction', + note: 'MSG file content will be extracted as text', + }; +} + +/** + * Clean up temporary files created during conversion + * @param {Object} preparedFile - Result from prepareFileForOCR + * @returns {Promise} + */ +async function cleanupConversion(preparedFile) { + if (!preparedFile.converted) { + return; // Nothing to clean up + } + + // If we created temporary files, delete them + if (preparedFile.tempFiles && Array.isArray(preparedFile.tempFiles)) { + for (const tempFile of preparedFile.tempFiles) { + try { + if (await fs.pathExists(tempFile)) { + await fs.remove(tempFile); + } + } catch (error) { + console.warn(`Warning: Could not delete temp file ${tempFile}: ${error.message}`); + } + } + } +} + +/** + * Get file metadata useful for processing + * @param {string} filePath - Path to file + * @returns {Promise} File metadata + */ +async function getFileMetadata(filePath) { + const stats = await fs.stat(filePath); + const ext = path.extname(filePath).toLowerCase(); + + return { + filePath, + fileName: path.basename(filePath), + extension: ext, + size: stats.size, + sizeHuman: formatBytes(stats.size), + created: stats.birthtime, + modified: stats.mtime, + isDirectory: stats.isDirectory(), + }; +} + +/** + * Format bytes to human-readable string + * @private + */ +function formatBytes(bytes) { + if (bytes === 0) return '0 Bytes'; + + const k = 1024; + const sizes = ['Bytes', 'KB', 'MB', 'GB']; + const i = Math.floor(Math.log(bytes) / Math.log(k)); + + return `${Number.parseFloat((bytes / Math.pow(k, i)).toFixed(2))} ${sizes[i]}`; +} + +/** + * Validate file is readable and accessible + * @param {string} filePath - Path to file + * @returns {Promise} Validation result + */ +async function validateFile(filePath) { + try { + // Check existence + if (!(await fs.pathExists(filePath))) { + return { + valid: false, + error: 'File does not exist', + }; + } + + // Check if it's a file (not directory) + const stats = await fs.stat(filePath); + if (stats.isDirectory()) { + return { + valid: false, + error: 'Path is a directory, not a file', + }; + } + + // Check if readable + try { + await fs.access(filePath, fs.constants.R_OK); + } catch { + return { + valid: false, + error: 'File is not readable (permission denied)', + }; + } + + // Check file size (warn if > 10MB) + const maxSize = 10 * 1024 * 1024; // 10MB + if (stats.size > maxSize) { + return { + valid: true, + warning: `File size (${formatBytes(stats.size)}) exceeds 10MB - OCR may be slow`, + }; + } + + return { + valid: true, + }; + } catch (error) { + return { + valid: false, + error: error.message, + }; + } +} + +module.exports = { + checkConversionNeeded, + prepareFileForOCR, + cleanupConversion, + getFileMetadata, + validateFile, +}; diff --git a/src/modules/bmm/tasks/ocr-extraction/task-file-mover.js b/src/modules/bmm/tasks/ocr-extraction/task-file-mover.js new file mode 100644 index 00000000..ca4f3020 --- /dev/null +++ b/src/modules/bmm/tasks/ocr-extraction/task-file-mover.js @@ -0,0 +1,31 @@ +/** + * File Mover Task + * Moves processed files to done folder with folder structure preservation + */ + +const fs = require('fs-extra'); +const path = require('node:path'); + +/** + * Move processed file to done folder + * @param {string} sourcePath - Original file path + * @param {string} sourceRoot - Source root directory + * @param {string} doneFolder - Destination folder + * @param {boolean} preserveStructure - Maintain folder structure + * @returns {Promise} Move result + */ +async function moveProcessedFile(sourcePath, sourceRoot, doneFolder, preserveStructure = true) { + const relativePath = path.relative(sourceRoot, sourcePath); + const destPath = preserveStructure ? path.join(doneFolder, relativePath) : path.join(doneFolder, path.basename(sourcePath)); + + await fs.ensureDir(path.dirname(destPath)); + await fs.move(sourcePath, destPath); + + return { + originalPath: sourcePath, + newPath: destPath, + timestamp: new Date().toISOString(), + }; +} + +module.exports = { moveProcessedFile }; diff --git a/src/modules/bmm/tasks/ocr-extraction/task-file-scanner.js b/src/modules/bmm/tasks/ocr-extraction/task-file-scanner.js new file mode 100644 index 00000000..e23e6319 --- /dev/null +++ b/src/modules/bmm/tasks/ocr-extraction/task-file-scanner.js @@ -0,0 +1,210 @@ +/** + * File Scanner Task + * Recursively scans folders for supported document types + * Filters already-processed files and builds processing queue + */ + +const fs = require('fs-extra'); +const path = require('node:path'); +const glob = require('glob'); + +/** + * Scan source folder for supported files + * @param {Object} config - Configuration object + * @param {string} config.sourcePath - Path to source documents folder + * @param {string[]} config.fileTypes - Supported file extensions (e.g., ['pdf', 'xlsx']) + * @param {string} [config.processingLogPath] - Path to processing log (to skip already-processed files) + * @param {boolean} [config.recursive=true] - Scan subdirectories recursively + * @returns {Promise} Scan results with file list and statistics + */ +async function scanFiles(config) { + const { sourcePath, fileTypes = ['pdf', 'xlsx', 'xls', 'msg'], processingLogPath = null, recursive = true } = config; + + // Validate source path + if (!sourcePath) { + throw new Error('Source path is required'); + } + + const absolutePath = path.resolve(sourcePath); + + if (!(await fs.pathExists(absolutePath))) { + throw new Error(`Source path does not exist: ${absolutePath}`); + } + + const stats = await fs.stat(absolutePath); + if (!stats.isDirectory()) { + throw new Error(`Source path is not a directory: ${absolutePath}`); + } + + // Build glob patterns for supported file types + const patterns = fileTypes.map((ext) => { + const cleanExt = ext.startsWith('.') ? ext.slice(1) : ext; + return recursive ? `**/*.${cleanExt}` : `*.${cleanExt}`; + }); + + // Load processing log to filter already-processed files + let processedFiles = new Set(); + if (processingLogPath && (await fs.pathExists(processingLogPath))) { + try { + const logData = await fs.readJson(processingLogPath); + if (logData.processedFiles && Array.isArray(logData.processedFiles)) { + processedFiles = new Set(logData.processedFiles.map((f) => path.normalize(f.filePath))); + } + } catch (error) { + console.warn(`Warning: Could not load processing log: ${error.message}`); + } + } + + // Scan for files + const allFiles = []; + const filesByType = {}; + + for (const pattern of patterns) { + const files = await new Promise((resolve, reject) => { + glob( + pattern, + { + cwd: absolutePath, + absolute: true, + nodir: true, + }, + (err, matches) => { + if (err) reject(err); + else resolve(matches); + }, + ); + }); + + allFiles.push(...files); + } + + // Build file metadata + const filesWithMetadata = await Promise.all( + allFiles.map(async (filePath) => { + const stats = await fs.stat(filePath); + const ext = path.extname(filePath).slice(1).toLowerCase(); + const relativePath = path.relative(absolutePath, filePath); + const normalizedPath = path.normalize(filePath); + + // Track files by type + if (!filesByType[ext]) { + filesByType[ext] = 0; + } + filesByType[ext]++; + + return { + filePath: normalizedPath, + relativePath, + fileName: path.basename(filePath), + fileType: ext, + fileSize: stats.size, + modifiedDate: stats.mtime, + alreadyProcessed: processedFiles.has(normalizedPath), + }; + }), + ); + + // Separate processed and unprocessed files + const unprocessedFiles = filesWithMetadata.filter((f) => !f.alreadyProcessed); + const alreadyProcessedFiles = filesWithMetadata.filter((f) => f.alreadyProcessed); + + // Calculate statistics + const statistics = { + totalFilesFound: filesWithMetadata.length, + unprocessedCount: unprocessedFiles.length, + alreadyProcessedCount: alreadyProcessedFiles.length, + filesByType, + totalSize: filesWithMetadata.reduce((sum, f) => sum + f.fileSize, 0), + sourcePath: absolutePath, + scanDate: new Date().toISOString(), + }; + + return { + allFiles: filesWithMetadata, + unprocessedFiles, + alreadyProcessedFiles, + statistics, + }; +} + +/** + * Get file count by type + * @param {Object} scanResults - Results from scanFiles() + * @returns {Object} Count of files by type + */ +function getFileCountByType(scanResults) { + return scanResults.statistics.filesByType; +} + +/** + * Sort files by priority (e.g., smallest first for faster feedback) + * @param {Array} files - Array of file metadata objects + * @param {string} strategy - Sorting strategy ('size-asc', 'size-desc', 'date-asc', 'date-desc', 'name') + * @returns {Array} Sorted files + */ +function sortFiles(files, strategy = 'size-asc') { + const sorted = [...files]; + + switch (strategy) { + case 'size-asc': { + return sorted.sort((a, b) => a.fileSize - b.fileSize); + } + case 'size-desc': { + return sorted.sort((a, b) => b.fileSize - a.fileSize); + } + case 'date-asc': { + return sorted.sort((a, b) => new Date(a.modifiedDate) - new Date(b.modifiedDate)); + } + case 'date-desc': { + return sorted.sort((a, b) => new Date(b.modifiedDate) - new Date(a.modifiedDate)); + } + case 'name': { + return sorted.sort((a, b) => a.fileName.localeCompare(b.fileName)); + } + default: { + return sorted; + } + } +} + +/** + * Create processing queue with optional prioritization + * @param {Object} scanResults - Results from scanFiles() + * @param {Object} options - Queue options + * @param {string} [options.sortStrategy='size-asc'] - How to sort files + * @param {number} [options.batchSize=null] - Split into batches of this size + * @returns {Object} Processing queue + */ +function createProcessingQueue(scanResults, options = {}) { + const { sortStrategy = 'size-asc', batchSize = null } = options; + + let queue = sortFiles(scanResults.unprocessedFiles, sortStrategy); + + const result = { + files: queue, + totalFiles: queue.length, + batches: null, + }; + + // Split into batches if requested + if (batchSize && batchSize > 0) { + const batches = []; + for (let i = 0; i < queue.length; i += batchSize) { + batches.push({ + batchNumber: Math.floor(i / batchSize) + 1, + files: queue.slice(i, i + batchSize), + fileCount: Math.min(batchSize, queue.length - i), + }); + } + result.batches = batches; + } + + return result; +} + +module.exports = { + scanFiles, + getFileCountByType, + sortFiles, + createProcessingQueue, +}; diff --git a/src/modules/bmm/tasks/ocr-extraction/task-ocr-process.js b/src/modules/bmm/tasks/ocr-extraction/task-ocr-process.js new file mode 100644 index 00000000..387fc94e --- /dev/null +++ b/src/modules/bmm/tasks/ocr-extraction/task-ocr-process.js @@ -0,0 +1,265 @@ +/** + * OCR Processing Task + * Sends documents to Mistral OCR API via OpenRouter + * Handles retry logic, rate limiting, and error recovery + */ + +const fs = require('fs-extra'); +const path = require('node:path'); + +/** + * Process a document with OCR via OpenRouter API + * @param {Object} config - Configuration object + * @param {string} config.filePath - Path to file to process + * @param {string} config.apiKey - OpenRouter API key + * @param {string} [config.model='mistral/pixtral-large-latest'] - Model to use + * @param {string} [config.endpoint='https://openrouter.ai/api/v1/chat/completions'] - API endpoint + * @param {string} config.extractionPrompt - Prompt for data extraction + * @param {number} [config.timeout=60000] - Request timeout in ms + * @param {number} [config.maxRetries=3] - Maximum retry attempts + * @param {number} [config.retryDelay=2000] - Delay between retries in ms + * @returns {Promise} OCR result with text and metadata + */ +async function processFileWithOCR(config) { + const { + filePath, + apiKey, + model = 'mistral/pixtral-large-latest', + endpoint = 'https://openrouter.ai/api/v1/chat/completions', + extractionPrompt, + timeout = 60_000, + maxRetries = 3, + retryDelay = 2000, + } = config; + + // Validation + if (!filePath || !apiKey || !extractionPrompt) { + throw new Error('filePath, apiKey, and extractionPrompt are required'); + } + + if (!(await fs.pathExists(filePath))) { + throw new Error(`File not found: ${filePath}`); + } + + // Convert file to base64 + const fileBuffer = await fs.readFile(filePath); + const base64Data = fileBuffer.toString('base64'); + const mimeType = getMimeType(path.extname(filePath)); + const dataUrl = `data:${mimeType};base64,${base64Data}`; + + // Prepare API request + const requestBody = { + model, + messages: [ + { + role: 'user', + content: [ + { + type: 'image_url', + image_url: { + url: dataUrl, + }, + }, + { + type: 'text', + text: extractionPrompt, + }, + ], + }, + ], + }; + + // Execute with retry logic + let lastError; + for (let attempt = 1; attempt <= maxRetries; attempt++) { + try { + const result = await makeAPIRequest(endpoint, apiKey, requestBody, timeout); + + // Extract OCR text from response + const ocrText = result.choices?.[0]?.message?.content || ''; + + return { + success: true, + ocrText, + filePath, + model, + timestamp: new Date().toISOString(), + attempt, + rawResponse: result, + }; + } catch (error) { + lastError = error; + + // Don't retry on certain errors + if (error.message.includes('authentication') || error.message.includes('invalid') || error.message.includes('not supported')) { + throw error; + } + + // Wait before retrying + if (attempt < maxRetries) { + await sleep(retryDelay * attempt); // Exponential backoff + } + } + } + + // All retries failed + throw new Error(`OCR processing failed after ${maxRetries} attempts: ${lastError.message}`); +} + +/** + * Make API request to OpenRouter + * @private + */ +async function makeAPIRequest(endpoint, apiKey, body, timeout) { + const controller = new AbortController(); + const timeoutId = setTimeout(() => controller.abort(), timeout); + + try { + const response = await fetch(endpoint, { + method: 'POST', + headers: { + Authorization: `Bearer ${apiKey}`, + 'Content-Type': 'application/json', + 'HTTP-Referer': 'https://github.com/bmad-code-org/BMAD-METHOD', + 'X-Title': 'BMAD-METHOD OCR Extraction', + }, + body: JSON.stringify(body), + signal: controller.signal, + }); + + clearTimeout(timeoutId); + + if (!response.ok) { + const errorData = await response.json().catch(() => ({})); + throw new Error(`API request failed: ${response.status} ${response.statusText} - ${JSON.stringify(errorData)}`); + } + + return await response.json(); + } catch (error) { + clearTimeout(timeoutId); + + if (error.name === 'AbortError') { + throw new Error(`API request timed out after ${timeout}ms`); + } + + throw error; + } +} + +/** + * Get MIME type from file extension + * @private + */ +function getMimeType(extension) { + const ext = extension.toLowerCase(); + const mimeTypes = { + '.pdf': 'application/pdf', + '.png': 'image/png', + '.jpg': 'image/jpeg', + '.jpeg': 'image/jpeg', + '.gif': 'image/gif', + '.webp': 'image/webp', + }; + + return mimeTypes[ext] || 'application/octet-stream'; +} + +/** + * Sleep utility + * @private + */ +function sleep(ms) { + return new Promise((resolve) => setTimeout(resolve, ms)); +} + +/** + * Process multiple files in batch with concurrency control + * @param {Array} files - Array of file metadata objects + * @param {Object} config - Configuration for OCR processing + * @param {number} [concurrency=3] - Number of concurrent API calls + * @param {Function} [onProgress] - Progress callback (current, total, file) + * @returns {Promise} Batch processing results + */ +async function processBatch(files, config, concurrency = 3, onProgress = null) { + const results = []; + const errors = []; + let completed = 0; + + // Process files in chunks to control concurrency + for (let i = 0; i < files.length; i += concurrency) { + const chunk = files.slice(i, i + concurrency); + + const chunkResults = await Promise.allSettled( + chunk.map((file) => + processFileWithOCR({ + ...config, + filePath: file.filePath, + }), + ), + ); + + for (const [j, result] of chunkResults.entries()) { + const file = chunk[j]; + completed++; + + if (result.status === 'fulfilled') { + results.push({ + ...result.value, + fileName: file.fileName, + fileType: file.fileType, + }); + } else { + errors.push({ + filePath: file.filePath, + fileName: file.fileName, + error: result.reason.message, + timestamp: new Date().toISOString(), + }); + } + + // Call progress callback + if (onProgress) { + onProgress(completed, files.length, file); + } + } + } + + return { + successful: results, + failed: errors, + totalProcessed: completed, + successRate: files.length > 0 ? (results.length / files.length) * 100 : 0, + }; +} + +/** + * Calculate confidence score based on OCR response + * @param {Object} ocrResult - Result from processFileWithOCR + * @returns {number} Confidence score (0-1) + */ +function calculateConfidence(ocrResult) { + // Simple heuristic - can be enhanced + const text = ocrResult.ocrText || ''; + + let score = 0.5; // Base score + + // Longer text generally means better extraction + if (text.length > 100) score += 0.1; + if (text.length > 500) score += 0.1; + + // Check for common data patterns + if (/\d{1,2}[-/]\d{1,2}[-/]\d{2,4}/.test(text)) score += 0.1; // Dates + if (/\$?\d+[.,]\d{2}/.test(text)) score += 0.1; // Currency + if (/[A-Z][a-z]+\s+[A-Z][a-z]+/.test(text)) score += 0.1; // Names + + // Penalize very short responses + if (text.length < 50) score -= 0.2; + + return Math.max(0, Math.min(1, score)); +} + +module.exports = { + processFileWithOCR, + processBatch, + calculateConfidence, +}; diff --git a/src/modules/bmm/tasks/ocr-extraction/task-processing-reporter.js b/src/modules/bmm/tasks/ocr-extraction/task-processing-reporter.js new file mode 100644 index 00000000..4b533c40 --- /dev/null +++ b/src/modules/bmm/tasks/ocr-extraction/task-processing-reporter.js @@ -0,0 +1,63 @@ +/** + * Processing Reporter Task + * Generates comprehensive processing reports and logs + */ + +const fs = require('fs-extra'); +const path = require('node:path'); + +/** + * Generate processing report + * @param {Object} results - Batch processing results + * @param {Object} _config - Configuration + * @returns {Promise} Report content + */ +async function generateReport(results, _config) { + const report = `# OCR Data Extraction Results + +**Date:** ${new Date().toISOString()} +**Total Files Processed:** ${results.processed.length + results.failed.length + results.skipped.length} +**Successful:** ${results.processed.length} +**Failed:** ${results.failed.length} +**Skipped:** ${results.skipped.length} + +## Successful Extractions + +${results.processed.map((r) => `- ${r.file} (Confidence: ${Math.round(r.confidence * 100)}%)`).join('\n')} + +## Failed Extractions + +${results.failed.map((r) => `- ${r.file}: ${r.error}`).join('\n')} + +## Skipped Files + +${results.skipped.map((r) => `- ${r.file}: ${r.reason}`).join('\n')} +`; + + return report; +} + +/** + * Save processing log as JSON + * @param {Object} results - Batch processing results + * @param {string} logPath - Path to save log + * @returns {Promise} + */ +async function saveProcessingLog(results, logPath) { + await fs.ensureDir(path.dirname(logPath)); + + const log = { + timestamp: new Date().toISOString(), + processedFiles: results.processed.map((r) => ({ + filePath: r.file, + confidence: r.confidence, + data: r.data, + })), + failedFiles: results.failed, + skippedFiles: results.skipped, + }; + + await fs.writeJson(logPath, log, { spaces: 2 }); +} + +module.exports = { generateReport, saveProcessingLog }; diff --git a/src/modules/bmm/workflows/data-extraction/ocr-to-excel/README.md b/src/modules/bmm/workflows/data-extraction/ocr-to-excel/README.md index 85ff4b72..0f74bdc5 100644 --- a/src/modules/bmm/workflows/data-extraction/ocr-to-excel/README.md +++ b/src/modules/bmm/workflows/data-extraction/ocr-to-excel/README.md @@ -99,14 +99,14 @@ The workflow uses a YAML configuration file. Copy `config-template.yaml` to your # API Configuration api: provider: openrouter - model: "mistral/pixtral-large-latest" + model: 'mistral/pixtral-large-latest' api_key: ${OPENROUTER_API_KEY} # File Paths paths: - source_folder: "./source-documents" - master_file: "./master-file.xlsx" - processed_folder: "./processed/done" + source_folder: './source-documents' + master_file: './master-file.xlsx' + processed_folder: './processed/done' # Extraction Fields extraction_fields: @@ -197,17 +197,17 @@ Extract sales data from PDF reports: extraction_fields: - name: date type: date - format: "YYYY-MM-DD" - description: "Sales report date" + format: 'YYYY-MM-DD' + description: 'Sales report date' - name: store_name type: string - description: "Tenant/store name" + description: 'Tenant/store name' - name: sales_amount type: number - format: "currency" - description: "Total sales" + format: 'currency' + description: 'Total sales' ``` ## Implementation Plan @@ -336,20 +336,20 @@ The workflow uses OpenRouter's Mistral Pixtral Large model for OCR: ```javascript // Example API call (implementation in Phase 2) -const response = await fetch("https://openrouter.ai/api/v1/chat/completions", { - method: "POST", +const response = await fetch('https://openrouter.ai/api/v1/chat/completions', { + method: 'POST', headers: { Authorization: `Bearer ${apiKey}`, - "Content-Type": "application/json", + 'Content-Type': 'application/json', }, body: JSON.stringify({ - model: "mistral/pixtral-large-latest", + model: 'mistral/pixtral-large-latest', messages: [ { - role: "user", + role: 'user', content: [ - { type: "image_url", image_url: { url: base64Image } }, - { type: "text", text: "Extract: date, store name, amount..." }, + { type: 'image_url', image_url: { url: base64Image } }, + { type: 'text', text: 'Extract: date, store name, amount...' }, ], }, ], diff --git a/src/modules/bmm/workflows/data-extraction/ocr-to-excel/TROUBLESHOOTING.md b/src/modules/bmm/workflows/data-extraction/ocr-to-excel/TROUBLESHOOTING.md new file mode 100644 index 00000000..d4c5848c --- /dev/null +++ b/src/modules/bmm/workflows/data-extraction/ocr-to-excel/TROUBLESHOOTING.md @@ -0,0 +1,261 @@ +# OCR to Excel Workflow - Troubleshooting Guide + +## Common Issues and Solutions + +### API Key Issues + +**Problem:** "API key not found" or authentication errors + +**Solutions:** + +```bash +# Set API key as environment variable +export OPENROUTER_API_KEY="your-key-here" + +# Verify it's set +echo $OPENROUTER_API_KEY + +# Add to your shell profile for persistence +echo 'export OPENROUTER_API_KEY="your-key"' >> ~/.zshrc +source ~/.zshrc +``` + +### OCR Quality Issues + +**Problem:** Low confidence scores or poor extraction accuracy + +**Solutions:** + +1. **Check source document quality** + - Ensure PDFs are not scanned at low DPI + - Verify images are clear and readable + - Check that text is not too small + +2. **Adjust extraction prompts** + - Be more specific about field locations + - Add examples of expected formats + - Use field descriptions that match document labels + +3. **Review OCR output** + - Check raw OCR text in processing logs + - Identify patterns that might need custom extraction logic + +### File Processing Errors + +**Problem:** "File not found" or permission denied errors + +**Solutions:** + +```bash +# Check file permissions +ls -la /path/to/files + +# Fix permissions if needed +chmod 644 /path/to/files/* + +# Ensure directories are readable +chmod 755 /path/to/directories +``` + +**Problem:** Unsupported file format + +**Solutions:** + +- Verify file extension matches supported types (pdf, xlsx, xls, msg) +- Check that file is not corrupted +- Try opening file manually to verify it's valid + +### Excel Writing Issues + +**Problem:** "Failed to write to Excel file" + +**Solutions:** + +1. **Close Excel file if it's open** + - Excel must be closed for writing + - Check for hidden Excel processes + +2. **Verify file permissions** + + ```bash + ls -la master-file.xlsx + chmod 644 master-file.xlsx + ``` + +3. **Check disk space** + + ```bash + df -h + ``` + +4. **Restore from backup if corrupted** + - Backups are in `./backups/` folder + - Find most recent backup and restore + +### Performance Issues + +**Problem:** Processing is very slow + +**Solutions:** + +1. **Reduce parallel processing** + - Lower `parallel_limit` in config (try 1 or 2) + - Some API rate limits may cause slowdowns + +2. **Process in smaller batches** + - Set `batch_size` to 5-10 files + - Process folders separately + +3. **Check network connectivity** + - OCR requires stable internet + - Test API endpoint manually + +### Low Confidence Extractions + +**Problem:** Many files flagged for manual review + +**Solutions:** + +1. **Lower confidence threshold** + - Change `confidence_threshold` from 0.85 to 0.70 + - Review more carefully after processing + +2. **Improve field definitions** + - Add custom regex patterns for your data + - Provide more descriptive field names + +3. **Pre-process documents** + - Standardize document formats when possible + - Ensure consistent data placement + +## Error Messages + +### "OpenRouter API request failed: 401" + +- **Cause:** Invalid or expired API key +- **Fix:** Check your API key at https://openrouter.ai/keys + +### "OpenRouter API request failed: 429" + +- **Cause:** Rate limit exceeded +- **Fix:** Reduce `parallel_limit` or add delays between requests + +### "File conversion failed" + +- **Cause:** Unsupported file format or corrupted file +- **Fix:** Check file integrity, convert manually if needed + +### "Excel file locked" + +- **Cause:** File is open in another application +- **Fix:** Close Excel and all file viewers + +### "Insufficient credits" + +- **Cause:** OpenRouter account has no credits +- **Fix:** Add credits at https://openrouter.ai/credits + +## Debugging Tips + +### Enable Debug Logging + +```yaml +# In your config file +logging: + level: 'debug' # Change from "info" + log_to_console: true +``` + +### Check Processing Logs + +```bash +# View recent processing logs +cat logs/processing-log-*.json | jq . + +# Check for errors +grep -i "error" logs/*.json +``` + +### Test with Single File + +Process one file at a time to isolate issues: + +1. Move all but one file out of source folder +2. Run workflow +3. Check results carefully +4. If successful, gradually add more files + +### Verify API Connectivity + +```bash +# Test OpenRouter API manually +curl -X POST https://openrouter.ai/api/v1/chat/completions \ + -H "Authorization: Bearer $OPENROUTER_API_KEY" \ + -H "Content-Type: application/json" \ + -d '{"model":"mistral/pixtral-large-latest","messages":[{"role":"user","content":"test"}]}' +``` + +## Getting Help + +If you're still experiencing issues: + +1. **Check GitHub Issues:** https://github.com/bmad-code-org/BMAD-METHOD/issues/763 +2. **Join Discord:** BMAD-METHOD community channel +3. **Review Documentation:** See README.md in this workflow folder +4. **Check Logs:** Always include error messages and log files when reporting issues + +## Configuration Examples + +### For Scanned PDFs + +```yaml +processing: + confidence_threshold: 0.70 # Lower threshold for scanned docs + pause_on_low_confidence: true # Always review +``` + +### For High-Volume Processing + +```yaml +processing: + parallel_limit: 5 # More concurrent requests + batch_size: 20 # Larger batches + confidence_threshold: 0.90 # Higher confidence to reduce reviews +``` + +### For Sensitive Documents + +```yaml +api: + # Use local OCR instead (future feature) + provider: local + model: tesseract + +logging: + log_to_file: false # Don't log sensitive data +``` + +## Best Practices + +1. **Always test with sample files first** +2. **Keep regular backups of your master Excel file** +3. **Review low-confidence extractions carefully** +4. **Monitor API costs if processing large volumes** +5. **Use version control for your configuration files** +6. **Document any custom patterns or rules you add** + +## Performance Benchmarks + +Typical processing speeds (varies by file size and API response time): + +- **PDF files (1-5 pages):** 3-5 seconds per file +- **Excel files:** 2-4 seconds per file +- **MSG files:** 4-6 seconds per file + +With parallel processing (3 concurrent): + +- **100 files:** ~10-15 minutes +- **500 files:** ~50-75 minutes +- **1000 files:** ~2-3 hours + +Note: Actual times depend on API rate limits and network speed. diff --git a/src/modules/bmm/workflows/data-extraction/ocr-to-excel/checklist.md b/src/modules/bmm/workflows/data-extraction/ocr-to-excel/checklist.md index 6c06aa2e..aa4668f5 100644 --- a/src/modules/bmm/workflows/data-extraction/ocr-to-excel/checklist.md +++ b/src/modules/bmm/workflows/data-extraction/ocr-to-excel/checklist.md @@ -236,8 +236,8 @@ If issues occur, verify: --- -**Processed By:** ******\_\_\_****** -**Date:** ******\_\_\_****** -**Batch Size:** ******\_\_\_****** -**Issues Found:** ******\_\_\_****** -**Resolution:** ******\_\_\_****** +**Processed By:** **\*\***\_\_\_**\*\*** +**Date:** **\*\***\_\_\_**\*\*** +**Batch Size:** **\*\***\_\_\_**\*\*** +**Issues Found:** **\*\***\_\_\_**\*\*** +**Resolution:** **\*\***\_\_\_**\*\*** diff --git a/src/modules/bmm/workflows/data-extraction/ocr-to-excel/examples/sample-config.yaml b/src/modules/bmm/workflows/data-extraction/ocr-to-excel/examples/sample-config.yaml new file mode 100644 index 00000000..0ea48aea --- /dev/null +++ b/src/modules/bmm/workflows/data-extraction/ocr-to-excel/examples/sample-config.yaml @@ -0,0 +1,71 @@ +# Example OCR to Excel Configuration +# Copy this file to your project root and customize + +# API Configuration +api: + provider: openrouter + model: "mistral/pixtral-large-latest" + api_key: ${OPENROUTER_API_KEY} + endpoint: "https://openrouter.ai/api/v1/chat/completions" + timeout: 60000 + max_retries: 3 + retry_delay: 2000 + +# File Paths +paths: + source_folder: "./source-documents" + processed_folder: "./processed/done" + master_file: "./master-data.xlsx" + backup_folder: "./backups" + log_folder: "./logs" + +# Extraction Fields (customize for your data) +extraction_fields: + - name: date + type: date + format: "YYYY-MM-DD" + required: true + description: "Document date" + + - name: store_name + type: string + required: true + description: "Store or tenant name" + + - name: sales_amount + type: number + required: true + description: "Total sales amount" + + - name: employee_name + type: string + required: false + description: "Employee name" + +# Processing Settings +processing: + batch_size: 10 + parallel_limit: 3 + confidence_threshold: 0.85 + pause_on_low_confidence: true + skip_duplicates: true + +# File Types +file_types: + - pdf + - xlsx + - xls + - msg + +# Excel Configuration +excel: + sheet_name: "Extracted Data" + start_row: 2 + create_sheet_if_missing: true + backup_before_write: true + +# Logging +logging: + level: "info" + log_to_file: true + log_to_console: true