diff --git a/src/psm/agents/mlops/mlops.agent.yaml b/src/psm/agents/mlops/mlops.agent.yaml deleted file mode 100644 index 7e5613e4d..000000000 --- a/src/psm/agents/mlops/mlops.agent.yaml +++ /dev/null @@ -1,21 +0,0 @@ -# MLOps & Performance Engineer Agent Definition - -agent: - metadata: - id: "_bmad/psm/agents/mlops.md" - name: Linh - title: MLOps & Performance Engineer - icon: πŸ€– - module: psm - hasSidecar: false - - persona: - role: MLOps Specialist + Performance Engineer - identity: MLOps specialist bridging ML research and production. Expert in model serving, pipeline optimization, and chaos engineering. - communication_style: Data-driven, experimental. Thinks in pipelines and metrics. Ship fast, measure everything. - principles: Reproducibility first; monitor model drift; chaos engineering validates assumptions; cost-aware optimization. - - menu: - - trigger: MD or fuzzy match on mlops-deploy - workflow: "skill:bmad-psm-mlops-deployment" - description: "[MD] MLOps Deployment β€” Model validation, deploy, monitor" diff --git a/src/psm/agents/security/security.agent.yaml b/src/psm/agents/security/security.agent.yaml deleted file mode 100644 index 974106013..000000000 --- a/src/psm/agents/security/security.agent.yaml +++ /dev/null @@ -1,21 +0,0 @@ -# Security & Infrastructure Engineer Agent Definition - -agent: - metadata: - id: "_bmad/psm/agents/security.md" - name: HΓ  - title: Security & Infrastructure Engineer - icon: πŸ›‘οΈ - module: psm - hasSidecar: false - - persona: - role: Security Specialist + Infrastructure Expert - identity: Security specialist with expertise in defense-in-depth, compliance frameworks, and infrastructure hardening. Thorough and detail-oriented. - communication_style: Thorough, detail-oriented. Asks 'what if' scenarios. Thinks about edge cases and threat models. - principles: Zero trust architecture; defense in depth; security by default; least privilege. - - menu: - - trigger: SA or fuzzy match on security-audit - workflow: "skill:bmad-psm-security-audit" - description: "[SA] Security Audit β€” Scope, audit, report" diff --git a/src/psm/agents/sre/sre-sidecar/production-standards.md b/src/psm/agents/sre/sre-sidecar/production-standards.md deleted file mode 100644 index aab20e291..000000000 --- a/src/psm/agents/sre/sre-sidecar/production-standards.md +++ /dev/null @@ -1,21 +0,0 @@ -# Production Standards for PSM - -SRE operational standards, incident response protocols, and production quality benchmarks. - -## User Specified CRITICAL Rules - Supersedes General Rules - -None - -## General CRITICAL RULES - -### Rule 1: SLO-First Approach -ALL production decisions MUST reference defined SLOs. No optimization without measurement baseline. - -### Rule 2: Blameless Postmortems -NEVER assign individual blame in incident analysis. Focus on systemic improvements. - -### Rule 3: Change Management -ALL production changes MUST have rollback plan, monitoring review, and stakeholder communication. - -### Rule 4: Severity Classification -SEV1: Complete outage >50% users. SEV2: Major degradation >20%. SEV3: Minor <20%. SEV4: Cosmetic. diff --git a/src/psm/agents/sre/sre.agent.yaml b/src/psm/agents/sre/sre.agent.yaml deleted file mode 100644 index dd2724ff0..000000000 --- a/src/psm/agents/sre/sre.agent.yaml +++ /dev/null @@ -1,30 +0,0 @@ -# Site Reliability Engineer Agent Definition - -agent: - metadata: - id: "_bmad/psm/agents/sre.md" - name: Minh - title: Site Reliability Engineer - icon: πŸ”§ - module: psm - hasSidecar: true - - persona: - role: Senior SRE + Production Operations Expert - identity: Senior SRE with deep expertise in reliability, observability, and operational excellence. Obsessed with SLOs, automation, and incident response. - communication_style: Metric-driven, systematic. Translates business goals to technical SLOs. Always asks 'what is the SLO?' first. - principles: SLO-first approach; automate everything; measure before optimizing; blameless postmortems. - - menu: - - trigger: IR or fuzzy match on incident - workflow: "skill:bmad-psm-incident-response" - description: "[IR] Incident Response β€” Triage, diagnose, fix, postmortem" - - trigger: PR or fuzzy match on readiness - workflow: "skill:bmad-psm-production-readiness" - description: "[PR] Production Readiness Review β€” 9-dimension assessment" - - trigger: NS or fuzzy match on new-service - workflow: "skill:bmad-psm-setup-new-service" - description: "[NS] Setup New Service β€” Architecture to deployment" - - trigger: QD or fuzzy match on diagnose - workflow: "skill:bmad-psm-quick-diagnose" - description: "[QD] Quick Diagnose β€” Fast production troubleshooting" diff --git a/src/psm/config.yaml b/src/psm/config.yaml deleted file mode 100644 index 42a07adbe..000000000 --- a/src/psm/config.yaml +++ /dev/null @@ -1,13 +0,0 @@ -code: psm -name: "PSM: Production Systems & MLOps" -header: "BMad Production Systems Module" -subheader: "Production engineering workflows for incident response, production readiness, security, and MLOps." -description: "AI-driven production engineering framework with SRE, Security, and MLOps agents." -default_selected: false - -knowledge_base_path: - prompt: - - "Where is your production knowledge base? (folder with SKILL.md files)" - - "Leave default if you don't have one yet." - default: "docs/production-knowledge" - result: "{project-root}/{value}" diff --git a/src/psm/module-help.csv b/src/psm/module-help.csv deleted file mode 100644 index 567b4cafb..000000000 --- a/src/psm/module-help.csv +++ /dev/null @@ -1,7 +0,0 @@ -module,phase,name,code,sequence,workflow-file,command,required,agent,options,description,output-location,outputs, -psm,operations,Incident Response,IR,,skill:bmad-psm-incident-response,bmad-psm-incident-response,false,sre,Operations Mode,"Handle production incidents with systematic triage, diagnosis, and recovery. Use when the user says 'production is down' or 'incident response' or 'we have an outage'.",output_folder,"incident response report", -psm,operations,Production Readiness,PR,,skill:bmad-psm-production-readiness,bmad-psm-production-readiness,false,sre,Operations Mode,"Run production readiness review across 9 dimensions. Use when the user says 'are we ready for production' or 'PRR' or 'go-live check'.",output_folder,"production readiness assessment", -psm,operations,Security Audit,SA,,skill:bmad-psm-security-audit,bmad-psm-security-audit,false,security,Operations Mode,"Run comprehensive security audit and threat assessment. Use when the user says 'security audit' or 'vulnerability assessment' or 'security review'.",output_folder,"security audit report", -psm,operations,MLOps Deployment,MD,,skill:bmad-psm-mlops-deployment,bmad-psm-mlops-deployment,false,mlops,Operations Mode,"Deploy ML model to production with validation and monitoring. Use when the user says 'deploy model' or 'ML deployment' or 'model serving'.",output_folder,"mlops deployment report", -psm,operations,Setup New Service,NS,,skill:bmad-psm-setup-new-service,bmad-psm-setup-new-service,false,sre,Operations Mode,"Set up new production service from architecture through deployment. Use when the user says 'new service' or 'setup service' or 'new microservice'.",output_folder,"service setup plan", -psm,operations,Quick Diagnose,QD,,skill:bmad-psm-quick-diagnose,bmad-psm-quick-diagnose,false,sre,Operations Mode,"Quick diagnosis of production issue with minimal latency. Use when the user says 'something is broken' or 'quick diagnose' or 'what is happening?'.",output_folder,"diagnostic report", diff --git a/src/psm/module.yaml b/src/psm/module.yaml deleted file mode 100644 index 42a07adbe..000000000 --- a/src/psm/module.yaml +++ /dev/null @@ -1,13 +0,0 @@ -code: psm -name: "PSM: Production Systems & MLOps" -header: "BMad Production Systems Module" -subheader: "Production engineering workflows for incident response, production readiness, security, and MLOps." -description: "AI-driven production engineering framework with SRE, Security, and MLOps agents." -default_selected: false - -knowledge_base_path: - prompt: - - "Where is your production knowledge base? (folder with SKILL.md files)" - - "Leave default if you don't have one yet." - default: "docs/production-knowledge" - result: "{project-root}/{value}" diff --git a/src/psm/teams/default-party.csv b/src/psm/teams/default-party.csv deleted file mode 100644 index bd87313e7..000000000 --- a/src/psm/teams/default-party.csv +++ /dev/null @@ -1,4 +0,0 @@ -name,displayName,title,icon,role,identity,communicationStyle,principles,module,path -"sre","Minh","Site Reliability Engineer","πŸ”§","Senior SRE + Production Operations Expert","Senior SRE with deep expertise in reliability, observability, and operational excellence. Obsessed with SLOs, automation, and incident response.","Metric-driven, systematic. Always asks 'what is the SLO?' first.","SLO-first; automate everything; measure before optimizing; blameless postmortems.","psm","bmad/psm/agents/sre.md" -"security","HΓ ","Security & Infrastructure Engineer","πŸ›‘οΈ","Security Specialist + Infrastructure Expert","Security specialist with expertise in defense-in-depth, compliance frameworks, and infrastructure hardening.","Thorough, detail-oriented. Asks 'what if' scenarios. Thinks about edge cases and threat models.","Zero trust; defense in depth; security by default; least privilege.","psm","bmad/psm/agents/security.md" -"mlops","Linh","MLOps & Performance Engineer","πŸ€–","MLOps Specialist + Performance Engineer","MLOps specialist bridging ML research and production. Expert in model serving, pipeline optimization, and chaos engineering.","Data-driven, experimental. 'Ship fast, measure everything.'","Reproducibility first; monitor drift; chaos engineering validates; cost-aware optimization.","psm","bmad/psm/agents/mlops.md" diff --git a/src/psm/teams/ops-team.yaml b/src/psm/teams/ops-team.yaml deleted file mode 100644 index a7fa7dbd3..000000000 --- a/src/psm/teams/ops-team.yaml +++ /dev/null @@ -1,7 +0,0 @@ -# Powered by BMAD-COREβ„’ -bundle: - name: Production Operations Team - icon: βš™οΈ - description: Production engineering team for incident response, security, and MLOps -agents: "*" -party: "./default-party.csv" diff --git a/src/psm/workflows/bmad-psm-incident-response/SKILL.md b/src/psm/workflows/bmad-psm-incident-response/SKILL.md deleted file mode 100644 index 6d2fb39ef..000000000 --- a/src/psm/workflows/bmad-psm-incident-response/SKILL.md +++ /dev/null @@ -1,6 +0,0 @@ ---- -name: bmad-psm-incident-response -description: 'Handle production incidents with systematic triage, diagnosis, and recovery. Use when the user says "production is down" or "incident response" or "we have an outage"' ---- - -Follow the instructions in [workflow.md](workflow.md). diff --git a/src/psm/workflows/bmad-psm-incident-response/bmad-skill-manifest.yaml b/src/psm/workflows/bmad-psm-incident-response/bmad-skill-manifest.yaml deleted file mode 100644 index d0f08abdb..000000000 --- a/src/psm/workflows/bmad-psm-incident-response/bmad-skill-manifest.yaml +++ /dev/null @@ -1 +0,0 @@ -type: skill diff --git a/src/psm/workflows/bmad-psm-incident-response/incident-postmortem.template.md b/src/psm/workflows/bmad-psm-incident-response/incident-postmortem.template.md deleted file mode 100644 index 13904eb17..000000000 --- a/src/psm/workflows/bmad-psm-incident-response/incident-postmortem.template.md +++ /dev/null @@ -1,269 +0,0 @@ ---- -template_name: incident-postmortem -template_version: "1.0.0" -created_date: 2026-03-17 -description: Standard postmortem template for incident analysis and learning ---- - -# Incident Postmortem: {{INCIDENT_TITLE}} - -**Date**: {{INCIDENT_DATE}} -**Duration**: {{START_TIME}} β€” {{END_TIME}} ({{DURATION_MINUTES}} minutes) -**Severity**: {{SEV1|SEV2|SEV3}} ({{IMPACT_DESCRIPTION}}) -**Lead**: {{INCIDENT_COMMANDER_NAME}} -**Facilitator**: {{POSTMORTEM_FACILITATOR_NAME}} - ---- - -## Summary - -[1-2 paragraph executive summary of what happened, impact, and resolution] - -**Timeline at a glance**: -- T-0:00 β€” Normal operation -- T-{{TIME1}} β€” {{EVENT1}} -- T-{{TIME2}} β€” {{EVENT2}} -- T-{{RESOLUTION_TIME}} β€” Incident resolved - -**Impact**: {{METRIC1}} affected {{X}} users, {{METRIC2}}, {{METRIC3}} - ---- - -## Detailed Timeline - -| Time | Event | Notes | -|------|-------|-------| -| {{T}} | {{What happened}} | {{Who detected it}} | -| {{T+X}} | {{Next event}} | {{Action taken}} | -| {{T+Y}} | {{Root cause identified}} | {{By whom}} | -| {{T+Z}} | {{Fix applied}} | {{Verification steps}} | -| {{T+Final}} | {{Incident resolved}} | {{Verification}} | - ---- - -## Root Cause Analysis - -### Primary Cause - -**{{ROOT_CAUSE_TITLE}}** - -{{Detailed explanation of the root cause}} - -**How it happened**: -1. {{Precondition 1}} (why the system was vulnerable) -2. {{Trigger event}} (what caused the failure) -3. {{Failure cascade}} (why it got worse) -4. {{Detection lag}} (why it took X minutes to detect) - -**Evidence**: -- {{Log entry or metric showing the issue}} -- {{Related system behavior}} -- {{Impact indicator}} - -### Contributing Factors - -- {{Factor 1}} β€” {{Brief explanation}} -- {{Factor 2}} β€” {{Brief explanation}} -- {{Factor 3}} β€” {{Brief explanation}} - -### Why Didn't We Catch This? - -- {{Missing monitoring}} β€” {{What metric would have alerted}} -- {{Testing gap}} β€” {{What test would have failed}} -- {{Documentation gap}} β€” {{What runbook would have helped}} -- {{Knowledge gap}} β€” {{What training would have helped}} - ---- - -## Impact Assessment - -### User Impact - -- **Duration**: {{START_TIME}} β€” {{END_TIME}} ({{DURATION}} minutes) -- **Scale**: {{X}}% of {{METRIC}} (e.g., 5% of payment requests) -- **Users Affected**: {{APPROX_COUNT}} users -- **Revenue Impact**: {{$X}} (if applicable) -- **Customer Escalations**: {{NUMBER}} tickets opened - -**User-facing symptoms**: -- {{Symptom 1}} (e.g., "Checkout returns 500 error") -- {{Symptom 2}} (e.g., "Page loads slowly") -- {{Symptom 3}} - -### Operational Impact - -- **System Recovery**: {{SERVICE/METRIC}} took {{TIME}} to recover -- **Cascading Effects**: {{SERVICE_X}} also affected due to {{reason}} -- **On-call Load**: {{NUMBER}} pages, {{NUMBER}} escalations -- **Data Loss**: {{None | {{Description}}}} - ---- - -## Resolution & Recovery - -### Immediate Actions Taken - -1. **{{Time T+X}}** β€” {{Action 1}} - - Rationale: {{Why this helped}} - - Result: {{What changed}} - -2. **{{Time T+Y}}** β€” {{Action 2}} - - Rationale: {{Why this helped}} - - Result: {{What changed}} - -3. **{{Time T+Z}}** β€” {{Root Fix Applied}} - - Details: {{Technical description}} - - Verification: {{How we confirmed it worked}} - -### Rollback/Rollforward Decision - -**Decision**: {{Rollback to version X | Rollforward with fix | Hybrid approach}} - -**Rationale**: {{Explain why this was the right choice}} - -**Verification**: {{How we confirmed the fix worked}} - ---- - -## Lessons Learned - -### What Went Well - -- {{Thing we did right}} β€” This prevented {{worse outcome}} -- {{Thing we did right}} β€” Team coordination was excellent -- {{Thing we did right}} β€” Monitoring caught {{something}} - -### What We Can Improve - -| Issue | Category | Severity | Recommendation | Owner | -|-------|----------|----------|-----------------|-------| -| {{We didn't detect it for X minutes}} | Observability | HIGH | Add alert for {{metric}} when > {{threshold}} | DevOps | -| {{Runbook was outdated}} | Runbooks | MEDIUM | Update {{runbook}} with new architecture | SRE | -| {{New service not in alerting system}} | Process | MEDIUM | Add new services to alert config automatically | Platform | -| {{Team didn't know about new feature}} | Knowledge | LOW | Document new features in wiki | Tech Lead | - ---- - -## Action Items - -### Critical (Must Complete Before Similar Incident) - -- [ ] **{{Action 1}}** β€” {{Description}} - - Owner: {{NAME}} - - Deadline: {{DATE}} (within 1 week) - - Acceptance: {{How we verify it's done}} - -- [ ] **{{Action 2}}** β€” {{Description}} - - Owner: {{NAME}} - - Deadline: {{DATE}} (within 1 week) - - Acceptance: {{How we verify it's done}} - -### High Priority (Target Next 2 Weeks) - -- [ ] {{Action}} β€” Owner: {{NAME}}, Deadline: {{DATE}} -- [ ] {{Action}} β€” Owner: {{NAME}}, Deadline: {{DATE}} -- [ ] {{Action}} β€” Owner: {{NAME}}, Deadline: {{DATE}} - -### Medium Priority (Target This Sprint) - -- [ ] {{Action}} β€” Owner: {{NAME}} -- [ ] {{Action}} β€” Owner: {{NAME}} - -### Backlog (Good to Have) - -- [ ] {{Action}} β€” {{Description}} -- [ ] {{Action}} β€” {{Description}} - ---- - -## Prevention Measures - -### Short-term (1-2 Weeks) - -1. **{{Mitigation 1}}** β€” Prevents {{this exact incident}} from happening again - - How: {{Technical approach}} - - Effort: {{Estimate}} - - Timeline: {{When}} - -2. **{{Mitigation 2}}** β€” Catches similar issues earlier - - How: {{Technical approach}} - - Effort: {{Estimate}} - - Timeline: {{When}} - -### Long-term (Next Quarter) - -1. **{{Large architectural change}}** β€” Eliminates root cause class - - Rationale: {{Why this is better}} - - Effort: {{Estimate}} - - Timeline: {{When}} - ---- - -## Incident Stats - -``` -MTTD (Mean Time To Detect): {{MINUTES}} minutes - - Automatic detection: {{If applicable, how}} - - Manual detection: {{Who found it}} - -MTTR (Mean Time To Resolve): {{MINUTES}} minutes - - Investigation time: {{MINUTES}} - - Fix implementation time: {{MINUTES}} - - Verification time: {{MINUTES}} - -Severity: {{SEV1|SEV2|SEV3}} ({{Criteria}}) -``` - ---- - -## Distribution & Follow-up - -- [x] Postmortem shared with: {{TEAM_LIST}} -- [x] Customer communication sent: {{YES|NO|TEMPLATE_USED}} -- [x] Action items tracked in: {{JIRA/BACKLOG}} -- [x] Follow-up review scheduled: {{DATE}} - -**Follow-up Review**: {{DATE}} with {{ATTENDEES}} -- Confirm all critical action items completed -- Verify prevention measures working -- Check for recurring patterns - ---- - -## Appendix: Supporting Evidence - -### Logs - -``` -[Relevant log entries showing the incident] - -{{TIMESTAMP}} ERROR: {{MESSAGE}} -{{TIMESTAMP}} ERROR: {{MESSAGE}} -``` - -### Metrics - -[Include screenshots or links to metric dashboards showing the incident] - -- Error rate spike: [Chart or metric] -- Latency spike: [Chart or metric] -- Traffic pattern: [Chart or metric] - -### Configuration Changes - -```yaml -# Changes made before incident -- {{Change 1}} ({{TIMESTAMP}}) -- {{Change 2}} ({{TIMESTAMP}}) -``` - ---- - -**Document Completed By**: {{NAME}} -**Date**: {{DATE}} -**Review Status**: Draft | Final | Approved - -**Approvals**: -- [ ] Incident Commander: {{NAME}} {{DATE}} -- [ ] Service Owner: {{NAME}} {{DATE}} -- [ ] VP Engineering (if SEV1): {{NAME}} {{DATE}} diff --git a/src/psm/workflows/bmad-psm-incident-response/workflow.md b/src/psm/workflows/bmad-psm-incident-response/workflow.md deleted file mode 100644 index 2c7fa4541..000000000 --- a/src/psm/workflows/bmad-psm-incident-response/workflow.md +++ /dev/null @@ -1,163 +0,0 @@ ---- -workflow_id: W-INCIDENT-001 -workflow_name: Production Incident Response -version: 6.2.0 -lead_agent: "SRE Minh" -supporting_agents: ["Architect Khang", "Mary Analyst"] -phase: "3-Run: Emergency Response & Recovery" -created_date: 2026-03-17 -last_modified: 2026-03-17 -config_file: "_config/config.yaml" -estimated_duration: "15 minutes to 2 hours (depending on severity)" -outputFile: '{output_folder}/psm-artifacts/incident-{{project_name}}-{{date}}.md' ---- - -# Production Incident Response Workflow β€” BMAD Pattern - -## Metadata & Context - -**Goal**: Triage, diagnose, resolve production incidents through systematic diagnosis and apply fixes with verification. This is the most critical workflow - minimize MTTR (Mean Time To Recovery) while maintaining system stability. - -**Lead Team**: -- SRE Minh (Incident Command, Recovery Orchestration) -- Architect Khang (Root Cause Analysis, System-wide Impact) -- Mary Analyst (Impact Assessment, Post-Incident Review) - -**Success Criteria**: -- βœ“ Incident severity classified within 5 minutes -- βœ“ Root cause identified within first triage pass -- βœ“ Fix applied and verified -- βœ“ System metrics returned to baseline -- βœ“ Incident postmortem documented with action items -- βœ“ Prevention measures identified - -## Workflow Overview - -Workflow nΓ y di qua 4 bΖ°α»›c atomic, mα»—i bΖ°α»›c focus vΓ o mα»™t phase khΓ‘c nhau: - -1. **Step-01-Triage** β†’ Gather initial info, assess severity, classify impact -2. **Step-02-Diagnose** β†’ Systematic diagnosis using observability data (logs, metrics, traces) -3. **Step-03-Fix** β†’ Apply fix, verify resolution, validate recovery -4. **Step-04-Postmortem** β†’ Document incident, identify action items, prevent recurrence - -## Configuration Loading - -Tα»± Δ‘α»™ng load tα»« `_config/config.yaml`: - -```yaml -project_context: - organization: "[loaded from config]" - environment: "production" - incident_channel: "slack:#incidents" - -workflow_defaults: - communication_language: "Vietnamese-English" - severity_levels: ["SEV1", "SEV2", "SEV3", "SEV4"] - escalation_contacts: "[loaded from config]" - on_call_engineer: "[loaded from config]" -``` - -## Workflow Architecture - Micro-File Design - -BMAD pattern: Mα»—i step lΓ  mα»™t file riΓͺng, load just-in-time. Workflow chain: - -``` -workflow.md (entry point) - ↓ -step-01-triage.md (classify severity, initial assessment) - ↓ -step-02-diagnose.md (root cause analysis) - ↓ -step-03-fix.md (apply fix, verify) - ↓ -step-04-postmortem.md (document, prevent) - ↓ -incident-response-summary.md (final output) -``` - -**Key Benefits**: -- Single-step focus β€” engineer concentrates on one phase -- Knowledge isolation β€” load only relevant SKILL docs per step -- State tracking β€” save progress after each step -- Easy resumption β€” if interrupted, restart from exact step - -## Skill References - -Workflow nΓ y load knowledge tα»«: - -- **5.07 Reliability & Resilience** β†’ Circuit breaker patterns, fallback strategies, timeout management -- **5.08 Observability & Monitoring** β†’ Structured logging, metrics queries, distributed tracing -- **5.09 Error Handling & Recovery** β†’ Error classification, graceful degradation patterns -- **5.10 Production Readiness** β†’ Incident prevention checklist, alerting setup -- **5.14 Documentation & Runbooks** β†’ Postmortem templates, incident reports - -## Execution Model - -### Entry Point Logic - -``` -1. Check if incident session exists - β†’ If NEW incident: Start from step-01-triage.md - β†’ If ONGOING: Load incident-session.yaml β†’ continue from last completed step - β†’ If RESOLVED: Load postmortem template - -2. For each step: - a) Load step-{N}-{name}.md - b) Load referenced SKILL files (auto-parse "Load:" directives) - c) Execute MENU [A][C] options - d) Save step output to step-{N}-output.md + incident-context.yaml - e) Move to next step or conclude - -3. Final: Generate incident report + postmortem in outputs folder -``` - -### State Tracking - -Incident session frontmatter tracks progress: - -```yaml -incident_context: - incident_id: "INC-2026-03-17-001" - severity: "SEV1" | "SEV2" | "SEV3" | "SEV4" - status: "triage" β†’ "diagnosing" β†’ "recovering" β†’ "resolved" β†’ "postmortem" - affected_services: ["service-1", "service-2"] - started_at: "2026-03-17T14:30:00Z" - timeline: - detected_at: "2026-03-17T14:30:00Z" - triage_completed_at: "2026-03-17T14:35:00Z" - root_cause_identified_at: "2026-03-17T14:50:00Z" - fix_applied_at: "2026-03-17T15:10:00Z" - resolved_at: "2026-03-17T15:15:00Z" - current_step: "step-02-diagnose" - last_updated: "2026-03-17T14:50:00Z" - incident_commander: "SRE Minh" -``` - -## Mandatory Workflow Rules - -1. **Speed first** β€” Triage must complete in < 5 minutes -2. **Root cause identification** β€” Must identify root cause before fix attempt -3. **Verify before declaring resolved** β€” Check metrics + user reports -4. **Document everything** β€” Every action logged for postmortem -5. **Escalation protocol** β€” SEV1 β†’ Page on-call architect immediately -6. **Communication** β€” Update stakeholders every 5-10 minutes -7. **No flying blind** β€” All fixes must reference observability data - -## Severity Scale - -- **SEV1** β€” Service completely down, revenue impact, > 1% users affected β†’ Page all on-call -- **SEV2** β€” Major degradation, significant users affected, partial functionality down -- **SEV3** β€” Moderate impact, some users affected, workaround possible -- **SEV4** β€” Minor issue, limited users, can defer to business hours - -## Navigation - -HΓ£y chọn cΓ‘ch bαΊ―t Δ‘αΊ§u: - -- **[NEW-INC]** β€” Report new incident β†’ Load step-01-triage -- **[RESUME-INC]** β€” Continue existing incident (detect progress from incident-session.yaml) -- **[ESCALATE]** β€” Escalate to on-call architect - ---- - -**HΓ£y bΓ‘o cΓ‘o tΓ¬nh trαΊ‘ng incident hoαΊ·c chọn [NEW-INC] để bαΊ―t Δ‘αΊ§u triage** diff --git a/src/psm/workflows/bmad-psm-mlops-deployment/SKILL.md b/src/psm/workflows/bmad-psm-mlops-deployment/SKILL.md deleted file mode 100644 index e24eade5c..000000000 --- a/src/psm/workflows/bmad-psm-mlops-deployment/SKILL.md +++ /dev/null @@ -1,6 +0,0 @@ ---- -name: bmad-psm-mlops-deployment -description: 'Deploy ML model to production with validation and monitoring. Use when the user says "deploy model" or "ML deployment" or "model serving"' ---- - -Follow the instructions in [workflow.md](workflow.md). diff --git a/src/psm/workflows/bmad-psm-mlops-deployment/bmad-skill-manifest.yaml b/src/psm/workflows/bmad-psm-mlops-deployment/bmad-skill-manifest.yaml deleted file mode 100644 index d0f08abdb..000000000 --- a/src/psm/workflows/bmad-psm-mlops-deployment/bmad-skill-manifest.yaml +++ /dev/null @@ -1 +0,0 @@ -type: skill diff --git a/src/psm/workflows/bmad-psm-mlops-deployment/workflow.md b/src/psm/workflows/bmad-psm-mlops-deployment/workflow.md deleted file mode 100644 index a682e6344..000000000 --- a/src/psm/workflows/bmad-psm-mlops-deployment/workflow.md +++ /dev/null @@ -1,89 +0,0 @@ ---- -workflow_id: MLOPS001 -workflow_name: MLOps Deployment -description: Deploy ML model to production with validation, serving, and monitoring -entry_point: steps/step-01-model-validation.md -phase: 5-specialized -lead_agent: "Linh (MLOps)" -status: "active" -created_date: 2026-03-17 -version: "1.0.0" -estimated_duration: "3-4 hours" -outputFile: '{output_folder}/psm-artifacts/mlops-deploy-{{project_name}}-{{date}}.md' ---- - -# Workflow: MLOps Deployment - -## Goal -Deploy machine learning models to production with comprehensive validation, infrastructure setup, and post-deployment monitoring. - -## Overview - -MLOps deployment ensures ML models are production-ready and continuously monitored for performance and data drift. The workflow: - -1. **Validates** model quality, performance metrics, and data drift detection -2. **Deploys** model to serving infrastructure with versioning and A/B testing -3. **Monitors** model performance, data drift, and cost metrics post-deployment - -## Execution Path - -``` -START - ↓ -[Step 01] Model Validation (Check metrics, data drift, A/B test plan) - ↓ -[Step 02] Deploy Model (Setup serving, infrastructure, GPU optimization) - ↓ -[Step 03] Monitor (Langfuse/MLflow, drift detection, cost tracking) - ↓ -END -``` - -## Key Roles - -| Role | Agent | Responsibility | -|------|-------|-----------------| -| Lead | Linh (MLOps) | Coordinate deployment, monitor model health | -| Data Scientist | Data Lead | Validate model quality, approve for production | -| DevOps | Platform Eng | Setup infrastructure, manage resources | - -## Validation Gates (3) - -1. **Model Quality** β€” Accuracy, precision, recall metrics meet SLO -2. **Data Quality** β€” No data drift detected; training/production data distribution aligned -3. **Business Readiness** β€” A/B test plan ready, rollback strategy defined - -## Input Requirements - -- **Trained model artifact** β€” Model checkpoint, weights, configuration -- **Performance metrics** β€” Baseline accuracy, latency, throughput expectations -- **Data validation** β€” Training dataset description, expected data distribution -- **Serving infrastructure** β€” Compute requirements (GPU/CPU), latency targets - -## Output Deliverable - -- **MLOps Deployment Report** - - Model version and metadata - - Performance validation summary - - Serving infrastructure setup - - Monitoring dashboard and alerts - - Data drift detection configuration - -## Success Criteria - -1. Model passes all quality gates before deployment -2. Serving infrastructure deployed and load-tested -3. Monitoring and alerting configured and validated -4. Rollback strategy tested and documented -5. Team trained on model updates and incident response - -## Next Steps After Workflow - -- Monitor model performance daily for first week -- Track data drift metrics; alert if detected -- Plan model retraining based on performance degradation -- Document lessons learned in MLOps runbook - ---- - -**Navigation**: [← Back to 5-specialized](../), [Next: Step 01 β†’](steps/step-01-model-validation.md) diff --git a/src/psm/workflows/bmad-psm-production-readiness/SKILL.md b/src/psm/workflows/bmad-psm-production-readiness/SKILL.md deleted file mode 100644 index d3444d26a..000000000 --- a/src/psm/workflows/bmad-psm-production-readiness/SKILL.md +++ /dev/null @@ -1,6 +0,0 @@ ---- -name: bmad-psm-production-readiness -description: 'Run production readiness review across 9 dimensions. Use when the user says "are we ready for production" or "PRR" or "go-live check"' ---- - -Follow the instructions in [workflow.md](workflow.md). diff --git a/src/psm/workflows/bmad-psm-production-readiness/bmad-skill-manifest.yaml b/src/psm/workflows/bmad-psm-production-readiness/bmad-skill-manifest.yaml deleted file mode 100644 index d0f08abdb..000000000 --- a/src/psm/workflows/bmad-psm-production-readiness/bmad-skill-manifest.yaml +++ /dev/null @@ -1 +0,0 @@ -type: skill diff --git a/src/psm/workflows/bmad-psm-production-readiness/production-readiness.template.md b/src/psm/workflows/bmad-psm-production-readiness/production-readiness.template.md deleted file mode 100644 index 5ba0f47d9..000000000 --- a/src/psm/workflows/bmad-psm-production-readiness/production-readiness.template.md +++ /dev/null @@ -1,367 +0,0 @@ ---- -template_name: production-readiness-checklist -template_version: "1.0.0" -created_date: 2026-03-17 -description: Production Readiness Review checklist and report template ---- - -# Production Readiness Review (PRR) - -**Service**: {{SERVICE_NAME}} -**Owner**: {{SERVICE_OWNER}} -**Reviewer**: {{SRE_LEAD}} (Minh) -**Review Date**: {{DATE}} -**Target Go-Live**: {{TARGET_DATE}} - ---- - -## Executive Summary - -{{1-2 paragraphs summarizing the readiness assessment, decision, and key findings}} - -**Overall Assessment**: {{READY | CONDITIONAL | NOT_READY}} - -**Timeline**: Service {{can | can conditionally | cannot}} proceed to production {{on {{DATE}}}} - ---- - -## Production Readiness Scorecard - -### 9-Dimension Assessment - -| # | Dimension | Score | Status | Key Finding | -|---|-----------|-------|--------|-------------| -| 1 | Reliability | {{GREEN|YELLOW|RED}} | βœ…/⚠️/❌ | {{Brief finding}} | -| 2 | Observability | {{GREEN|YELLOW|RED}} | βœ…/⚠️/❌ | {{Brief finding}} | -| 3 | Performance | {{GREEN|YELLOW|RED}} | βœ…/⚠️/❌ | {{Brief finding}} | -| 4 | Security | {{GREEN|YELLOW|RED}} | βœ…/⚠️/❌ | {{Brief finding}} | -| 5 | Capacity | {{GREEN|YELLOW|RED}} | βœ…/⚠️/❌ | {{Brief finding}} | -| 6 | Data | {{GREEN|YELLOW|RED}} | βœ…/⚠️/❌ | {{Brief finding}} | -| 7 | Runbooks | {{GREEN|YELLOW|RED}} | βœ…/⚠️/❌ | {{Brief finding}} | -| 8 | Dependencies | {{GREEN|YELLOW|RED}} | βœ…/⚠️/❌ | {{Brief finding}} | -| 9 | Rollback | {{GREEN|YELLOW|RED}} | βœ…/⚠️/❌ | {{Brief finding}} | - -**Summary**: {{X}} GREEN, {{Y}} YELLOW, {{Z}} RED - ---- - -## Detailed Findings by Dimension - -### 1. Reliability - -**Goal**: Service meets SLO targets with documented failure modes and incident response plan. - -**Findings**: - -- [ ] {{Finding 1}} ({{Status}}) -- [ ] {{Finding 2}} ({{Status}}) -- [ ] {{Finding 3}} ({{Status}}) - -**Assessment**: {{Detailed narrative, 3-5 sentences}} - -**Score**: {{GREEN|YELLOW|RED}} - ---- - -### 2. Observability - -**Goal**: Service has comprehensive logging, metrics, tracing, and dashboards for operational visibility. - -**Findings**: - -- [ ] {{Finding 1}} ({{Status}}) -- [ ] {{Finding 2}} ({{Status}}) -- [ ] {{Finding 3}} ({{Status}}) - -**Assessment**: {{Detailed narrative, 3-5 sentences}} - -**Score**: {{GREEN|YELLOW|RED}} - ---- - -### 3. Performance - -**Goal**: Service meets latency/throughput targets and scales under expected load. - -**Findings**: - -- [ ] {{Finding 1}} ({{Status}}) -- [ ] {{Finding 2}} ({{Status}}) -- [ ] {{Finding 3}} ({{Status}}) - -**Assessment**: {{Detailed narrative, 3-5 sentences}} - -**Score**: {{GREEN|YELLOW|RED}} - ---- - -### 4. Security - -**Goal**: Authentication, authorization, encryption, and secrets management are implemented. - -**Findings**: - -- [ ] {{Finding 1}} ({{Status}}) -- [ ] {{Finding 2}} ({{Status}}) -- [ ] {{Finding 3}} ({{Status}}) - -**Assessment**: {{Detailed narrative, 3-5 sentences}} - -**Score**: {{GREEN|YELLOW|RED}} - ---- - -### 5. Capacity - -**Goal**: Resource requirements defined with growth headroom and cost acceptable. - -**Findings**: - -- [ ] {{Finding 1}} ({{Status}}) -- [ ] {{Finding 2}} ({{Status}}) -- [ ] {{Finding 3}} ({{Status}}) - -**Assessment**: {{Detailed narrative, 3-5 sentences}} - -**Score**: {{GREEN|YELLOW|RED}} - ---- - -### 6. Data - -**Goal**: Data governance, backup, retention, and disaster recovery documented and tested. - -**Findings**: - -- [ ] {{Finding 1}} ({{Status}}) -- [ ] {{Finding 2}} ({{Status}}) -- [ ] {{Finding 3}} ({{Status}}) - -**Assessment**: {{Detailed narrative, 3-5 sentences}} - -**Score**: {{GREEN|YELLOW|RED}} - ---- - -### 7. Runbooks - -**Goal**: Incident response, deployment, troubleshooting procedures documented and drilled. - -**Findings**: - -- [ ] {{Finding 1}} ({{Status}}) -- [ ] {{Finding 2}} ({{Status}}) -- [ ] {{Finding 3}} ({{Status}}) - -**Assessment**: {{Detailed narrative, 3-5 sentences}} - -**Score**: {{GREEN|YELLOW|RED}} - ---- - -### 8. Dependencies - -**Goal**: External/internal dependencies mapped, versioned, with fallback strategies. - -**Findings**: - -- [ ] {{Finding 1}} ({{Status}}) -- [ ] {{Finding 2}} ({{Status}}) -- [ ] {{Finding 3}} ({{Status}}) - -**Assessment**: {{Detailed narrative, 3-5 sentences}} - -**Score**: {{GREEN|YELLOW|RED}} - ---- - -### 9. Rollback - -**Goal**: Safe rollback strategy tested; deployment is reversible. - -**Findings**: - -- [ ] {{Finding 1}} ({{Status}}) -- [ ] {{Finding 2}} ({{Status}}) -- [ ] {{Finding 3}} ({{Status}}) - -**Assessment**: {{Detailed narrative, 3-5 sentences}} - -**Score**: {{GREEN|YELLOW|RED}} - ---- - -## Critical Blockers (P0) - -{{If any P0 blockers exist:}} - -Service **CANNOT** proceed to production until these are resolved: - -### P0 Blocker #1: {{ISSUE_TITLE}} - -- **Dimension**: {{Which dimension}} -- **Description**: {{What's the problem}} -- **Impact**: {{Why it's critical}} -- **Resolution**: {{How to fix}} -- **Owner**: {{Who must fix it}} -- **Deadline**: {{When it must be done}} -- **Acceptance**: {{How we verify it's fixed}} - -### P0 Blocker #2: {{ISSUE_TITLE}} - -{{Repeat format}} - ---- - -## Risks to Manage (P1) - -Service can proceed with documented monitoring and contingency plans: - -### P1 Risk #1: {{ISSUE_TITLE}} - -- **Dimension**: {{Which dimension}} -- **Description**: {{What's the problem}} -- **Impact**: {{If it happens, what's the consequence}} -- **Likelihood**: {{HIGH|MEDIUM|LOW}} -- **Mitigation**: {{How we'll manage it}} -- **Monitoring**: {{What metrics to watch}} -- **Contingency**: {{What we'll do if it occurs}} -- **Owner**: {{Who owns this risk}} -- **Target Fix**: {{Timeline to resolve permanently}} - -### P1 Risk #2: {{ISSUE_TITLE}} - -{{Repeat format}} - ---- - -## Recommendations - -**High Priority** (Next sprint): -- {{Recommendation 1}} -- {{Recommendation 2}} - -**Medium Priority** (Within 1 month): -- {{Recommendation 1}} -- {{Recommendation 2}} - -**Nice to Have** (Backlog): -- {{Recommendation 1}} -- {{Recommendation 2}} - ---- - -## Final Decision - -### Decision - -**{{ βœ… GO | ⚠️ CONDITIONAL-GO | ❌ NO-GO }}** - -### Rationale - -{{Explain the decision. Why can/can't we proceed?}} - -### Conditions (If CONDITIONAL-GO) - -If proceeding despite P1 risks, document conditions: - -1. **{{Condition 1}}**: {{Description}} - - Owner: {{Who oversees this}} - - Success Criteria: {{How we verify it}} - - Escalation: {{Who to contact if issues}} - -2. **{{Condition 2}}**: {{Description}} - - Owner: {{Who oversees this}} - - Success Criteria: {{How we verify it}} - - Escalation: {{Who to contact if issues}} - -### Deployment Timeline - -{{If GO or CONDITIONAL-GO:}} - -- **Approved for deployment**: {{DATE}} -- **Earliest go-live**: {{DATE}} -- **Recommended window**: {{DATE/TIME}} -- **On-call coverage required**: {{YES|NO}} -- **Emergency rollback plan**: {{REFERENCE TO RUNBOOK}} - ---- - -## Sign-offs & Approvals - -### Approval Chain - -- [ ] **SRE Lead** ({{NAME}}) β€” Review completed and findings approved - - Signature: ________________________ Date: __________ - -- [ ] **Architecture Lead** ({{NAME}}) β€” Architecture validated - - Signature: ________________________ Date: __________ - -- [ ] **Service Owner** ({{NAME}}) β€” Acknowledged findings and committed to actions - - Signature: ________________________ Date: __________ - -- [ ] **VP Engineering** ({{NAME}}) β€” Risk accepted (if CONDITIONAL-GO) - - Signature: ________________________ Date: __________ - ---- - -## Post-Production Plan - -### First 24 Hours - -- [ ] SRE on-call monitoring closely -- [ ] Daily standup with service team -- [ ] Monitor for any unusual patterns -- [ ] Be ready to rollback if needed - -### First Week - -- [ ] Daily metrics review -- [ ] Watch for data drift or unusual behavior -- [ ] Follow up on any P1 risks - -### Ongoing - -- [ ] Monthly PRR follow-ups to verify improvements -- [ ] Track action items to completion -- [ ] Update this PRR if significant changes made - ---- - -## Action Items - -| ID | Action | Owner | Deadline | Type | Status | -|----|--------|-------|----------|------|--------| -| A1 | {{Action}} | {{Name}} | {{Date}} | {{BLOCKER|RISK|RECOMMENDATION}} | ☐ | -| A2 | {{Action}} | {{Name}} | {{Date}} | {{BLOCKER|RISK|RECOMMENDATION}} | ☐ | -| A3 | {{Action}} | {{Name}} | {{Date}} | {{BLOCKER|RISK|RECOMMENDATION}} | ☐ | - ---- - -## Appendix - -### A. Load Test Results - -[Link to or summary of load test results showing service meets performance targets] - -### B. Security Review Results - -[Link to or summary of security audit findings] - -### C. Architecture Diagrams - -[Include or link to system architecture, data flow, and deployment topology] - -### D. SLO Definition - -[Document the agreed-upon SLO targets for availability, latency, error rate] - -### E. Runbooks - -[Link to or list of key runbooks: incident response, deployment, rollback, troubleshooting] - ---- - -**Report prepared by**: {{SRE_LEAD}} -**Report date**: {{DATE}} -**Last updated**: {{DATE}} diff --git a/src/psm/workflows/bmad-psm-production-readiness/workflow.md b/src/psm/workflows/bmad-psm-production-readiness/workflow.md deleted file mode 100644 index b64ed8e6e..000000000 --- a/src/psm/workflows/bmad-psm-production-readiness/workflow.md +++ /dev/null @@ -1,92 +0,0 @@ ---- -workflow_id: PRR001 -workflow_name: Production Readiness Review -description: Validate service is ready for production using comprehensive readiness checklist -entry_point: steps/step-01-init-checklist.md -phase: 3-run -lead_agent: "Minh (SRE)" -status: "active" -created_date: 2026-03-17 -version: "1.0.0" -estimated_duration: "2-3 hours" -outputFile: '{output_folder}/psm-artifacts/prr-{{project_name}}-{{date}}.md' ---- - -# Workflow: Production Readiness Review (PRR) - -## Goal -Validate and certify that a service meets production readiness standards across 9 key dimensions before deployment. - -## Overview - -This workflow systematically evaluates a service against production readiness criteria defined in the Production Systems BMAD skill framework. Using SRE expertise and architectural patterns, the workflow: - -1. **Initializes** the PRR process with service context and dimensional overview -2. **Deep reviews** each dimension (reliability, observability, performance, security, capacity, data, runbooks, dependencies, rollback) -3. **Renders final decision** with GO/NO-GO/CONDITIONAL-GO recommendation - -## Execution Path - -``` -START - ↓ -[Step 01] Init Checklist (Load framework, gather service context, present dimensions) - ↓ -[Step 02] Deep Review (Score each dimension, identify blockers, recommendations) - ↓ -[Step 03] Final Decision (Scorecard, decision, action items, DONE) - ↓ -END -``` - -## Key Roles - -| Role | Agent | Responsibility | -|------|-------|-----------------| -| Lead | Minh (SRE) | Navigate workflow, coordinate review, make final call | -| Subject Matter | Service Owner | Provide service context, clarify architecture | -| Review Committee | Arch, SecOps, MLOps | Contribute expertise on specific dimensions | - -## Dimensions Evaluated (9) - -1. **Reliability** β€” SLA/SLO definition, error budgets, failure modes, incident response -2. **Observability** β€” Logging, metrics, tracing, dashboards, alerting -3. **Performance** β€” Latency targets, throughput, P99 tail behavior, optimization opportunities -4. **Security** β€” Auth/authz, secrets management, encryption, audit logging, compliance -5. **Capacity** β€” Resource limits, scaling policies, burst capacity, cost projections -6. **Data** β€” Schema versioning, backup/restore, data governance, retention policies -7. **Runbooks** β€” Incident runbooks, operational playbooks, troubleshooting guides -8. **Dependencies** β€” External services, internal libraries, database versioning, API contracts -9. **Rollback** β€” Rollback strategy, canary deployment, feature flags, smoke tests - -## Input Requirements - -- **Service name and owner** β€” Which service are we evaluating? -- **Current architecture** β€” High-level design, tech stack, topology -- **Existing metrics/dashboards** β€” Links to monitoring, SLO definitions -- **Known gaps/risks** β€” Already identified issues to address - -## Output Deliverable - -- **Production Readiness Checklist** (template: `production-readiness.template.md`) - - Scorecard with 9 dimensions (red/yellow/green) - - Blockers and recommendations per dimension - - Final GO/NO-GO/CONDITIONAL-GO decision - - Explicit action items with owners and deadlines - -## Success Criteria - -1. All 9 dimensions evaluated with clear rationale -2. Blockers categorized as P0 (must fix) or P1 (should fix) -3. Team alignment on decision (documented in PRR report) -4. Action plan with clear accountability and timeline - -## Next Steps After Workflow - -- If **GO**: Proceed to deployment; document in CHANGELOG -- If **NO-GO**: Reschedule PRR once blockers addressed; track in backlog -- If **CONDITIONAL-GO**: Deploy with documented caveats; setup monitoring for risk areas - ---- - -**Navigation**: [← Back to 3-run](../), [Next: Step 01 β†’](steps/step-01-init-checklist.md) diff --git a/src/psm/workflows/bmad-psm-quick-diagnose/SKILL.md b/src/psm/workflows/bmad-psm-quick-diagnose/SKILL.md deleted file mode 100644 index be9f69f77..000000000 --- a/src/psm/workflows/bmad-psm-quick-diagnose/SKILL.md +++ /dev/null @@ -1,6 +0,0 @@ ---- -name: bmad-psm-quick-diagnose -description: 'Quick diagnosis of production issue with minimal latency. Use when the user says "something is broken" or "quick diagnose" or "what is happening?"' ---- - -Follow the instructions in [workflow.md](workflow.md). diff --git a/src/psm/workflows/bmad-psm-quick-diagnose/bmad-skill-manifest.yaml b/src/psm/workflows/bmad-psm-quick-diagnose/bmad-skill-manifest.yaml deleted file mode 100644 index d0f08abdb..000000000 --- a/src/psm/workflows/bmad-psm-quick-diagnose/bmad-skill-manifest.yaml +++ /dev/null @@ -1 +0,0 @@ -type: skill diff --git a/src/psm/workflows/bmad-psm-quick-diagnose/workflow.md b/src/psm/workflows/bmad-psm-quick-diagnose/workflow.md deleted file mode 100644 index dc88a7fff..000000000 --- a/src/psm/workflows/bmad-psm-quick-diagnose/workflow.md +++ /dev/null @@ -1,80 +0,0 @@ ---- -workflow_id: QD001 -workflow_name: Quick Diagnose -description: Fast diagnosis of production issue with root cause and fix suggestion -entry_point: steps/step-01-gather.md -phase: quick-flow -lead_agent: "Minh (SRE)" -status: "active" -created_date: 2026-03-17 -version: "1.0.0" -estimated_duration: "15-25 minutes" -outputFile: '{output_folder}/psm-artifacts/quick-diagnose-{{date}}.md' ---- - -# Workflow: Quick Diagnose Production Issue - -## Goal -Rapidly diagnose production issues by gathering symptom data, checking metrics, and suggesting fixes. - -## Overview - -Quick Diagnose is a lightweight workflow for time-sensitive production troubleshooting: - -1. **Gathers** symptom description and quick metrics check -2. **Diagnoses** root cause using observability data -3. **Suggests** fix or mitigation immediately - -## Execution Path - -``` -START - ↓ -[Step 01] Gather Context (What's broken? Check metrics) - ↓ -[Step 02] Diagnose & Fix (Root cause analysis β†’ fix suggestion β†’ verify) - ↓ -END -``` - -## Key Roles - -| Role | Agent | -|------|-------| -| Lead | Minh (SRE) | - -## Input Requirements - -- **Symptom description** β€” What is failing? (error message, behavior, timeline) -- **Affected service/component** β€” What system is broken? -- **Timeline** β€” When did it start? Is it ongoing? -- **Impact** β€” How many users affected? Is revenue impacted? - -## Output Deliverable - -- **Quick Diagnosis Report** (markdown, 1-2 pages) - - Symptom analysis - - Root cause hypothesis - - Immediate mitigation (if needed) - - Fix suggestion with effort - - Follow-up actions - -## Success Criteria - -1. Root cause identified within 15-20 minutes -2. Immediate mitigation available (if needed) -3. Fix suggestion documented with clear steps -4. Team knows what to do next - -## Quick Diagnose vs Full Production Readiness Review - -| Aspect | Quick Diagnose | Full PRR | -|--------|---|---| -| Trigger | Active incident | Pre-deployment | -| Duration | 15-25 min | 2-3 hours | -| Scope | Single issue | All 9 dimensions | -| Goal | Fix now | Prevent issues | - ---- - -**Navigation**: [← Back to quick-flow](../), [Next: Step 01 β†’](steps/step-01-gather.md) diff --git a/src/psm/workflows/bmad-psm-security-audit/SKILL.md b/src/psm/workflows/bmad-psm-security-audit/SKILL.md deleted file mode 100644 index 7da323eb1..000000000 --- a/src/psm/workflows/bmad-psm-security-audit/SKILL.md +++ /dev/null @@ -1,6 +0,0 @@ ---- -name: bmad-psm-security-audit -description: 'Run comprehensive security audit and threat assessment. Use when the user says "security audit" or "vulnerability assessment" or "security review"' ---- - -Follow the instructions in [workflow.md](workflow.md). diff --git a/src/psm/workflows/bmad-psm-security-audit/bmad-skill-manifest.yaml b/src/psm/workflows/bmad-psm-security-audit/bmad-skill-manifest.yaml deleted file mode 100644 index d0f08abdb..000000000 --- a/src/psm/workflows/bmad-psm-security-audit/bmad-skill-manifest.yaml +++ /dev/null @@ -1 +0,0 @@ -type: skill diff --git a/src/psm/workflows/bmad-psm-security-audit/security-audit-report.template.md b/src/psm/workflows/bmad-psm-security-audit/security-audit-report.template.md deleted file mode 100644 index a4127603e..000000000 --- a/src/psm/workflows/bmad-psm-security-audit/security-audit-report.template.md +++ /dev/null @@ -1,502 +0,0 @@ ---- -template_name: security-audit-report -template_version: "1.0.0" -created_date: 2026-03-17 -description: Security audit report with findings, severity levels, and remediation plan ---- - -# Security Audit Report - -**Service**: {{SERVICE_NAME}} -**Service Owner**: {{SERVICE_OWNER}} -**Auditor**: {{SECURITY_LEAD}} (HΓ ) -**Audit Date**: {{START_DATE}} β€” {{END_DATE}} -**Report Date**: {{REPORT_DATE}} -**Scope**: {{SCOPE_DESCRIPTION}} - ---- - -## Executive Summary - -This security audit evaluated {{SERVICE_NAME}} against security best practices and compliance requirements. The assessment identified {{X}} findings across {{Y}} security domains. - -**Overall Security Posture**: {{COMPLIANT | FINDINGS | CRITICAL}} - -{{1-2 paragraph summary of key findings, critical issues if any, and recommendations}} - ---- - -## Audit Scope - -### Services Reviewed - -- {{Service 1}} ({{Description}}) -- {{Service 2}} ({{Description}}) -- {{Service 3}} ({{Description}}) - -### Assessment Domains - -- βœ… Authentication & Authorization -- βœ… API Security -- βœ… Secrets Management -- βœ… Encryption (in-transit & at-rest) -- βœ… PII & Data Protection - -### Exclusions - -{{Any out-of-scope areas:}} -- {{Item}} (reason) -- {{Item}} (reason) - ---- - -## Findings Summary - -### By Severity - -| Severity | Count | Trend | -|----------|-------|-------| -| **Critical** | {{X}} | {{↑/β†’/↓}} | -| **High** | {{Y}} | {{↑/β†’/↓}} | -| **Medium** | {{Z}} | {{↑/β†’/↓}} | -| **Low** | {{W}} | {{↑/β†’/↓}} | -| **Total** | {{X+Y+Z+W}} | | - -### By Domain - -| Domain | Critical | High | Medium | Low | Status | -|--------|----------|------|--------|-----|--------| -| Auth & Authz | {{#}} | {{#}} | {{#}} | {{#}} | βœ…/⚠️/❌ | -| API Security | {{#}} | {{#}} | {{#}} | {{#}} | βœ…/⚠️/❌ | -| Secrets Mgmt | {{#}} | {{#}} | {{#}} | {{#}} | βœ…/⚠️/❌ | -| Encryption | {{#}} | {{#}} | {{#}} | {{#}} | βœ…/⚠️/❌ | -| PII & Data | {{#}} | {{#}} | {{#}} | {{#}} | βœ…/⚠️/❌ | - ---- - -## Critical Severity Findings - -### [F1] {{Finding Title}} - -**Severity**: CRITICAL (CVSS {{8.0-10.0}}) -**Domain**: {{Which domain}} -**Status**: {{Open | In Progress | Resolved}} - -**Description**: -{{Detailed description of the vulnerability, how it could be exploited, and the impact}} - -**Evidence**: -- {{Evidence 1}} -- {{Evidence 2}} -- {{Testing confirmation}} - -**Impact**: -- {{Business impact}} -- {{Technical impact}} -- {{Compliance impact}} - -**Remediation**: -1. {{Step 1}} ({{Estimated time}}) -2. {{Step 2}} ({{Estimated time}}) -3. {{Step 3}} ({{Estimated time}}) - -**Owner**: {{Name}} -**Target Fix Date**: {{DATE}} -**Effort**: {{Est. hours/days}} -**Verification**: {{How we'll confirm it's fixed}} - ---- - -### [F2] {{Finding Title}} - -{{Repeat Critical severity format}} - ---- - -## High Severity Findings - -### [F3] {{Finding Title}} - -**Severity**: HIGH (CVSS {{7.0-7.9}}) -**Domain**: {{Which domain}} -**Status**: {{Open | In Progress | Resolved}} - -**Description**: {{Brief description}} - -**Impact**: {{Why it matters}} - -**Remediation**: -1. {{Step 1}} -2. {{Step 2}} - -**Owner**: {{Name}} -**Target Date**: {{DATE}} - ---- - -### [F4] {{Finding Title}} - -{{Repeat High severity format}} - ---- - -## Medium Severity Findings - -### [F5] {{Finding Title}} - -**Severity**: MEDIUM (CVSS {{4.0-6.9}}) -**Domain**: {{Which domain}} -**Description**: {{Brief description}} -**Remediation**: {{Brief fix}} -**Owner**: {{Name}} | **Target Date**: {{DATE}} - ---- - -### [F6] {{Finding Title}} - -{{Repeat Medium severity format}} - ---- - -## Low Severity Findings - -### [F7] {{Finding Title}} - -**Severity**: LOW (CVSS {{0.1-3.9}}) -**Description**: {{Brief description}} -**Remediation**: {{Brief fix}} - ---- - -### [F8] {{Finding Title}} - -{{Repeat Low severity format}} - ---- - -## Domain-Specific Assessment - -### Domain 1: Authentication & Authorization - -**Status**: {{COMPLIANT | FINDINGS | CRITICAL}} - -**Strengths**: -- {{Positive finding 1}} -- {{Positive finding 2}} - -**Gaps**: -- {{Gap 1}} β€” {{Impact}} -- {{Gap 2}} β€” {{Impact}} - -**Recommendations**: -1. {{Recommendation 1}} -2. {{Recommendation 2}} - ---- - -### Domain 2: API Security - -**Status**: {{COMPLIANT | FINDINGS | CRITICAL}} - -**Strengths**: -- {{Positive finding 1}} -- {{Positive finding 2}} - -**Gaps**: -- {{Gap 1}} β€” {{Impact}} -- {{Gap 2}} β€” {{Impact}} - -**Recommendations**: -1. {{Recommendation 1}} -2. {{Recommendation 2}} - ---- - -### Domain 3: Secrets Management - -**Status**: {{COMPLIANT | FINDINGS | CRITICAL}} - -**Strengths**: -- {{Positive finding 1}} -- {{Positive finding 2}} - -**Gaps**: -- {{Gap 1}} β€” {{Impact}} -- {{Gap 2}} β€” {{Impact}} - -**Recommendations**: -1. {{Recommendation 1}} -2. {{Recommendation 2}} - ---- - -### Domain 4: Encryption - -**Status**: {{COMPLIANT | FINDINGS | CRITICAL}} - -**Strengths**: -- {{Positive finding 1}} -- {{Positive finding 2}} - -**Gaps**: -- {{Gap 1}} β€” {{Impact}} -- {{Gap 2}} β€” {{Impact}} - -**Recommendations**: -1. {{Recommendation 1}} -2. {{Recommendation 2}} - ---- - -### Domain 5: PII & Data Protection - -**Status**: {{COMPLIANT | FINDINGS | CRITICAL}} - -**Strengths**: -- {{Positive finding 1}} -- {{Positive finding 2}} - -**Gaps**: -- {{Gap 1}} β€” {{Impact}} -- {{Gap 2}} β€” {{Impact}} - -**Recommendations**: -1. {{Recommendation 1}} -2. {{Recommendation 2}} - ---- - -## Compliance Assessment - -### GDPR (General Data Protection Regulation) - -**Applicable**: {{YES | NO | PARTIAL}} -**Status**: {{COMPLIANT | NON-COMPLIANT | CONDITIONAL}} - -| Requirement | Status | Finding | Gap Fix | -|-------------|--------|---------|---------| -| Data Encryption | {{βœ…/❌}} | {{Description}} | {{Remediation}} | -| Access Control | {{βœ…/❌}} | {{Description}} | {{Remediation}} | -| Retention Policy | {{βœ…/❌}} | {{Description}} | {{Remediation}} | -| Right to Deletion | {{βœ…/❌}} | {{Description}} | {{Remediation}} | -| Data Processing Agreement | {{βœ…/❌}} | {{Description}} | {{Remediation}} | - -**Timeline to Compliance**: {{DATE or "Already compliant"}} - ---- - -### PCI-DSS (Payment Card Industry Data Security Standard) - -**Applicable**: {{YES | NO | PARTIAL}} -**Status**: {{COMPLIANT | NON-COMPLIANT | CONDITIONAL}} - -| Requirement | Status | Finding | Gap Fix | -|-------------|--------|---------|---------| -| TLS 1.2+ | {{βœ…/❌}} | {{Description}} | {{Remediation}} | -| Secrets Management | {{βœ…/❌}} | {{Description}} | {{Remediation}} | -| Input Validation | {{βœ…/❌}} | {{Description}} | {{Remediation}} | - -**Timeline to Compliance**: {{DATE or "Already compliant"}} - ---- - -### SOC 2 Type II - -**Applicable**: {{YES | NO | PARTIAL}} -**Status**: {{COMPLIANT | NON-COMPLIANT | CONDITIONAL}} - -**Gap Summary**: {{Description of gaps or "No gaps identified"}} - -**Timeline**: {{When audit can be conducted}} - ---- - -### Other Regulations - -{{Any other applicable standards (HIPAA, FINRA, etc.)}} - ---- - -## Remediation Roadmap - -### Critical Path (Week 1-2) - -**All Critical findings must be fixed before production deployment.** - -- [ ] {{F1}} β€” Owner: {{Name}}, Deadline: {{DATE}} -- [ ] {{F2}} β€” Owner: {{Name}}, Deadline: {{DATE}} - -**Milestone**: Security re-scan on {{DATE}} to verify fixes - ---- - -### Phase 2 (Week 3-4) - -Complete High-severity findings: - -- [ ] {{F3}} β€” Owner: {{Name}}, Deadline: {{DATE}} -- [ ] {{F4}} β€” Owner: {{Name}}, Deadline: {{DATE}} - -**Milestone**: Second security review on {{DATE}} - ---- - -### Phase 3 (Weeks 5-8) - -Address Medium-severity findings (can be post-production with monitoring): - -- [ ] {{F5}} β€” Owner: {{Name}}, Target: {{DATE}} -- [ ] {{F6}} β€” Owner: {{Name}}, Target: {{DATE}} - ---- - -### Backlog (Next Sprint) - -Low-severity items: - -- [ ] {{F7}} β€” {{Brief description}} -- [ ] {{F8}} β€” {{Brief description}} - ---- - -## Remediation Status Tracking - -| Finding | Owner | Deadline | Status | Last Update | Notes | -|---------|-------|----------|--------|-------------|-------| -| F1 | {{Name}} | {{Date}} | πŸ”΄ Pending | {{Date}} | {{Notes}} | -| F2 | {{Name}} | {{Date}} | 🟑 In Progress | {{Date}} | {{Notes}} | -| F3 | {{Name}} | {{Date}} | 🟒 Complete | {{Date}} | {{Notes}} | - ---- - -## Post-Audit Monitoring - -### Controls to Monitor - -{{If service proceeds to production despite findings:}} - -- **{{Control 1}}** β€” Monitor via {{method}}, alert if {{threshold}} -- **{{Control 2}}** β€” Monitor via {{method}}, alert if {{threshold}} -- **{{Control 3}}** β€” Monitor via {{method}}, alert if {{threshold}} - -### Incident Response - -If a security incident occurs: -1. Activate incident response team -2. Notify {{Escalation contacts}} -3. Follow {{Incident response runbook}} -4. Conduct post-incident security review - ---- - -## Risk Assessment Matrix - -``` - LIKELIHOOD - Low Med High - CRITICAL H C C -IMPACT - HIGH M H C - MEDIUM L M H - LOW L L M - -Legend: C=Critical, H=High, M=Medium, L=Low -``` - -**Our findings map**: -- {{F1}} β€” {{Position on matrix}} -- {{F2}} β€” {{Position on matrix}} - ---- - -## Positive Findings - -**Strengths to maintain:** - -- {{Positive 1}} β€” Keep doing this -- {{Positive 2}} β€” Keep doing this -- {{Positive 3}} β€” Keep doing this - ---- - -## Recommendations Summary - -### Immediate (Critical) -- {{Fix all Critical findings}} ({{effort}}) - -### Short-term (High Priority) -- {{Fix all High findings}} ({{effort}}) -- {{Implement automated scanning}} ({{effort}}) -- {{Setup security monitoring}} ({{effort}}) - -### Medium-term -- {{Implement {{technology}} for {{purpose}}}} ({{effort}}) -- {{Security training for team}} ({{effort}}) - -### Long-term (Next 6 Months) -- {{Major security initiative}} ({{effort}}) -- {{Penetration testing}} ({{effort}}) - ---- - -## Sign-offs & Approvals - -### Audit Approval - -- [ ] **Security Lead** ({{AUDITOR_NAME}}) - - Signature: ________________________ Date: __________ - - Assessment complete and findings documented - -### Service Owner Acknowledgment - -- [ ] **Service Owner** ({{SERVICE_OWNER}}) - - Signature: ________________________ Date: __________ - - Acknowledged findings and committed to remediation - -### Compliance Officer Review - -- [ ] **Compliance Officer** ({{NAME}}) - - Signature: ________________________ Date: __________ - - Compliance requirements verified - -### Executive Approval (If Production Clearance Needed) - -- [ ] **VP Engineering / Security** ({{NAME}}) - - Signature: ________________________ Date: __________ - - Risk accepted; approved for production - ---- - -## Distribution - -- [x] Shared with: {{Service team, Leadership, Compliance}} -- [x] Date shared: {{DATE}} -- [x] Follow-up review scheduled: {{DATE}} - ---- - -## Appendix: Testing Evidence - -### Code Review Findings - -``` -{{Code snippets demonstrating vulnerabilities}} -``` - -### Configuration Issues - -``` -{{Configuration examples showing gaps}} -``` - -### Dependencies Scan - -``` -{{Vulnerable dependencies identified}} -``` - ---- - -**Report Prepared By**: {{AUDITOR_NAME}} -**Report Date**: {{DATE}} -**Review Status**: Draft | Final | Approved diff --git a/src/psm/workflows/bmad-psm-security-audit/workflow.md b/src/psm/workflows/bmad-psm-security-audit/workflow.md deleted file mode 100644 index 8ad8a8c3b..000000000 --- a/src/psm/workflows/bmad-psm-security-audit/workflow.md +++ /dev/null @@ -1,91 +0,0 @@ ---- -workflow_id: SA001 -workflow_name: Security Audit -description: Comprehensive security review using security patterns, config management, and compliance framework -entry_point: steps/step-01-scope.md -phase: 4-cross -lead_agent: "HΓ  (Security)" -status: "active" -created_date: 2026-03-17 -version: "1.0.0" -estimated_duration: "2-3 hours" -outputFile: '{output_folder}/psm-artifacts/security-audit-{{project_name}}-{{date}}.md' ---- - -# Workflow: Security Audit - -## Goal -Perform comprehensive security evaluation using Production Systems BMAD framework, covering threat modeling, vulnerability assessment, compliance, and security controls. - -## Overview - -Security audit is a critical cross-functional workflow that evaluates service security posture before production deployment or for ongoing compliance verification. The audit: - -1. **Scopes** the audit engagement, defines threat model, and identifies compliance requirements -2. **Executes** detailed security assessment across multiple domains (authentication, data protection, infrastructure, API security) -3. **Reports** findings with severity levels, remediation recommendations, and compliance status - -## Execution Path - -``` -START - ↓ -[Step 01] Scope & Threat Model (Define audit scope, identify threats, compliance reqs) - ↓ -[Step 02] Security Assessment (Execute checklist across domains, identify vulns) - ↓ -[Step 03] Security Report (Findings report, severity, recommendations, compliance) - ↓ -END -``` - -## Key Roles - -| Role | Agent | Responsibility | -|------|-------|-----------------| -| Lead | HΓ  (Security) | Lead audit, coordinate assessment, synthesize findings | -| Subject Matter | Service Owner + Platform Eng | Provide architecture, answer security questions | -| Compliance | Security/Compliance Team | Validate compliance mapping, sign-off | - -## Assessment Domains (5) - -1. **Authentication & Authorization** β€” Identity verification, access control, session management -2. **API Security** β€” Input validation, rate limiting, API key management, CORS -3. **Secrets Management** β€” Credential storage, rotation, access logging -4. **Encryption** β€” In-transit (TLS), at-rest, key management -5. **PII & Data Protection** β€” Classification, access controls, audit logging, retention - -## Input Requirements - -- **Service architecture diagram** β€” Components, data flows, external integrations -- **Authentication/authorization approach** β€” OAuth2, JWT, SAML, custom -- **Secrets storage mechanism** β€” Vault, cloud provider, environment variables -- **Compliance requirements** β€” GDPR, CCPA, SOC2, industry-specific -- **Known security controls** β€” WAF, TLS config, authentication libraries - -## Output Deliverable - -- **Security Audit Report** (template: `security-audit-report.template.md`) - - Audit scope and threat model - - Findings organized by domain with severity (Critical/High/Medium/Low) - - Remediation recommendations with priority and effort - - Compliance status matrix - - Sign-off - -## Success Criteria - -1. All security domains assessed with clear findings -2. Severity levels assigned (using CVSS or similar framework) -3. Remediation plan with owners and deadlines -4. Compliance requirements verified (if applicable) -5. Team alignment on security posture - -## Next Steps After Workflow - -- If **COMPLIANT**: Document in security registry; schedule periodic re-audit -- If **NON-COMPLIANT**: Add remediation items to backlog; track closure -- If **CRITICAL ISSUES**: Consider production pause until resolved - ---- - -**Navigation**: [← Back to 4-cross](../), [Next: Step 01 β†’](steps/step-01-scope.md) diff --git a/src/psm/workflows/bmad-psm-setup-new-service/SKILL.md b/src/psm/workflows/bmad-psm-setup-new-service/SKILL.md deleted file mode 100644 index 6b43cff8a..000000000 --- a/src/psm/workflows/bmad-psm-setup-new-service/SKILL.md +++ /dev/null @@ -1,6 +0,0 @@ ---- -name: bmad-psm-setup-new-service -description: 'Set up new production service from architecture through deployment. Use when the user says "new service" or "setup service" or "new microservice"' ---- - -Follow the instructions in [workflow.md](workflow.md). diff --git a/src/psm/workflows/bmad-psm-setup-new-service/bmad-skill-manifest.yaml b/src/psm/workflows/bmad-psm-setup-new-service/bmad-skill-manifest.yaml deleted file mode 100644 index d0f08abdb..000000000 --- a/src/psm/workflows/bmad-psm-setup-new-service/bmad-skill-manifest.yaml +++ /dev/null @@ -1 +0,0 @@ -type: skill diff --git a/src/psm/workflows/bmad-psm-setup-new-service/workflow.md b/src/psm/workflows/bmad-psm-setup-new-service/workflow.md deleted file mode 100644 index 35b2d6a6a..000000000 --- a/src/psm/workflows/bmad-psm-setup-new-service/workflow.md +++ /dev/null @@ -1,116 +0,0 @@ ---- -workflow_id: W-SETUP-SVC-001 -workflow_name: Setup Production Service for BMAD -version: 6.2.0 -lead_agent: "Architect Khang" -supporting_agents: ["SRE Minh", "Mary Analyst"] -phase: "1-Analysis β†’ 2-Planning β†’ 3-Solutioning β†’ 4-Implementation" -created_date: 2026-03-17 -last_modified: 2026-03-17 -config_file: "_config/config.yaml" -estimated_duration: "12-20 hours" -outputFile: '{output_folder}/psm-artifacts/service-setup-{{project_name}}-{{date}}.md' ---- - -# Setup Production Service Workflow β€” BMAD Pattern - -## Metadata & Context - -**Goal**: XΓ’y dα»±ng production-grade service tα»« scratch, vα»›i Δ‘αΊ§y Δ‘α»§ architecture, API design, deployment pipeline, reliability patterns, security, vΓ  production readiness. - -**Lead Team**: -- SRE Minh (Reliability, Infrastructure, Operations) -- Architect Khang (System Design, Technology Selection) -- Mary Analyst (Requirements, Risk Assessment) - -**Success Criteria**: -- βœ“ Architecture design document approved -- βœ“ API contracts defined & validated -- βœ“ Database schema designed & indexed -- βœ“ CI/CD pipeline operational -- βœ“ Resilience & observability in place -- βœ“ Security & compliance verified -- βœ“ Production readiness checklist passed - -## Workflow Overview - -Workflow nΓ y di qua 6 bΖ°α»›c atomic, mα»—i bΖ°α»›c focus vΓ o mα»™t domain riΓͺng: - -1. **Step-01-Architecture** β†’ Requirements + Architecture Pattern Selection -2. **Step-02-API-Database** β†’ API Design + Database Selection + Schema -3. **Step-03-Build-Deploy** β†’ CI/CD + Containerization + Testing Strategy -4. **Step-04-Reliability** β†’ Resilience Patterns + Observability + Error Handling -5. **Step-05-Security-Infra** β†’ Auth/Authz + Secrets + K8s Config -6. **Step-06-Readiness** β†’ PRR Checklist + Runbook + Go/No-Go Decision - -## Configuration Loading - -Tα»± Δ‘α»™ng load tα»« `_config/config.yaml`: - -```yaml -project_context: - user_name: "[loaded from config]" - organization: "[loaded from config]" - environment: "production" - -workflow_defaults: - communication_language: "Vietnamese" - output_folder: "./outputs/setup-new-service-{service_name}" - timestamp: "2026-03-17" -``` - -## Execution Model - -### Entry Point Logic - -``` -1. Check if workflow.md exists in outputs folder - β†’ If NEW: Start from step-01-architecture.md - β†’ If RESUME: Load progress.yaml β†’ auto-skip completed steps - β†’ If PARTIAL: Load step-N-context.yaml β†’ resume from step N - -2. For each step: - a) Load step-{N}-{name}.md - b) Load referenced SKILL files (auto-parse "Load:" directives) - c) Execute MENU [A][C] options - d) Save step output to step-{N}-output.md - e) Move to next step - -3. Final: Generate comprehensive outputs in outputs folder -``` - -### State Tracking - -Output document frontmatter tracks progress: - -```yaml -workflow_progress: - step_01_architecture: "completed" - step_02_api_database: "completed" - step_03_build_deploy: "in_progress" - step_04_reliability: "pending" - step_05_security_infra: "pending" - step_06_readiness: "pending" - last_updated: "2026-03-17T14:30:00Z" - current_agent: "Architect Khang" -``` - -## Mandatory Workflow Rules - -1. **No skipping steps** β€” Mα»—i step phαΊ£i được execute theo order -2. **Validate assumptions** β€” Mα»—i decision phαΊ£i được document -3. **Cross-phase collaboration** β€” Architects + SRE + Analysts work together -4. **Output artifacts** β€” Mα»—i step produce tangible output documents -5. **Handoff protocol** β€” Context được transfer giα»―a steps rΓ΅ rΓ ng - -## Navigation - -HΓ£y chọn cΓ‘ch bαΊ―t Δ‘αΊ§u: - -- **[NEW]** β€” BαΊ―t Δ‘αΊ§u workflow mα»›i β†’ Load step-01 -- **[RESUME]** β€” Quay lαΊ‘i workflow Δ‘Γ£ tα»«ng chαΊ‘y (detect progress) -- **[SKIP-TO]** β€” NhαΊ£y tα»›i step cα»₯ thể (dev-only, requires confirmation) - ---- - -**TiαΊΏp tα»₯c bαΊ±ng cΓ‘ch chọn [NEW] hoαΊ·c [RESUME]**