From e4f5e6d8b5882c15f78804a89e4b55d0063eb029 Mon Sep 17 00:00:00 2001 From: Doan Ngoc Cuong Date: Thu, 19 Mar 2026 01:34:10 +0700 Subject: [PATCH] feat(psm): Add Production Systems & MLOps module MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add new PSM module for production operations: - 3 agents: SRE (Minh), Security (HΓ ), MLOps (Linh) - 6 workflows: incident-response, production-readiness, security-audit, mlops-deployment, setup-new-service, quick-diagnose - Teams for party mode integration - Registered as community module in installer Co-Authored-By: Claude Opus --- src/psm/agents/mlops/mlops.agent.yaml | 21 + src/psm/agents/security/security.agent.yaml | 21 + .../sre/sre-sidecar/production-standards.md | 21 + src/psm/agents/sre/sre.agent.yaml | 30 ++ src/psm/config.yaml | 13 + src/psm/module-help.csv | 7 + src/psm/module.yaml | 13 + src/psm/teams/default-party.csv | 4 + src/psm/teams/ops-team.yaml | 7 + .../bmad-psm-incident-response/SKILL.md | 6 + .../bmad-skill-manifest.yaml | 1 + .../incident-postmortem.template.md | 269 ++++++++++ .../bmad-psm-incident-response/workflow.md | 163 ++++++ .../bmad-psm-mlops-deployment/SKILL.md | 6 + .../bmad-skill-manifest.yaml | 1 + .../bmad-psm-mlops-deployment/workflow.md | 89 ++++ .../bmad-psm-production-readiness/SKILL.md | 6 + .../bmad-skill-manifest.yaml | 1 + .../production-readiness.template.md | 367 +++++++++++++ .../bmad-psm-production-readiness/workflow.md | 92 ++++ .../bmad-psm-quick-diagnose/SKILL.md | 6 + .../bmad-skill-manifest.yaml | 1 + .../bmad-psm-quick-diagnose/workflow.md | 80 +++ .../bmad-psm-security-audit/SKILL.md | 6 + .../bmad-skill-manifest.yaml | 1 + .../security-audit-report.template.md | 502 ++++++++++++++++++ .../bmad-psm-security-audit/workflow.md | 91 ++++ .../bmad-psm-setup-new-service/SKILL.md | 6 + .../bmad-skill-manifest.yaml | 1 + .../bmad-psm-setup-new-service/workflow.md | 116 ++++ tools/cli/external-official-modules.yaml | 10 + 31 files changed, 1958 insertions(+) create mode 100644 src/psm/agents/mlops/mlops.agent.yaml create mode 100644 src/psm/agents/security/security.agent.yaml create mode 100644 src/psm/agents/sre/sre-sidecar/production-standards.md create mode 100644 src/psm/agents/sre/sre.agent.yaml create mode 100644 src/psm/config.yaml create mode 100644 src/psm/module-help.csv create mode 100644 src/psm/module.yaml create mode 100644 src/psm/teams/default-party.csv create mode 100644 src/psm/teams/ops-team.yaml create mode 100644 src/psm/workflows/bmad-psm-incident-response/SKILL.md create mode 100644 src/psm/workflows/bmad-psm-incident-response/bmad-skill-manifest.yaml create mode 100644 src/psm/workflows/bmad-psm-incident-response/incident-postmortem.template.md create mode 100644 src/psm/workflows/bmad-psm-incident-response/workflow.md create mode 100644 src/psm/workflows/bmad-psm-mlops-deployment/SKILL.md create mode 100644 src/psm/workflows/bmad-psm-mlops-deployment/bmad-skill-manifest.yaml create mode 100644 src/psm/workflows/bmad-psm-mlops-deployment/workflow.md create mode 100644 src/psm/workflows/bmad-psm-production-readiness/SKILL.md create mode 100644 src/psm/workflows/bmad-psm-production-readiness/bmad-skill-manifest.yaml create mode 100644 src/psm/workflows/bmad-psm-production-readiness/production-readiness.template.md create mode 100644 src/psm/workflows/bmad-psm-production-readiness/workflow.md create mode 100644 src/psm/workflows/bmad-psm-quick-diagnose/SKILL.md create mode 100644 src/psm/workflows/bmad-psm-quick-diagnose/bmad-skill-manifest.yaml create mode 100644 src/psm/workflows/bmad-psm-quick-diagnose/workflow.md create mode 100644 src/psm/workflows/bmad-psm-security-audit/SKILL.md create mode 100644 src/psm/workflows/bmad-psm-security-audit/bmad-skill-manifest.yaml create mode 100644 src/psm/workflows/bmad-psm-security-audit/security-audit-report.template.md create mode 100644 src/psm/workflows/bmad-psm-security-audit/workflow.md create mode 100644 src/psm/workflows/bmad-psm-setup-new-service/SKILL.md create mode 100644 src/psm/workflows/bmad-psm-setup-new-service/bmad-skill-manifest.yaml create mode 100644 src/psm/workflows/bmad-psm-setup-new-service/workflow.md diff --git a/src/psm/agents/mlops/mlops.agent.yaml b/src/psm/agents/mlops/mlops.agent.yaml new file mode 100644 index 000000000..7e5613e4d --- /dev/null +++ b/src/psm/agents/mlops/mlops.agent.yaml @@ -0,0 +1,21 @@ +# MLOps & Performance Engineer Agent Definition + +agent: + metadata: + id: "_bmad/psm/agents/mlops.md" + name: Linh + title: MLOps & Performance Engineer + icon: πŸ€– + module: psm + hasSidecar: false + + persona: + role: MLOps Specialist + Performance Engineer + identity: MLOps specialist bridging ML research and production. Expert in model serving, pipeline optimization, and chaos engineering. + communication_style: Data-driven, experimental. Thinks in pipelines and metrics. Ship fast, measure everything. + principles: Reproducibility first; monitor model drift; chaos engineering validates assumptions; cost-aware optimization. + + menu: + - trigger: MD or fuzzy match on mlops-deploy + workflow: "skill:bmad-psm-mlops-deployment" + description: "[MD] MLOps Deployment β€” Model validation, deploy, monitor" diff --git a/src/psm/agents/security/security.agent.yaml b/src/psm/agents/security/security.agent.yaml new file mode 100644 index 000000000..974106013 --- /dev/null +++ b/src/psm/agents/security/security.agent.yaml @@ -0,0 +1,21 @@ +# Security & Infrastructure Engineer Agent Definition + +agent: + metadata: + id: "_bmad/psm/agents/security.md" + name: HΓ  + title: Security & Infrastructure Engineer + icon: πŸ›‘οΈ + module: psm + hasSidecar: false + + persona: + role: Security Specialist + Infrastructure Expert + identity: Security specialist with expertise in defense-in-depth, compliance frameworks, and infrastructure hardening. Thorough and detail-oriented. + communication_style: Thorough, detail-oriented. Asks 'what if' scenarios. Thinks about edge cases and threat models. + principles: Zero trust architecture; defense in depth; security by default; least privilege. + + menu: + - trigger: SA or fuzzy match on security-audit + workflow: "skill:bmad-psm-security-audit" + description: "[SA] Security Audit β€” Scope, audit, report" diff --git a/src/psm/agents/sre/sre-sidecar/production-standards.md b/src/psm/agents/sre/sre-sidecar/production-standards.md new file mode 100644 index 000000000..aab20e291 --- /dev/null +++ b/src/psm/agents/sre/sre-sidecar/production-standards.md @@ -0,0 +1,21 @@ +# Production Standards for PSM + +SRE operational standards, incident response protocols, and production quality benchmarks. + +## User Specified CRITICAL Rules - Supersedes General Rules + +None + +## General CRITICAL RULES + +### Rule 1: SLO-First Approach +ALL production decisions MUST reference defined SLOs. No optimization without measurement baseline. + +### Rule 2: Blameless Postmortems +NEVER assign individual blame in incident analysis. Focus on systemic improvements. + +### Rule 3: Change Management +ALL production changes MUST have rollback plan, monitoring review, and stakeholder communication. + +### Rule 4: Severity Classification +SEV1: Complete outage >50% users. SEV2: Major degradation >20%. SEV3: Minor <20%. SEV4: Cosmetic. diff --git a/src/psm/agents/sre/sre.agent.yaml b/src/psm/agents/sre/sre.agent.yaml new file mode 100644 index 000000000..dd2724ff0 --- /dev/null +++ b/src/psm/agents/sre/sre.agent.yaml @@ -0,0 +1,30 @@ +# Site Reliability Engineer Agent Definition + +agent: + metadata: + id: "_bmad/psm/agents/sre.md" + name: Minh + title: Site Reliability Engineer + icon: πŸ”§ + module: psm + hasSidecar: true + + persona: + role: Senior SRE + Production Operations Expert + identity: Senior SRE with deep expertise in reliability, observability, and operational excellence. Obsessed with SLOs, automation, and incident response. + communication_style: Metric-driven, systematic. Translates business goals to technical SLOs. Always asks 'what is the SLO?' first. + principles: SLO-first approach; automate everything; measure before optimizing; blameless postmortems. + + menu: + - trigger: IR or fuzzy match on incident + workflow: "skill:bmad-psm-incident-response" + description: "[IR] Incident Response β€” Triage, diagnose, fix, postmortem" + - trigger: PR or fuzzy match on readiness + workflow: "skill:bmad-psm-production-readiness" + description: "[PR] Production Readiness Review β€” 9-dimension assessment" + - trigger: NS or fuzzy match on new-service + workflow: "skill:bmad-psm-setup-new-service" + description: "[NS] Setup New Service β€” Architecture to deployment" + - trigger: QD or fuzzy match on diagnose + workflow: "skill:bmad-psm-quick-diagnose" + description: "[QD] Quick Diagnose β€” Fast production troubleshooting" diff --git a/src/psm/config.yaml b/src/psm/config.yaml new file mode 100644 index 000000000..42a07adbe --- /dev/null +++ b/src/psm/config.yaml @@ -0,0 +1,13 @@ +code: psm +name: "PSM: Production Systems & MLOps" +header: "BMad Production Systems Module" +subheader: "Production engineering workflows for incident response, production readiness, security, and MLOps." +description: "AI-driven production engineering framework with SRE, Security, and MLOps agents." +default_selected: false + +knowledge_base_path: + prompt: + - "Where is your production knowledge base? (folder with SKILL.md files)" + - "Leave default if you don't have one yet." + default: "docs/production-knowledge" + result: "{project-root}/{value}" diff --git a/src/psm/module-help.csv b/src/psm/module-help.csv new file mode 100644 index 000000000..567b4cafb --- /dev/null +++ b/src/psm/module-help.csv @@ -0,0 +1,7 @@ +module,phase,name,code,sequence,workflow-file,command,required,agent,options,description,output-location,outputs, +psm,operations,Incident Response,IR,,skill:bmad-psm-incident-response,bmad-psm-incident-response,false,sre,Operations Mode,"Handle production incidents with systematic triage, diagnosis, and recovery. Use when the user says 'production is down' or 'incident response' or 'we have an outage'.",output_folder,"incident response report", +psm,operations,Production Readiness,PR,,skill:bmad-psm-production-readiness,bmad-psm-production-readiness,false,sre,Operations Mode,"Run production readiness review across 9 dimensions. Use when the user says 'are we ready for production' or 'PRR' or 'go-live check'.",output_folder,"production readiness assessment", +psm,operations,Security Audit,SA,,skill:bmad-psm-security-audit,bmad-psm-security-audit,false,security,Operations Mode,"Run comprehensive security audit and threat assessment. Use when the user says 'security audit' or 'vulnerability assessment' or 'security review'.",output_folder,"security audit report", +psm,operations,MLOps Deployment,MD,,skill:bmad-psm-mlops-deployment,bmad-psm-mlops-deployment,false,mlops,Operations Mode,"Deploy ML model to production with validation and monitoring. Use when the user says 'deploy model' or 'ML deployment' or 'model serving'.",output_folder,"mlops deployment report", +psm,operations,Setup New Service,NS,,skill:bmad-psm-setup-new-service,bmad-psm-setup-new-service,false,sre,Operations Mode,"Set up new production service from architecture through deployment. Use when the user says 'new service' or 'setup service' or 'new microservice'.",output_folder,"service setup plan", +psm,operations,Quick Diagnose,QD,,skill:bmad-psm-quick-diagnose,bmad-psm-quick-diagnose,false,sre,Operations Mode,"Quick diagnosis of production issue with minimal latency. Use when the user says 'something is broken' or 'quick diagnose' or 'what is happening?'.",output_folder,"diagnostic report", diff --git a/src/psm/module.yaml b/src/psm/module.yaml new file mode 100644 index 000000000..42a07adbe --- /dev/null +++ b/src/psm/module.yaml @@ -0,0 +1,13 @@ +code: psm +name: "PSM: Production Systems & MLOps" +header: "BMad Production Systems Module" +subheader: "Production engineering workflows for incident response, production readiness, security, and MLOps." +description: "AI-driven production engineering framework with SRE, Security, and MLOps agents." +default_selected: false + +knowledge_base_path: + prompt: + - "Where is your production knowledge base? (folder with SKILL.md files)" + - "Leave default if you don't have one yet." + default: "docs/production-knowledge" + result: "{project-root}/{value}" diff --git a/src/psm/teams/default-party.csv b/src/psm/teams/default-party.csv new file mode 100644 index 000000000..bd87313e7 --- /dev/null +++ b/src/psm/teams/default-party.csv @@ -0,0 +1,4 @@ +name,displayName,title,icon,role,identity,communicationStyle,principles,module,path +"sre","Minh","Site Reliability Engineer","πŸ”§","Senior SRE + Production Operations Expert","Senior SRE with deep expertise in reliability, observability, and operational excellence. Obsessed with SLOs, automation, and incident response.","Metric-driven, systematic. Always asks 'what is the SLO?' first.","SLO-first; automate everything; measure before optimizing; blameless postmortems.","psm","bmad/psm/agents/sre.md" +"security","HΓ ","Security & Infrastructure Engineer","πŸ›‘οΈ","Security Specialist + Infrastructure Expert","Security specialist with expertise in defense-in-depth, compliance frameworks, and infrastructure hardening.","Thorough, detail-oriented. Asks 'what if' scenarios. Thinks about edge cases and threat models.","Zero trust; defense in depth; security by default; least privilege.","psm","bmad/psm/agents/security.md" +"mlops","Linh","MLOps & Performance Engineer","πŸ€–","MLOps Specialist + Performance Engineer","MLOps specialist bridging ML research and production. Expert in model serving, pipeline optimization, and chaos engineering.","Data-driven, experimental. 'Ship fast, measure everything.'","Reproducibility first; monitor drift; chaos engineering validates; cost-aware optimization.","psm","bmad/psm/agents/mlops.md" diff --git a/src/psm/teams/ops-team.yaml b/src/psm/teams/ops-team.yaml new file mode 100644 index 000000000..a7fa7dbd3 --- /dev/null +++ b/src/psm/teams/ops-team.yaml @@ -0,0 +1,7 @@ +# Powered by BMAD-COREβ„’ +bundle: + name: Production Operations Team + icon: βš™οΈ + description: Production engineering team for incident response, security, and MLOps +agents: "*" +party: "./default-party.csv" diff --git a/src/psm/workflows/bmad-psm-incident-response/SKILL.md b/src/psm/workflows/bmad-psm-incident-response/SKILL.md new file mode 100644 index 000000000..6d2fb39ef --- /dev/null +++ b/src/psm/workflows/bmad-psm-incident-response/SKILL.md @@ -0,0 +1,6 @@ +--- +name: bmad-psm-incident-response +description: 'Handle production incidents with systematic triage, diagnosis, and recovery. Use when the user says "production is down" or "incident response" or "we have an outage"' +--- + +Follow the instructions in [workflow.md](workflow.md). diff --git a/src/psm/workflows/bmad-psm-incident-response/bmad-skill-manifest.yaml b/src/psm/workflows/bmad-psm-incident-response/bmad-skill-manifest.yaml new file mode 100644 index 000000000..d0f08abdb --- /dev/null +++ b/src/psm/workflows/bmad-psm-incident-response/bmad-skill-manifest.yaml @@ -0,0 +1 @@ +type: skill diff --git a/src/psm/workflows/bmad-psm-incident-response/incident-postmortem.template.md b/src/psm/workflows/bmad-psm-incident-response/incident-postmortem.template.md new file mode 100644 index 000000000..13904eb17 --- /dev/null +++ b/src/psm/workflows/bmad-psm-incident-response/incident-postmortem.template.md @@ -0,0 +1,269 @@ +--- +template_name: incident-postmortem +template_version: "1.0.0" +created_date: 2026-03-17 +description: Standard postmortem template for incident analysis and learning +--- + +# Incident Postmortem: {{INCIDENT_TITLE}} + +**Date**: {{INCIDENT_DATE}} +**Duration**: {{START_TIME}} β€” {{END_TIME}} ({{DURATION_MINUTES}} minutes) +**Severity**: {{SEV1|SEV2|SEV3}} ({{IMPACT_DESCRIPTION}}) +**Lead**: {{INCIDENT_COMMANDER_NAME}} +**Facilitator**: {{POSTMORTEM_FACILITATOR_NAME}} + +--- + +## Summary + +[1-2 paragraph executive summary of what happened, impact, and resolution] + +**Timeline at a glance**: +- T-0:00 β€” Normal operation +- T-{{TIME1}} β€” {{EVENT1}} +- T-{{TIME2}} β€” {{EVENT2}} +- T-{{RESOLUTION_TIME}} β€” Incident resolved + +**Impact**: {{METRIC1}} affected {{X}} users, {{METRIC2}}, {{METRIC3}} + +--- + +## Detailed Timeline + +| Time | Event | Notes | +|------|-------|-------| +| {{T}} | {{What happened}} | {{Who detected it}} | +| {{T+X}} | {{Next event}} | {{Action taken}} | +| {{T+Y}} | {{Root cause identified}} | {{By whom}} | +| {{T+Z}} | {{Fix applied}} | {{Verification steps}} | +| {{T+Final}} | {{Incident resolved}} | {{Verification}} | + +--- + +## Root Cause Analysis + +### Primary Cause + +**{{ROOT_CAUSE_TITLE}}** + +{{Detailed explanation of the root cause}} + +**How it happened**: +1. {{Precondition 1}} (why the system was vulnerable) +2. {{Trigger event}} (what caused the failure) +3. {{Failure cascade}} (why it got worse) +4. {{Detection lag}} (why it took X minutes to detect) + +**Evidence**: +- {{Log entry or metric showing the issue}} +- {{Related system behavior}} +- {{Impact indicator}} + +### Contributing Factors + +- {{Factor 1}} β€” {{Brief explanation}} +- {{Factor 2}} β€” {{Brief explanation}} +- {{Factor 3}} β€” {{Brief explanation}} + +### Why Didn't We Catch This? + +- {{Missing monitoring}} β€” {{What metric would have alerted}} +- {{Testing gap}} β€” {{What test would have failed}} +- {{Documentation gap}} β€” {{What runbook would have helped}} +- {{Knowledge gap}} β€” {{What training would have helped}} + +--- + +## Impact Assessment + +### User Impact + +- **Duration**: {{START_TIME}} β€” {{END_TIME}} ({{DURATION}} minutes) +- **Scale**: {{X}}% of {{METRIC}} (e.g., 5% of payment requests) +- **Users Affected**: {{APPROX_COUNT}} users +- **Revenue Impact**: {{$X}} (if applicable) +- **Customer Escalations**: {{NUMBER}} tickets opened + +**User-facing symptoms**: +- {{Symptom 1}} (e.g., "Checkout returns 500 error") +- {{Symptom 2}} (e.g., "Page loads slowly") +- {{Symptom 3}} + +### Operational Impact + +- **System Recovery**: {{SERVICE/METRIC}} took {{TIME}} to recover +- **Cascading Effects**: {{SERVICE_X}} also affected due to {{reason}} +- **On-call Load**: {{NUMBER}} pages, {{NUMBER}} escalations +- **Data Loss**: {{None | {{Description}}}} + +--- + +## Resolution & Recovery + +### Immediate Actions Taken + +1. **{{Time T+X}}** β€” {{Action 1}} + - Rationale: {{Why this helped}} + - Result: {{What changed}} + +2. **{{Time T+Y}}** β€” {{Action 2}} + - Rationale: {{Why this helped}} + - Result: {{What changed}} + +3. **{{Time T+Z}}** β€” {{Root Fix Applied}} + - Details: {{Technical description}} + - Verification: {{How we confirmed it worked}} + +### Rollback/Rollforward Decision + +**Decision**: {{Rollback to version X | Rollforward with fix | Hybrid approach}} + +**Rationale**: {{Explain why this was the right choice}} + +**Verification**: {{How we confirmed the fix worked}} + +--- + +## Lessons Learned + +### What Went Well + +- {{Thing we did right}} β€” This prevented {{worse outcome}} +- {{Thing we did right}} β€” Team coordination was excellent +- {{Thing we did right}} β€” Monitoring caught {{something}} + +### What We Can Improve + +| Issue | Category | Severity | Recommendation | Owner | +|-------|----------|----------|-----------------|-------| +| {{We didn't detect it for X minutes}} | Observability | HIGH | Add alert for {{metric}} when > {{threshold}} | DevOps | +| {{Runbook was outdated}} | Runbooks | MEDIUM | Update {{runbook}} with new architecture | SRE | +| {{New service not in alerting system}} | Process | MEDIUM | Add new services to alert config automatically | Platform | +| {{Team didn't know about new feature}} | Knowledge | LOW | Document new features in wiki | Tech Lead | + +--- + +## Action Items + +### Critical (Must Complete Before Similar Incident) + +- [ ] **{{Action 1}}** β€” {{Description}} + - Owner: {{NAME}} + - Deadline: {{DATE}} (within 1 week) + - Acceptance: {{How we verify it's done}} + +- [ ] **{{Action 2}}** β€” {{Description}} + - Owner: {{NAME}} + - Deadline: {{DATE}} (within 1 week) + - Acceptance: {{How we verify it's done}} + +### High Priority (Target Next 2 Weeks) + +- [ ] {{Action}} β€” Owner: {{NAME}}, Deadline: {{DATE}} +- [ ] {{Action}} β€” Owner: {{NAME}}, Deadline: {{DATE}} +- [ ] {{Action}} β€” Owner: {{NAME}}, Deadline: {{DATE}} + +### Medium Priority (Target This Sprint) + +- [ ] {{Action}} β€” Owner: {{NAME}} +- [ ] {{Action}} β€” Owner: {{NAME}} + +### Backlog (Good to Have) + +- [ ] {{Action}} β€” {{Description}} +- [ ] {{Action}} β€” {{Description}} + +--- + +## Prevention Measures + +### Short-term (1-2 Weeks) + +1. **{{Mitigation 1}}** β€” Prevents {{this exact incident}} from happening again + - How: {{Technical approach}} + - Effort: {{Estimate}} + - Timeline: {{When}} + +2. **{{Mitigation 2}}** β€” Catches similar issues earlier + - How: {{Technical approach}} + - Effort: {{Estimate}} + - Timeline: {{When}} + +### Long-term (Next Quarter) + +1. **{{Large architectural change}}** β€” Eliminates root cause class + - Rationale: {{Why this is better}} + - Effort: {{Estimate}} + - Timeline: {{When}} + +--- + +## Incident Stats + +``` +MTTD (Mean Time To Detect): {{MINUTES}} minutes + - Automatic detection: {{If applicable, how}} + - Manual detection: {{Who found it}} + +MTTR (Mean Time To Resolve): {{MINUTES}} minutes + - Investigation time: {{MINUTES}} + - Fix implementation time: {{MINUTES}} + - Verification time: {{MINUTES}} + +Severity: {{SEV1|SEV2|SEV3}} ({{Criteria}}) +``` + +--- + +## Distribution & Follow-up + +- [x] Postmortem shared with: {{TEAM_LIST}} +- [x] Customer communication sent: {{YES|NO|TEMPLATE_USED}} +- [x] Action items tracked in: {{JIRA/BACKLOG}} +- [x] Follow-up review scheduled: {{DATE}} + +**Follow-up Review**: {{DATE}} with {{ATTENDEES}} +- Confirm all critical action items completed +- Verify prevention measures working +- Check for recurring patterns + +--- + +## Appendix: Supporting Evidence + +### Logs + +``` +[Relevant log entries showing the incident] + +{{TIMESTAMP}} ERROR: {{MESSAGE}} +{{TIMESTAMP}} ERROR: {{MESSAGE}} +``` + +### Metrics + +[Include screenshots or links to metric dashboards showing the incident] + +- Error rate spike: [Chart or metric] +- Latency spike: [Chart or metric] +- Traffic pattern: [Chart or metric] + +### Configuration Changes + +```yaml +# Changes made before incident +- {{Change 1}} ({{TIMESTAMP}}) +- {{Change 2}} ({{TIMESTAMP}}) +``` + +--- + +**Document Completed By**: {{NAME}} +**Date**: {{DATE}} +**Review Status**: Draft | Final | Approved + +**Approvals**: +- [ ] Incident Commander: {{NAME}} {{DATE}} +- [ ] Service Owner: {{NAME}} {{DATE}} +- [ ] VP Engineering (if SEV1): {{NAME}} {{DATE}} diff --git a/src/psm/workflows/bmad-psm-incident-response/workflow.md b/src/psm/workflows/bmad-psm-incident-response/workflow.md new file mode 100644 index 000000000..2c7fa4541 --- /dev/null +++ b/src/psm/workflows/bmad-psm-incident-response/workflow.md @@ -0,0 +1,163 @@ +--- +workflow_id: W-INCIDENT-001 +workflow_name: Production Incident Response +version: 6.2.0 +lead_agent: "SRE Minh" +supporting_agents: ["Architect Khang", "Mary Analyst"] +phase: "3-Run: Emergency Response & Recovery" +created_date: 2026-03-17 +last_modified: 2026-03-17 +config_file: "_config/config.yaml" +estimated_duration: "15 minutes to 2 hours (depending on severity)" +outputFile: '{output_folder}/psm-artifacts/incident-{{project_name}}-{{date}}.md' +--- + +# Production Incident Response Workflow β€” BMAD Pattern + +## Metadata & Context + +**Goal**: Triage, diagnose, resolve production incidents through systematic diagnosis and apply fixes with verification. This is the most critical workflow - minimize MTTR (Mean Time To Recovery) while maintaining system stability. + +**Lead Team**: +- SRE Minh (Incident Command, Recovery Orchestration) +- Architect Khang (Root Cause Analysis, System-wide Impact) +- Mary Analyst (Impact Assessment, Post-Incident Review) + +**Success Criteria**: +- βœ“ Incident severity classified within 5 minutes +- βœ“ Root cause identified within first triage pass +- βœ“ Fix applied and verified +- βœ“ System metrics returned to baseline +- βœ“ Incident postmortem documented with action items +- βœ“ Prevention measures identified + +## Workflow Overview + +Workflow nΓ y di qua 4 bΖ°α»›c atomic, mα»—i bΖ°α»›c focus vΓ o mα»™t phase khΓ‘c nhau: + +1. **Step-01-Triage** β†’ Gather initial info, assess severity, classify impact +2. **Step-02-Diagnose** β†’ Systematic diagnosis using observability data (logs, metrics, traces) +3. **Step-03-Fix** β†’ Apply fix, verify resolution, validate recovery +4. **Step-04-Postmortem** β†’ Document incident, identify action items, prevent recurrence + +## Configuration Loading + +Tα»± Δ‘α»™ng load tα»« `_config/config.yaml`: + +```yaml +project_context: + organization: "[loaded from config]" + environment: "production" + incident_channel: "slack:#incidents" + +workflow_defaults: + communication_language: "Vietnamese-English" + severity_levels: ["SEV1", "SEV2", "SEV3", "SEV4"] + escalation_contacts: "[loaded from config]" + on_call_engineer: "[loaded from config]" +``` + +## Workflow Architecture - Micro-File Design + +BMAD pattern: Mα»—i step lΓ  mα»™t file riΓͺng, load just-in-time. Workflow chain: + +``` +workflow.md (entry point) + ↓ +step-01-triage.md (classify severity, initial assessment) + ↓ +step-02-diagnose.md (root cause analysis) + ↓ +step-03-fix.md (apply fix, verify) + ↓ +step-04-postmortem.md (document, prevent) + ↓ +incident-response-summary.md (final output) +``` + +**Key Benefits**: +- Single-step focus β€” engineer concentrates on one phase +- Knowledge isolation β€” load only relevant SKILL docs per step +- State tracking β€” save progress after each step +- Easy resumption β€” if interrupted, restart from exact step + +## Skill References + +Workflow nΓ y load knowledge tα»«: + +- **5.07 Reliability & Resilience** β†’ Circuit breaker patterns, fallback strategies, timeout management +- **5.08 Observability & Monitoring** β†’ Structured logging, metrics queries, distributed tracing +- **5.09 Error Handling & Recovery** β†’ Error classification, graceful degradation patterns +- **5.10 Production Readiness** β†’ Incident prevention checklist, alerting setup +- **5.14 Documentation & Runbooks** β†’ Postmortem templates, incident reports + +## Execution Model + +### Entry Point Logic + +``` +1. Check if incident session exists + β†’ If NEW incident: Start from step-01-triage.md + β†’ If ONGOING: Load incident-session.yaml β†’ continue from last completed step + β†’ If RESOLVED: Load postmortem template + +2. For each step: + a) Load step-{N}-{name}.md + b) Load referenced SKILL files (auto-parse "Load:" directives) + c) Execute MENU [A][C] options + d) Save step output to step-{N}-output.md + incident-context.yaml + e) Move to next step or conclude + +3. Final: Generate incident report + postmortem in outputs folder +``` + +### State Tracking + +Incident session frontmatter tracks progress: + +```yaml +incident_context: + incident_id: "INC-2026-03-17-001" + severity: "SEV1" | "SEV2" | "SEV3" | "SEV4" + status: "triage" β†’ "diagnosing" β†’ "recovering" β†’ "resolved" β†’ "postmortem" + affected_services: ["service-1", "service-2"] + started_at: "2026-03-17T14:30:00Z" + timeline: + detected_at: "2026-03-17T14:30:00Z" + triage_completed_at: "2026-03-17T14:35:00Z" + root_cause_identified_at: "2026-03-17T14:50:00Z" + fix_applied_at: "2026-03-17T15:10:00Z" + resolved_at: "2026-03-17T15:15:00Z" + current_step: "step-02-diagnose" + last_updated: "2026-03-17T14:50:00Z" + incident_commander: "SRE Minh" +``` + +## Mandatory Workflow Rules + +1. **Speed first** β€” Triage must complete in < 5 minutes +2. **Root cause identification** β€” Must identify root cause before fix attempt +3. **Verify before declaring resolved** β€” Check metrics + user reports +4. **Document everything** β€” Every action logged for postmortem +5. **Escalation protocol** β€” SEV1 β†’ Page on-call architect immediately +6. **Communication** β€” Update stakeholders every 5-10 minutes +7. **No flying blind** β€” All fixes must reference observability data + +## Severity Scale + +- **SEV1** β€” Service completely down, revenue impact, > 1% users affected β†’ Page all on-call +- **SEV2** β€” Major degradation, significant users affected, partial functionality down +- **SEV3** β€” Moderate impact, some users affected, workaround possible +- **SEV4** β€” Minor issue, limited users, can defer to business hours + +## Navigation + +HΓ£y chọn cΓ‘ch bαΊ―t Δ‘αΊ§u: + +- **[NEW-INC]** β€” Report new incident β†’ Load step-01-triage +- **[RESUME-INC]** β€” Continue existing incident (detect progress from incident-session.yaml) +- **[ESCALATE]** β€” Escalate to on-call architect + +--- + +**HΓ£y bΓ‘o cΓ‘o tΓ¬nh trαΊ‘ng incident hoαΊ·c chọn [NEW-INC] để bαΊ―t Δ‘αΊ§u triage** diff --git a/src/psm/workflows/bmad-psm-mlops-deployment/SKILL.md b/src/psm/workflows/bmad-psm-mlops-deployment/SKILL.md new file mode 100644 index 000000000..e24eade5c --- /dev/null +++ b/src/psm/workflows/bmad-psm-mlops-deployment/SKILL.md @@ -0,0 +1,6 @@ +--- +name: bmad-psm-mlops-deployment +description: 'Deploy ML model to production with validation and monitoring. Use when the user says "deploy model" or "ML deployment" or "model serving"' +--- + +Follow the instructions in [workflow.md](workflow.md). diff --git a/src/psm/workflows/bmad-psm-mlops-deployment/bmad-skill-manifest.yaml b/src/psm/workflows/bmad-psm-mlops-deployment/bmad-skill-manifest.yaml new file mode 100644 index 000000000..d0f08abdb --- /dev/null +++ b/src/psm/workflows/bmad-psm-mlops-deployment/bmad-skill-manifest.yaml @@ -0,0 +1 @@ +type: skill diff --git a/src/psm/workflows/bmad-psm-mlops-deployment/workflow.md b/src/psm/workflows/bmad-psm-mlops-deployment/workflow.md new file mode 100644 index 000000000..a682e6344 --- /dev/null +++ b/src/psm/workflows/bmad-psm-mlops-deployment/workflow.md @@ -0,0 +1,89 @@ +--- +workflow_id: MLOPS001 +workflow_name: MLOps Deployment +description: Deploy ML model to production with validation, serving, and monitoring +entry_point: steps/step-01-model-validation.md +phase: 5-specialized +lead_agent: "Linh (MLOps)" +status: "active" +created_date: 2026-03-17 +version: "1.0.0" +estimated_duration: "3-4 hours" +outputFile: '{output_folder}/psm-artifacts/mlops-deploy-{{project_name}}-{{date}}.md' +--- + +# Workflow: MLOps Deployment + +## Goal +Deploy machine learning models to production with comprehensive validation, infrastructure setup, and post-deployment monitoring. + +## Overview + +MLOps deployment ensures ML models are production-ready and continuously monitored for performance and data drift. The workflow: + +1. **Validates** model quality, performance metrics, and data drift detection +2. **Deploys** model to serving infrastructure with versioning and A/B testing +3. **Monitors** model performance, data drift, and cost metrics post-deployment + +## Execution Path + +``` +START + ↓ +[Step 01] Model Validation (Check metrics, data drift, A/B test plan) + ↓ +[Step 02] Deploy Model (Setup serving, infrastructure, GPU optimization) + ↓ +[Step 03] Monitor (Langfuse/MLflow, drift detection, cost tracking) + ↓ +END +``` + +## Key Roles + +| Role | Agent | Responsibility | +|------|-------|-----------------| +| Lead | Linh (MLOps) | Coordinate deployment, monitor model health | +| Data Scientist | Data Lead | Validate model quality, approve for production | +| DevOps | Platform Eng | Setup infrastructure, manage resources | + +## Validation Gates (3) + +1. **Model Quality** β€” Accuracy, precision, recall metrics meet SLO +2. **Data Quality** β€” No data drift detected; training/production data distribution aligned +3. **Business Readiness** β€” A/B test plan ready, rollback strategy defined + +## Input Requirements + +- **Trained model artifact** β€” Model checkpoint, weights, configuration +- **Performance metrics** β€” Baseline accuracy, latency, throughput expectations +- **Data validation** β€” Training dataset description, expected data distribution +- **Serving infrastructure** β€” Compute requirements (GPU/CPU), latency targets + +## Output Deliverable + +- **MLOps Deployment Report** + - Model version and metadata + - Performance validation summary + - Serving infrastructure setup + - Monitoring dashboard and alerts + - Data drift detection configuration + +## Success Criteria + +1. Model passes all quality gates before deployment +2. Serving infrastructure deployed and load-tested +3. Monitoring and alerting configured and validated +4. Rollback strategy tested and documented +5. Team trained on model updates and incident response + +## Next Steps After Workflow + +- Monitor model performance daily for first week +- Track data drift metrics; alert if detected +- Plan model retraining based on performance degradation +- Document lessons learned in MLOps runbook + +--- + +**Navigation**: [← Back to 5-specialized](../), [Next: Step 01 β†’](steps/step-01-model-validation.md) diff --git a/src/psm/workflows/bmad-psm-production-readiness/SKILL.md b/src/psm/workflows/bmad-psm-production-readiness/SKILL.md new file mode 100644 index 000000000..d3444d26a --- /dev/null +++ b/src/psm/workflows/bmad-psm-production-readiness/SKILL.md @@ -0,0 +1,6 @@ +--- +name: bmad-psm-production-readiness +description: 'Run production readiness review across 9 dimensions. Use when the user says "are we ready for production" or "PRR" or "go-live check"' +--- + +Follow the instructions in [workflow.md](workflow.md). diff --git a/src/psm/workflows/bmad-psm-production-readiness/bmad-skill-manifest.yaml b/src/psm/workflows/bmad-psm-production-readiness/bmad-skill-manifest.yaml new file mode 100644 index 000000000..d0f08abdb --- /dev/null +++ b/src/psm/workflows/bmad-psm-production-readiness/bmad-skill-manifest.yaml @@ -0,0 +1 @@ +type: skill diff --git a/src/psm/workflows/bmad-psm-production-readiness/production-readiness.template.md b/src/psm/workflows/bmad-psm-production-readiness/production-readiness.template.md new file mode 100644 index 000000000..5ba0f47d9 --- /dev/null +++ b/src/psm/workflows/bmad-psm-production-readiness/production-readiness.template.md @@ -0,0 +1,367 @@ +--- +template_name: production-readiness-checklist +template_version: "1.0.0" +created_date: 2026-03-17 +description: Production Readiness Review checklist and report template +--- + +# Production Readiness Review (PRR) + +**Service**: {{SERVICE_NAME}} +**Owner**: {{SERVICE_OWNER}} +**Reviewer**: {{SRE_LEAD}} (Minh) +**Review Date**: {{DATE}} +**Target Go-Live**: {{TARGET_DATE}} + +--- + +## Executive Summary + +{{1-2 paragraphs summarizing the readiness assessment, decision, and key findings}} + +**Overall Assessment**: {{READY | CONDITIONAL | NOT_READY}} + +**Timeline**: Service {{can | can conditionally | cannot}} proceed to production {{on {{DATE}}}} + +--- + +## Production Readiness Scorecard + +### 9-Dimension Assessment + +| # | Dimension | Score | Status | Key Finding | +|---|-----------|-------|--------|-------------| +| 1 | Reliability | {{GREEN|YELLOW|RED}} | βœ…/⚠️/❌ | {{Brief finding}} | +| 2 | Observability | {{GREEN|YELLOW|RED}} | βœ…/⚠️/❌ | {{Brief finding}} | +| 3 | Performance | {{GREEN|YELLOW|RED}} | βœ…/⚠️/❌ | {{Brief finding}} | +| 4 | Security | {{GREEN|YELLOW|RED}} | βœ…/⚠️/❌ | {{Brief finding}} | +| 5 | Capacity | {{GREEN|YELLOW|RED}} | βœ…/⚠️/❌ | {{Brief finding}} | +| 6 | Data | {{GREEN|YELLOW|RED}} | βœ…/⚠️/❌ | {{Brief finding}} | +| 7 | Runbooks | {{GREEN|YELLOW|RED}} | βœ…/⚠️/❌ | {{Brief finding}} | +| 8 | Dependencies | {{GREEN|YELLOW|RED}} | βœ…/⚠️/❌ | {{Brief finding}} | +| 9 | Rollback | {{GREEN|YELLOW|RED}} | βœ…/⚠️/❌ | {{Brief finding}} | + +**Summary**: {{X}} GREEN, {{Y}} YELLOW, {{Z}} RED + +--- + +## Detailed Findings by Dimension + +### 1. Reliability + +**Goal**: Service meets SLO targets with documented failure modes and incident response plan. + +**Findings**: + +- [ ] {{Finding 1}} ({{Status}}) +- [ ] {{Finding 2}} ({{Status}}) +- [ ] {{Finding 3}} ({{Status}}) + +**Assessment**: {{Detailed narrative, 3-5 sentences}} + +**Score**: {{GREEN|YELLOW|RED}} + +--- + +### 2. Observability + +**Goal**: Service has comprehensive logging, metrics, tracing, and dashboards for operational visibility. + +**Findings**: + +- [ ] {{Finding 1}} ({{Status}}) +- [ ] {{Finding 2}} ({{Status}}) +- [ ] {{Finding 3}} ({{Status}}) + +**Assessment**: {{Detailed narrative, 3-5 sentences}} + +**Score**: {{GREEN|YELLOW|RED}} + +--- + +### 3. Performance + +**Goal**: Service meets latency/throughput targets and scales under expected load. + +**Findings**: + +- [ ] {{Finding 1}} ({{Status}}) +- [ ] {{Finding 2}} ({{Status}}) +- [ ] {{Finding 3}} ({{Status}}) + +**Assessment**: {{Detailed narrative, 3-5 sentences}} + +**Score**: {{GREEN|YELLOW|RED}} + +--- + +### 4. Security + +**Goal**: Authentication, authorization, encryption, and secrets management are implemented. + +**Findings**: + +- [ ] {{Finding 1}} ({{Status}}) +- [ ] {{Finding 2}} ({{Status}}) +- [ ] {{Finding 3}} ({{Status}}) + +**Assessment**: {{Detailed narrative, 3-5 sentences}} + +**Score**: {{GREEN|YELLOW|RED}} + +--- + +### 5. Capacity + +**Goal**: Resource requirements defined with growth headroom and cost acceptable. + +**Findings**: + +- [ ] {{Finding 1}} ({{Status}}) +- [ ] {{Finding 2}} ({{Status}}) +- [ ] {{Finding 3}} ({{Status}}) + +**Assessment**: {{Detailed narrative, 3-5 sentences}} + +**Score**: {{GREEN|YELLOW|RED}} + +--- + +### 6. Data + +**Goal**: Data governance, backup, retention, and disaster recovery documented and tested. + +**Findings**: + +- [ ] {{Finding 1}} ({{Status}}) +- [ ] {{Finding 2}} ({{Status}}) +- [ ] {{Finding 3}} ({{Status}}) + +**Assessment**: {{Detailed narrative, 3-5 sentences}} + +**Score**: {{GREEN|YELLOW|RED}} + +--- + +### 7. Runbooks + +**Goal**: Incident response, deployment, troubleshooting procedures documented and drilled. + +**Findings**: + +- [ ] {{Finding 1}} ({{Status}}) +- [ ] {{Finding 2}} ({{Status}}) +- [ ] {{Finding 3}} ({{Status}}) + +**Assessment**: {{Detailed narrative, 3-5 sentences}} + +**Score**: {{GREEN|YELLOW|RED}} + +--- + +### 8. Dependencies + +**Goal**: External/internal dependencies mapped, versioned, with fallback strategies. + +**Findings**: + +- [ ] {{Finding 1}} ({{Status}}) +- [ ] {{Finding 2}} ({{Status}}) +- [ ] {{Finding 3}} ({{Status}}) + +**Assessment**: {{Detailed narrative, 3-5 sentences}} + +**Score**: {{GREEN|YELLOW|RED}} + +--- + +### 9. Rollback + +**Goal**: Safe rollback strategy tested; deployment is reversible. + +**Findings**: + +- [ ] {{Finding 1}} ({{Status}}) +- [ ] {{Finding 2}} ({{Status}}) +- [ ] {{Finding 3}} ({{Status}}) + +**Assessment**: {{Detailed narrative, 3-5 sentences}} + +**Score**: {{GREEN|YELLOW|RED}} + +--- + +## Critical Blockers (P0) + +{{If any P0 blockers exist:}} + +Service **CANNOT** proceed to production until these are resolved: + +### P0 Blocker #1: {{ISSUE_TITLE}} + +- **Dimension**: {{Which dimension}} +- **Description**: {{What's the problem}} +- **Impact**: {{Why it's critical}} +- **Resolution**: {{How to fix}} +- **Owner**: {{Who must fix it}} +- **Deadline**: {{When it must be done}} +- **Acceptance**: {{How we verify it's fixed}} + +### P0 Blocker #2: {{ISSUE_TITLE}} + +{{Repeat format}} + +--- + +## Risks to Manage (P1) + +Service can proceed with documented monitoring and contingency plans: + +### P1 Risk #1: {{ISSUE_TITLE}} + +- **Dimension**: {{Which dimension}} +- **Description**: {{What's the problem}} +- **Impact**: {{If it happens, what's the consequence}} +- **Likelihood**: {{HIGH|MEDIUM|LOW}} +- **Mitigation**: {{How we'll manage it}} +- **Monitoring**: {{What metrics to watch}} +- **Contingency**: {{What we'll do if it occurs}} +- **Owner**: {{Who owns this risk}} +- **Target Fix**: {{Timeline to resolve permanently}} + +### P1 Risk #2: {{ISSUE_TITLE}} + +{{Repeat format}} + +--- + +## Recommendations + +**High Priority** (Next sprint): +- {{Recommendation 1}} +- {{Recommendation 2}} + +**Medium Priority** (Within 1 month): +- {{Recommendation 1}} +- {{Recommendation 2}} + +**Nice to Have** (Backlog): +- {{Recommendation 1}} +- {{Recommendation 2}} + +--- + +## Final Decision + +### Decision + +**{{ βœ… GO | ⚠️ CONDITIONAL-GO | ❌ NO-GO }}** + +### Rationale + +{{Explain the decision. Why can/can't we proceed?}} + +### Conditions (If CONDITIONAL-GO) + +If proceeding despite P1 risks, document conditions: + +1. **{{Condition 1}}**: {{Description}} + - Owner: {{Who oversees this}} + - Success Criteria: {{How we verify it}} + - Escalation: {{Who to contact if issues}} + +2. **{{Condition 2}}**: {{Description}} + - Owner: {{Who oversees this}} + - Success Criteria: {{How we verify it}} + - Escalation: {{Who to contact if issues}} + +### Deployment Timeline + +{{If GO or CONDITIONAL-GO:}} + +- **Approved for deployment**: {{DATE}} +- **Earliest go-live**: {{DATE}} +- **Recommended window**: {{DATE/TIME}} +- **On-call coverage required**: {{YES|NO}} +- **Emergency rollback plan**: {{REFERENCE TO RUNBOOK}} + +--- + +## Sign-offs & Approvals + +### Approval Chain + +- [ ] **SRE Lead** ({{NAME}}) β€” Review completed and findings approved + - Signature: ________________________ Date: __________ + +- [ ] **Architecture Lead** ({{NAME}}) β€” Architecture validated + - Signature: ________________________ Date: __________ + +- [ ] **Service Owner** ({{NAME}}) β€” Acknowledged findings and committed to actions + - Signature: ________________________ Date: __________ + +- [ ] **VP Engineering** ({{NAME}}) β€” Risk accepted (if CONDITIONAL-GO) + - Signature: ________________________ Date: __________ + +--- + +## Post-Production Plan + +### First 24 Hours + +- [ ] SRE on-call monitoring closely +- [ ] Daily standup with service team +- [ ] Monitor for any unusual patterns +- [ ] Be ready to rollback if needed + +### First Week + +- [ ] Daily metrics review +- [ ] Watch for data drift or unusual behavior +- [ ] Follow up on any P1 risks + +### Ongoing + +- [ ] Monthly PRR follow-ups to verify improvements +- [ ] Track action items to completion +- [ ] Update this PRR if significant changes made + +--- + +## Action Items + +| ID | Action | Owner | Deadline | Type | Status | +|----|--------|-------|----------|------|--------| +| A1 | {{Action}} | {{Name}} | {{Date}} | {{BLOCKER|RISK|RECOMMENDATION}} | ☐ | +| A2 | {{Action}} | {{Name}} | {{Date}} | {{BLOCKER|RISK|RECOMMENDATION}} | ☐ | +| A3 | {{Action}} | {{Name}} | {{Date}} | {{BLOCKER|RISK|RECOMMENDATION}} | ☐ | + +--- + +## Appendix + +### A. Load Test Results + +[Link to or summary of load test results showing service meets performance targets] + +### B. Security Review Results + +[Link to or summary of security audit findings] + +### C. Architecture Diagrams + +[Include or link to system architecture, data flow, and deployment topology] + +### D. SLO Definition + +[Document the agreed-upon SLO targets for availability, latency, error rate] + +### E. Runbooks + +[Link to or list of key runbooks: incident response, deployment, rollback, troubleshooting] + +--- + +**Report prepared by**: {{SRE_LEAD}} +**Report date**: {{DATE}} +**Last updated**: {{DATE}} diff --git a/src/psm/workflows/bmad-psm-production-readiness/workflow.md b/src/psm/workflows/bmad-psm-production-readiness/workflow.md new file mode 100644 index 000000000..b64ed8e6e --- /dev/null +++ b/src/psm/workflows/bmad-psm-production-readiness/workflow.md @@ -0,0 +1,92 @@ +--- +workflow_id: PRR001 +workflow_name: Production Readiness Review +description: Validate service is ready for production using comprehensive readiness checklist +entry_point: steps/step-01-init-checklist.md +phase: 3-run +lead_agent: "Minh (SRE)" +status: "active" +created_date: 2026-03-17 +version: "1.0.0" +estimated_duration: "2-3 hours" +outputFile: '{output_folder}/psm-artifacts/prr-{{project_name}}-{{date}}.md' +--- + +# Workflow: Production Readiness Review (PRR) + +## Goal +Validate and certify that a service meets production readiness standards across 9 key dimensions before deployment. + +## Overview + +This workflow systematically evaluates a service against production readiness criteria defined in the Production Systems BMAD skill framework. Using SRE expertise and architectural patterns, the workflow: + +1. **Initializes** the PRR process with service context and dimensional overview +2. **Deep reviews** each dimension (reliability, observability, performance, security, capacity, data, runbooks, dependencies, rollback) +3. **Renders final decision** with GO/NO-GO/CONDITIONAL-GO recommendation + +## Execution Path + +``` +START + ↓ +[Step 01] Init Checklist (Load framework, gather service context, present dimensions) + ↓ +[Step 02] Deep Review (Score each dimension, identify blockers, recommendations) + ↓ +[Step 03] Final Decision (Scorecard, decision, action items, DONE) + ↓ +END +``` + +## Key Roles + +| Role | Agent | Responsibility | +|------|-------|-----------------| +| Lead | Minh (SRE) | Navigate workflow, coordinate review, make final call | +| Subject Matter | Service Owner | Provide service context, clarify architecture | +| Review Committee | Arch, SecOps, MLOps | Contribute expertise on specific dimensions | + +## Dimensions Evaluated (9) + +1. **Reliability** β€” SLA/SLO definition, error budgets, failure modes, incident response +2. **Observability** β€” Logging, metrics, tracing, dashboards, alerting +3. **Performance** β€” Latency targets, throughput, P99 tail behavior, optimization opportunities +4. **Security** β€” Auth/authz, secrets management, encryption, audit logging, compliance +5. **Capacity** β€” Resource limits, scaling policies, burst capacity, cost projections +6. **Data** β€” Schema versioning, backup/restore, data governance, retention policies +7. **Runbooks** β€” Incident runbooks, operational playbooks, troubleshooting guides +8. **Dependencies** β€” External services, internal libraries, database versioning, API contracts +9. **Rollback** β€” Rollback strategy, canary deployment, feature flags, smoke tests + +## Input Requirements + +- **Service name and owner** β€” Which service are we evaluating? +- **Current architecture** β€” High-level design, tech stack, topology +- **Existing metrics/dashboards** β€” Links to monitoring, SLO definitions +- **Known gaps/risks** β€” Already identified issues to address + +## Output Deliverable + +- **Production Readiness Checklist** (template: `production-readiness.template.md`) + - Scorecard with 9 dimensions (red/yellow/green) + - Blockers and recommendations per dimension + - Final GO/NO-GO/CONDITIONAL-GO decision + - Explicit action items with owners and deadlines + +## Success Criteria + +1. All 9 dimensions evaluated with clear rationale +2. Blockers categorized as P0 (must fix) or P1 (should fix) +3. Team alignment on decision (documented in PRR report) +4. Action plan with clear accountability and timeline + +## Next Steps After Workflow + +- If **GO**: Proceed to deployment; document in CHANGELOG +- If **NO-GO**: Reschedule PRR once blockers addressed; track in backlog +- If **CONDITIONAL-GO**: Deploy with documented caveats; setup monitoring for risk areas + +--- + +**Navigation**: [← Back to 3-run](../), [Next: Step 01 β†’](steps/step-01-init-checklist.md) diff --git a/src/psm/workflows/bmad-psm-quick-diagnose/SKILL.md b/src/psm/workflows/bmad-psm-quick-diagnose/SKILL.md new file mode 100644 index 000000000..be9f69f77 --- /dev/null +++ b/src/psm/workflows/bmad-psm-quick-diagnose/SKILL.md @@ -0,0 +1,6 @@ +--- +name: bmad-psm-quick-diagnose +description: 'Quick diagnosis of production issue with minimal latency. Use when the user says "something is broken" or "quick diagnose" or "what is happening?"' +--- + +Follow the instructions in [workflow.md](workflow.md). diff --git a/src/psm/workflows/bmad-psm-quick-diagnose/bmad-skill-manifest.yaml b/src/psm/workflows/bmad-psm-quick-diagnose/bmad-skill-manifest.yaml new file mode 100644 index 000000000..d0f08abdb --- /dev/null +++ b/src/psm/workflows/bmad-psm-quick-diagnose/bmad-skill-manifest.yaml @@ -0,0 +1 @@ +type: skill diff --git a/src/psm/workflows/bmad-psm-quick-diagnose/workflow.md b/src/psm/workflows/bmad-psm-quick-diagnose/workflow.md new file mode 100644 index 000000000..dc88a7fff --- /dev/null +++ b/src/psm/workflows/bmad-psm-quick-diagnose/workflow.md @@ -0,0 +1,80 @@ +--- +workflow_id: QD001 +workflow_name: Quick Diagnose +description: Fast diagnosis of production issue with root cause and fix suggestion +entry_point: steps/step-01-gather.md +phase: quick-flow +lead_agent: "Minh (SRE)" +status: "active" +created_date: 2026-03-17 +version: "1.0.0" +estimated_duration: "15-25 minutes" +outputFile: '{output_folder}/psm-artifacts/quick-diagnose-{{date}}.md' +--- + +# Workflow: Quick Diagnose Production Issue + +## Goal +Rapidly diagnose production issues by gathering symptom data, checking metrics, and suggesting fixes. + +## Overview + +Quick Diagnose is a lightweight workflow for time-sensitive production troubleshooting: + +1. **Gathers** symptom description and quick metrics check +2. **Diagnoses** root cause using observability data +3. **Suggests** fix or mitigation immediately + +## Execution Path + +``` +START + ↓ +[Step 01] Gather Context (What's broken? Check metrics) + ↓ +[Step 02] Diagnose & Fix (Root cause analysis β†’ fix suggestion β†’ verify) + ↓ +END +``` + +## Key Roles + +| Role | Agent | +|------|-------| +| Lead | Minh (SRE) | + +## Input Requirements + +- **Symptom description** β€” What is failing? (error message, behavior, timeline) +- **Affected service/component** β€” What system is broken? +- **Timeline** β€” When did it start? Is it ongoing? +- **Impact** β€” How many users affected? Is revenue impacted? + +## Output Deliverable + +- **Quick Diagnosis Report** (markdown, 1-2 pages) + - Symptom analysis + - Root cause hypothesis + - Immediate mitigation (if needed) + - Fix suggestion with effort + - Follow-up actions + +## Success Criteria + +1. Root cause identified within 15-20 minutes +2. Immediate mitigation available (if needed) +3. Fix suggestion documented with clear steps +4. Team knows what to do next + +## Quick Diagnose vs Full Production Readiness Review + +| Aspect | Quick Diagnose | Full PRR | +|--------|---|---| +| Trigger | Active incident | Pre-deployment | +| Duration | 15-25 min | 2-3 hours | +| Scope | Single issue | All 9 dimensions | +| Goal | Fix now | Prevent issues | + +--- + +**Navigation**: [← Back to quick-flow](../), [Next: Step 01 β†’](steps/step-01-gather.md) diff --git a/src/psm/workflows/bmad-psm-security-audit/SKILL.md b/src/psm/workflows/bmad-psm-security-audit/SKILL.md new file mode 100644 index 000000000..7da323eb1 --- /dev/null +++ b/src/psm/workflows/bmad-psm-security-audit/SKILL.md @@ -0,0 +1,6 @@ +--- +name: bmad-psm-security-audit +description: 'Run comprehensive security audit and threat assessment. Use when the user says "security audit" or "vulnerability assessment" or "security review"' +--- + +Follow the instructions in [workflow.md](workflow.md). diff --git a/src/psm/workflows/bmad-psm-security-audit/bmad-skill-manifest.yaml b/src/psm/workflows/bmad-psm-security-audit/bmad-skill-manifest.yaml new file mode 100644 index 000000000..d0f08abdb --- /dev/null +++ b/src/psm/workflows/bmad-psm-security-audit/bmad-skill-manifest.yaml @@ -0,0 +1 @@ +type: skill diff --git a/src/psm/workflows/bmad-psm-security-audit/security-audit-report.template.md b/src/psm/workflows/bmad-psm-security-audit/security-audit-report.template.md new file mode 100644 index 000000000..a4127603e --- /dev/null +++ b/src/psm/workflows/bmad-psm-security-audit/security-audit-report.template.md @@ -0,0 +1,502 @@ +--- +template_name: security-audit-report +template_version: "1.0.0" +created_date: 2026-03-17 +description: Security audit report with findings, severity levels, and remediation plan +--- + +# Security Audit Report + +**Service**: {{SERVICE_NAME}} +**Service Owner**: {{SERVICE_OWNER}} +**Auditor**: {{SECURITY_LEAD}} (HΓ ) +**Audit Date**: {{START_DATE}} β€” {{END_DATE}} +**Report Date**: {{REPORT_DATE}} +**Scope**: {{SCOPE_DESCRIPTION}} + +--- + +## Executive Summary + +This security audit evaluated {{SERVICE_NAME}} against security best practices and compliance requirements. The assessment identified {{X}} findings across {{Y}} security domains. + +**Overall Security Posture**: {{COMPLIANT | FINDINGS | CRITICAL}} + +{{1-2 paragraph summary of key findings, critical issues if any, and recommendations}} + +--- + +## Audit Scope + +### Services Reviewed + +- {{Service 1}} ({{Description}}) +- {{Service 2}} ({{Description}}) +- {{Service 3}} ({{Description}}) + +### Assessment Domains + +- βœ… Authentication & Authorization +- βœ… API Security +- βœ… Secrets Management +- βœ… Encryption (in-transit & at-rest) +- βœ… PII & Data Protection + +### Exclusions + +{{Any out-of-scope areas:}} +- {{Item}} (reason) +- {{Item}} (reason) + +--- + +## Findings Summary + +### By Severity + +| Severity | Count | Trend | +|----------|-------|-------| +| **Critical** | {{X}} | {{↑/β†’/↓}} | +| **High** | {{Y}} | {{↑/β†’/↓}} | +| **Medium** | {{Z}} | {{↑/β†’/↓}} | +| **Low** | {{W}} | {{↑/β†’/↓}} | +| **Total** | {{X+Y+Z+W}} | | + +### By Domain + +| Domain | Critical | High | Medium | Low | Status | +|--------|----------|------|--------|-----|--------| +| Auth & Authz | {{#}} | {{#}} | {{#}} | {{#}} | βœ…/⚠️/❌ | +| API Security | {{#}} | {{#}} | {{#}} | {{#}} | βœ…/⚠️/❌ | +| Secrets Mgmt | {{#}} | {{#}} | {{#}} | {{#}} | βœ…/⚠️/❌ | +| Encryption | {{#}} | {{#}} | {{#}} | {{#}} | βœ…/⚠️/❌ | +| PII & Data | {{#}} | {{#}} | {{#}} | {{#}} | βœ…/⚠️/❌ | + +--- + +## Critical Severity Findings + +### [F1] {{Finding Title}} + +**Severity**: CRITICAL (CVSS {{8.0-10.0}}) +**Domain**: {{Which domain}} +**Status**: {{Open | In Progress | Resolved}} + +**Description**: +{{Detailed description of the vulnerability, how it could be exploited, and the impact}} + +**Evidence**: +- {{Evidence 1}} +- {{Evidence 2}} +- {{Testing confirmation}} + +**Impact**: +- {{Business impact}} +- {{Technical impact}} +- {{Compliance impact}} + +**Remediation**: +1. {{Step 1}} ({{Estimated time}}) +2. {{Step 2}} ({{Estimated time}}) +3. {{Step 3}} ({{Estimated time}}) + +**Owner**: {{Name}} +**Target Fix Date**: {{DATE}} +**Effort**: {{Est. hours/days}} +**Verification**: {{How we'll confirm it's fixed}} + +--- + +### [F2] {{Finding Title}} + +{{Repeat Critical severity format}} + +--- + +## High Severity Findings + +### [F3] {{Finding Title}} + +**Severity**: HIGH (CVSS {{7.0-7.9}}) +**Domain**: {{Which domain}} +**Status**: {{Open | In Progress | Resolved}} + +**Description**: {{Brief description}} + +**Impact**: {{Why it matters}} + +**Remediation**: +1. {{Step 1}} +2. {{Step 2}} + +**Owner**: {{Name}} +**Target Date**: {{DATE}} + +--- + +### [F4] {{Finding Title}} + +{{Repeat High severity format}} + +--- + +## Medium Severity Findings + +### [F5] {{Finding Title}} + +**Severity**: MEDIUM (CVSS {{4.0-6.9}}) +**Domain**: {{Which domain}} +**Description**: {{Brief description}} +**Remediation**: {{Brief fix}} +**Owner**: {{Name}} | **Target Date**: {{DATE}} + +--- + +### [F6] {{Finding Title}} + +{{Repeat Medium severity format}} + +--- + +## Low Severity Findings + +### [F7] {{Finding Title}} + +**Severity**: LOW (CVSS {{0.1-3.9}}) +**Description**: {{Brief description}} +**Remediation**: {{Brief fix}} + +--- + +### [F8] {{Finding Title}} + +{{Repeat Low severity format}} + +--- + +## Domain-Specific Assessment + +### Domain 1: Authentication & Authorization + +**Status**: {{COMPLIANT | FINDINGS | CRITICAL}} + +**Strengths**: +- {{Positive finding 1}} +- {{Positive finding 2}} + +**Gaps**: +- {{Gap 1}} β€” {{Impact}} +- {{Gap 2}} β€” {{Impact}} + +**Recommendations**: +1. {{Recommendation 1}} +2. {{Recommendation 2}} + +--- + +### Domain 2: API Security + +**Status**: {{COMPLIANT | FINDINGS | CRITICAL}} + +**Strengths**: +- {{Positive finding 1}} +- {{Positive finding 2}} + +**Gaps**: +- {{Gap 1}} β€” {{Impact}} +- {{Gap 2}} β€” {{Impact}} + +**Recommendations**: +1. {{Recommendation 1}} +2. {{Recommendation 2}} + +--- + +### Domain 3: Secrets Management + +**Status**: {{COMPLIANT | FINDINGS | CRITICAL}} + +**Strengths**: +- {{Positive finding 1}} +- {{Positive finding 2}} + +**Gaps**: +- {{Gap 1}} β€” {{Impact}} +- {{Gap 2}} β€” {{Impact}} + +**Recommendations**: +1. {{Recommendation 1}} +2. {{Recommendation 2}} + +--- + +### Domain 4: Encryption + +**Status**: {{COMPLIANT | FINDINGS | CRITICAL}} + +**Strengths**: +- {{Positive finding 1}} +- {{Positive finding 2}} + +**Gaps**: +- {{Gap 1}} β€” {{Impact}} +- {{Gap 2}} β€” {{Impact}} + +**Recommendations**: +1. {{Recommendation 1}} +2. {{Recommendation 2}} + +--- + +### Domain 5: PII & Data Protection + +**Status**: {{COMPLIANT | FINDINGS | CRITICAL}} + +**Strengths**: +- {{Positive finding 1}} +- {{Positive finding 2}} + +**Gaps**: +- {{Gap 1}} β€” {{Impact}} +- {{Gap 2}} β€” {{Impact}} + +**Recommendations**: +1. {{Recommendation 1}} +2. {{Recommendation 2}} + +--- + +## Compliance Assessment + +### GDPR (General Data Protection Regulation) + +**Applicable**: {{YES | NO | PARTIAL}} +**Status**: {{COMPLIANT | NON-COMPLIANT | CONDITIONAL}} + +| Requirement | Status | Finding | Gap Fix | +|-------------|--------|---------|---------| +| Data Encryption | {{βœ…/❌}} | {{Description}} | {{Remediation}} | +| Access Control | {{βœ…/❌}} | {{Description}} | {{Remediation}} | +| Retention Policy | {{βœ…/❌}} | {{Description}} | {{Remediation}} | +| Right to Deletion | {{βœ…/❌}} | {{Description}} | {{Remediation}} | +| Data Processing Agreement | {{βœ…/❌}} | {{Description}} | {{Remediation}} | + +**Timeline to Compliance**: {{DATE or "Already compliant"}} + +--- + +### PCI-DSS (Payment Card Industry Data Security Standard) + +**Applicable**: {{YES | NO | PARTIAL}} +**Status**: {{COMPLIANT | NON-COMPLIANT | CONDITIONAL}} + +| Requirement | Status | Finding | Gap Fix | +|-------------|--------|---------|---------| +| TLS 1.2+ | {{βœ…/❌}} | {{Description}} | {{Remediation}} | +| Secrets Management | {{βœ…/❌}} | {{Description}} | {{Remediation}} | +| Input Validation | {{βœ…/❌}} | {{Description}} | {{Remediation}} | + +**Timeline to Compliance**: {{DATE or "Already compliant"}} + +--- + +### SOC 2 Type II + +**Applicable**: {{YES | NO | PARTIAL}} +**Status**: {{COMPLIANT | NON-COMPLIANT | CONDITIONAL}} + +**Gap Summary**: {{Description of gaps or "No gaps identified"}} + +**Timeline**: {{When audit can be conducted}} + +--- + +### Other Regulations + +{{Any other applicable standards (HIPAA, FINRA, etc.)}} + +--- + +## Remediation Roadmap + +### Critical Path (Week 1-2) + +**All Critical findings must be fixed before production deployment.** + +- [ ] {{F1}} β€” Owner: {{Name}}, Deadline: {{DATE}} +- [ ] {{F2}} β€” Owner: {{Name}}, Deadline: {{DATE}} + +**Milestone**: Security re-scan on {{DATE}} to verify fixes + +--- + +### Phase 2 (Week 3-4) + +Complete High-severity findings: + +- [ ] {{F3}} β€” Owner: {{Name}}, Deadline: {{DATE}} +- [ ] {{F4}} β€” Owner: {{Name}}, Deadline: {{DATE}} + +**Milestone**: Second security review on {{DATE}} + +--- + +### Phase 3 (Weeks 5-8) + +Address Medium-severity findings (can be post-production with monitoring): + +- [ ] {{F5}} β€” Owner: {{Name}}, Target: {{DATE}} +- [ ] {{F6}} β€” Owner: {{Name}}, Target: {{DATE}} + +--- + +### Backlog (Next Sprint) + +Low-severity items: + +- [ ] {{F7}} β€” {{Brief description}} +- [ ] {{F8}} β€” {{Brief description}} + +--- + +## Remediation Status Tracking + +| Finding | Owner | Deadline | Status | Last Update | Notes | +|---------|-------|----------|--------|-------------|-------| +| F1 | {{Name}} | {{Date}} | πŸ”΄ Pending | {{Date}} | {{Notes}} | +| F2 | {{Name}} | {{Date}} | 🟑 In Progress | {{Date}} | {{Notes}} | +| F3 | {{Name}} | {{Date}} | 🟒 Complete | {{Date}} | {{Notes}} | + +--- + +## Post-Audit Monitoring + +### Controls to Monitor + +{{If service proceeds to production despite findings:}} + +- **{{Control 1}}** β€” Monitor via {{method}}, alert if {{threshold}} +- **{{Control 2}}** β€” Monitor via {{method}}, alert if {{threshold}} +- **{{Control 3}}** β€” Monitor via {{method}}, alert if {{threshold}} + +### Incident Response + +If a security incident occurs: +1. Activate incident response team +2. Notify {{Escalation contacts}} +3. Follow {{Incident response runbook}} +4. Conduct post-incident security review + +--- + +## Risk Assessment Matrix + +``` + LIKELIHOOD + Low Med High + CRITICAL H C C +IMPACT + HIGH M H C + MEDIUM L M H + LOW L L M + +Legend: C=Critical, H=High, M=Medium, L=Low +``` + +**Our findings map**: +- {{F1}} β€” {{Position on matrix}} +- {{F2}} β€” {{Position on matrix}} + +--- + +## Positive Findings + +**Strengths to maintain:** + +- {{Positive 1}} β€” Keep doing this +- {{Positive 2}} β€” Keep doing this +- {{Positive 3}} β€” Keep doing this + +--- + +## Recommendations Summary + +### Immediate (Critical) +- {{Fix all Critical findings}} ({{effort}}) + +### Short-term (High Priority) +- {{Fix all High findings}} ({{effort}}) +- {{Implement automated scanning}} ({{effort}}) +- {{Setup security monitoring}} ({{effort}}) + +### Medium-term +- {{Implement {{technology}} for {{purpose}}}} ({{effort}}) +- {{Security training for team}} ({{effort}}) + +### Long-term (Next 6 Months) +- {{Major security initiative}} ({{effort}}) +- {{Penetration testing}} ({{effort}}) + +--- + +## Sign-offs & Approvals + +### Audit Approval + +- [ ] **Security Lead** ({{AUDITOR_NAME}}) + - Signature: ________________________ Date: __________ + - Assessment complete and findings documented + +### Service Owner Acknowledgment + +- [ ] **Service Owner** ({{SERVICE_OWNER}}) + - Signature: ________________________ Date: __________ + - Acknowledged findings and committed to remediation + +### Compliance Officer Review + +- [ ] **Compliance Officer** ({{NAME}}) + - Signature: ________________________ Date: __________ + - Compliance requirements verified + +### Executive Approval (If Production Clearance Needed) + +- [ ] **VP Engineering / Security** ({{NAME}}) + - Signature: ________________________ Date: __________ + - Risk accepted; approved for production + +--- + +## Distribution + +- [x] Shared with: {{Service team, Leadership, Compliance}} +- [x] Date shared: {{DATE}} +- [x] Follow-up review scheduled: {{DATE}} + +--- + +## Appendix: Testing Evidence + +### Code Review Findings + +``` +{{Code snippets demonstrating vulnerabilities}} +``` + +### Configuration Issues + +``` +{{Configuration examples showing gaps}} +``` + +### Dependencies Scan + +``` +{{Vulnerable dependencies identified}} +``` + +--- + +**Report Prepared By**: {{AUDITOR_NAME}} +**Report Date**: {{DATE}} +**Review Status**: Draft | Final | Approved diff --git a/src/psm/workflows/bmad-psm-security-audit/workflow.md b/src/psm/workflows/bmad-psm-security-audit/workflow.md new file mode 100644 index 000000000..8ad8a8c3b --- /dev/null +++ b/src/psm/workflows/bmad-psm-security-audit/workflow.md @@ -0,0 +1,91 @@ +--- +workflow_id: SA001 +workflow_name: Security Audit +description: Comprehensive security review using security patterns, config management, and compliance framework +entry_point: steps/step-01-scope.md +phase: 4-cross +lead_agent: "HΓ  (Security)" +status: "active" +created_date: 2026-03-17 +version: "1.0.0" +estimated_duration: "2-3 hours" +outputFile: '{output_folder}/psm-artifacts/security-audit-{{project_name}}-{{date}}.md' +--- + +# Workflow: Security Audit + +## Goal +Perform comprehensive security evaluation using Production Systems BMAD framework, covering threat modeling, vulnerability assessment, compliance, and security controls. + +## Overview + +Security audit is a critical cross-functional workflow that evaluates service security posture before production deployment or for ongoing compliance verification. The audit: + +1. **Scopes** the audit engagement, defines threat model, and identifies compliance requirements +2. **Executes** detailed security assessment across multiple domains (authentication, data protection, infrastructure, API security) +3. **Reports** findings with severity levels, remediation recommendations, and compliance status + +## Execution Path + +``` +START + ↓ +[Step 01] Scope & Threat Model (Define audit scope, identify threats, compliance reqs) + ↓ +[Step 02] Security Assessment (Execute checklist across domains, identify vulns) + ↓ +[Step 03] Security Report (Findings report, severity, recommendations, compliance) + ↓ +END +``` + +## Key Roles + +| Role | Agent | Responsibility | +|------|-------|-----------------| +| Lead | HΓ  (Security) | Lead audit, coordinate assessment, synthesize findings | +| Subject Matter | Service Owner + Platform Eng | Provide architecture, answer security questions | +| Compliance | Security/Compliance Team | Validate compliance mapping, sign-off | + +## Assessment Domains (5) + +1. **Authentication & Authorization** β€” Identity verification, access control, session management +2. **API Security** β€” Input validation, rate limiting, API key management, CORS +3. **Secrets Management** β€” Credential storage, rotation, access logging +4. **Encryption** β€” In-transit (TLS), at-rest, key management +5. **PII & Data Protection** β€” Classification, access controls, audit logging, retention + +## Input Requirements + +- **Service architecture diagram** β€” Components, data flows, external integrations +- **Authentication/authorization approach** β€” OAuth2, JWT, SAML, custom +- **Secrets storage mechanism** β€” Vault, cloud provider, environment variables +- **Compliance requirements** β€” GDPR, CCPA, SOC2, industry-specific +- **Known security controls** β€” WAF, TLS config, authentication libraries + +## Output Deliverable + +- **Security Audit Report** (template: `security-audit-report.template.md`) + - Audit scope and threat model + - Findings organized by domain with severity (Critical/High/Medium/Low) + - Remediation recommendations with priority and effort + - Compliance status matrix + - Sign-off + +## Success Criteria + +1. All security domains assessed with clear findings +2. Severity levels assigned (using CVSS or similar framework) +3. Remediation plan with owners and deadlines +4. Compliance requirements verified (if applicable) +5. Team alignment on security posture + +## Next Steps After Workflow + +- If **COMPLIANT**: Document in security registry; schedule periodic re-audit +- If **NON-COMPLIANT**: Add remediation items to backlog; track closure +- If **CRITICAL ISSUES**: Consider production pause until resolved + +--- + +**Navigation**: [← Back to 4-cross](../), [Next: Step 01 β†’](steps/step-01-scope.md) diff --git a/src/psm/workflows/bmad-psm-setup-new-service/SKILL.md b/src/psm/workflows/bmad-psm-setup-new-service/SKILL.md new file mode 100644 index 000000000..6b43cff8a --- /dev/null +++ b/src/psm/workflows/bmad-psm-setup-new-service/SKILL.md @@ -0,0 +1,6 @@ +--- +name: bmad-psm-setup-new-service +description: 'Set up new production service from architecture through deployment. Use when the user says "new service" or "setup service" or "new microservice"' +--- + +Follow the instructions in [workflow.md](workflow.md). diff --git a/src/psm/workflows/bmad-psm-setup-new-service/bmad-skill-manifest.yaml b/src/psm/workflows/bmad-psm-setup-new-service/bmad-skill-manifest.yaml new file mode 100644 index 000000000..d0f08abdb --- /dev/null +++ b/src/psm/workflows/bmad-psm-setup-new-service/bmad-skill-manifest.yaml @@ -0,0 +1 @@ +type: skill diff --git a/src/psm/workflows/bmad-psm-setup-new-service/workflow.md b/src/psm/workflows/bmad-psm-setup-new-service/workflow.md new file mode 100644 index 000000000..35b2d6a6a --- /dev/null +++ b/src/psm/workflows/bmad-psm-setup-new-service/workflow.md @@ -0,0 +1,116 @@ +--- +workflow_id: W-SETUP-SVC-001 +workflow_name: Setup Production Service for BMAD +version: 6.2.0 +lead_agent: "Architect Khang" +supporting_agents: ["SRE Minh", "Mary Analyst"] +phase: "1-Analysis β†’ 2-Planning β†’ 3-Solutioning β†’ 4-Implementation" +created_date: 2026-03-17 +last_modified: 2026-03-17 +config_file: "_config/config.yaml" +estimated_duration: "12-20 hours" +outputFile: '{output_folder}/psm-artifacts/service-setup-{{project_name}}-{{date}}.md' +--- + +# Setup Production Service Workflow β€” BMAD Pattern + +## Metadata & Context + +**Goal**: XΓ’y dα»±ng production-grade service tα»« scratch, vα»›i Δ‘αΊ§y Δ‘α»§ architecture, API design, deployment pipeline, reliability patterns, security, vΓ  production readiness. + +**Lead Team**: +- SRE Minh (Reliability, Infrastructure, Operations) +- Architect Khang (System Design, Technology Selection) +- Mary Analyst (Requirements, Risk Assessment) + +**Success Criteria**: +- βœ“ Architecture design document approved +- βœ“ API contracts defined & validated +- βœ“ Database schema designed & indexed +- βœ“ CI/CD pipeline operational +- βœ“ Resilience & observability in place +- βœ“ Security & compliance verified +- βœ“ Production readiness checklist passed + +## Workflow Overview + +Workflow nΓ y di qua 6 bΖ°α»›c atomic, mα»—i bΖ°α»›c focus vΓ o mα»™t domain riΓͺng: + +1. **Step-01-Architecture** β†’ Requirements + Architecture Pattern Selection +2. **Step-02-API-Database** β†’ API Design + Database Selection + Schema +3. **Step-03-Build-Deploy** β†’ CI/CD + Containerization + Testing Strategy +4. **Step-04-Reliability** β†’ Resilience Patterns + Observability + Error Handling +5. **Step-05-Security-Infra** β†’ Auth/Authz + Secrets + K8s Config +6. **Step-06-Readiness** β†’ PRR Checklist + Runbook + Go/No-Go Decision + +## Configuration Loading + +Tα»± Δ‘α»™ng load tα»« `_config/config.yaml`: + +```yaml +project_context: + user_name: "[loaded from config]" + organization: "[loaded from config]" + environment: "production" + +workflow_defaults: + communication_language: "Vietnamese" + output_folder: "./outputs/setup-new-service-{service_name}" + timestamp: "2026-03-17" +``` + +## Execution Model + +### Entry Point Logic + +``` +1. Check if workflow.md exists in outputs folder + β†’ If NEW: Start from step-01-architecture.md + β†’ If RESUME: Load progress.yaml β†’ auto-skip completed steps + β†’ If PARTIAL: Load step-N-context.yaml β†’ resume from step N + +2. For each step: + a) Load step-{N}-{name}.md + b) Load referenced SKILL files (auto-parse "Load:" directives) + c) Execute MENU [A][C] options + d) Save step output to step-{N}-output.md + e) Move to next step + +3. Final: Generate comprehensive outputs in outputs folder +``` + +### State Tracking + +Output document frontmatter tracks progress: + +```yaml +workflow_progress: + step_01_architecture: "completed" + step_02_api_database: "completed" + step_03_build_deploy: "in_progress" + step_04_reliability: "pending" + step_05_security_infra: "pending" + step_06_readiness: "pending" + last_updated: "2026-03-17T14:30:00Z" + current_agent: "Architect Khang" +``` + +## Mandatory Workflow Rules + +1. **No skipping steps** β€” Mα»—i step phαΊ£i được execute theo order +2. **Validate assumptions** β€” Mα»—i decision phαΊ£i được document +3. **Cross-phase collaboration** β€” Architects + SRE + Analysts work together +4. **Output artifacts** β€” Mα»—i step produce tangible output documents +5. **Handoff protocol** β€” Context được transfer giα»―a steps rΓ΅ rΓ ng + +## Navigation + +HΓ£y chọn cΓ‘ch bαΊ―t Δ‘αΊ§u: + +- **[NEW]** β€” BαΊ―t Δ‘αΊ§u workflow mα»›i β†’ Load step-01 +- **[RESUME]** β€” Quay lαΊ‘i workflow Δ‘Γ£ tα»«ng chαΊ‘y (detect progress) +- **[SKIP-TO]** β€” NhαΊ£y tα»›i step cα»₯ thể (dev-only, requires confirmation) + +--- + +**TiαΊΏp tα»₯c bαΊ±ng cΓ‘ch chọn [NEW] hoαΊ·c [RESUME]** diff --git a/tools/cli/external-official-modules.yaml b/tools/cli/external-official-modules.yaml index 6a2fa259d..9f26fe8c4 100644 --- a/tools/cli/external-official-modules.yaml +++ b/tools/cli/external-official-modules.yaml @@ -42,6 +42,16 @@ modules: type: bmad-org npmPackage: bmad-method-test-architecture-enterprise + bmad-production-systems: + url: https://github.com/DoanNgocCuong/bmad-module-production-systems + module-definition: src/module.yaml + code: psm + name: "Production Systems & MLOps (BMad Community Module)" + description: "Production engineering with SRE, Security, and MLOps agents for incident response, PRR, and deployment" + defaultSelected: false + type: community + npmPackage: bmad-production-systems + whiteport-design-studio: url: https://github.com/bmad-code-org/bmad-method-wds-expansion module-definition: src/module.yaml