3-shake · nwiizo · Feb 3, 2026 · Feb 3, 2026 · Feb 3, 2026 · Feb 3, 2026
@@ -1,5 +1,5 @@
 system:
-  debug: 
+  debug:
     log_level: debug
 
 ai:
@@ -11,7 +11,22 @@ ai:
     project: "<YOUR_PROJECT_ID>"
     location: "us-central1"
     model: "gemini-2.0-flash-001"
-
+
+  anthropic:
+    model: "claude-sonnet-4-20250514"
+
+  # Fallback configuration - automatically switches to backup providers on failure
+  fallback:
+    enabled: false  # Set to true to enable automatic failover
+    providers:      # Providers tried in order (first = primary, rest = fallbacks)
+      - openai
+      - anthropic
+      # - vertexai  # Uncomment if using VertexAI as fallback
+    retry:
+      max_retries: 2    # Retries per provider before moving to next
+      delay_ms: 1000    # Delay between retries in milliseconds
+      timeout_ms: 30000 # Request timeout (not yet implemented)
+
   commands:
     - describe:
         description: "Generate a detailed description of the Issue."
@@ -74,3 +89,323 @@ ai:
           ---
           The following is the GitHub Issue and comments on it. Please analyze:
         require_intent: false
+    - postmortem:
+        description: "Generate a postmortem document from the incident timeline."
+        system_prompt: |
+          You are an SRE expert specializing in incident management and postmortem documentation.
+          Generate a comprehensive postmortem document based on the incident Issue and its comments.
+
+          Analyze the conversation timeline to extract:
+          1. Key events and their timestamps
+          2. People involved and their actions
+          3. Root cause (if identified)
+          4. Resolution steps taken
+          5. Lessons learned
+
+          Output in the following structured Markdown format:
+
+          ## Postmortem: [Incident Title]
+
+          ### Incident Summary
+          | Field | Value |
+          |-------|-------|
+          | **Incident ID** | Issue #[number] |
+          | **Date** | [Extract from issue/comments] |
+          | **Duration** | [Estimate from timeline] |
+          | **Severity** | [Estimate: Sev1/Sev2/Sev3/Sev4] |
+          | **Impact** | [Describe user/system impact] |
+
+          ### Timeline
+          | Time | Event | Actor |
+          |------|-------|-------|
+          | [timestamp] | [event description] | [user/system] |
+          | ... | ... | ... |
+
+          ### Root Cause
+          **Direct Cause**: [What directly caused the incident]
+
+          **Contributing Factors**:
+          - [Factor 1]
+          - [Factor 2]
+
+          ### Response & Resolution
+          1. [Step 1: What was done]
+          2. [Step 2: What was done]
+          3. [Resolution: How was it fixed]
+
+          ### What Went Well
+          - [Positive aspect 1]
+          - [Positive aspect 2]
+
+          ### What Could Be Improved
+          - [Improvement area 1]
+          - [Improvement area 2]
+
+          ### Action Items
+          | Priority | Action | Owner | Due Date |
+          |----------|--------|-------|----------|
+          | P1 | [Immediate action] | TBD | [date] |
+          | P2 | [Short-term fix] | TBD | [date] |
+          | P3 | [Long-term improvement] | TBD | [date] |
+
+          ### Lessons Learned
+          - [Key takeaway 1]
+          - [Key takeaway 2]
+
+          ---
+          *This postmortem was auto-generated by alert-menta. Please review and update with additional details.*
+
+          ---
+          The following is the GitHub Issue and comments representing the incident timeline:
+        require_intent: false
+    - runbook:
+        description: "Generate a runbook with step-by-step response procedures for the incident."
+        system_prompt: |
+          You are an SRE expert who creates operational runbooks. Based on the incident information,
+          generate a step-by-step runbook that guides the responder through the investigation and resolution process.
+
+          Your runbook should be:
+          1. Actionable - each step should be specific and executable
+          2. Progressive - start with diagnosis, then containment, then resolution
+          3. Safe - include validation steps and rollback procedures
+          4. Complete - cover the full incident lifecycle
+
+          Output in the following structured Markdown format:
+
+          ## Runbook: [Incident Type]
+
+          ### Overview
+          - **Incident Type**: [e.g., High CPU, Memory Leak, API 5xx]
+          - **Severity**: [Sev1/Sev2/Sev3/Sev4]
+          - **Estimated Time**: [e.g., 15-30 minutes]
+          - **Required Access**: [e.g., kubectl, AWS Console, Database]
+
+          ### Prerequisites
+          - [ ] Access to [required systems]
+          - [ ] Required tools installed: [list tools]
+          - [ ] Communication channel established
+
+          ### Step 1: Initial Assessment
+          **Goal**: Understand the scope and impact
+
+          ```bash
+          # Commands to run (if applicable)
+          [diagnostic commands]
+          ```
+
+          **Expected Output**: [What to look for]
+          **Decision Point**: If [condition], proceed to Step 2. If [other condition], escalate.
+
+          ### Step 2: Containment
+          **Goal**: Limit the blast radius
+
+          - [ ] [Action 1]
+          - [ ] [Action 2]
+
+          **Validation**: [How to verify containment worked]
+
+          ### Step 3: Investigation
+          **Goal**: Identify root cause
+
+          | Check | Command/Action | What to Look For |
+          |-------|----------------|------------------|
+          | [Check 1] | [command] | [expected result] |
+          | [Check 2] | [command] | [expected result] |
+
+          ### Step 4: Resolution
+          **Goal**: Fix the issue
+
+          **Option A** (Recommended): [Primary fix]
+          ```bash
+          [fix commands]
+          ```
+
+          **Option B** (Fallback): [Alternative fix]
+
+          **Validation**: [How to verify the fix worked]
+
+          ### Step 5: Verification
+          **Goal**: Confirm system is healthy
+
+          - [ ] [Health check 1]
+          - [ ] [Health check 2]
+          - [ ] Monitor for [duration] to ensure stability
+
+          ### Escalation Criteria
+          Escalate to [team/person] if:
+          - [ ] Issue not resolved within [time]
+          - [ ] [Severity condition]
+          - [ ] [Impact condition]
+
+          ### Rollback Procedure
+          If the fix causes issues:
+          1. [Rollback step 1]
+          2. [Rollback step 2]
+
+          ### Post-Incident
+          - [ ] Update incident timeline
+          - [ ] Notify stakeholders
+          - [ ] Schedule postmortem if needed
+
+          ---
+          *This runbook was auto-generated by alert-menta. Adapt the steps based on your specific environment.*
+
+          ---
+          The following is the GitHub Issue describing the incident. Generate an appropriate runbook:
+        require_intent: false
+    - timeline:
+        description: "Generate a chronological timeline of incident response activities."
+        system_prompt: |
+          You are an SRE expert who creates incident timelines. Analyze the Issue and its comments to create a chronological timeline of all events and activities.
+
+          Extract the following from the conversation:
+          1. When was the incident first reported/detected
+          2. Who was involved and when they joined
+          3. What commands were executed (e.g., /describe, /analysis)
+          4. Key findings and decisions
+          5. Actions taken and their results
+          6. When the incident was resolved (if applicable)
+
+          Use emojis to categorize events:
+          - 🚨 Alert/Incident creation
+          - 👀 Investigation started
+          - 🔍 Analysis/Investigation
+          - 💡 Root cause identified
+          - 🔧 Fix/mitigation applied
+          - ✅ Resolved/Verified
+          - 📝 Documentation/Postmortem
+          - ⚠️ Escalation
+          - 💬 Communication/Update
+
+          Output in the following structured Markdown format:
+
+          ## Incident Timeline
+
+          ### Summary
+          - **Incident**: [Brief description]
+          - **Status**: [Ongoing/Resolved]
+          - **Duration**: [Time from start to resolution or current time]
+
+          ### Timeline
+
+          | Time | Event | Actor | Details |
+          |------|-------|-------|---------|
+          | [HH:MM] | 🚨 Incident Created | @[user]/bot | [Brief description] |
+          | [HH:MM] | 👀 Investigation Started | @[user] | [What they did] |
+          | ... | ... | ... | ... |
+
+          ### Key Metrics
+          - **Time to Acknowledge (TTA)**: [Time from creation to first human response]
+          - **Time to Resolve (TTR)**: [Time from creation to resolution, if resolved]
+          - **Number of Responders**: [Count]
+
+          ### Current Status
+          [Summary of current situation and next steps if incident is ongoing]
+
+          ---
+          *Timeline auto-generated by alert-menta from Issue comments.*
+
+          ---
+          The following is the GitHub Issue and comments. Generate the incident timeline:
+        require_intent: false
+    # Example command with structured output (#64)
+    - triage:
+        description: "Automatically triage the incident with structured JSON output."
+        system_prompt: |
+          You are an SRE expert. Analyze the incident and provide a structured triage assessment.
+          Return ONLY a valid JSON object with the specified schema.
+        require_intent: false
+        structured_output:
+          enabled: true
+          schema_name: "incident_triage"
+          schema:
+            type: object
+            properties:
+              severity:
+                type: string
+                enum: ["critical", "high", "medium", "low"]
+                description: "Incident severity level"
+              category:
+                type: string
+                enum: ["infrastructure", "application", "database", "network", "security", "other"]
+                description: "Incident category"
+              summary:
+                type: string
+                description: "Brief summary of the incident"
+              affected_services:
+                type: array
+                items:
+                  type: string
+                description: "List of affected services"
+              recommended_actions:
+                type: array
+                items:
+                  type: string
+                description: "Recommended immediate actions"
+              estimated_impact:
+                type: string
+                description: "Estimated user/business impact"
+            required:
+              - severity
+              - category
+              - summary
+          fallback_to_text: true
+
+# First Response Guide settings (#62)
+# Automatically posts an incident response guide when issues with specific labels are created
+first_response:
+  enabled: false  # Set to true to enable automatic first response guides
+  trigger_labels:  # Issue labels that trigger the guide
+    - incident
+    - alert
+    - outage
+  slack_channel: "#incidents"  # Optional: Slack channel for notifications
+  guides:  # Severity-specific guide configurations
+    - severity: high
+      auto_notify:
+        - "@sre-team"
+      # template: ".alert-menta/guides/high-severity.md"  # Optional: custom template
+    - severity: medium
+      auto_notify: []
+    - severity: low
+      auto_notify: []
+  escalation:
+    timeout_minutes: 15  # Escalation timeout for high severity
+    notify_target: "@oncall"
+
+# Auto-Triage settings (#63)
+# Automatically assigns labels and comments on new issues
+triage:
+  enabled: false  # Set to true to enable auto-triage
+  auto_label: true  # Automatically add labels based on triage
+  auto_comment: true  # Post triage result as comment
+  confidence_threshold: 0.7  # Minimum confidence to apply labels (0.0-1.0)
+  labels:
+    priority:
+      - name: "priority:critical"
+        criteria: "Production service outage, data loss risk"
+      - name: "priority:high"
+        criteria: "User impact, requires urgent attention"
+      - name: "priority:medium"
+        criteria: "Feature degradation but workaround exists"
+      - name: "priority:low"
+        criteria: "Improvement request, minor issue"
+    category:
+      - name: "type:bug"
+        criteria: "Bug report for existing functionality"
+      - name: "type:feature"
+        criteria: "New feature request or enhancement"
+      - name: "type:docs"
+        criteria: "Documentation update or fix"
+      - name: "type:incident"
+        criteria: "Incident report, alert, or outage"
+
+# Slack notification settings
+notifications:
+  slack:
+    enabled: false  # Set to true to enable Slack notifications
+    webhook_url: ""  # Your Slack Incoming Webhook URL (or use -slack-webhook-url flag)
+    channel: ""  # Optional: Override webhook default channel (e.g., "#incidents")
+    notify_on:
+      - command_response  # Notify when AI responds to a command
+      # - incident_created  # Notify when new incident is created (future feature)