{
  "schema_version": "1.0.0",
  "bot_id": "6.11",
  "bot_name": "IncidentCommander",
  "slug": "incidentcommander",
  "layer": "Governance",
  "layer_key": "gov",
  "bot_class": "Governance Service",
  "authority": [
    "Explain"
  ],
  "status": "planned",
  "readiness": "Spec started",
  "flagship": false,
  "is_reference": false,
  "public_export": false,
  "identity": {
    "layer": "Governance",
    "bot_class": "Governance Service",
    "authority": "Explain",
    "runs_before": "Nothing \u2014 IncidentCommander is triggered by guard or operator alerts",
    "runs_after": "A guard, monitor, or operator declares an incident",
    "applies_to": "Any declared incident affecting the Polytraders bot fleet",
    "default_mode": "shadow_only",
    "user_visible": "no",
    "developer_owner": "Polytraders core"
  },
  "purpose": "IncidentCommander coordinates halts, flattens, and post-mortems when a guard, monitor, or operator declares an incident. It records the incident timeline, dispatches auto-actions by severity, pages on-call, and tracks RCA completion.",
  "why_it_matters": [
    {
      "failure": "No centralised incident coordinator",
      "consequence": "Multiple bots may take conflicting halt/flatten actions; incident timeline is incoherent."
    },
    {
      "failure": "RCA not completed within SLA",
      "consequence": "Repeat incidents occur because root cause is never addressed; compliance audit finds gaps."
    }
  ],
  "polymarket_inputs": [
    {
      "input": "None \u2014 IncidentCommander is a pure internal governance orchestrator",
      "source": "internal",
      "required": false,
      "use": "N/A"
    }
  ],
  "internal_inputs": [
    {
      "input": "Incident declaration event from any guard or operator",
      "source": "internal",
      "required": true,
      "use": "Trigger incident workflow; dispatch auto-actions by severity."
    },
    {
      "input": "KillSwitch active flag",
      "source": "gov.killswitch",
      "required": false,
      "use": "Check if KillSwitch is already active before dispatching halt auto-action."
    }
  ],
  "raw_params": [
    "auto_actions_by_severity \u00b7 map",
    "require_rca_within_h \u00b7 int",
    "page_on_severity \u00b7 enum",
    "publish_status_externally \u00b7 bool"
  ],
  "parameters": [
    {
      "name": "auto_actions_by_severity",
      "default": {
        "P0": [
          "halt_all",
          "page_oncall"
        ],
        "P1": [
          "page_oncall"
        ],
        "P2": [
          "notify_slack"
        ]
      },
      "warning": null,
      "hard": null,
      "controls": "Map of severity level to list of auto-actions to dispatch.",
      "why_default_matters": "P0 incidents require immediate halt; P1 requires paging; P2 requires notification.",
      "threshold_logic": [
        {
          "condition": "severity=P0",
          "action": "Dispatch halt_all and page_oncall immediately"
        }
      ],
      "dev_check": "actions = p.auto_actions_by_severity.get(incident.severity, [])",
      "user_facing": ""
    },
    {
      "name": "require_rca_within_h",
      "default": 24,
      "warning": 12,
      "hard": 48,
      "controls": "Hours after incident resolution within which an RCA document must be filed.",
      "why_default_matters": "24-hour RCA deadline ensures timely learning while context is fresh.",
      "threshold_logic": [
        {
          "condition": "rca not filed within require_rca_within_h",
          "action": "Emit RCA_OVERDUE alert"
        }
      ],
      "dev_check": "if now() - incident.resolved_at > hours(p.require_rca_within_h): emit('RCA_OVERDUE')",
      "user_facing": ""
    }
  ],
  "default_config": {
    "bot_id": "gov.incidentcommander",
    "version": "0.1.0",
    "mode": "shadow_only",
    "defaults": {
      "auto_actions_by_severity": {
        "P0": [
          "halt_all",
          "page_oncall"
        ],
        "P1": [
          "page_oncall"
        ],
        "P2": [
          "notify_slack"
        ]
      },
      "require_rca_within_h": 24,
      "page_on_severity": "P1",
      "publish_status_externally": false
    }
  },
  "implementation_flow": [
    "On incident declaration, assign incident_id (ULID) and record severity, scope, and declaring bot.",
    "Dispatch auto-actions from auto_actions_by_severity map for the declared severity.",
    "Page on-call if incident severity >= page_on_severity threshold.",
    "Record incident timeline events: declaration, auto-actions taken, acknowledgement, resolution.",
    "After resolution, start require_rca_within_h countdown; emit RCA_OVERDUE if RCA is not filed in time.",
    "Emit OperationsReport(event_type=INCIDENT_DECLARED/RESOLVED/RCA_FILED) on each lifecycle event."
  ],
  "decision_logic": {
    "approve": "Not applicable \u2014 IncidentCommander does not approve trading orders.",
    "reshape_required": "Not applicable.",
    "reject": "Not applicable as a trading decision.",
    "warning_only": "Emits RCA_OVERDUE warn if RCA is not filed within SLA."
  },
  "decision_output_schema": "OperationsReport",
  "decision_output_example": {
    "report_id": "ops_incidentcommander_01HX9Z",
    "bot_id": "gov.incidentcommander",
    "event_type": "INCIDENT_DECLARED",
    "incident_id": "inc_01HX9Z",
    "severity": "P1",
    "scope": [
      "risk.liquidityguard",
      "exec.smartrouter"
    ],
    "auto_actions_dispatched": [
      "page_oncall"
    ],
    "declared_at": "2026-05-09T10:00:00Z",
    "report_kind": "OperationsReport",
    "topic": "polytraders.reports.operations"
  },
  "developer_log": {
    "bot_id": "gov.incidentcommander",
    "event_type": "AUTO_ACTION_DISPATCHED",
    "incident_id": "inc_01HX9Z",
    "action": "page_oncall",
    "dispatched_at_ms": 1746792060000
  },
  "user_explanations": [
    {
      "situation": "Incident declared",
      "message": "A system incident has been declared. Automated responses have been triggered based on severity."
    },
    {
      "situation": "RCA overdue",
      "message": "The root cause analysis for a recent incident has not been filed within the required timeframe."
    }
  ],
  "failure_modes": {
    "main_failure_mode": "On-call paging system is unavailable; P0/P1 incidents do not generate pages.",
    "false_positive_risk": "A transient alert triggers a P0 incident and halt-all auto-action unnecessarily.",
    "false_negative_risk": "An incident is declared at P2 when it should be P0; critical auto-actions are not dispatched.",
    "safe_fallback": "If paging system is unavailable, log PAGING_SYSTEM_UNAVAILABLE and attempt fallback notification via Slack.",
    "required_dependencies": [
      "On-call paging system",
      "Internal audit log store",
      "gov.killswitch"
    ]
  },
  "acceptance_tests": {
    "unit": [
      {
        "test": "P0 incident triggers halt_all and page_oncall",
        "setup": "severity=P0",
        "expected": "halt_all and page_oncall dispatched; INCIDENT_DECLARED OperationsReport emitted"
      },
      {
        "test": "RCA_OVERDUE emitted when RCA not filed in time",
        "setup": "incident resolved 25h ago, require_rca_within_h=24",
        "expected": "RCA_OVERDUE emitted"
      }
    ],
    "integration": [
      {
        "test": "Full incident lifecycle: declaration \u2192 auto-actions \u2192 resolution \u2192 RCA filed",
        "expected": "4 OperationsReport records with correct event_types"
      }
    ],
    "property": [
      {
        "property": "Every P0 incident triggers halt_all auto-action within 5 seconds",
        "required": "Always true \u2014 auto-actions are dispatched synchronously on declaration"
      }
    ]
  },
  "checklist_overrides": {},
  "legacy_goal": "Coordinate halts, flattens, and post-mortems when a guard, monitor, or operator declares an incident.",
  "legacy_pm_signals": [
    "Active incidents with severity, owner, and affected scope",
    "Auto-actions taken (pause / quarantine / flatten)",
    "Post-incident artefacts: timeline, RCA, follow-up tickets"
  ],
  "legacy_external_feeds": [
    "On-call paging system"
  ],
  "reporting_groups": [
    "governance_audit"
  ],
  "network": [
    "polygon"
  ],
  "api_surface": [
    "internal"
  ],
  "version": {
    "spec": "2.0.0",
    "implementation": "0.1.0",
    "schema": "2",
    "released": null,
    "planned_release": "Q3-2026"
  },
  "migration_history": [
    {
      "date": "2026-04-28",
      "from": "n/a",
      "to": "v2-spec",
      "reason": "Spec drafted post-CLOB-V2 cutover; bot not yet implemented",
      "action_taken": "Designed against V2 schema (pUSD, builder codes, V2 EIP-712 domain)"
    }
  ],
  "polymarket_v2_compat": {
    "clob_version": "v2",
    "collateral": "pUSD",
    "eip712_domain_version": "2",
    "builder_code_aware": false,
    "negrisk_aware": false,
    "multichain_ready": false,
    "sdk_used": "py-clob-client-v2",
    "settlement_contract": "CTFExchangeV2",
    "notes": "IncidentCommander is a pure governance orchestration service; interacts with no CLOB surfaces directly."
  },
  "reference_implementation": {
    "pseudocode": "// ---- INCIDENT DECLARATION ----\nFUNCTION declareIncident(declaration):\n  incident = {\n    id: generateULID(), severity: declaration.severity,\n    scope: declaration.scope, declared_by: declaration.bot_id,\n    declared_at: now(), status: 'active', timeline: []\n  }\n  postgres.insert('incidents', incident)\n  actions = config.auto_actions_by_severity.get(incident.severity, [])\n  FOR action IN actions:\n    dispatch(action, incident)\n    incident.timeline.append({action: action, at: now()})\n  EMIT OperationsReport(event_type='INCIDENT_DECLARED', incident_id=incident.id,\n    severity=incident.severity, auto_actions_dispatched=actions)\n\n// ---- RESOLUTION ----\nFUNCTION resolveIncident(incidentId, resolvedBy):\n  incident = postgres.get('incidents', incidentId)\n  incident.status = 'resolved'\n  incident.resolved_at = now()\n  incident.resolved_by = resolvedBy\n  postgres.upsert('incidents', incident)\n  scheduleRcaDeadline(incidentId, config.require_rca_within_h)\n  EMIT OperationsReport(event_type='INCIDENT_RESOLVED', incident_id=incidentId)\n\n// ---- RCA CHECK ----\nFUNCTION checkRcaDeadline(incidentId):\n  incident = postgres.get('incidents', incidentId)\n  IF incident.rca_filed IS NULL:\n    EMIT OperationsReport(event_type='RCA_OVERDUE', incident_id=incidentId)\n    alerting.emit('RCA_OVERDUE', {incident_id: incidentId})",
    "sdk_calls": [
      "postgres.insert('incidents', incident)",
      "postgres.upsert('incidents', incident)",
      "alerting.emit('RCA_OVERDUE', metadata)"
    ],
    "complexity": "O(1) per incident event; O(A) per declaration where A = auto-action count"
  },
  "wire_examples": {
    "input": {
      "label": "Incident declaration",
      "source": "risk.liquidityguard",
      "payload": {
        "declaring_bot": "risk.liquidityguard",
        "severity": "P1",
        "scope": [
          "exec.smartrouter"
        ],
        "declared_at": "2026-05-09T10:00:00Z"
      }
    },
    "output": {
      "label": "OperationsReport \u2014 INCIDENT_DECLARED",
      "payload": {
        "report_id": "ops_incident_01HX9Z",
        "event_type": "INCIDENT_DECLARED",
        "incident_id": "inc_01HX9Z",
        "severity": "P1",
        "report_kind": "OperationsReport",
        "topic": "polytraders.reports.operations"
      }
    }
  },
  "reason_codes": [
    {
      "code": "INCIDENT_DECLARED",
      "severity": "INFO",
      "meaning": "An incident was declared and auto-actions dispatched.",
      "action": "Log; emit OperationsReport.",
      "user_message": ""
    },
    {
      "code": "INCIDENT_RESOLVED",
      "severity": "INFO",
      "meaning": "An incident was resolved.",
      "action": "Log; start RCA countdown.",
      "user_message": ""
    },
    {
      "code": "RCA_OVERDUE",
      "severity": "WARN",
      "meaning": "RCA not filed within require_rca_within_h.",
      "action": "Emit WARN alert.",
      "user_message": ""
    },
    {
      "code": "PAGING_SYSTEM_UNAVAILABLE",
      "severity": "WARN",
      "meaning": "On-call paging system is unreachable.",
      "action": "Fallback to Slack; emit WARN.",
      "user_message": ""
    },
    {
      "code": "KILL_SWITCH_ACTIVE",
      "severity": "WARN",
      "meaning": "KillSwitch already active when halt_all dispatched.",
      "action": "Log; no duplicate halt needed.",
      "user_message": ""
    }
  ],
  "metrics": {
    "emitted": [
      {
        "name": "polytraders_gov_incidentcommander_incidents_total",
        "type": "counter",
        "unit": "count",
        "labels": [
          "severity",
          "status"
        ],
        "meaning": "Total incidents by severity and status."
      },
      {
        "name": "polytraders_gov_incidentcommander_rca_overdue_total",
        "type": "counter",
        "unit": "count",
        "labels": [],
        "meaning": "Total RCA overdue events."
      },
      {
        "name": "polytraders_gov_incidentcommander_auto_actions_total",
        "type": "counter",
        "unit": "count",
        "labels": [
          "action"
        ],
        "meaning": "Total auto-actions dispatched by type."
      },
      {
        "name": "polytraders_gov_incidentcommander_active_incidents",
        "type": "gauge",
        "unit": "count",
        "labels": [
          "severity"
        ],
        "meaning": "Currently active incidents by severity."
      }
    ],
    "alerts": [
      {
        "name": "IncidentCommanderRcaOverdue",
        "condition": "rate(polytraders_gov_incidentcommander_rca_overdue_total[1h]) > 0",
        "severity": "P2",
        "runbook": "#runbook-incidentcommander-rca"
      },
      {
        "name": "IncidentCommanderActiveP0",
        "condition": "polytraders_gov_incidentcommander_active_incidents{severity='P0'} > 0",
        "severity": "P0",
        "runbook": "#runbook-incidentcommander-p0"
      },
      {
        "name": "IncidentCommanderPagingUnavailable",
        "condition": "absent(polytraders_gov_incidentcommander_auto_actions_total{action='page_oncall'})",
        "severity": "P1",
        "runbook": "#runbook-incidentcommander-paging"
      }
    ]
  },
  "state": {
    "store": "postgres",
    "shape": "incidents table: {id, severity, scope, declared_by, declared_at, status, resolved_at, rca_filed, timeline[]}",
    "ttl": "indefinite (incidents are never purged)",
    "recovery": "On restart, reload active incidents from Postgres; re-schedule RCA deadlines.",
    "size_estimate": "~5 KB per incident with full timeline; expected < 1000 incidents per year"
  },
  "concurrency": {
    "execution_model": "event-driven; one goroutine per active incident",
    "max_in_flight": 10,
    "idempotency_key": "incident_id",
    "timeout_ms": 5000,
    "backpressure": "queue",
    "locking": "Postgres row-level lock per incident_id"
  },
  "dependencies": {
    "depends_on": [
      {
        "bot_id": "gov.killswitch",
        "why": "IncidentCommander checks KillSwitch state before dispatching halt_all.",
        "contract": "KillSwitch state is queryable in < 100ms."
      }
    ],
    "emits_to": [
      {
        "bot_id": "internal.governance_audit",
        "what": "OperationsReport on every incident lifecycle event"
      }
    ],
    "sibling": [
      {
        "bot_id": "gov.parameterchangeauditor",
        "why": "ParameterChangeAuditor provides recent config changes to support RCA.",
        "contract": "Changes queryable by audited_bot and changed_at."
      }
    ],
    "external": [
      {
        "service": "On-call paging system",
        "endpoint": "https://paging.internal",
        "sla": "99.9%",
        "failure_mode": "Fallback to Slack notification; emit PAGING_SYSTEM_UNAVAILABLE."
      }
    ]
  },
  "security_surfaces": {
    "signs_orders": false,
    "private_key_access": "none",
    "abuse_vectors": [
      "Declaring a spurious P0 incident to trigger halt_all and disrupt trading"
    ],
    "mitigations": [
      "Incident declarations require an authenticated internal bot or operator identity",
      "Incident timeline is immutably logged; false declarations are auditable"
    ]
  },
  "failure_injection": [
    {
      "scenario": "PAGING_SYSTEM_DOWN",
      "how_to_inject": "Block TCP to paging.internal during P1 incident declaration",
      "expected_behaviour": "PAGING_SYSTEM_UNAVAILABLE emitted; fallback Slack notification sent",
      "recovery": "Automatic when paging system recovers."
    },
    {
      "scenario": "RCA_DEADLINE_EXCEEDED",
      "how_to_inject": "Resolve incident; do not file RCA; wait 25h",
      "expected_behaviour": "RCA_OVERDUE emitted; alert fired",
      "recovery": "File RCA; mark incident RCA-complete."
    },
    {
      "scenario": "SPURIOUS_P0_DECLARATION",
      "how_to_inject": "Send P0 incident declaration from a test bot",
      "expected_behaviour": "halt_all dispatched; INCIDENT_DECLARED emitted; on-call paged",
      "recovery": "Cancel halt via gov.killswitch; resolve incident."
    }
  ],
  "runbook": {
    "summary": "IncidentCommander incidents require immediate triage. P0 auto-actions halt trading; confirm the incident is genuine before clearing.",
    "oncall_actions": [
      {
        "alert": "IncidentCommanderActiveP0",
        "first_action": "Verify the declaring bot and incident scope; confirm halt is warranted.",
        "escalate_to": "Engineering lead immediately"
      },
      {
        "alert": "IncidentCommanderRcaOverdue",
        "first_action": "Identify incident owner; request RCA filing immediately.",
        "escalate_to": "Governance pod lead"
      }
    ],
    "manual_overrides": [
      {
        "name": "resolve-incident",
        "how": "polytraders gov incident resolve --id <id> --resolved-by <operator>",
        "when": "Incident is confirmed resolved and trading can resume."
      }
    ],
    "healthcheck": "/internal/health/incidentcommander \u2192 green if No active P0 incidents; paging system reachable; no RCA overdue; red if Active P0 incident or paging system unreachable"
  },
  "promotion_gates": {
    "to_shadow": [
      {
        "gate": "P0 auto-action dispatch unit test passes",
        "how_measured": "CI",
        "threshold": "Pass"
      }
    ],
    "to_limited_live": [
      {
        "gate": "End-to-end incident lifecycle test completes in staging",
        "how_measured": "Integration test",
        "threshold": "Pass"
      }
    ],
    "to_general_live": [
      {
        "gate": "One production incident handled with full timeline and RCA filed",
        "how_measured": "Governance review",
        "threshold": "Pass"
      }
    ]
  },
  "reporting": {
    "emits_kinds": [
      "OperationsReport"
    ],
    "topics": [
      "polytraders.reports.operations"
    ],
    "cadence": "every-period",
    "retention_class": "1y",
    "sampling_rule": "batched-1/min",
    "bus_failure_action": "drop-after-buffer",
    "user_visible": "no",
    "consumes_kinds": []
  },
  "capital_impact": "Critical",
  "v3_status": {
    "phase": 7,
    "phase_name": "Governance & replay",
    "docs": {
      "done": 27,
      "total": 27,
      "state": "done"
    },
    "impl": {
      "done": 0,
      "total": 15,
      "state": "pending"
    },
    "runtime": {
      "done": 0,
      "total": 8,
      "state": "pending"
    },
    "overall": "pending"
  }
}