{
  "schema_version": "1.0.0",
  "bot_id": "6.15",
  "bot_name": "SLAMonitor",
  "slug": "slamonitor",
  "layer": "Governance",
  "layer_key": "gov",
  "bot_class": "Governance Service",
  "authority": [
    "Explain"
  ],
  "status": "planned",
  "readiness": "Spec started",
  "flagship": false,
  "is_reference": false,
  "public_export": false,
  "identity": {
    "layer": "Governance",
    "bot_class": "Governance Service",
    "authority": "Explain",
    "runs_before": "Nothing \u2014 SLAMonitor is a passive observer; runs continuously on metrics",
    "runs_after": "Metrics are emitted by all bots in the fleet",
    "applies_to": "All service-level objectives defined for the Polytraders fleet",
    "default_mode": "shadow_only",
    "user_visible": "summary-only",
    "developer_owner": "Polytraders core"
  },
  "purpose": "SLAMonitor tracks service-level objectives committed to internally and to users, measures error-budget burn rate, and emits alerts when burn rate approaches the SLO budget limit. Retained 7 years as a compliance-grade availability record.",
  "why_it_matters": [
    {
      "failure": "No SLO tracking",
      "consequence": "Availability and latency regressions go undetected until users complain; SLA breach evidence is unavailable for compliance."
    },
    {
      "failure": "Error budget burn not tracked",
      "consequence": "The team consumes the entire error budget without realising it; no time left for planned maintenance."
    }
  ],
  "polymarket_inputs": [
    {
      "input": "None \u2014 SLAMonitor consumes only internal metrics",
      "source": "internal",
      "required": false,
      "use": "N/A"
    }
  ],
  "internal_inputs": [
    {
      "input": "Prometheus/OpenMetrics scrape from all fleet bots",
      "source": "internal.metrics_store",
      "required": true,
      "use": "Compute SLO compliance and error-budget burn rate."
    },
    {
      "input": "ExecutionReport stream",
      "source": "internal.report_bus",
      "required": true,
      "use": "Track fill-quality SLOs (latency, fill rate) per strategy."
    }
  ],
  "raw_params": [
    "slo_definitions \u00b7 map",
    "burn_rate_alert_pct \u00b7 0\u2013100",
    "publish_to_user \u00b7 bool",
    "auto_freeze_on_breach \u00b7 bool"
  ],
  "parameters": [
    {
      "name": "slo_definitions",
      "default": {
        "fill_latency_ms_p99": 500,
        "fill_success_rate_pct": 99.5,
        "uptime_pct": 99.9
      },
      "warning": null,
      "hard": null,
      "controls": "Map of SLO name to target value.",
      "why_default_matters": "Default SLOs reflect the commitments in the Polytraders service agreement.",
      "threshold_logic": [
        {
          "condition": "metric_value violates slo_target",
          "action": "Increment error budget consumption; emit SLO_BREACH_DETECTED if budget exhausted"
        }
      ],
      "dev_check": "if metric_value > slo.target: budgetConsumer.record(slo.name)",
      "user_facing": "The system maintains targets for response speed and availability."
    },
    {
      "name": "burn_rate_alert_pct",
      "default": 5.0,
      "warning": 10,
      "hard": 20,
      "controls": "Alert when hourly error-budget burn rate exceeds this percentage of the monthly budget.",
      "why_default_matters": "5% hourly burn means the monthly budget would be exhausted in 20 hours.",
      "threshold_logic": [
        {
          "condition": "hourly_burn_rate > burn_rate_alert_pct",
          "action": "Emit SLO_BURN_RATE_EXCEEDED alert"
        }
      ],
      "dev_check": "if hourly_burn > p.burn_rate_alert_pct: emit('SLO_BURN_RATE_EXCEEDED')",
      "user_facing": "You'll be notified if service quality degrades significantly."
    }
  ],
  "default_config": {
    "bot_id": "gov.slamonitor",
    "version": "0.1.0",
    "mode": "shadow_only",
    "defaults": {
      "slo_definitions": {
        "fill_latency_ms_p99": 500,
        "fill_success_rate_pct": 99.5,
        "uptime_pct": 99.9
      },
      "burn_rate_alert_pct": 5.0,
      "publish_to_user": true,
      "auto_freeze_on_breach": false
    }
  },
  "implementation_flow": [
    "Scrape Prometheus metrics from all fleet bots every 60 seconds.",
    "For each SLO definition, compute current compliance and error-budget consumption.",
    "Compute hourly burn rate as (errors_in_last_hour / monthly_budget * 100).",
    "If burn_rate > burn_rate_alert_pct, emit SLO_BURN_RATE_EXCEEDED alert.",
    "If error budget is exhausted, emit SLO_BREACH_DETECTED and optionally freeze deployments.",
    "Emit SettlementReport(event_type=SLO_STATUS) every hour with all SLO compliance metrics.",
    "Retain SettlementReport records for 7 years as compliance-grade availability evidence."
  ],
  "decision_logic": {
    "approve": "Not applicable \u2014 SLAMonitor does not approve trading orders.",
    "reshape_required": "Not applicable.",
    "reject": "If auto_freeze_on_breach=true, freezes new deployments on SLO breach.",
    "warning_only": "Emits SLO_BURN_RATE_EXCEEDED when burn rate threshold is crossed."
  },
  "decision_output_schema": "SettlementReport",
  "decision_output_example": {
    "report_id": "stl_slamonitor_01HX9Z",
    "bot_id": "gov.slamonitor",
    "event_type": "SLO_STATUS",
    "window_start": "2026-05-09T09:00:00Z",
    "window_end": "2026-05-09T10:00:00Z",
    "slo_compliance": {
      "fill_latency_ms_p99": {
        "target": 500,
        "actual": 312,
        "compliant": true
      },
      "fill_success_rate_pct": {
        "target": 99.5,
        "actual": 99.8,
        "compliant": true
      },
      "uptime_pct": {
        "target": 99.9,
        "actual": 100.0,
        "compliant": true
      }
    },
    "error_budget_consumed_pct": 1.2,
    "hourly_burn_rate_pct": 0.8,
    "report_kind": "SettlementReport",
    "topic": "polytraders.reports.settlement",
    "retained_until": "2033-05-09"
  },
  "developer_log": {
    "bot_id": "gov.slamonitor",
    "event_type": "METRICS_SCRAPED",
    "slo_name": "fill_latency_ms_p99",
    "actual_value": 312,
    "target_value": 500,
    "compliant": true,
    "scraped_at_ms": 1746792060000
  },
  "user_explanations": [
    {
      "situation": "SLO status report published",
      "message": "Service quality is within the committed targets. All systems are operating normally."
    },
    {
      "situation": "SLO burn rate alert",
      "message": "Service quality has degraded and is consuming the error budget at a high rate. The team has been notified."
    }
  ],
  "failure_modes": {
    "main_failure_mode": "Metrics store is unavailable; SLO compliance cannot be computed; error budget calculation stalls.",
    "false_positive_risk": "A transient spike in fill latency causes a burn-rate alert that resolves in < 5 minutes.",
    "false_negative_risk": "A sustained SLO degradation below the burn-rate threshold goes unalerted.",
    "safe_fallback": "If metrics store is unavailable, emit SLO_STATUS with slo_compliance=unknown and alert on data gap.",
    "required_dependencies": [
      "internal.metrics_store (Prometheus)",
      "internal.report_bus (ExecutionReport)",
      "Postgres SLO store"
    ]
  },
  "acceptance_tests": {
    "unit": [
      {
        "test": "Burn rate alert fires when hourly burn exceeds threshold",
        "setup": "hourly_burn=6.0, burn_rate_alert_pct=5.0",
        "expected": "SLO_BURN_RATE_EXCEEDED alert emitted"
      },
      {
        "test": "SLO status reports all compliant when all metrics within targets",
        "setup": "fill_latency=312, fill_success=99.8, uptime=100.0",
        "expected": "SettlementReport with all slo_compliance.compliant=true"
      }
    ],
    "integration": [
      {
        "test": "Hourly SettlementReport emitted with correct SLO compliance metrics",
        "expected": "SettlementReport on polytraders.reports.settlement every 60 minutes"
      }
    ],
    "property": [
      {
        "property": "Every SLO status report is retained for >= 2555 days",
        "required": "Always true"
      }
    ]
  },
  "checklist_overrides": {},
  "legacy_goal": "Track service-level objectives that we\u2019ve committed to internally and to users; alert on burn.",
  "legacy_pm_signals": [
    "Uptime, latency, fill-quality, and incident SLOs per strategy",
    "Error-budget consumption and burn-rate",
    "SLA breach history with linked incidents"
  ],
  "legacy_external_feeds": [],
  "reporting_groups": [
    "governance_audit"
  ],
  "network": [
    "polygon"
  ],
  "api_surface": [
    "internal"
  ],
  "version": {
    "spec": "2.0.0",
    "implementation": "0.1.0",
    "schema": "2",
    "released": null,
    "planned_release": "Q4-2026"
  },
  "migration_history": [
    {
      "date": "2026-04-28",
      "from": "n/a",
      "to": "v2-spec",
      "reason": "Spec drafted post-CLOB-V2 cutover; bot not yet implemented",
      "action_taken": "Designed against V2 schema (pUSD, builder codes, V2 EIP-712 domain)"
    }
  ],
  "polymarket_v2_compat": {
    "clob_version": "v2",
    "collateral": "pUSD",
    "eip712_domain_version": "2",
    "builder_code_aware": false,
    "negrisk_aware": false,
    "multichain_ready": false,
    "sdk_used": "py-clob-client-v2",
    "settlement_contract": "CTFExchangeV2",
    "notes": "SLAMonitor tracks service-level objectives across the Polytraders fleet; no CLOB calls. All latency and budget metrics are pUSD-free."
  },
  "reference_implementation": {
    "pseudocode": "// ---- SCRAPE LOOP (every 60s) ----\nFUNCTION scrapeAndEvaluate():\n  metrics = FETCH internal.metricsStore.GET({bots: 'all', window: '1m'})\n  IF metrics IS NULL:\n    EMIT SettlementReport(event_type='SLO_DATA_GAP')\n    alerting.emit('SLO_METRICS_UNAVAILABLE')\n    RETURN\n\n  compliance = {}\n  FOR slo IN config.slo_definitions:\n    actual = metrics.get(slo.name)\n    compliant = (actual <= slo.target) IF slo.type == 'max' ELSE (actual >= slo.target)\n    compliance[slo.name] = {target: slo.target, actual: actual, compliant: compliant}\n    IF NOT compliant:\n      errorBudget.record(slo.name, violation=True)\n\n// ---- HOURLY REPORT ----\nFUNCTION emitHourlyReport(windowStart, windowEnd):\n  burnRate = errorBudget.hourlyBurnRate()\n  IF burnRate > config.burn_rate_alert_pct:\n    alerting.emit('SLO_BURN_RATE_EXCEEDED', {burn_rate: burnRate})\n  IF errorBudget.exhausted():\n    alerting.emit('SLO_BREACH_DETECTED')\n    IF config.auto_freeze_on_breach:\n      deploymentManager.freeze()\n  EMIT SettlementReport(event_type='SLO_STATUS',\n    window_start=windowStart, window_end=windowEnd,\n    slo_compliance=compliance,\n    error_budget_consumed_pct=errorBudget.consumedPct(),\n    hourly_burn_rate_pct=burnRate,\n    retained_until=now() + days(2555))",
    "sdk_calls": [
      "internal.metricsStore.GET({bots, window})",
      "errorBudget.hourlyBurnRate()",
      "alerting.emit('SLO_BURN_RATE_EXCEEDED', metadata)"
    ],
    "complexity": "O(S) per scrape cycle where S = SLO count; O(1) for hourly report"
  },
  "wire_examples": {
    "input": {
      "label": "Prometheus metrics scrape",
      "source": "internal.metrics_store",
      "payload": {
        "fill_latency_ms_p99": 312,
        "fill_success_rate_pct": 99.8,
        "uptime_pct": 100.0,
        "scraped_at_ms": 1746792060000
      }
    },
    "output": {
      "label": "SettlementReport \u2014 SLO_STATUS",
      "payload": {
        "report_id": "stl_sla_01HX9Z",
        "event_type": "SLO_STATUS",
        "error_budget_consumed_pct": 1.2,
        "hourly_burn_rate_pct": 0.8,
        "report_kind": "SettlementReport",
        "topic": "polytraders.reports.settlement",
        "retained_until": "2033-05-09"
      }
    }
  },
  "reason_codes": [
    {
      "code": "SLO_STATUS",
      "severity": "INFO",
      "meaning": "Hourly SLO compliance report emitted.",
      "action": "Log and store.",
      "user_message": "Service quality is within committed targets."
    },
    {
      "code": "SLO_BURN_RATE_EXCEEDED",
      "severity": "WARN",
      "meaning": "Hourly error-budget burn rate exceeds burn_rate_alert_pct.",
      "action": "Emit alert; include in SLO_STATUS report.",
      "user_message": "Service quality has degraded; the team has been notified."
    },
    {
      "code": "SLO_BREACH_DETECTED",
      "severity": "HARD_REJECT",
      "meaning": "Error budget exhausted for the month.",
      "action": "Emit alert; optionally freeze deployments.",
      "user_message": ""
    },
    {
      "code": "SLO_METRICS_UNAVAILABLE",
      "severity": "WARN",
      "meaning": "Metrics store unavailable; SLO compliance unknown.",
      "action": "Emit SLO_DATA_GAP SettlementReport; alert.",
      "user_message": ""
    },
    {
      "code": "KILL_SWITCH_ACTIVE",
      "severity": "WARN",
      "meaning": "KillSwitch active; noted in SLO report as planned downtime.",
      "action": "Exclude kill-switch period from error budget consumption.",
      "user_message": ""
    }
  ],
  "metrics": {
    "emitted": [
      {
        "name": "polytraders_gov_slamonitor_slo_compliance",
        "type": "gauge",
        "unit": "bool",
        "labels": [
          "slo_name"
        ],
        "meaning": "Current compliance status per SLO (1=compliant, 0=breaching)."
      },
      {
        "name": "polytraders_gov_slamonitor_error_budget_consumed_pct",
        "type": "gauge",
        "unit": "percent",
        "labels": [
          "slo_name"
        ],
        "meaning": "Percentage of monthly error budget consumed per SLO."
      },
      {
        "name": "polytraders_gov_slamonitor_burn_rate_hourly_pct",
        "type": "gauge",
        "unit": "percent",
        "labels": [],
        "meaning": "Current hourly burn rate as percentage of monthly budget."
      },
      {
        "name": "polytraders_gov_slamonitor_status_reports_total",
        "type": "counter",
        "unit": "count",
        "labels": [
          "status"
        ],
        "meaning": "Total SLO status reports emitted by status."
      }
    ],
    "alerts": [
      {
        "name": "SLAMonitorBurnRateHigh",
        "condition": "polytraders_gov_slamonitor_burn_rate_hourly_pct > 5",
        "severity": "P2",
        "runbook": "#runbook-slamonitor-burnrate"
      },
      {
        "name": "SLAMonitorBreach",
        "condition": "polytraders_gov_slamonitor_error_budget_consumed_pct > 100",
        "severity": "P1",
        "runbook": "#runbook-slamonitor-breach"
      },
      {
        "name": "SLAMonitorMetricsUnavailable",
        "condition": "absent(polytraders_gov_slamonitor_slo_compliance)",
        "severity": "P2",
        "runbook": "#runbook-slamonitor-metrics"
      }
    ]
  },
  "state": {
    "store": "postgres",
    "shape": "slo_status_reports table: {report_id, window_start, window_end, slo_compliance{}, error_budget_consumed_pct, hourly_burn_rate_pct, retained_until}",
    "ttl": "2555 days (7 years)",
    "recovery": "On restart, reload error budget state from last committed SettlementReport.",
    "size_estimate": "~5 KB per hourly report; ~45 MB per year at 24 reports/day"
  },
  "concurrency": {
    "execution_model": "single-threaded scrape loop + hourly report goroutine",
    "max_in_flight": 5,
    "idempotency_key": "window_start",
    "timeout_ms": 10000,
    "backpressure": "skip scrape if previous not complete",
    "locking": "Postgres unique constraint on window_start for hourly reports"
  },
  "dependencies": {
    "depends_on": [
      {
        "bot_id": "internal.metrics_store",
        "why": "All SLO compliance data is sourced from Prometheus metrics.",
        "contract": "Metrics available with < 60s staleness."
      }
    ],
    "emits_to": [
      {
        "bot_id": "internal.post_trade_archive",
        "what": "SettlementReport with hourly SLO compliance and 7-year retention"
      }
    ],
    "sibling": [
      {
        "bot_id": "gov.incidentcommander",
        "why": "SLAMonitor SLO breach events may trigger IncidentCommander declarations.",
        "contract": "SLO_BREACH_DETECTED event includes scope for IncidentCommander."
      }
    ],
    "external": [
      {
        "service": "Internal metrics store (Prometheus)",
        "endpoint": "https://metrics.internal",
        "sla": "99.9%",
        "failure_mode": "Emit SLO_DATA_GAP SettlementReport; alert; resume on recovery."
      }
    ]
  },
  "security_surfaces": {
    "signs_orders": false,
    "private_key_access": "none",
    "abuse_vectors": [
      "Manipulating metrics to suppress SLO breach detection"
    ],
    "mitigations": [
      "Metrics store is read-only for SLAMonitor; write access is restricted to fleet bots only"
    ]
  },
  "failure_injection": [
    {
      "scenario": "METRICS_STORE_UNAVAILABLE",
      "how_to_inject": "Block reads from internal.metrics_store",
      "expected_behaviour": "SLO_METRICS_UNAVAILABLE emitted; SLO_DATA_GAP SettlementReport; alert",
      "recovery": "Automatic resume when metrics store recovers."
    },
    {
      "scenario": "HIGH_BURN_RATE",
      "how_to_inject": "Inject 200 fill failures to exhaust fill_success_rate SLO budget",
      "expected_behaviour": "SLO_BURN_RATE_EXCEEDED alert; SLO_BREACH_DETECTED if budget exhausted",
      "recovery": "Investigate and resolve fill failures; error budget resets monthly."
    },
    {
      "scenario": "AUTO_FREEZE_ON_BREACH",
      "how_to_inject": "Set auto_freeze_on_breach=true; exhaust error budget",
      "expected_behaviour": "deployment manager frozen; SLO_BREACH_DETECTED emitted",
      "recovery": "Manual unfreeze after SLO remediation."
    }
  ],
  "runbook": {
    "summary": "SLAMonitor incidents require rapid response when error budget burns faster than planned. P1 if budget is exhausted; P2 for high burn rate.",
    "oncall_actions": [
      {
        "alert": "SLAMonitorBreach",
        "first_action": "Identify which SLO is breaching; correlate with recent deployments or incidents.",
        "escalate_to": "Engineering lead; consider declaring incident via IncidentCommander"
      },
      {
        "alert": "SLAMonitorBurnRateHigh",
        "first_action": "Review which metrics are degraded; check fill latency and success rate.",
        "escalate_to": "Governance pod lead"
      }
    ],
    "manual_overrides": [
      {
        "name": "reset-error-budget",
        "how": "polytraders gov slamonitor reset-budget --slo <name> --reason <reason>",
        "when": "After a planned maintenance window that should not count against the SLO budget."
      }
    ],
    "healthcheck": "/internal/health/slamonitor \u2192 green if Metrics store reachable; all SLOs compliant; burn rate < burn_rate_alert_pct; red if Metrics store unreachable or any SLO budget exhausted"
  },
  "promotion_gates": {
    "to_shadow": [
      {
        "gate": "Burn rate calculation unit tests pass",
        "how_measured": "CI",
        "threshold": "Pass"
      }
    ],
    "to_limited_live": [
      {
        "gate": "Hourly SLO status report emitted correctly in staging with 3 SLO definitions",
        "how_measured": "Integration test",
        "threshold": "Pass"
      }
    ],
    "to_general_live": [
      {
        "gate": "30-day SLO report history retained in Postgres; compliance team sign-off",
        "how_measured": "Compliance review",
        "threshold": "Pass"
      }
    ]
  },
  "reporting": {
    "emits_kinds": [
      "SettlementReport"
    ],
    "topics": [
      "polytraders.reports.settlement"
    ],
    "cadence": "every-event",
    "retention_class": "7y",
    "sampling_rule": "emit-every",
    "bus_failure_action": "wal-then-retry",
    "user_visible": "summary-only",
    "consumes_kinds": [
      "ExecutionReport"
    ]
  },
  "capital_impact": "Indirect",
  "v3_status": {
    "phase": 3,
    "phase_name": "Reporting & event store",
    "docs": {
      "done": 27,
      "total": 27,
      "state": "done"
    },
    "impl": {
      "done": 0,
      "total": 15,
      "state": "pending"
    },
    "runtime": {
      "done": 0,
      "total": 8,
      "state": "pending"
    },
    "overall": "pending"
  }
}