{
  "schema_version": "1.0.0",
  "bot_id": "6.2",
  "bot_name": "Health & Heartbeat",
  "slug": "health-heartbeat",
  "layer": "Governance",
  "layer_key": "gov",
  "bot_class": "Governance Service",
  "authority": [
    "Explain"
  ],
  "status": "live",
  "readiness": "General live",
  "flagship": false,
  "is_reference": false,
  "public_export": false,
  "identity": {
    "layer": "Governance",
    "bot_class": "Governance Service",
    "authority": "Explain",
    "runs_before": "Every bot lifecycle decision \u2014 HealthHeartbeat must confirm liveness before strategy logic executes",
    "runs_after": "System startup; triggered on CronRunner schedule (every heartbeat_interval_s)",
    "applies_to": "All 97 production bots across all layers",
    "default_mode": "general_live",
    "user_visible": "Advanced details only",
    "developer_owner": "Polytraders core \u2014 Governance pod"
  },
  "purpose": "HealthHeartbeat monitors the liveness of all 97 production bots by polling each bot's internal health endpoint at a configurable interval. If a bot misses missed_heartbeats_to_alert consecutive polls, HealthHeartbeat emits a page-severity alert and optionally triggers an auto-restart. It emits an OperationsReport after every sweep cycle summarising bot health across all layers. Internal-only \u2014 no external API surface.",
  "why_it_matters": [
    {
      "failure": "A bot crashes silently without HealthHeartbeat running",
      "consequence": "The dead bot's layer is unguarded. Risk votes, kill-switch checks, or execution guards may stop firing, allowing uncontrolled order flow."
    },
    {
      "failure": "Auto-restart fires for a bot in a crash-loop",
      "consequence": "Repeated restarts mask a systemic failure and exhaust restart budgets. Without a circuit breaker, the governance layer itself degrades."
    },
    {
      "failure": "Alert not fired on missed heartbeats",
      "consequence": "On-call is not paged. The dead bot may go unnoticed for hours, accumulating unmonitored risk exposure."
    },
    {
      "failure": "HealthHeartbeat itself is not monitored",
      "consequence": "The watchdog is unwatched. A dead HealthHeartbeat means all 97 bots run without liveness supervision."
    }
  ],
  "polymarket_inputs": [
    {
      "input": "None \u2014 all inputs are internal",
      "source": "internal",
      "required": false,
      "use": "HealthHeartbeat does not consume any Polymarket API surface directly."
    }
  ],
  "internal_inputs": [
    {
      "input": "Bot health endpoints \u2014 GET /internal/health/<slug>",
      "source": "All 97 production bots",
      "required": true,
      "use": "Primary liveness signal. A 200 response within timeout_ms is a live heartbeat."
    },
    {
      "input": "Bot registry \u2014 list of all bot slugs, layers, and restart configs",
      "source": "Config store",
      "required": true,
      "use": "Defines the set of bots to monitor and their per-bot restart and alerting rules."
    },
    {
      "input": "Restart executor \u2014 internal command bus topic for restart triggers",
      "source": "Process manager",
      "required": false,
      "use": "When auto_restart=true, HealthHeartbeat publishes a restart command to the process manager after missed_heartbeats_to_alert consecutive misses."
    }
  ],
  "raw_params": [
    "heartbeat_interval_s \u00b7 int",
    "missed_heartbeats_to_alert \u00b7 int",
    "auto_restart \u00b7 bool",
    "page_on_failure \u00b7 bool"
  ],
  "parameters": [
    {
      "name": "heartbeat_interval_s",
      "default": 30,
      "warning": 120,
      "hard": 300,
      "controls": "How often (in seconds) HealthHeartbeat polls each bot's health endpoint.",
      "why_default_matters": "30s gives a 90s detection window for a 3-miss threshold. Increasing beyond 120s delays alerting significantly.",
      "threshold_logic": [
        {
          "condition": "heartbeat_interval_s <= 30",
          "action": "Normal monitoring"
        },
        {
          "condition": "30\u2013120s",
          "action": "WARN \u2014 detection latency increased"
        },
        {
          "condition": "> 300s",
          "action": "Reject config change \u2014 PARAMETER_CHANGE_REQUIRES_APPROVAL"
        }
      ],
      "dev_check": "if (p.heartbeat_interval_s > p.hard) throw ConfigError('PARAMETER_CHANGE_REQUIRES_APPROVAL')",
      "user_facing": "The system checks that all components are running regularly."
    },
    {
      "name": "missed_heartbeats_to_alert",
      "default": 3,
      "warning": 5,
      "hard": 10,
      "controls": "Number of consecutive missed polls before an alert is fired.",
      "why_default_matters": "3 consecutive misses (90s at default interval) is enough to distinguish a transient blip from a real crash.",
      "threshold_logic": [
        {
          "condition": "missed <= 3",
          "action": "Normal tolerance"
        },
        {
          "condition": "4\u201310",
          "action": "WARN \u2014 alert latency increased"
        },
        {
          "condition": "> 10",
          "action": "Reject \u2014 PARAMETER_CHANGE_REQUIRES_APPROVAL"
        }
      ],
      "dev_check": "if (p.missed_heartbeats_to_alert > p.hard) throw ConfigError('PARAMETER_CHANGE_REQUIRES_APPROVAL')",
      "user_facing": "A component is flagged as unhealthy only after multiple consecutive check failures, to avoid false alarms."
    },
    {
      "name": "auto_restart",
      "default": true,
      "warning": null,
      "hard": null,
      "controls": "When true, HealthHeartbeat triggers a restart command after missed_heartbeats_to_alert consecutive failures. Respects a per-bot restart budget.",
      "why_default_matters": "Auto-restart recovers from transient crashes without manual intervention, minimising downtime for governance bots.",
      "threshold_logic": [
        {
          "condition": "auto_restart=true AND misses >= threshold",
          "action": "Publish restart command; emit HEALTH_HEARTBEAT_AUTO_RESTART"
        },
        {
          "condition": "restart_budget exhausted",
          "action": "Emit HEALTH_HEARTBEAT_RESTART_BUDGET_EXHAUSTED; page on-call without restarting"
        }
      ],
      "dev_check": "if (p.auto_restart && misses >= p.missed_heartbeats_to_alert) triggerRestart(bot_slug)",
      "user_facing": "If a component stops responding, the system will attempt to restart it automatically."
    },
    {
      "name": "page_on_failure",
      "default": true,
      "warning": null,
      "hard": null,
      "controls": "When true (locked), a page-severity alert is fired for any bot that exceeds the missed heartbeat threshold.",
      "why_default_matters": "Every bot that stops heartbeating is a potential live incident. Paging is mandatory.",
      "threshold_logic": [
        {
          "condition": "page_on_failure=true AND misses >= threshold",
          "action": "Fire page-severity alert"
        },
        {
          "condition": "page_on_failure=false",
          "action": "Not permitted \u2014 parameter is locked to true"
        }
      ],
      "dev_check": "if (!p.page_on_failure) throw ConfigError('PARAMETER_CHANGE_REQUIRES_APPROVAL')",
      "user_facing": "Critical system components are monitored by an on-call team."
    }
  ],
  "default_config": {
    "bot_id": "gov.health_heartbeat",
    "version": "2.0.0",
    "mode": "general_live",
    "defaults": {
      "heartbeat_interval_s": 30,
      "missed_heartbeats_to_alert": 3,
      "auto_restart": true,
      "page_on_failure": true
    },
    "locked": {
      "page_on_failure": {
        "immutable": true
      },
      "heartbeat_interval_s": {
        "max": 300
      },
      "missed_heartbeats_to_alert": {
        "max": 10
      }
    }
  },
  "implementation_flow": [
    "On startup, load the bot registry from the config store; build a polling table keyed by bot_slug with miss_count=0.",
    "Every heartbeat_interval_s, iterate over all registered bots and issue GET /internal/health/<slug> with a timeout of heartbeat_interval_s/3.",
    "For each bot: if response is 200 within timeout, reset miss_count to 0 and emit INFO heartbeat.",
    "If response is non-200 or times out, increment miss_count.",
    "When miss_count >= missed_heartbeats_to_alert: emit page alert (HEALTH_HEARTBEAT_BOT_DOWN) and, if auto_restart=true, publish restart command to the process manager.",
    "Track restart budget per bot (default 3 restarts per 10 minutes). If budget is exhausted, emit HEALTH_HEARTBEAT_RESTART_BUDGET_EXHAUSTED and stop auto-restarting.",
    "After each full sweep, emit an OperationsReport summarising: total_bots, healthy_count, unhealthy_count, restarted_count, sweep_duration_ms.",
    "HealthHeartbeat itself is monitored by a watchdog process (deadman timer) that pages if no OperationsReport is emitted within 2x heartbeat_interval_s."
  ],
  "decision_logic": {
    "approve": "Not applicable \u2014 HealthHeartbeat does not approve or reject trading decisions.",
    "reshape_required": "Not applicable.",
    "reject": "Not applicable as a trading decision.",
    "warning_only": "A single missed heartbeat increments the miss counter but does not fire an alert. Only consecutive misses at or above the threshold trigger an alert or restart."
  },
  "decision_output_schema": "OperationsReport",
  "decision_output_example": {
    "report_id": "ops_health_20260509T120000Z",
    "bot_id": "gov.health_heartbeat",
    "event_type": "HEALTH_SWEEP_COMPLETE",
    "total_bots": 97,
    "healthy_count": 96,
    "unhealthy_count": 1,
    "restarted_count": 1,
    "sweep_duration_ms": 840,
    "unhealthy_bots": [
      {
        "slug": "strat.some_strategy",
        "miss_count": 3,
        "action": "restarted"
      }
    ],
    "fired_at_ms": 1746792000000,
    "report_kind": "OperationsReport"
  },
  "developer_log": {
    "bot_id": "gov.health_heartbeat",
    "event_type": "HEALTH_BOT_MISS",
    "slug": "strat.some_strategy",
    "miss_count": 2,
    "threshold": 3,
    "last_seen_ms": 1746791940000,
    "fired_at_ms": 1746791970000
  },
  "user_explanations": [
    {
      "situation": "All bots healthy",
      "message": "All system components passed their health checks. Everything is running normally."
    },
    {
      "situation": "A bot was auto-restarted",
      "message": "A component stopped responding and was automatically restarted. Trading and risk monitoring continued without interruption."
    },
    {
      "situation": "A bot is down and restart budget exhausted",
      "message": "A component is not responding and automatic restart attempts have been exhausted. The on-call team has been notified."
    }
  ],
  "failure_modes": {
    "main_failure_mode": "HealthHeartbeat itself crashes, silently leaving all 97 bots unmonitored. Requires an external deadman watchdog.",
    "false_positive_risk": "A healthy bot's health endpoint returns 503 transiently (e.g., during a rolling restart), triggering a spurious miss counter increment.",
    "false_negative_risk": "A bot crashes but its health endpoint continues to respond 200 from a zombie process that has stopped processing events \u2014 HealthHeartbeat sees it as healthy.",
    "safe_fallback": "If HealthHeartbeat cannot reach a bot's health endpoint due to a network partition, it increments miss_count normally and fires the alert after the threshold. The bot is never silently marked healthy on connectivity loss.",
    "required_dependencies": [
      "Bot registry (config store)",
      "Internal health endpoints on all 97 bots",
      "Process manager (for auto-restart commands)",
      "Alerting / paging system",
      "Deadman watchdog for HealthHeartbeat itself"
    ]
  },
  "acceptance_tests": {
    "unit": [
      {
        "test": "miss_count increments on non-200 response",
        "setup": "Mock health endpoint returns 503",
        "expected": "miss_count incremented; no alert below threshold"
      },
      {
        "test": "Alert fires at threshold",
        "setup": "miss_count == missed_heartbeats_to_alert",
        "expected": "HEALTH_HEARTBEAT_BOT_DOWN alert emitted; restart triggered if auto_restart=true"
      },
      {
        "test": "Restart budget enforced",
        "setup": "3 restarts in 10 minutes for same bot",
        "expected": "4th restart blocked; HEALTH_HEARTBEAT_RESTART_BUDGET_EXHAUSTED emitted"
      },
      {
        "test": "miss_count resets on recovery",
        "setup": "Bot returns to 200 after 2 misses",
        "expected": "miss_count reset to 0; HEALTH_HEARTBEAT_BOT_RECOVERED emitted"
      },
      {
        "test": "heartbeat_interval_s above hard maximum rejected",
        "setup": "heartbeat_interval_s=400",
        "expected": "ConfigError PARAMETER_CHANGE_REQUIRES_APPROVAL"
      }
    ],
    "integration": [
      {
        "test": "Full sweep of all 97 bots completes within heartbeat_interval_s",
        "expected": "OperationsReport emitted with total_bots=97 within configured interval"
      },
      {
        "test": "Auto-restart command delivered to process manager",
        "expected": "Restart command published; bot restarts; miss_count resets on recovery"
      }
    ],
    "property": [
      {
        "property": "Every missed heartbeat increments miss_count; no miss is silently dropped",
        "required": "Always true"
      },
      {
        "property": "An OperationsReport is emitted after every sweep cycle",
        "required": "Always true"
      }
    ]
  },
  "checklist_overrides": {},
  "legacy_goal": "Per-bot heartbeat; restart on crash; alert on drift.",
  "legacy_pm_signals": [
    "Order-reject rate, fill anomaly, latency spike"
  ],
  "legacy_external_feeds": [
    "On-call paging system"
  ],
  "reporting_groups": [
    "governance_audit"
  ],
  "network": [
    "polygon"
  ],
  "api_surface": [
    "internal"
  ],
  "reference_implementation": {
    "summary": "Polls all 97 registered bots' health endpoints every heartbeat_interval_s, tracks consecutive misses, fires alerts and auto-restarts at threshold, emits a sweep OperationsReport after each cycle.",
    "language_note": "Pseudocode is language-agnostic. FETCH = read input. EMIT = produce output. Translate to TS/Python/Go/Rust.",
    "pseudocode": "// ---- STARTUP ----\nFUNCTION init():\n  registry = FETCH config_store.GET('/bot-registry')\n  miss_counts = { slug: 0 FOR slug IN registry }\n  restart_budgets = { slug: { count: 0, window_start: now() } FOR slug IN registry }\n  setInterval(runSweep, config.heartbeat_interval_s * 1000)\n\n// ---- SWEEP ----\nFUNCTION runSweep():\n  sweep_start = now()\n  healthy = 0; unhealthy = 0; restarted = 0\n  unhealthy_bots = []\n\n  FOR bot IN registry:\n    response = FETCH GET '/internal/health/' + bot.slug\n      TIMEOUT config.heartbeat_interval_s / 3 * 1000\n\n    IF response.status == 200:\n      IF miss_counts[bot.slug] >= config.missed_heartbeats_to_alert:\n        EMIT alert(HEALTH_HEARTBEAT_BOT_RECOVERED, bot.slug)\n      miss_counts[bot.slug] = 0\n      healthy += 1\n    ELSE:\n      miss_counts[bot.slug] += 1\n      unhealthy += 1\n\n      IF miss_counts[bot.slug] >= config.missed_heartbeats_to_alert:\n        alerting.emit('HEALTH_HEARTBEAT_BOT_DOWN', {\n          slug: bot.slug, miss_count: miss_counts[bot.slug] })\n\n        IF config.auto_restart:\n          budget = restart_budgets[bot.slug]\n          IF (now() - budget.window_start) > 600_000:  // 10-min window\n            budget.count = 0; budget.window_start = now()\n          IF budget.count < 3:\n            internal_bus.publish('process.restart', { slug: bot.slug })\n            budget.count += 1; restarted += 1\n            unhealthy_bots.append({ slug: bot.slug, miss_count: miss_counts[bot.slug], action: 'restarted' })\n          ELSE:\n            alerting.emit('HEALTH_HEARTBEAT_RESTART_BUDGET_EXHAUSTED', { slug: bot.slug })\n            unhealthy_bots.append({ slug: bot.slug, miss_count: miss_counts[bot.slug], action: 'budget_exhausted' })\n\n  EMIT OperationsReport({\n    report_id:         'ops_health_' + sweep_start,\n    event_type:        'HEALTH_SWEEP_COMPLETE',\n    total_bots:        len(registry),\n    healthy_count:     healthy,\n    unhealthy_count:   unhealthy,\n    restarted_count:   restarted,\n    sweep_duration_ms: now() - sweep_start,\n    unhealthy_bots:    unhealthy_bots,\n    fired_at_ms:       sweep_start\n  })\n",
    "sdk_calls": [
      "config_store.GET('/bot-registry')",
      "FETCH GET '/internal/health/<slug>' TIMEOUT <ms>",
      "internal_bus.publish('process.restart', { slug })",
      "alerting.emit('HEALTH_HEARTBEAT_BOT_DOWN', metadata)",
      "alerting.emit('HEALTH_HEARTBEAT_RESTART_BUDGET_EXHAUSTED', metadata)"
    ],
    "complexity": "O(N) per sweep where N = 97 registered bots"
  },
  "wire_examples": {
    "input": {
      "label": "Health endpoint poll response",
      "source": "internal /internal/health/<slug>",
      "payload": {
        "slug": "strat.some_strategy",
        "status": "ok",
        "last_decision_ms": 1746791970000,
        "uptime_s": 86400
      }
    },
    "output": {
      "label": "OperationsReport \u2014 HEALTH_SWEEP_COMPLETE",
      "payload": {
        "report_id": "ops_health_1746792000000",
        "bot_id": "gov.health_heartbeat",
        "event_type": "HEALTH_SWEEP_COMPLETE",
        "total_bots": 97,
        "healthy_count": 96,
        "unhealthy_count": 1,
        "restarted_count": 1,
        "sweep_duration_ms": 840,
        "unhealthy_bots": [
          {
            "slug": "strat.some_strategy",
            "miss_count": 3,
            "action": "restarted"
          }
        ],
        "fired_at_ms": 1746792000000,
        "report_kind": "OperationsReport"
      }
    }
  },
  "reason_codes": [
    {
      "code": "HEALTH_HEARTBEAT_SWEEP_COMPLETE",
      "severity": "INFO",
      "meaning": "Full sweep of all registered bots completed; OperationsReport emitted.",
      "action": "No action \u2014 routine heartbeat.",
      "user_message": ""
    },
    {
      "code": "HEALTH_HEARTBEAT_BOT_DOWN",
      "severity": "WARN",
      "meaning": "A bot has exceeded the missed_heartbeats_to_alert threshold of consecutive missed polls.",
      "action": "Fire page-severity alert; trigger auto-restart if enabled.",
      "user_message": "A system component is not responding. The on-call team has been notified."
    },
    {
      "code": "HEALTH_HEARTBEAT_BOT_RECOVERED",
      "severity": "INFO",
      "meaning": "A previously unhealthy bot returned a healthy response; miss_count reset to 0.",
      "action": "Emit recovery notification; no further action.",
      "user_message": "A component that was restarted is now healthy."
    },
    {
      "code": "HEALTH_HEARTBEAT_AUTO_RESTART",
      "severity": "WARN",
      "meaning": "CronRunner triggered an automatic restart for a bot that missed the heartbeat threshold.",
      "action": "Log restart; increment restart budget counter.",
      "user_message": "A component was automatically restarted."
    },
    {
      "code": "HEALTH_HEARTBEAT_RESTART_BUDGET_EXHAUSTED",
      "severity": "WARN",
      "meaning": "A bot has been restarted the maximum number of times within the restart budget window without recovering.",
      "action": "Stop auto-restarting; escalate page to on-call.",
      "user_message": "Automatic restart attempts have been exhausted for a component. Manual intervention is required."
    },
    {
      "code": "HEALTH_HEARTBEAT_ENDPOINT_TIMEOUT",
      "severity": "WARN",
      "meaning": "A bot's health endpoint did not respond within the configured timeout.",
      "action": "Treat as missed heartbeat; increment miss_count.",
      "user_message": ""
    },
    {
      "code": "KILL_SWITCH_ACTIVE",
      "severity": "WARN",
      "meaning": "KillSwitch is active; this is surfaced in the sweep report for context.",
      "action": "Continue monitoring all bots; do not suppress health checks.",
      "user_message": ""
    },
    {
      "code": "HEALTH_HEARTBEAT_REGISTRY_STALE",
      "severity": "WARN",
      "meaning": "The bot registry has not been refreshed from the config store within 5 minutes.",
      "action": "Retry registry fetch; alert if stale for > 10 minutes.",
      "user_message": ""
    }
  ],
  "metrics": {
    "emitted": [
      {
        "name": "polytraders_gov_healthheartbeat_bots_healthy",
        "type": "gauge",
        "unit": "count",
        "labels": [],
        "meaning": "Number of bots currently in healthy state."
      },
      {
        "name": "polytraders_gov_healthheartbeat_bots_unhealthy",
        "type": "gauge",
        "unit": "count",
        "labels": [],
        "meaning": "Number of bots currently in unhealthy state (above miss threshold)."
      },
      {
        "name": "polytraders_gov_healthheartbeat_restarts_total",
        "type": "counter",
        "unit": "count",
        "labels": [
          "slug"
        ],
        "meaning": "Total auto-restarts triggered per bot slug."
      },
      {
        "name": "polytraders_gov_healthheartbeat_misses_total",
        "type": "counter",
        "unit": "count",
        "labels": [
          "slug"
        ],
        "meaning": "Total missed heartbeat polls per bot slug."
      },
      {
        "name": "polytraders_gov_healthheartbeat_sweep_duration_ms",
        "type": "histogram",
        "unit": "ms",
        "labels": [],
        "meaning": "Wall-clock latency of a full 97-bot sweep cycle."
      },
      {
        "name": "polytraders_gov_healthheartbeat_sweeps_total",
        "type": "counter",
        "unit": "count",
        "labels": [],
        "meaning": "Total sweep cycles completed."
      }
    ],
    "alerts": [
      {
        "name": "HealthHeartbeatBotDown",
        "condition": "polytraders_gov_healthheartbeat_bots_unhealthy > 0",
        "severity": "page",
        "runbook": "#runbook-healthheartbeat-bot-down"
      },
      {
        "name": "HealthHeartbeatRestartBudgetExhausted",
        "condition": "rate(polytraders_gov_healthheartbeat_restarts_total[10m]) > 3",
        "severity": "page",
        "runbook": "#runbook-healthheartbeat-restart-budget"
      },
      {
        "name": "HealthHeartbeatSweepMissing",
        "condition": "rate(polytraders_gov_healthheartbeat_sweeps_total[5m]) == 0",
        "severity": "page",
        "runbook": "#runbook-healthheartbeat-missing"
      },
      {
        "name": "HealthHeartbeatSweepLatencyHigh",
        "condition": "histogram_quantile(0.99, polytraders_gov_healthheartbeat_sweep_duration_ms) > 25000",
        "severity": "warn",
        "runbook": "#runbook-healthheartbeat-latency"
      }
    ],
    "dashboards": [
      "Grafana \u2014 Governance / HealthHeartbeat liveness overview (all 97 bots)",
      "Grafana \u2014 Governance / Auto-restart rate and budget consumption"
    ],
    "log_level": "info"
  },
  "state": {
    "store": "in-memory",
    "shape": "miss_counts: { slug -> int }, restart_budgets: { slug -> { count, window_start } }",
    "ttl": "Not persisted; rebuilt on startup from a fresh sweep",
    "recovery": "On restart, all miss_counts reset to 0. The first sweep re-establishes the health baseline.",
    "size_estimate": "< 50 KB for 97 bots"
  },
  "concurrency": {
    "execution_model": "thread-pool (one HTTP poll per bot in parallel)",
    "max_in_flight": 97,
    "idempotency_key": "slug + sweep_start_ms",
    "timeout_ms": 10000,
    "backpressure": "cap parallel polls at max_in_flight=97; excess queued to next sweep",
    "locking": "per-slug mutex on miss_counts and restart_budgets"
  },
  "dependencies": {
    "depends_on": [
      {
        "bot_id": "internal.config_store",
        "why": "Bot registry is loaded from config store on startup."
      }
    ],
    "emits_to": [
      {
        "bot_id": "internal.process_manager",
        "what": "process.restart commands for unhealthy bots"
      }
    ],
    "sibling": [
      {
        "bot_id": "gov.cron_runner",
        "why": "CronRunner fires the hourly health sweep trigger."
      }
    ],
    "external": [
      {
        "service": "Alerting / paging system",
        "sla": "99.9% (internal SRE target)",
        "fallback": "If paging system is unavailable, alert is queued locally and retried; log to stderr."
      }
    ]
  },
  "security_surfaces": {
    "signs_orders": false,
    "private_key_access": "none",
    "abuse_vectors": [
      "A bot returns a fake 200 response from a zombie process to avoid restart",
      "Raising missed_heartbeats_to_alert to a very high value to prevent alerts from firing",
      "Disabling page_on_failure to suppress alerting"
    ],
    "mitigations": [
      "page_on_failure is locked immutable; cannot be disabled",
      "heartbeat_interval_s and missed_heartbeats_to_alert have hard maximums enforced at config load",
      "Health endpoint responses are checked for a valid JSON body, not just HTTP status",
      "HealthHeartbeat itself is monitored by an external deadman watchdog"
    ],
    "contract_calls": []
  },
  "polymarket_v2_compat": {
    "clob_version": "v2",
    "collateral": "pUSD",
    "eip712_domain_version": "2",
    "builder_code_aware": false,
    "negrisk_aware": false,
    "multichain_ready": false,
    "sdk_used": "internal-only",
    "settlement_contract": "none",
    "notes": "HealthHeartbeat monitors liveness of all bots including V2-aware ones but has no direct CLOB or on-chain interface itself."
  },
  "version": {
    "spec": "2.0.0",
    "implementation": "2.1.0",
    "schema": "2",
    "released": "2026-04-28"
  },
  "migration_history": [
    {
      "date": "2026-04-28",
      "from": "v1",
      "to": "v2",
      "reason": "CLOB V2 cutover",
      "action_taken": "No direct CLOB changes required. Updated OperationsReport schema; removed stale USDC.e references from sweep report payloads. Added V2-aware bots to the monitoring registry."
    }
  ],
  "failure_injection": [
    {
      "scenario": "BOT_CRASH",
      "how_to_inject": "Kill a bot process so its health endpoint stops responding",
      "expected_behavior": "miss_count increments each poll; after missed_heartbeats_to_alert misses, HEALTH_HEARTBEAT_BOT_DOWN alert fires and restart is triggered",
      "recovery": "Bot restarts; miss_count resets to 0; HEALTH_HEARTBEAT_BOT_RECOVERED emitted."
    },
    {
      "scenario": "RESTART_BUDGET_EXHAUSTED",
      "how_to_inject": "Repeatedly kill a bot faster than restart_budget window (3 crashes in < 10 min)",
      "expected_behavior": "Third restart fires; fourth missed threshold triggers HEALTH_HEARTBEAT_RESTART_BUDGET_EXHAUSTED; no further auto-restart",
      "recovery": "Manual intervention required; budget resets after 10-minute window."
    },
    {
      "scenario": "HEALTH_HEARTBEAT_SELF_CRASH",
      "how_to_inject": "Kill HealthHeartbeat process",
      "expected_behavior": "Deadman watchdog fires page after 2x heartbeat_interval_s without a sweep OperationsReport",
      "recovery": "HealthHeartbeat is restarted by the process manager; sweep resumes; miss counts reinitialised."
    },
    {
      "scenario": "ENDPOINT_TIMEOUT",
      "how_to_inject": "Set a mock health endpoint to respond after 30s (beyond timeout)",
      "expected_behavior": "HEALTH_HEARTBEAT_ENDPOINT_TIMEOUT logged; miss_count incremented",
      "recovery": "When endpoint responds within timeout, miss_count resets."
    },
    {
      "scenario": "NETWORK_PARTITION",
      "how_to_inject": "Block internal network between HealthHeartbeat and a subset of bots",
      "expected_behavior": "Affected bots' miss counts increment; alert fires at threshold; restart attempted (network partition means restart may not help)",
      "recovery": "Network restored; bots return to healthy; miss counts reset."
    }
  ],
  "runbook": {
    "summary": "HealthHeartbeat incidents are either a bot going down (most common), the restart budget exhausting on a crash-looping bot, or HealthHeartbeat itself failing. All three require immediate response.",
    "oncall_actions": [
      {
        "alert": "HealthHeartbeatBotDown",
        "first_step": "Identify which bot(s) are unhealthy from the sweep OperationsReport. Check bot logs for crash details.",
        "escalation": "Layer pod lead for the affected bot"
      },
      {
        "alert": "HealthHeartbeatRestartBudgetExhausted",
        "first_step": "Do NOT manually restart the bot without investigating crash logs. Check for crash-loop root cause.",
        "escalation": "Layer pod lead + SRE on-call immediately"
      },
      {
        "alert": "HealthHeartbeatSweepMissing",
        "first_step": "Check HealthHeartbeat process status; verify deadman watchdog is running.",
        "escalation": "Governance pod lead immediately"
      },
      {
        "alert": "HealthHeartbeatSweepLatencyHigh",
        "first_step": "Check internal network latency to bot health endpoints; reduce parallel poll count if overloaded.",
        "escalation": "SRE on-call after 30 minutes"
      }
    ],
    "manual_overrides": [
      {
        "name": "pause_auto_restart",
        "how": "polytraders gov health pause-restart --slug <slug>",
        "when": "Stop auto-restart for a specific bot while investigating a crash-loop.",
        "command": "polytraders gov health pause-restart --slug <slug>",
        "effect": "Stop auto-restart for a specific bot while investigating a crash-loop."
      }
    ],
    "healthcheck": "Endpoint: /internal/health/health-heartbeat | Green: Last sweep completed within 2x heartbeat_interval_s; all bots polled; OperationsReport emitted. | Red: No sweep in 2x heartbeat_interval_s; registry load failed; process unresponsive."
  },
  "promotion_gates": {
    "to_shadow": [
      {
        "gate": "Unit tests pass for miss counting, alert threshold, and restart budget",
        "how_measured": "CI test run",
        "threshold": "100% pass"
      }
    ],
    "to_limited_live": [
      {
        "gate": "Full 97-bot sweep completes within heartbeat_interval_s under normal load",
        "how_measured": "polytraders_gov_healthheartbeat_sweep_duration_ms histogram",
        "threshold": "< 30s sweep for 97 bots"
      }
    ],
    "to_general_live": [
      {
        "gate": "End-to-end: bot crash detected and auto-restarted within 3 sweep cycles",
        "how_measured": "Failure injection test",
        "threshold": "Pass"
      },
      {
        "gate": "Restart budget exhaustion alert fires and stops further restarts",
        "how_measured": "Failure injection test",
        "threshold": "Pass"
      }
    ]
  },
  "reporting": {
    "emits_kinds": [
      "OperationsReport"
    ],
    "topics": [
      "polytraders.reports.ops"
    ],
    "cadence": "every-event",
    "retention_class": "1y",
    "sampling_rule": "batched-1/min",
    "bus_failure_action": "drop-after-buffer",
    "user_visible": "no",
    "consumes_kinds": []
  },
  "capital_impact": "Indirect",
  "v3_status": {
    "phase": 3,
    "phase_name": "Reporting & event store",
    "docs": {
      "done": 27,
      "total": 27,
      "state": "done"
    },
    "impl": {
      "done": 0,
      "total": 15,
      "state": "pending"
    },
    "runtime": {
      "done": 0,
      "total": 8,
      "state": "pending"
    },
    "overall": "pending"
  }
}