{
  "schema_version": "1.0.0",
  "bot_id": "2.11",
  "bot_name": "ExchangeStatusMonitor",
  "slug": "exchangestatusmonitor",
  "layer": "Execution",
  "layer_key": "exec",
  "bot_class": "Execution Utility",
  "authority": [
    "Reshape"
  ],
  "status": "planned",
  "readiness": "Spec started",
  "flagship": false,
  "is_reference": false,
  "public_export": false,
  "identity": {
    "layer": "Execution",
    "bot_class": "Execution Utility",
    "authority": "Reshape",
    "runs_before": "All exec bots that submit orders",
    "runs_after": "Continuous background process; does not depend on order flow",
    "applies_to": "All active trading while Polymarket CLOB V2 is the execution venue",
    "default_mode": "shadow_only",
    "user_visible": "summary-only",
    "developer_owner": "Polytraders core \u2014 Execution pod"
  },
  "purpose": "ExchangeStatusMonitor treats Polymarket itself as a degradable dependency. It polls CLOB V2 endpoint health, watches for reject-rate spikes, and parses public maintenance signals. When degradation is confirmed, it emits ObservationReports that trigger pause or de-risk actions across the exec layer.",
  "why_it_matters": [
    {
      "failure": "Exchange degradation not detected",
      "consequence": "Orders continue to be submitted to a degraded CLOB, accumulating 429 errors, failed acks, and stale fills."
    },
    {
      "failure": "Maintenance window missed",
      "consequence": "Orders submitted during a scheduled maintenance window are rejected without useful error context, causing unnecessary retries."
    },
    {
      "failure": "Resume too early after incident",
      "consequence": "Resuming order submission before the CLOB has fully recovered causes a second wave of errors and potentially double-fills on retry."
    }
  ],
  "polymarket_inputs": [
    {
      "input": "CLOB V2 health/ping endpoint",
      "source": "clob_public",
      "required": true,
      "use": "Detect CLOB REST endpoint availability and error rates."
    },
    {
      "input": "Polymarket public status page",
      "source": "internal HTTP poller",
      "required": false,
      "use": "Parse maintenance-window announcements and incident notices."
    }
  ],
  "internal_inputs": [
    {
      "input": "Reject-rate metrics from OrderLifecycleManager",
      "source": "exec.orderlifecyclemanager",
      "required": true,
      "use": "Detect reject-rate spikes (429 or 503 responses) as an exchange degradation signal."
    }
  ],
  "raw_params": [
    "pause_on_status \u00b7 list",
    "flatten_on_status \u00b7 list",
    "poll_interval_s \u00b7 int",
    "resume_quarantine_min \u00b7 int"
  ],
  "parameters": [
    {
      "name": "pause_on_status",
      "default": [
        "degraded",
        "maintenance"
      ],
      "warning": "\u2014",
      "hard": "\u2014",
      "controls": "List of exchange status codes on which to emit PAUSE ObservationReport to suspend order submission.",
      "why_default_matters": "Pausing on 'degraded' and 'maintenance' covers the two most common exchange unavailability scenarios.",
      "threshold_logic": [
        {
          "condition": "status NOT IN pause_on_status",
          "action": "No pause \u2014 continue order submission"
        },
        {
          "condition": "status IN pause_on_status",
          "action": "Emit EXCHANGE_STATUS_PAUSE ObservationReport"
        }
      ],
      "dev_check": "if status in params.pause_on_status: emit(EXCHANGE_STATUS_PAUSE)",
      "user_facing": "Trading has been paused because the exchange is temporarily unavailable."
    },
    {
      "name": "flatten_on_status",
      "default": [
        "outage"
      ],
      "warning": "\u2014",
      "hard": "\u2014",
      "controls": "List of exchange status codes on which to emit FLATTEN ObservationReport, requesting all open orders to be cancelled.",
      "why_default_matters": "A full outage means orders will not be filled or cancelled by the exchange; the safest response is to cancel all open orders.",
      "threshold_logic": [
        {
          "condition": "status NOT IN flatten_on_status",
          "action": "No flatten"
        },
        {
          "condition": "status IN flatten_on_status",
          "action": "Emit EXCHANGE_STATUS_FLATTEN ObservationReport"
        }
      ],
      "dev_check": "if status in params.flatten_on_status: emit(EXCHANGE_STATUS_FLATTEN)",
      "user_facing": "Your open orders have been cancelled because the exchange is experiencing an outage."
    },
    {
      "name": "poll_interval_s",
      "default": 15,
      "warning": 30,
      "hard": 60,
      "controls": "How often to poll the CLOB health endpoint and status page.",
      "why_default_matters": "15s provides timely detection of degradation; polling faster increases request overhead on an already-stressed exchange.",
      "threshold_logic": [
        {
          "condition": "interval <= 15s",
          "action": "Normal polling"
        },
        {
          "condition": "interval > 30s",
          "action": "WARN \u2014 degradation detection latency increased"
        },
        {
          "condition": "interval > 60s (hard)",
          "action": "Reject config"
        }
      ],
      "dev_check": "assert params.poll_interval_s <= params.hard",
      "user_facing": "Exchange availability is checked regularly."
    },
    {
      "name": "resume_quarantine_min",
      "default": 5,
      "warning": 2,
      "hard": 1,
      "controls": "Minutes to wait after exchange status returns to healthy before lifting the pause, to prevent false resumption.",
      "why_default_matters": "A 5-minute quarantine absorbs intermittent recovery signals; CLOB incidents often have brief healthy periods before full recovery.",
      "threshold_logic": [
        {
          "condition": "quarantine_min >= 5",
          "action": "Normal; wait for full recovery"
        },
        {
          "condition": "quarantine_min < 2 (warning)",
          "action": "WARN \u2014 may resume too early"
        },
        {
          "condition": "quarantine_min < 1 (hard)",
          "action": "Reject config \u2014 minimum 1 min quarantine required"
        }
      ],
      "dev_check": "if params.resume_quarantine_min < params.hard: raise ConfigError",
      "user_facing": "Trading will resume shortly after the exchange confirms it is fully operational."
    }
  ],
  "default_config": {
    "bot_id": "exec.exchangestatusmonitor",
    "version": "0.1.0",
    "mode": "shadow_only",
    "defaults": {
      "pause_on_status": [
        "degraded",
        "maintenance"
      ],
      "flatten_on_status": [
        "outage"
      ],
      "poll_interval_s": 15,
      "resume_quarantine_min": 5
    },
    "locked": {
      "poll_interval_s": {
        "max": 60
      },
      "resume_quarantine_min": {
        "min": 1
      }
    }
  },
  "implementation_flow": [
    "Every poll_interval_s: GET clob_public /health; record status_code and latency.",
    "If status_code != 200 or latency > 2000ms: increment consecutive_error_count.",
    "After 3 consecutive errors: set exchange_status=degraded.",
    "Optionally: fetch Polymarket public status page; parse for maintenance or outage keywords.",
    "Read reject_rate from OrderLifecycleManager metrics; if reject_rate > 10% over 60s: treat as degraded signal.",
    "Determine composite status: healthy, degraded, maintenance, or outage.",
    "If status in pause_on_status: emit ObservationReport(EXCHANGE_STATUS_PAUSE).",
    "If status in flatten_on_status: emit ObservationReport(EXCHANGE_STATUS_FLATTEN).",
    "When status returns to healthy: start resume_quarantine_min timer; emit EXCHANGE_STATUS_RESUMING.",
    "After quarantine completes: emit EXCHANGE_STATUS_HEALTHY; clear pause state."
  ],
  "decision_logic": {
    "approve": "Exchange healthy; no action \u2014 continue normal order submission.",
    "reshape_required": "Not applicable \u2014 ExchangeStatusMonitor is observation-only.",
    "reject": "Status in flatten_on_status: emit FLATTEN ObservationReport; all open orders should be cancelled.",
    "warning_only": "Consecutive error count rising but threshold not yet reached; WARN emitted."
  },
  "decision_output_schema": "ObservationReport",
  "decision_output_example": {
    "report_id": "rep_6f7a8b9c0d1e2f3a",
    "trace_id": "trc_5e6f7a8b9c0d1e2f",
    "bot_id": "exec.exchangestatusmonitor",
    "exchange_status": "degraded",
    "verdict": "EXCHANGE_STATUS_PAUSE",
    "consecutive_errors": 4,
    "reject_rate_pct": 15.2,
    "measured_at_ms": 1746770400000
  },
  "developer_log": {
    "exchange_status": "degraded",
    "consecutive_errors": 4,
    "reject_rate_pct": 15.2,
    "status_page_parsed": true,
    "status_page_result": "No active incident",
    "verdict": "EXCHANGE_STATUS_PAUSE",
    "quarantine_active": false
  },
  "user_explanations": [
    {
      "situation": "Exchange paused",
      "message": "Trading has been temporarily paused because the exchange is experiencing technical issues."
    },
    {
      "situation": "Orders cancelled \u2014 outage",
      "message": "Your open orders were cancelled because the exchange had an outage. You can re-enter when trading resumes."
    },
    {
      "situation": "Trading resuming after quarantine",
      "message": "The exchange has recovered. Trading will resume shortly after a brief verification period."
    }
  ],
  "failure_modes": {
    "main_failure_mode": "Status-page parsing fails silently, causing ExchangeStatusMonitor to miss a scheduled maintenance window and continue submitting orders that will be rejected.",
    "false_positive_risk": "Brief network hiccup to the health endpoint counted as exchange degradation, pausing trading unnecessarily.",
    "false_negative_risk": "reject_rate threshold too high; exchange is degraded but local reject rate hasn't yet exceeded threshold, delaying pause.",
    "safe_fallback": "If health endpoint is unreachable for > 3 consecutive polls, treat as degraded; emit EXCHANGE_STATUS_PAUSE conservatively.",
    "required_dependencies": [
      "clob_public /health endpoint",
      "reject-rate metrics from OrderLifecycleManager",
      "internal scheduler for poll triggers"
    ]
  },
  "acceptance_tests": {
    "unit": [
      {
        "test": "Pause emitted after 3 consecutive health check failures",
        "setup": "Inject 3 consecutive 503 responses from clob_public",
        "expected": "exchange_status=degraded; EXCHANGE_STATUS_PAUSE emitted"
      },
      {
        "test": "Flatten emitted on outage status",
        "setup": "Set exchange_status=outage",
        "expected": "EXCHANGE_STATUS_FLATTEN emitted"
      },
      {
        "test": "No pause on single health check failure",
        "setup": "Inject 1 503 response; next 2 succeed",
        "expected": "No pause; WARN only after 1st failure"
      }
    ],
    "integration": [
      {
        "test": "Quarantine: status recovers \u2192 quarantine timer starts \u2192 EXCHANGE_STATUS_HEALTHY after resume_quarantine_min",
        "expected": "EXCHANGE_STATUS_RESUMING emitted; EXCHANGE_STATUS_HEALTHY emitted after 5 min"
      },
      {
        "test": "reject_rate spike triggers degraded status",
        "expected": "reject_rate > 10% for 60s \u2192 exchange_status=degraded; EXCHANGE_STATUS_PAUSE emitted"
      }
    ],
    "property": [
      {
        "property": "EXCHANGE_STATUS_FLATTEN only emitted when status in flatten_on_status",
        "required": "Always true"
      },
      {
        "property": "After quarantine completes, at least resume_quarantine_min minutes have elapsed since last error",
        "required": "Always true"
      }
    ]
  },
  "checklist_overrides": {},
  "legacy_goal": "Treat Polymarket itself as a degradable dependency \u2014 pause or de-risk on signal.",
  "legacy_pm_signals": [
    "Heartbeat from CLOB V2 endpoints; status-page parsing",
    "Reject-rate spike across unrelated strategies",
    "Maintenance-window calendar from public announcements"
  ],
  "legacy_external_feeds": [
    "Polymarket public status page"
  ],
  "reporting_groups": [
    "execution"
  ],
  "network": [
    "polygon"
  ],
  "api_surface": [
    "clob_public",
    "internal"
  ],
  "version": {
    "spec": "2.0.0",
    "implementation": "0.1.0",
    "schema": "2",
    "released": null,
    "planned_release": "Q4-2026"
  },
  "migration_history": [
    {
      "date": "2026-04-28",
      "from": "n/a",
      "to": "v2-spec",
      "reason": "Spec drafted post-CLOB-V2 cutover; bot not yet implemented",
      "action_taken": "Designed against V2 schema (pUSD, builder codes, V2 EIP-712 domain)"
    }
  ],
  "polymarket_v2_compat": {
    "clob_version": "v2",
    "collateral": "pUSD",
    "eip712_domain_version": "2",
    "builder_code_aware": false,
    "negrisk_aware": false,
    "multichain_ready": false,
    "sdk_used": "py-clob-client-v2",
    "settlement_contract": "CTFExchangeV2",
    "notes": "ExchangeStatusMonitor polls CLOB V2 public endpoints only; it does not submit orders. It provides exchange health signals to other exec bots to coordinate pause/resume behaviour."
  },
  "reference_implementation": {
    "pseudocode": "STATE: consecutiveErrors = 0, exchangeStatus = 'healthy',\n       quarantineStartMs = None\n\nFUNCTION pollExchange():\n  t0 = now_ms()\n  resp = clob_public.GET('/health', timeout=2000)\n  latency = now_ms() - t0\n\n  IF resp IS NULL OR resp.status_code != 200 OR latency > 2000:\n    consecutiveErrors += 1\n  ELSE:\n    consecutiveErrors = 0\n\n  // Reject-rate check\n  rejectRate = FETCH metrics.reject_rate_60s()\n  IF rejectRate > 0.10:\n    consecutiveErrors = max(consecutiveErrors, 3)\n\n  // Status determination\n  IF consecutiveErrors >= 3:\n    IF statusPage.contains('outage'):\n      exchangeStatus = 'outage'\n    ELSE:\n      exchangeStatus = 'degraded'\n  ELIF statusPage.contains('maintenance'):\n    exchangeStatus = 'maintenance'\n  ELSE:\n    exchangeStatus = 'healthy'\n\n  // Emit ObservationReport\n  IF exchangeStatus IN params.flatten_on_status:\n    EMIT ObservationReport(EXCHANGE_STATUS_FLATTEN)\n  ELIF exchangeStatus IN params.pause_on_status:\n    EMIT ObservationReport(EXCHANGE_STATUS_PAUSE)\n  ELIF exchangeStatus == 'healthy' AND quarantineStartMs IS None:\n    quarantineStartMs = now_ms()\n    EMIT ObservationReport(EXCHANGE_STATUS_RESUMING)\n  ELIF exchangeStatus == 'healthy':\n    IF now_ms() - quarantineStartMs >= params.resume_quarantine_min * 60000:\n      quarantineStartMs = None\n      EMIT ObservationReport(EXCHANGE_STATUS_HEALTHY)\n\nSCHEDULE pollExchange EVERY params.poll_interval_s",
    "sdk_calls": [
      "clob_public.GET('/health')",
      "statusPage.fetch('https://status.polymarket.com')"
    ],
    "complexity": "O(1) per poll cycle"
  },
  "wire_examples": {
    "input": [
      {
        "label": "Poll trigger + health response",
        "source": "internal scheduler + clob_public",
        "payload": {
          "poll_ts_ms": 1746770400000,
          "health_status_code": 503,
          "latency_ms": 2100,
          "consecutive_errors": 4
        }
      }
    ],
    "output": [
      {
        "label": "ObservationReport \u2014 EXCHANGE_STATUS_PAUSE",
        "payload": {
          "report_id": "rep_6f7a8b9c0d1e2f3a",
          "bot_id": "exec.exchangestatusmonitor",
          "exchange_status": "degraded",
          "verdict": "EXCHANGE_STATUS_PAUSE",
          "consecutive_errors": 4,
          "measured_at_ms": 1746770400000
        }
      }
    ]
  },
  "reason_codes": [
    {
      "code": "EXCHANGE_STATUS_HEALTHY",
      "severity": "INFO",
      "meaning": "Exchange is healthy; quarantine completed; order submission permitted.",
      "action": "Clear pause state; resume normal operations.",
      "user_message": ""
    },
    {
      "code": "EXCHANGE_STATUS_PAUSE",
      "severity": "WARN",
      "meaning": "Exchange is degraded or in maintenance; order submission paused.",
      "action": "Emit ObservationReport; exec bots suspend new order submissions.",
      "user_message": "Trading is paused because the exchange is temporarily unavailable."
    },
    {
      "code": "EXCHANGE_STATUS_FLATTEN",
      "severity": "HARD_REJECT",
      "meaning": "Exchange outage confirmed; all open orders should be cancelled.",
      "action": "Emit ObservationReport; trigger mass cancel.",
      "user_message": "Your orders were cancelled due to an exchange outage."
    },
    {
      "code": "EXCHANGE_STATUS_RESUMING",
      "severity": "INFO",
      "meaning": "Exchange has recovered; quarantine period started.",
      "action": "Start resume_quarantine_min timer; do not resume submissions yet.",
      "user_message": "The exchange has recovered. Trading will resume shortly."
    }
  ],
  "metrics": {
    "emitted": [
      {
        "name": "polytraders_exec_exchangestatusmonitor_status",
        "type": "gauge",
        "unit": "enum",
        "labels": [
          "status"
        ],
        "meaning": "Current exchange status (healthy=1, degraded=2, maintenance=3, outage=4)."
      },
      {
        "name": "polytraders_exec_exchangestatusmonitor_consecutive_errors",
        "type": "gauge",
        "unit": "count",
        "labels": [],
        "meaning": "Current consecutive health check error count."
      },
      {
        "name": "polytraders_exec_exchangestatusmonitor_pause_events_total",
        "type": "counter",
        "unit": "count",
        "labels": [
          "verdict"
        ],
        "meaning": "Total pause/flatten events emitted by verdict."
      }
    ],
    "alerts": [
      {
        "name": "ESMExchangePaused",
        "condition": "polytraders_exec_exchangestatusmonitor_status > 1",
        "severity": "P1",
        "runbook": "#runbook-esm-exchange-paused"
      },
      {
        "name": "ESMHighConsecutiveErrors",
        "condition": "polytraders_exec_exchangestatusmonitor_consecutive_errors >= 3",
        "severity": "P2",
        "runbook": "#runbook-esm-consecutive-errors"
      }
    ]
  },
  "state": {
    "store": "in-memory + Redis for cross-instance coordination",
    "shape": "exchangeStatus (string), consecutiveErrors (int), quarantineStartMs (int|null)",
    "ttl": "Status persisted for 1h; clears on full healthy state restoration",
    "recovery": "On restart, re-poll health immediately; treat cold start as 0 consecutive errors.",
    "size_estimate": "~200 bytes for status state"
  },
  "concurrency": {
    "execution_model": "single-instance scheduled poller",
    "max_in_flight": 1,
    "idempotency_key": "poll_trigger_ts_ms",
    "timeout_ms": 2000,
    "backpressure": "Drop poll if previous poll still in flight",
    "locking": "single-writer: only ExchangeStatusMonitor writes to exchangeStatus store"
  },
  "dependencies": {
    "depends_on": [
      {
        "bot_id": "exec.orderlifecyclemanager",
        "why": "Provides reject-rate metrics as a secondary degradation signal.",
        "contract": "reject_rate_60s metric published by OrderLifecycleManager."
      }
    ],
    "emits_to": [
      {
        "bot_id": "exec.orderlifecyclemanager",
        "why": "EXCHANGE_STATUS_PAUSE/FLATTEN ObservationReports consumed to suspend order submission.",
        "contract": "All exec bots subscribe to exchange status ObservationReports."
      }
    ],
    "sibling": [],
    "external": [
      {
        "service": "CLOB V2 public API",
        "endpoint": "https://clob.polymarket.com/health",
        "sla": "best-effort (health endpoint)",
        "failure_mode": "Unreachable counts as consecutive error; 3 consecutive = degraded."
      },
      {
        "service": "Polymarket status page",
        "endpoint": "https://status.polymarket.com",
        "sla": "best-effort",
        "failure_mode": "If unreachable, status_page_parsed=false; rely on health endpoint only."
      }
    ]
  },
  "security_surfaces": {
    "signs_orders": false,
    "private_key_access": "none",
    "abuse_vectors": [
      "Injecting fake health endpoint responses to trigger spurious exchange-pause events",
      "Flooding status-page parser with malformed HTML to suppress maintenance detection"
    ],
    "mitigations": [
      "Health endpoint responses validated against expected schema; unexpected payloads treated as errors",
      "Status-page parsing uses keyword matching with a known-safe whitelist; malformed pages treated as 'no incident'"
    ]
  },
  "failure_injection": [
    {
      "scenario": "CLOB_HEALTH_ENDPOINT_DOWN",
      "how_to_inject": "Block TCP to clob_public /health for 3 poll cycles",
      "expected_behaviour": "consecutiveErrors=3; exchange_status=degraded; EXCHANGE_STATUS_PAUSE emitted",
      "recovery": "Health endpoint restored; consecutiveErrors=0; quarantine starts; EXCHANGE_STATUS_HEALTHY after resume_quarantine_min"
    },
    {
      "scenario": "REJECT_RATE_SPIKE",
      "how_to_inject": "Inject 15% reject rate into OrderLifecycleManager metrics for 60s",
      "expected_behaviour": "consecutiveErrors bumped to >=3; EXCHANGE_STATUS_PAUSE emitted",
      "recovery": "Reject rate normalises; quarantine starts"
    },
    {
      "scenario": "STATUS_PAGE_MAINTENANCE_WINDOW",
      "how_to_inject": "Inject 'scheduled maintenance' keyword into status page mock",
      "expected_behaviour": "exchange_status=maintenance; EXCHANGE_STATUS_PAUSE emitted; no flatten triggered",
      "recovery": "Maintenance keyword removed from status page; status returns healthy after quarantine"
    }
  ],
  "runbook": {
    "summary": "ExchangeStatusMonitor incidents require checking the Polymarket status page and CLOB health endpoint. Never manually lift a pause without confirming exchange health.",
    "oncall_actions": [
      {
        "alert": "ESMExchangePaused",
        "first_step": "Check https://status.polymarket.com and CLOB /health directly. If false positive (exchange is healthy), unflag manually after confirming.",
        "diagnosis": "",
        "mitigation": "",
        "escalation": "Exec pod lead + Infra"
      },
      {
        "alert": "ESMHighConsecutiveErrors",
        "first_step": "Check network connectivity to clob.polymarket.com; check CLOB latency dashboard.",
        "diagnosis": "",
        "mitigation": "",
        "escalation": "Infra on-call if connectivity issue"
      }
    ],
    "manual_overrides": [
      {
        "name": "force_resume",
        "how": "polytraders bot force-resume exec.exchangestatusmonitor",
        "when": "Exchange is confirmed healthy but quarantine has not yet expired; use only with Exec pod lead approval.",
        "command": "polytraders bot force-resume exec.exchangestatusmonitor",
        "effect": "Exchange is confirmed healthy but quarantine has not yet expired; use only with Exec pod lead approval."
      }
    ],
    "healthcheck": "GET /internal/health/exchangestatusmonitor -> 200 if exchange_status=healthy, consecutive_errors=0, quarantine_active=false, polling running. Red: exchange_status in (degraded, outage), consecutive_errors >= 3, polling interval missed."
  },
  "promotion_gates": {
    "to_shadow": [
      {
        "gate": "Pause-on-3-consecutive-errors unit test passes",
        "how_measured": "CI test run",
        "threshold": "100% pass"
      }
    ],
    "to_limited_live": [
      {
        "gate": "Zero false-positive pause events over 48h shadow run",
        "how_measured": "Cross-reference pause events with Polymarket incident log",
        "threshold": "Zero false positives"
      }
    ],
    "to_general_live": [
      {
        "gate": "Exchange degradation detected within 3 poll cycles during a real incident",
        "how_measured": "Post-incident review: compare EXCHANGE_STATUS_PAUSE timestamp to CLOB incident start time",
        "threshold": "Detection within 3 \u00d7 poll_interval_s"
      }
    ]
  },
  "reporting": {
    "emits_kinds": [
      "ObservationReport"
    ],
    "topics": [
      "polytraders.reports.observation"
    ],
    "partition_key": "trace_id",
    "cadence": "every-event",
    "retention_class": "30d",
    "sampling_rule": "emit-every",
    "bus_failure_action": "drop-after-buffer",
    "user_visible": "summary-only",
    "consumes_kinds": []
  },
  "capital_impact": "Direct",
  "mode_support": [
    "quarantine"
  ],
  "v3_status": {
    "phase": 5,
    "phase_name": "Execution rails",
    "docs": {
      "done": 27,
      "total": 27,
      "state": "done"
    },
    "impl": {
      "done": 0,
      "total": 15,
      "state": "pending"
    },
    "runtime": {
      "done": 0,
      "total": 8,
      "state": "pending"
    },
    "overall": "pending"
  }
}