{
  "schema_version": "1.0.0",
  "bot_id": "6.17",
  "bot_name": "APIDegradationMonitor",
  "slug": "api_degradation_monitor",
  "layer": "Governance",
  "layer_key": "gov",
  "bot_class": "Governance Service",
  "authority": [
    "Observe"
  ],
  "status": "planned",
  "readiness": "Spec ready",
  "flagship": false,
  "is_reference": false,
  "public_export": false,
  "identity": {
    "layer": "Governance",
    "bot_class": "Governance",
    "authority": "Observe",
    "runs_before": "risk.killswitch, exec.smart_router",
    "runs_after": "\u2014",
    "applies_to": "Continuous",
    "default_mode": "shadow",
    "user_visible": "Yes",
    "developer_owner": "Governance pod"
  },
  "purpose": "Watches every external API surface Polytraders depends on (CLOB v2 REST, CLOB WebSocket, Polymarket metadata REST, Ethereum RPC, builder fee oracle) and publishes a per-surface health envelope (latency p50/p99, error rate, last_success_ts_ms). Risk and Strategy bots consume this envelope to decide whether to operate normally, degrade, or pause.",
  "why_it_matters": [
    {
      "failure": "Cascading failures from a single dead dependency",
      "consequence": "Without an explicit health signal, every bot infers liveness from its own latest call \u2014 producing inconsistent retreat behaviour across the system."
    },
    {
      "failure": "Silent degradations",
      "consequence": "An API can stay up but slow to 30-second responses; bots without an explicit threshold keep blocking on it instead of failing fast."
    },
    {
      "failure": "Postmortem confusion",
      "consequence": "Without a health timeline, postmortems cannot answer 'what was the actual external latency at 14:23?'."
    }
  ],
  "polymarket_inputs": [
    {
      "input": "CLOB REST + WebSocket",
      "source": "Polymarket",
      "required": true,
      "use": "Probe latency and error rates."
    },
    {
      "input": "Polymarket metadata REST",
      "source": "Polymarket",
      "required": true,
      "use": "Health probe."
    },
    {
      "input": "Ethereum RPC",
      "source": "RPC provider",
      "required": true,
      "use": "Latency + block-tip lag probe."
    }
  ],
  "internal_inputs": [
    {
      "input": "Real outbound traffic latency samples",
      "source": "Every bot",
      "required": true,
      "use": "Passive observation in addition to active probes."
    }
  ],
  "raw_params": [
    "probe_interval_ms \u00b7 1000\u201360000",
    "warn_p99_ms \u00b7 100\u201310000",
    "fail_p99_ms \u00b7 100\u201360000",
    "fail_error_rate_pct \u00b7 1\u2013100"
  ],
  "parameters": [
    {
      "name": "probe_interval_ms",
      "default": 5000,
      "warning": "\u2014",
      "hard": "\u2014",
      "controls": "How often each surface is actively probed.",
      "why_default_matters": "5s gives quick detection without flooding upstreams.",
      "threshold_logic": [
        {
          "condition": "5000",
          "action": "Default"
        }
      ],
      "dev_check": "schedule.every(p.probe_interval_ms).do(probe);",
      "user_facing": "(Internal.)"
    },
    {
      "name": "warn_p99_ms",
      "default": 750,
      "warning": 750,
      "hard": "\u2014",
      "controls": "p99 latency at which the surface is marked DEGRADED.",
      "why_default_matters": "750ms p99 is the empirical breakpoint where downstream pipelines start to tail out.",
      "threshold_logic": [
        {
          "condition": "\u2264 750ms",
          "action": "OK"
        },
        {
          "condition": "> 750ms",
          "action": "DEGRADED"
        }
      ],
      "dev_check": "if (p99 > p.warn_p99_ms) status = 'DEGRADED';",
      "user_facing": "(Internal.)"
    },
    {
      "name": "fail_p99_ms",
      "default": 5000,
      "warning": "\u2014",
      "hard": 5000,
      "controls": "p99 latency at which the surface is marked DOWN.",
      "why_default_matters": "5s p99 means almost every operation is timing out.",
      "threshold_logic": [
        {
          "condition": "\u2264 5000ms",
          "action": "Better than DOWN"
        },
        {
          "condition": "> 5000ms",
          "action": "DOWN"
        }
      ],
      "dev_check": "if (p99 > p.fail_p99_ms) status = 'DOWN';",
      "user_facing": "(Internal.)"
    },
    {
      "name": "fail_error_rate_pct",
      "default": 25,
      "warning": 10,
      "hard": 25,
      "controls": "Error rate at which the surface is marked DOWN regardless of latency.",
      "why_default_matters": "25% errors over a 1-minute window is an obvious outage.",
      "threshold_logic": [
        {
          "condition": "< 10%",
          "action": "OK"
        },
        {
          "condition": "10\u201325%",
          "action": "DEGRADED"
        },
        {
          "condition": "> 25%",
          "action": "DOWN"
        }
      ],
      "dev_check": "if (errRate > p.fail_error_rate_pct) status = 'DOWN';",
      "user_facing": "(Internal.)"
    }
  ],
  "default_config": {
    "probe_interval_ms": 5000,
    "warn_p99_ms": 750,
    "fail_p99_ms": 5000,
    "fail_error_rate_pct": 25
  },
  "flow": "Run scheduled probes per surface \u2192 aggregate active + passive samples in a 1-minute rolling window \u2192 compute p50/p99 + error_rate \u2192 classify status (OK/DEGRADED/DOWN) \u2192 emit ApiHealthReport for every surface every probe interval.",
  "decision_logic": {
    "approve": "Sample active probes + passive traffic. Latch DOWN status until two consecutive OK windows.",
    "reshape_required": "This bot does not reshape orders.",
    "reject": "No reject path defined for this bot \u2014 it is observe-only.",
    "warning_only": "Apply warn/fail thresholds."
  },
  "decision_output_example": {
    "kind": "ApiHealthReport",
    "surface": "clob_v2_rest",
    "status": "DEGRADED",
    "p50_ms": 220,
    "p99_ms": 980,
    "error_rate_pct": 4.1,
    "last_success_ts_ms": 1715260000000
  },
  "developer_log": "Per emission: surface, status, p50, p99, error_rate, sample_count.",
  "user_explanations": [
    {
      "situation": "When this bot acts",
      "message": "The system briefly slowed down because one of the data sources we depend on was responding slowly."
    }
  ],
  "failure_modes": {
    "main_failure_mode": "Calling a surface DOWN when only the active probe is failing but real traffic is fine (or vice versa).",
    "false_positive_risk": "Active probe hits an old endpoint not used in production; mitigation: probes mirror real traffic shape.",
    "false_negative_risk": "Surface only fails on writes; passive read samples mask the issue; mitigation: write-side probes count separately.",
    "safe_fallback": "If the monitor itself fails, emit a synthetic ApiHealthReport with status=UNKNOWN and a non-stale ts_ms. Consumers must treat UNKNOWN as DEGRADED."
  },
  "acceptance_tests": {
    "unit": [
      {
        "test": "p99 = warn_p99_ms + 1 \u2192 DEGRADED.",
        "setup": "Synthetic fixture per template.",
        "expected": "Behaviour matches the rule described in the test name."
      },
      {
        "test": "Error rate = fail_error_rate_pct + 1 \u2192 DOWN.",
        "setup": "Synthetic fixture per template.",
        "expected": "Behaviour matches the rule described in the test name."
      }
    ],
    "integration": [
      {
        "test": "Inject a slow-loris response on the clob_v2_rest probe \u2192 status flips to DEGRADED within 2 probe intervals.",
        "expected": "End-to-end behaviour matches the spec without manual intervention."
      }
    ],
    "property": [
      {
        "property": "status transitions are monotonic within a single window: OK \u2194 DEGRADED \u2194 DOWN, no skip.",
        "required": "Always true across all generated inputs."
      }
    ]
  },
  "reference_implementation": {
    "language": "pseudocode",
    "pseudocode": "for each surface s:\n  samples = window(s, 60_000)\n  p50, p99 = quantiles(samples)\n  err = error_rate(samples)\n  status = classify(p99, err, p)\n  emit('ApiHealthReport', s, status, p50, p99, err, last_success_ts_ms[s])"
  },
  "wire_examples": {
    "input": {
      "surface": "clob_v2_rest",
      "samples": [
        {
          "ts_ms": 1715260000000,
          "latency_ms": 220,
          "ok": true
        }
      ]
    },
    "output": {
      "kind": "ApiHealthReport",
      "surface": "clob_v2_rest",
      "status": "OK",
      "p50_ms": 220,
      "p99_ms": 220,
      "error_rate_pct": 0
    }
  },
  "reason_codes": [
    {
      "code": "GOV_API_OK",
      "severity": "P3",
      "meaning": "Gov Api Ok",
      "action": "See decision output and developer log for context.",
      "user_message": "The system briefly slowed down because one of the data sources we depend on was responding slowly."
    },
    {
      "code": "GOV_API_DEGRADED",
      "severity": "P3",
      "meaning": "Gov Api Degraded",
      "action": "See decision output and developer log for context.",
      "user_message": "The system briefly slowed down because one of the data sources we depend on was responding slowly."
    },
    {
      "code": "GOV_API_DOWN",
      "severity": "P3",
      "meaning": "Gov Api Down",
      "action": "See decision output and developer log for context.",
      "user_message": "The system briefly slowed down because one of the data sources we depend on was responding slowly."
    },
    {
      "code": "GOV_API_UNKNOWN",
      "severity": "P3",
      "meaning": "Gov Api Unknown",
      "action": "See decision output and developer log for context.",
      "user_message": "The system briefly slowed down because one of the data sources we depend on was responding slowly."
    }
  ],
  "metrics": {
    "emitted": [
      {
        "name": "api_p50_ms",
        "type": "histogram",
        "unit": "ms",
        "labels": [
          "bot_id"
        ],
        "meaning": "Api p50 ms."
      },
      {
        "name": "api_p99_ms",
        "type": "histogram",
        "unit": "ms",
        "labels": [
          "bot_id"
        ],
        "meaning": "Api p99 ms."
      },
      {
        "name": "api_error_rate_pct",
        "type": "gauge",
        "unit": "value",
        "labels": [
          "bot_id"
        ],
        "meaning": "Api error rate pct."
      },
      {
        "name": "api_status_changes_total",
        "type": "counter",
        "unit": "event",
        "labels": [
          "bot_id"
        ],
        "meaning": "Api status changes total."
      }
    ],
    "alerts": [],
    "dashboards": [
      "6.17 overview dashboard"
    ]
  },
  "state": {
    "summary": "Per-surface rolling sample buffer + last status. In-memory; reseeds on restart.",
    "stores": [
      {
        "name": "api_degradation_monitor_state",
        "kind": "in-memory + fast KV mirror",
        "key": "bot_id",
        "value": "Per-surface rolling sample buffer + last status. In-memory; reseeds on restart.",
        "ttl": "24h",
        "durability": "crash-safe via KV mirror"
      }
    ],
    "recovery": "Cold-start hydrates from fast KV; missing keys default to safe fallback.",
    "on_restart": "All in-flight decisions are re-evaluated; no bot decision is trusted across restart without re-emit."
  },
  "concurrency": {
    "execution_model": "One worker per surface; emits to a single status feed.",
    "max_in_flight": 32,
    "idempotency_key": "order_intent_id",
    "replay_safe": true,
    "deduplication": "By idempotency_key within a 60s window.",
    "ordering_guarantees": "Per-market_id FIFO; cross-market unordered.",
    "timeout_ms": 250,
    "backpressure": "Bounded queue; oldest-dropped with metric increment when full.",
    "locking": "Per-market_id mutex; no global locks."
  },
  "dependencies": {
    "depends_on": [],
    "emits_to": [
      "risk.killswitch",
      "exec.smart_router"
    ]
  },
  "graph": {
    "requires": [],
    "required_before": [
      "risk.killswitch",
      "exec.smart_router"
    ],
    "consumes": [
      "ProbeSample",
      "TrafficSample"
    ],
    "emits": [
      "OperationsReport(kind=ApiHealthReport)"
    ],
    "blocks": false
  },
  "mode_support": [
    "off",
    "shadow",
    "advisory",
    "enforced"
  ],
  "latency_budget_ms": {
    "p50": 50,
    "p99": 250
  },
  "data_freshness": {
    "max_market_data_age_ms": 10000,
    "max_orderbook_age_ms": 10000,
    "max_external_feed_age_ms": 10000,
    "on_stale_data": "Emit status=UNKNOWN \u2014 never silently report OK."
  },
  "ownership": {
    "owner": "Governance pod",
    "on_call": "gov-oncall",
    "channel": "#polytraders-gov",
    "escalation": "Head of Governance",
    "severity_class": "P1"
  },
  "human_override": {
    "allowed": false,
    "who": "\u2014",
    "log_event": "\u2014",
    "time_bound": "\u2014",
    "scope": "\u2014",
    "second_approval": false
  },
  "security_surfaces": {
    "summary": "Probe credentials are read-only API keys with no order-placement scope.",
    "signing": "None \u2014 bot does not sign or submit.",
    "secrets": [],
    "contract_calls": [],
    "abuse_vectors": [],
    "mitigations": [
      "Rate-limit per source",
      "Audit-log every override",
      "Require role-based authz on admin paths"
    ]
  },
  "polymarket_v2_compat": {
    "clob_version": "V2",
    "collateral": "pUSD",
    "eip712_domain_version": "2",
    "builder_code_aware": true,
    "negrisk_aware": true,
    "multichain_ready": true,
    "sdk_used": "Polymarket CLOB V2 SDK",
    "settlement_contract": "CTFExchangeV2",
    "notes": "Surface 'clob_v2_rest' specifically targets V2 endpoints."
  },
  "version": {
    "current": "0.1.0",
    "contract_version": "1.0.0",
    "last_breaking_change": "none",
    "deprecation_window_days": 30
  },
  "migration_history": [],
  "runbook": {
    "summary": "If a surface is stuck DEGRADED with no obvious cause, increase probe_interval_ms temporarily and inspect upstream provider's status page.",
    "oncall_actions": [
      {
        "alert": "6.17_anomaly",
        "first_step": "Open the bot's reporting page and confirm the alert is real (not a metric hiccup).",
        "diagnosis": "Inspect developer log entries for the affected market_id over the last 30 minutes.",
        "mitigation": "Force-clear via Admin UI if the rule is clearly stale; otherwise leave engaged and notify owner.",
        "escalation": "Governance pod"
      }
    ],
    "manual_overrides": [
      {
        "command": "polytraders bot pause 6.17",
        "effect": "Disables the bot's enforcement layer; downstream consumers fall back to safe defaults."
      }
    ],
    "healthcheck": "GET /healthz/api_degradation_monitor \u2192 200 if last successful evaluation < 60s ago."
  },
  "promotion_gates": {
    "to_shadow": [
      {
        "gate": "Stub",
        "how_measured": "probe-suite passes against synthetic surfaces.",
        "threshold": "Documented threshold met for the full window."
      }
    ],
    "to_limited_live": [
      {
        "gate": "Shadow",
        "how_measured": "14 days; status feed compared with the upstream's own status page.",
        "threshold": "Documented threshold met for the full window."
      },
      {
        "gate": "Advisory",
        "how_measured": "7 days.",
        "threshold": "Documented threshold met for the full window."
      }
    ],
    "to_general_live": [
      {
        "gate": "Enforced",
        "how_measured": "KillSwitch and SmartRouter consume the feed.",
        "threshold": "Documented threshold met for the full window."
      }
    ]
  },
  "failure_injection": [
    {
      "scenario": "Drop probe responses for 60s and assert status flips DOWN",
      "how_to_inject": "Drop probe responses for 60s and assert status flips DOWN.",
      "expected_behavior": "Bot detects within its latency budget and emits the corresponding reason code.",
      "recovery": "Remove the injected fault; bot returns to healthy state within one debounce window."
    },
    {
      "scenario": "Disconnect the probe scheduler and assert UNKNOWN is emitted within one probe in",
      "how_to_inject": "Disconnect the probe scheduler and assert UNKNOWN is emitted within one probe interval.",
      "expected_behavior": "Bot detects within its latency budget and emits the corresponding reason code.",
      "recovery": "Remove the injected fault; bot returns to healthy state within one debounce window."
    }
  ],
  "capital_impact": "Indirect",
  "v3_status": {
    "phase": 2,
    "phase_name": "Data normalisation",
    "docs": {
      "done": 27,
      "total": 27,
      "state": "done"
    },
    "impl": {
      "done": 0,
      "total": 15,
      "state": "pending"
    },
    "runtime": {
      "done": 0,
      "total": 8,
      "state": "pending"
    },
    "overall": "pending"
  }
}