{
  "schema_version": "1.0.0",
  "bot_id": "5.6",
  "bot_name": "RPCFailoverManager",
  "slug": "rpcfailovermanager",
  "layer": "Security",
  "layer_key": "sec",
  "bot_class": "Guardrail",
  "authority": [
    "Reject",
    "Pause"
  ],
  "status": "planned",
  "readiness": "Spec started",
  "flagship": false,
  "is_reference": false,
  "public_export": false,
  "identity": {
    "layer": "Security",
    "bot_class": "Guardrail",
    "authority": "Reject, Pause",
    "runs_before": "Any bot that makes on-chain read calls",
    "runs_after": "System startup; continuous background probe",
    "applies_to": "All Polygon RPC endpoints in the configured provider pool",
    "default_mode": "shadow_only",
    "user_visible": "Advanced details only",
    "developer_owner": "Polytraders core"
  },
  "purpose": "Probe RPC providers continuously and fail over before a stale endpoint poisons our chain view.",
  "why_it_matters": [
    {
      "failure": "Single RPC endpoint goes stale",
      "consequence": "All bots reading chain state see an outdated block, causing mispriced or incorrectly-scoped orders."
    },
    {
      "failure": "No quorum check across providers",
      "consequence": "A forked or malicious RPC can poison chain state views used for contract address and balance checks."
    },
    {
      "failure": "No auto-quarantine of degraded provider",
      "consequence": "A slow or error-prone endpoint keeps being polled, adding latency to every on-chain check."
    }
  ],
  "polymarket_inputs": [
    {
      "input": "eth_blockNumber from each configured RPC provider",
      "source": "onchain",
      "required": true,
      "use": "Measure block height divergence across providers to detect stale endpoints."
    }
  ],
  "internal_inputs": [
    {
      "input": "Configured RPC provider pool and probe_interval_s",
      "source": "Admin UI",
      "required": true,
      "use": "Pool of providers to probe and failover thresholds."
    },
    {
      "input": "KillSwitch active flag",
      "source": "KillSwitch",
      "required": true,
      "use": "Halt all chain reads during global pause."
    }
  ],
  "raw_params": [
    "max_block_lag \u00b7 int",
    "min_providers_quorum \u00b7 int",
    "auto_quarantine \u00b7 bool",
    "probe_interval_s \u00b7 int"
  ],
  "parameters": [
    {
      "name": "max_block_lag",
      "default": 3,
      "warning": "Block lag >= 2 for any provider",
      "hard": "Block lag >= max_block_lag for primary provider",
      "controls": "Maximum tolerated block height difference before a provider is quarantined.",
      "why_default_matters": "3-block lag on Polygon (~6s) is the threshold where stale data becomes operationally dangerous.",
      "threshold_logic": [
        {
          "condition": "lag < max_block_lag",
          "action": "APPROVE \u2014 provider healthy"
        },
        {
          "condition": "lag >= max_block_lag AND auto_quarantine=true",
          "action": "Quarantine provider; failover to next in pool"
        },
        {
          "condition": "lag >= max_block_lag AND no healthy provider",
          "action": "REJECT \u2014 RPC_QUORUM_LOST"
        }
      ],
      "dev_check": "if (lag >= p.max_block_lag && p.auto_quarantine) quarantine(provider);",
      "user_facing": "The network connection is degraded. Orders are paused until connectivity is restored."
    },
    {
      "name": "min_providers_quorum",
      "default": 2,
      "warning": "Only min_providers_quorum providers healthy",
      "hard": "Fewer than min_providers_quorum healthy providers available",
      "controls": "Minimum number of healthy providers required before any chain read is trusted.",
      "why_default_matters": "Quorum of 2 prevents a single compromised provider from poisoning chain state.",
      "threshold_logic": [
        {
          "condition": "healthy_count >= min_providers_quorum",
          "action": "APPROVE"
        },
        {
          "condition": "healthy_count < min_providers_quorum",
          "action": "REJECT \u2014 RPC_QUORUM_LOST"
        }
      ],
      "dev_check": "if (healthyProviders.length < p.min_providers_quorum) return reject('RPC_QUORUM_LOST');",
      "user_facing": "Not enough network providers are available. Orders are paused."
    }
  ],
  "default_config": {
    "bot_id": "sec.rpc_failover_manager",
    "version": "0.1.0",
    "mode": "hard_guard",
    "defaults": {
      "max_block_lag": 3,
      "min_providers_quorum": 2,
      "auto_quarantine": true,
      "probe_interval_s": 5
    }
  },
  "implementation_flow": [
    "On startup: load provider pool from Admin UI config.",
    "Background loop every probe_interval_s: call eth_blockNumber on all providers.",
    "Compute block height divergence across providers.",
    "For providers with lag >= max_block_lag and auto_quarantine=true: mark quarantined.",
    "Check healthy provider count; if < min_providers_quorum: REJECT(RPC_QUORUM_LOST) on all pending chain reads.",
    "Elect primary provider as the one with highest block height and lowest latency.",
    "On order arrival: return current primary provider endpoint for chain reads.",
    "Periodically re-probe quarantined providers; restore if lag normalises."
  ],
  "decision_logic": {
    "approve": "At least min_providers_quorum healthy providers with lag < max_block_lag; primary elected.",
    "reshape_required": "Not applicable \u2014 manager either provides a healthy endpoint or rejects.",
    "reject": "Fewer than min_providers_quorum healthy providers (RPC_QUORUM_LOST).",
    "warning_only": "Warn when only min_providers_quorum providers remain healthy."
  },
  "decision_output_schema": "RiskVote",
  "decision_output_example": {
    "vote_id": "sec.rpc_failover_manager.20260509T170000Z",
    "decision": "APPROVE",
    "reason_code": null,
    "evidence": {
      "primary_provider": "alchemy-polygon-1",
      "healthy_count": 3,
      "quarantined_count": 0,
      "max_lag_blocks": 1
    },
    "checked_at": "2026-05-09T17:00:00Z"
  },
  "developer_log": {
    "bot_id": "sec.rpc_failover_manager",
    "decision": "APPROVE",
    "inputs_used": [
      "onchain.eth_blockNumber",
      "config.provider_pool"
    ],
    "checked_at": "2026-05-09T17:00:00Z"
  },
  "user_explanations": [
    {
      "situation": "Orders paused \u2014 RPC quorum lost",
      "message": "The network connection is degraded. Orders are paused until connectivity is restored."
    },
    {
      "situation": "Provider failover",
      "message": "The primary network provider was switched automatically. No action needed."
    },
    {
      "situation": "Provider quarantined",
      "message": "One of the network providers was temporarily taken offline. Others are being used."
    }
  ],
  "failure_modes": {
    "main_failure_mode": "All providers simultaneously stale, causing chain state blindness for all on-chain checks.",
    "false_positive_risk": "A brief network hiccup quarantines providers unnecessarily, causing trading pause until re-probe succeeds.",
    "false_negative_risk": "Two providers both stale at the same height pass the quorum check but provide wrong data.",
    "safe_fallback": "If fewer than min_providers_quorum healthy providers: fail-closed on all chain reads; emit RPC_QUORUM_LOST.",
    "required_dependencies": [
      "Configured RPC provider pool (at least 3 recommended)",
      "Admin UI config",
      "KillSwitch"
    ]
  },
  "acceptance_tests": {
    "unit": [
      {
        "test": "Approve when primary provider has lag < max_block_lag",
        "setup": "provider lag=1, max_block_lag=3",
        "expected": "APPROVE; primary returned"
      },
      {
        "test": "Quarantine provider with lag >= max_block_lag",
        "setup": "provider lag=4, max_block_lag=3, auto_quarantine=true",
        "expected": "Provider quarantined; failover to secondary"
      },
      {
        "test": "Reject when healthy providers < min_providers_quorum",
        "setup": "only 1 healthy provider, min_providers_quorum=2",
        "expected": "DENY(RPC_QUORUM_LOST)"
      }
    ],
    "integration": [
      {
        "test": "Quarantined provider restored after lag normalises",
        "expected": "Provider un-quarantined on next probe with lag < max_block_lag"
      },
      {
        "test": "KillSwitch halts all probes and rejects chain reads",
        "expected": "DENY(KILL_SWITCH_ACTIVE) on all reads"
      }
    ],
    "property": [
      {
        "property": "healthy_count < min_providers_quorum always produces DENY",
        "required": "Always true"
      },
      {
        "property": "Primary provider always has lowest lag among healthy set",
        "required": "Always true"
      }
    ]
  },
  "checklist_overrides": {},
  "legacy_goal": "Probe RPC providers continuously and fail over before a stale endpoint poisons our chain view.",
  "legacy_pm_signals": [
    "Block-height divergence across providers",
    "Latency, error-rate, and quota state per RPC",
    "Failover events and quarantine status"
  ],
  "legacy_external_feeds": [
    "Configured RPC pool"
  ],
  "reporting_groups": [
    "risk_compliance",
    "governance_audit"
  ],
  "network": [
    "polygon"
  ],
  "api_surface": [
    "onchain",
    "internal"
  ],
  "version": {
    "spec": "2.0.0",
    "implementation": "0.1.0",
    "schema": "2",
    "released": null,
    "planned_release": "Q3-2026"
  },
  "migration_history": [
    {
      "date": "2026-04-28",
      "from": "n/a",
      "to": "v2-spec",
      "reason": "Spec drafted post-CLOB-V2 cutover; bot not yet implemented",
      "action_taken": "Designed against V2 schema (pUSD, builder codes, V2 EIP-712 domain)"
    }
  ],
  "polymarket_v2_compat": {
    "clob_version": "v2",
    "collateral": "pUSD",
    "eip712_domain_version": "2",
    "builder_code_aware": false,
    "negrisk_aware": false,
    "multichain_ready": false,
    "sdk_used": "py-clob-client-v2",
    "settlement_contract": "CTFExchangeV2",
    "notes": "Manages Polygon RPC provider health to ensure CTFExchangeV2 and pUSD contract reads are from a fresh block."
  },
  "reference_implementation": {
    "pseudocode": "// RPCFailoverManager\nSTATE = { providers: [], primary: null, quarantined: [] }\n\n// Background probe loop\nEVERY params.probe_interval_s:\n  heights = []\n  FOR provider IN STATE.providers:\n    h = FETCH(provider.eth_blockNumber())\n    IF h == null: quarantine(provider); CONTINUE\n    heights.append({provider, h, latency})\n  max_h = MAX(heights.map(x => x.h))\n  FOR entry IN heights:\n    lag = max_h - entry.h\n    IF lag >= params.max_block_lag AND params.auto_quarantine:\n      quarantine(entry.provider)\n  healthy = heights.filter(x => (max_h - x.h) < params.max_block_lag)\n  IF healthy.count < params.min_providers_quorum:\n    EMIT RiskVote(DENY, RPC_QUORUM_LOST)\n  ELSE:\n    STATE.primary = healthy.sort_by(latency).first()\n    EMIT RiskVote(APPROVE)\n\n// Per-request provider lookup\nFUNCTION getPrimaryProvider():\n  IF STATE.primary == null: return DENY(RPC_QUORUM_LOST)\n  RETURN STATE.primary",
    "sdk_calls": [
      "provider.eth_blockNumber()",
      "internal.killswitch.status()"
    ],
    "complexity": "O(p) per probe where p = provider pool size (small constant)"
  },
  "wire_examples": {
    "input": [
      {
        "label": "Probe result from provider pool",
        "source": "onchain",
        "payload": {
          "provider": "alchemy-polygon-1",
          "block_number": 58420100,
          "latency_ms": 45,
          "timestamp_ms": 1746768672000
        }
      }
    ],
    "output": [
      {
        "label": "RiskVote \u2014 APPROVE with primary",
        "payload": {
          "vote_id": "sec.rpc_failover_manager.20260509T170000Z",
          "decision": "APPROVE",
          "reason_code": null,
          "evidence": {
            "primary_provider": "alchemy-polygon-1",
            "healthy_count": 3,
            "max_lag_blocks": 1
          },
          "checked_at": "2026-05-09T17:00:00Z"
        }
      }
    ]
  },
  "reason_codes": [
    {
      "code": "KILL_SWITCH_ACTIVE",
      "severity": "HARD_REJECT",
      "meaning": "Global kill switch is active.",
      "action": "Immediately return DENY.",
      "user_message": "Trading is currently paused."
    },
    {
      "code": "RPC_QUORUM_LOST",
      "severity": "HARD_REJECT",
      "meaning": "Fewer than min_providers_quorum healthy RPC providers available.",
      "action": "Return DENY on all chain reads until quorum restored.",
      "user_message": "The network connection is degraded. Orders are paused."
    },
    {
      "code": "RPC_PROVIDER_LAGGING",
      "severity": "WARN",
      "meaning": "A provider's block height lags by 2 blocks; approaching quarantine threshold.",
      "action": "Log warn; keep provider active; increase probe frequency.",
      "user_message": "Network connectivity is slightly degraded."
    },
    {
      "code": "RPC_QUORUM_WARN",
      "severity": "WARN",
      "meaning": "Only min_providers_quorum providers remain healthy; one more failure triggers reject.",
      "action": "Emit warn; notify ops.",
      "user_message": "Network connectivity is limited."
    },
    {
      "code": "RPC_FAILOVER_INFO",
      "severity": "INFO",
      "meaning": "Primary provider switched to a new endpoint.",
      "action": "Log info; no action needed.",
      "user_message": "Network provider updated automatically."
    }
  ],
  "metrics": {
    "emitted": [
      {
        "name": "polytraders_sec_rpcfailovermanager_healthy_providers",
        "type": "gauge",
        "unit": "count",
        "labels": [],
        "meaning": "Number of currently healthy providers."
      },
      {
        "name": "polytraders_sec_rpcfailovermanager_block_lag",
        "type": "gauge",
        "unit": "blocks",
        "labels": [
          "provider"
        ],
        "meaning": "Current block lag per provider relative to maximum observed."
      },
      {
        "name": "polytraders_sec_rpcfailovermanager_failovers_total",
        "type": "counter",
        "unit": "count",
        "labels": [],
        "meaning": "Number of primary provider switches."
      },
      {
        "name": "polytraders_sec_rpcfailovermanager_probe_latency_ms",
        "type": "histogram",
        "unit": "ms",
        "labels": [
          "provider"
        ],
        "meaning": "Latency of eth_blockNumber probe per provider."
      }
    ],
    "alerts": [
      {
        "name": "RPCQuorumLost",
        "condition": "polytraders_sec_rpcfailovermanager_healthy_providers < min_providers_quorum",
        "severity": "P0",
        "runbook": "#runbook-rpc-quorum-lost"
      },
      {
        "name": "RPCHighFailoverRate",
        "condition": "rate(polytraders_sec_rpcfailovermanager_failovers_total[5m]) > 2",
        "severity": "P1",
        "runbook": "#runbook-rpc-failover-rate"
      }
    ]
  },
  "state": {
    "store": "in-process; provider health state updated by background probe loop",
    "shape": "{provider_id -> {healthy, block_height, latency_ms, quarantined_at}}",
    "ttl": "refreshed every probe_interval_s; quarantine TTL = 60s before re-probe",
    "recovery": "Re-probe all providers on restart; no persistent state required.",
    "size_estimate": "< 1 KB"
  },
  "concurrency": {
    "execution_model": "background probe loop + sync per-request lookup",
    "max_in_flight": 10,
    "idempotency_key": "probe_timestamp",
    "timeout_ms": 1000,
    "backpressure": "drop probe if previous not complete",
    "locking": "read-write lock on provider health state"
  },
  "dependencies": {
    "depends_on": [
      {
        "bot_id": "risk.kill_switch",
        "why": "KillSwitch halts all probes.",
        "contract": "DENY(KILL_SWITCH_ACTIVE) on all reads."
      }
    ],
    "emits_to": [
      {
        "bot_id": "sec.chain_state_verifier",
        "why": "Provides healthy RPC endpoint for chain state reads.",
        "contract": "Primary provider elected by RPCFailoverManager used by ChainStateVerifier."
      },
      {
        "bot_id": "gov.builder_attribution",
        "why": "Log failover events.",
        "contract": "GovernanceLog entry on each failover."
      }
    ],
    "sibling": [
      "sec.chain_state_verifier"
    ],
    "external": [
      {
        "service": "Polygon RPC pool",
        "endpoint": "Configured provider endpoints",
        "sla": "best-effort per provider",
        "failure_mode": "Quarantine and failover; DENY if quorum lost."
      }
    ]
  },
  "security_surfaces": {
    "signs_orders": false,
    "private_key_access": "none",
    "abuse_vectors": [
      "Compromised RPC provider returning fraudulent block heights to pass quorum",
      "BGP hijack routing traffic to malicious RPC node"
    ],
    "mitigations": [
      "Quorum of min_providers_quorum providers required; single provider cannot pass unilaterally",
      "Provider pool configured in Admin UI with TLS-pinned endpoints"
    ]
  },
  "failure_injection": [
    {
      "scenario": "PRIMARY_PROVIDER_STALE",
      "how_to_inject": "Stop block production on primary provider (simulate stale)",
      "expected_behaviour": "Provider quarantined after max_block_lag; failover to secondary",
      "recovery": "Automatic when provider resumes producing blocks."
    },
    {
      "scenario": "QUORUM_LOST",
      "how_to_inject": "Quarantine all but 1 provider",
      "expected_behaviour": "DENY(RPC_QUORUM_LOST) on all chain reads; alert fires",
      "recovery": "Providers recover; quorum restored on next probe."
    },
    {
      "scenario": "ALL_PROVIDERS_DOWN",
      "how_to_inject": "Block all RPC endpoints",
      "expected_behaviour": "DENY(RPC_QUORUM_LOST); RPCQuorumLost P0 alert fires",
      "recovery": "Manual provider pool update or network restoration."
    }
  ],
  "runbook": {
    "summary": "RPCQuorumLost is a P0 event \u2014 all chain-dependent checks are blocked. Restore provider connectivity immediately.",
    "oncall_actions": [
      {
        "alert": "RPCQuorumLost",
        "first_action": "Check provider pool status in Admin UI; identify which providers are quarantined.",
        "escalate_to": "On-call engineer immediately; add backup provider if fewer than 2 available."
      },
      {
        "alert": "RPCHighFailoverRate",
        "first_action": "Check network conditions for Polygon; look for provider incidents.",
        "escalate_to": "Infrastructure team if persistent."
      }
    ],
    "manual_overrides": [
      {
        "name": "Add emergency RPC provider",
        "how": "polytraders admin add-provider sec.rpc_failover_manager --endpoint <url>",
        "when": "Quorum lost and existing providers cannot be recovered quickly."
      }
    ],
    "healthcheck": "GET /internal/health/rpcfailovermanager \u2192 green if At least min_providers_quorum healthy providers; primary elected within last probe_interval_s.; red if healthy_count < min_providers_quorum or no probe completed in last 30s."
  },
  "promotion_gates": {
    "to_shadow": [
      {
        "gate": "Provider quarantine and failover logic tested with simulated lag",
        "how_measured": "CI integration test",
        "threshold": "100% pass"
      }
    ],
    "to_limited_live": [
      {
        "gate": "Quorum-lost injection test fires DENY and alert correctly",
        "how_measured": "Failure injection test",
        "threshold": "Pass"
      }
    ],
    "to_general_live": [
      {
        "gate": "Zero RPCQuorumLost alerts in 48h shadow with real provider pool",
        "how_measured": "Grafana alert history",
        "threshold": "0 alerts"
      }
    ]
  },
  "reporting": {
    "emits_kinds": [
      "RiskVote"
    ],
    "topics": [
      "polytraders.reports.risk"
    ],
    "retention_class": "2y",
    "cadence": "every-event",
    "sampling_rule": "emit-every",
    "bus_failure_action": "fail-closed",
    "user_visible": "summary-only",
    "consumes_kinds": [
      "ObservationReport"
    ]
  },
  "capital_impact": "Direct",
  "mode_support": [
    "quarantine"
  ],
  "v3_status": {
    "phase": 5,
    "phase_name": "Execution rails",
    "docs": {
      "done": 27,
      "total": 27,
      "state": "done"
    },
    "impl": {
      "done": 0,
      "total": 15,
      "state": "pending"
    },
    "runtime": {
      "done": 0,
      "total": 8,
      "state": "pending"
    },
    "overall": "pending"
  }
}