{
  "schema_version": "1.0.0",
  "bot_id": "6.9",
  "bot_name": "ExperimentTracker",
  "slug": "experimenttracker",
  "layer": "Governance",
  "layer_key": "gov",
  "bot_class": "Governance Service",
  "authority": [
    "Explain"
  ],
  "status": "planned",
  "readiness": "Spec started",
  "flagship": false,
  "is_reference": false,
  "public_export": false,
  "identity": {
    "layer": "Governance",
    "bot_class": "Governance Service",
    "authority": "Explain",
    "runs_before": "StrategyRegistry promotion decision",
    "runs_after": "Shadow or limited-live deployment of a strategy variant",
    "applies_to": "All strategies in shadow or limited-live experiment mode",
    "default_mode": "shadow_only",
    "user_visible": "no",
    "developer_owner": "Polytraders core"
  },
  "purpose": "ExperimentTracker manages shadow and limited-live A/B experiments, records matched-pair samples, computes confidence intervals, and emits a drift signal to StrategyRegistry when a variant underperforms.",
  "why_it_matters": [
    {
      "failure": "No experiment tracking",
      "consequence": "Promotions are made without statistical evidence; regressions go undetected."
    },
    {
      "failure": "Auto-promote without human sign-off",
      "consequence": "A variant with a transient winning streak is promoted before significance is established."
    }
  ],
  "polymarket_inputs": [
    {
      "input": "None \u2014 ExperimentTracker consumes internal report bus data only",
      "source": "internal",
      "required": false,
      "use": "N/A"
    }
  ],
  "internal_inputs": [
    {
      "input": "Replay-tagged OperationsReport from shadow variant",
      "source": "gov.backtester",
      "required": true,
      "use": "Populate matched-pair samples for the variant."
    },
    {
      "input": "Live OperationsReport from control strategy",
      "source": "internal.report_bus",
      "required": true,
      "use": "Baseline comparison for edge and fill quality."
    }
  ],
  "raw_params": [
    "min_samples_for_decision \u00b7 int",
    "traffic_split_pct \u00b7 0\u2013100",
    "auto_promote_on_winning \u00b7 bool",
    "require_human_signoff \u00b7 bool"
  ],
  "parameters": [
    {
      "name": "min_samples_for_decision",
      "default": 100,
      "warning": null,
      "hard": null,
      "controls": "Minimum matched-pair samples before a winner can be declared.",
      "why_default_matters": "100 samples gives a reasonable confidence interval for most strategies.",
      "threshold_logic": [
        {
          "condition": "samples < min_samples_for_decision",
          "action": "Do not declare winner; emit EXPERIMENT_INSUFFICIENT_SAMPLES"
        }
      ],
      "dev_check": "if samples < p.min_samples_for_decision: emit('EXPERIMENT_INSUFFICIENT_SAMPLES')",
      "user_facing": "The experiment needs enough data before a conclusion can be drawn."
    },
    {
      "name": "traffic_split_pct",
      "default": 10,
      "warning": 50,
      "hard": 100,
      "controls": "Percentage of live traffic routed to the variant.",
      "why_default_matters": "10% limits exposure during shadow phase.",
      "threshold_logic": [
        {
          "condition": "traffic_split_pct > 50",
          "action": "WARN; require human sign-off"
        }
      ],
      "dev_check": "if p.traffic_split_pct > 50: emit('EXPERIMENT_LARGE_SPLIT_WARN')",
      "user_facing": "A small portion of traffic is used for the experiment."
    }
  ],
  "default_config": {
    "bot_id": "gov.experimenttracker",
    "version": "0.1.0",
    "mode": "shadow_only",
    "defaults": {
      "min_samples_for_decision": 100,
      "traffic_split_pct": 10,
      "auto_promote_on_winning": false,
      "require_human_signoff": true
    }
  },
  "implementation_flow": [
    "On experiment start, assign variant_id and record traffic_split_pct and baseline strategy slug.",
    "For each matched pair (shadow fill vs live fill), record edge, slippage, and fill quality in pUSD.",
    "Compute running confidence intervals on edge delta between variant and control.",
    "When samples >= min_samples_for_decision and CI is significant, emit EXPERIMENT_RESULT report.",
    "If variant underperforms control by > 2 sigma, emit drift signal to StrategyRegistry.",
    "If require_human_signoff=true, block auto-promote even when variant wins."
  ],
  "decision_logic": {
    "approve": "Not applicable \u2014 ExperimentTracker records statistical outcomes; it does not approve promotions.",
    "reshape_required": "Not applicable.",
    "reject": "Emits drift signal if variant underperforms; StrategyRegistry handles demotion.",
    "warning_only": "EXPERIMENT_LARGE_SPLIT_WARN when traffic_split_pct > 50."
  },
  "decision_output_schema": "OperationsReport",
  "decision_output_example": {
    "report_id": "ops_experimenttracker_01HX9Z",
    "bot_id": "gov.experimenttracker",
    "event_type": "EXPERIMENT_RESULT",
    "experiment_id": "exp_sports_v2",
    "variant_slug": "sports-model-v2",
    "control_slug": "sports-model",
    "samples": 150,
    "edge_delta_bps": 3.2,
    "ci_95_low": 1.1,
    "ci_95_high": 5.3,
    "verdict": "variant_wins",
    "report_kind": "OperationsReport",
    "topic": "polytraders.reports.operations"
  },
  "developer_log": {
    "bot_id": "gov.experimenttracker",
    "event_type": "SAMPLE_RECORDED",
    "experiment_id": "exp_sports_v2",
    "sample_n": 47,
    "variant_fill_pusd": 430.0,
    "control_fill_pusd": 415.0,
    "edge_delta_bps": 3.6
  },
  "user_explanations": [
    {
      "situation": "Experiment concluded with winning variant",
      "message": "The new strategy version performed better in testing and has been flagged for promotion review."
    },
    {
      "situation": "Insufficient samples",
      "message": "The experiment is still collecting data. No conclusion yet."
    }
  ],
  "failure_modes": {
    "main_failure_mode": "Report bus is unavailable; matched-pair samples cannot be collected, stalling the experiment.",
    "false_positive_risk": "Small sample size produces a false winner due to variance.",
    "false_negative_risk": "A genuinely better variant fails to reach significance within the experiment window.",
    "safe_fallback": "If report bus is unavailable, pause sample collection and emit EXPERIMENT_STALLED warn.",
    "required_dependencies": [
      "internal.report_bus",
      "gov.strategyregistry"
    ]
  },
  "acceptance_tests": {
    "unit": [
      {
        "test": "Winner not declared before min_samples reached",
        "setup": "samples=50, min_samples=100",
        "expected": "EXPERIMENT_INSUFFICIENT_SAMPLES"
      },
      {
        "test": "Drift signal emitted when variant underperforms by >2 sigma",
        "setup": "edge_delta=-5, sigma=2",
        "expected": "Drift signal sent to StrategyRegistry"
      }
    ],
    "integration": [
      {
        "test": "Full experiment lifecycle: start \u2192 sample collection \u2192 result report \u2192 drift signal",
        "expected": "OperationsReport with event_type=EXPERIMENT_RESULT emitted"
      }
    ],
    "property": [
      {
        "property": "auto_promote_on_winning is gated by require_human_signoff",
        "required": "When require_human_signoff=true, auto-promote never fires regardless of verdict"
      }
    ]
  },
  "checklist_overrides": {},
  "legacy_goal": "Run shadow and limited-live experiments alongside production, with statistically honest comparisons.",
  "legacy_pm_signals": [
    "Variant assignments and traffic split per strategy",
    "Matched-pair samples for shadow vs. live",
    "Confidence intervals on edge / slippage / fill quality"
  ],
  "legacy_external_feeds": [],
  "reporting_groups": [
    "governance_audit"
  ],
  "network": [
    "polygon"
  ],
  "api_surface": [
    "internal"
  ],
  "version": {
    "spec": "2.0.0",
    "implementation": "0.1.0",
    "schema": "2",
    "released": null,
    "planned_release": "Q3-2026"
  },
  "migration_history": [
    {
      "date": "2026-04-28",
      "from": "n/a",
      "to": "v2-spec",
      "reason": "Spec drafted post-CLOB-V2 cutover; bot not yet implemented",
      "action_taken": "Designed against V2 schema (pUSD, builder codes, V2 EIP-712 domain)"
    }
  ],
  "polymarket_v2_compat": {
    "clob_version": "v2",
    "collateral": "pUSD",
    "eip712_domain_version": "2",
    "builder_code_aware": false,
    "negrisk_aware": false,
    "multichain_ready": false,
    "sdk_used": "py-clob-client-v2",
    "settlement_contract": "CTFExchangeV2",
    "notes": "ExperimentTracker is an internal analytics service; uses pUSD for all simulated P&L comparisons."
  },
  "reference_implementation": {
    "pseudocode": "// ---- EXPERIMENT START ----\nFUNCTION startExperiment(config):\n  exp = {id: generateULID(), variant: config.variant_slug,\n         control: config.control_slug, samples: [], started_at: now()}\n  postgres.insert('experiments', exp)\n  EMIT OperationsReport(event_type='EXPERIMENT_STARTED', experiment_id=exp.id)\n\n// ---- SAMPLE RECORDING ----\nFUNCTION recordSample(variantFill, controlFill, experimentId):\n  delta_bps = (variantFill.edge_pusd - controlFill.edge_pusd) / controlFill.notional * 10000\n  postgres.insert('experiment_samples', {experiment_id: experimentId,\n    variant_fill_pusd: variantFill.size_pusd,\n    control_fill_pusd: controlFill.size_pusd,\n    edge_delta_bps: delta_bps, recorded_at: now()})\n\n// ---- RESULT EVALUATION ----\nFUNCTION evaluateExperiment(experimentId):\n  samples = postgres.select('experiment_samples', WHERE experiment_id=experimentId)\n  IF len(samples) < config.min_samples_for_decision:\n    EMIT OperationsReport(event_type='EXPERIMENT_INSUFFICIENT_SAMPLES')\n    RETURN\n  ci = computeCI95(samples)\n  verdict = 'variant_wins' IF ci.low > 0 ELSE 'control_wins' IF ci.high < 0 ELSE 'inconclusive'\n  EMIT OperationsReport(event_type='EXPERIMENT_RESULT', verdict=verdict,\n    ci_95_low=ci.low, ci_95_high=ci.high)\n  IF verdict == 'control_wins':\n    strategyRegistry.sendDriftSignal(experimentId.variant_slug)",
    "sdk_calls": [
      "postgres.insert('experiments', exp)",
      "postgres.select('experiment_samples', ...)",
      "strategyRegistry.sendDriftSignal(slug)"
    ],
    "complexity": "O(S) per evaluation where S = sample count"
  },
  "wire_examples": {
    "input": {
      "label": "Matched-pair sample",
      "source": "internal.report_bus",
      "payload": {
        "experiment_id": "exp_sports_v2",
        "variant_fill_pusd": 430.0,
        "control_fill_pusd": 415.0,
        "recorded_at_ms": 1746792060000
      }
    },
    "output": {
      "label": "OperationsReport \u2014 EXPERIMENT_RESULT",
      "payload": {
        "report_id": "ops_exp_01HX9Z",
        "event_type": "EXPERIMENT_RESULT",
        "verdict": "variant_wins",
        "ci_95_low": 1.1,
        "ci_95_high": 5.3,
        "report_kind": "OperationsReport",
        "topic": "polytraders.reports.operations"
      }
    }
  },
  "reason_codes": [
    {
      "code": "EXPERIMENT_STARTED",
      "severity": "INFO",
      "meaning": "A new experiment was registered.",
      "action": "Log and emit OperationsReport.",
      "user_message": ""
    },
    {
      "code": "EXPERIMENT_RESULT",
      "severity": "INFO",
      "meaning": "Experiment concluded with a statistical verdict.",
      "action": "Emit OperationsReport; optionally trigger promotion flow.",
      "user_message": ""
    },
    {
      "code": "EXPERIMENT_INSUFFICIENT_SAMPLES",
      "severity": "WARN",
      "meaning": "Insufficient samples to declare a winner.",
      "action": "Continue sampling.",
      "user_message": ""
    },
    {
      "code": "EXPERIMENT_LARGE_SPLIT_WARN",
      "severity": "WARN",
      "meaning": "traffic_split_pct > 50%; high exposure to variant.",
      "action": "Emit WARN; require human sign-off.",
      "user_message": ""
    },
    {
      "code": "EXPERIMENT_STALLED",
      "severity": "WARN",
      "meaning": "Report bus unavailable; sampling paused.",
      "action": "Pause experiment; emit alert.",
      "user_message": ""
    }
  ],
  "metrics": {
    "emitted": [
      {
        "name": "polytraders_gov_experimenttracker_experiments_total",
        "type": "counter",
        "unit": "count",
        "labels": [
          "verdict"
        ],
        "meaning": "Total experiments completed by verdict."
      },
      {
        "name": "polytraders_gov_experimenttracker_samples_total",
        "type": "counter",
        "unit": "count",
        "labels": [
          "experiment_id"
        ],
        "meaning": "Total matched-pair samples recorded."
      },
      {
        "name": "polytraders_gov_experimenttracker_edge_delta_bps",
        "type": "gauge",
        "unit": "bps",
        "labels": [
          "experiment_id"
        ],
        "meaning": "Running edge delta between variant and control."
      },
      {
        "name": "polytraders_gov_experimenttracker_drift_signals_total",
        "type": "counter",
        "unit": "count",
        "labels": [
          "slug"
        ],
        "meaning": "Total drift signals sent to StrategyRegistry."
      }
    ],
    "alerts": [
      {
        "name": "ExperimentTrackerStalled",
        "condition": "rate(polytraders_gov_experimenttracker_samples_total[30m]) == 0",
        "severity": "P2",
        "runbook": "#runbook-experimenttracker-stalled"
      },
      {
        "name": "ExperimentTrackerDriftSignal",
        "condition": "rate(polytraders_gov_experimenttracker_drift_signals_total[10m]) > 0",
        "severity": "P2",
        "runbook": "#runbook-experimenttracker-drift"
      }
    ]
  },
  "state": {
    "store": "postgres",
    "shape": "experiments table + experiment_samples table",
    "ttl": "1 year",
    "recovery": "On restart, reload active experiments from Postgres; resume sampling from last recorded sample.",
    "size_estimate": "~500 B per experiment; ~200 B per sample; ~10 MB for 50k samples"
  },
  "concurrency": {
    "execution_model": "event-driven; one goroutine per active experiment",
    "max_in_flight": 20,
    "idempotency_key": "experiment_id + sample_n",
    "timeout_ms": 5000,
    "backpressure": "queue",
    "locking": "Postgres unique constraint on (experiment_id, sample_n)"
  },
  "dependencies": {
    "depends_on": [
      {
        "bot_id": "internal.report_bus",
        "why": "Matched-pair samples are derived from OperationsReport records on the report bus.",
        "contract": "OperationsReport must carry fill metadata."
      }
    ],
    "emits_to": [
      {
        "bot_id": "gov.strategyregistry",
        "what": "Drift signals on underperforming variants"
      }
    ],
    "sibling": [
      {
        "bot_id": "gov.backtester",
        "why": "Backtester provides replay-mode baseline data for shadow experiments.",
        "contract": "Replay reports carry mode=replay."
      }
    ],
    "external": [
      {
        "service": "Internal Postgres",
        "endpoint": "postgres://internal",
        "sla": "99.9%",
        "failure_mode": "Pause sampling; queue samples in memory; flush on reconnect."
      }
    ]
  },
  "security_surfaces": {
    "signs_orders": false,
    "private_key_access": "none",
    "abuse_vectors": [
      "Manipulating sample data to bias experiment toward a preferred variant"
    ],
    "mitigations": [
      "Samples are immutably written to Postgres; no update path exists on experiment_samples"
    ]
  },
  "failure_injection": [
    {
      "scenario": "REPORT_BUS_UNAVAILABLE",
      "how_to_inject": "Block reads from internal.report_bus",
      "expected_behaviour": "EXPERIMENT_STALLED emitted; sampling paused",
      "recovery": "Automatic resume when bus is reachable."
    },
    {
      "scenario": "INSUFFICIENT_SAMPLES",
      "how_to_inject": "Set min_samples=1000 with only 50 samples collected",
      "expected_behaviour": "EXPERIMENT_INSUFFICIENT_SAMPLES emitted; no verdict",
      "recovery": "Continue sampling until threshold reached."
    },
    {
      "scenario": "DRIFT_SIGNAL",
      "how_to_inject": "Inject 50 samples where variant edge_delta < -5 bps",
      "expected_behaviour": "Drift signal sent to StrategyRegistry after significance threshold",
      "recovery": "StrategyRegistry demotes variant if configured."
    }
  ],
  "runbook": {
    "summary": "ExperimentTracker incidents involve stalled sampling (bus unavailable) or drift signals blocking a planned promotion.",
    "oncall_actions": [
      {
        "alert": "ExperimentTrackerStalled",
        "first_action": "Check internal report bus health.",
        "escalate_to": "Governance pod lead"
      },
      {
        "alert": "ExperimentTrackerDriftSignal",
        "first_action": "Review experiment results; confirm whether drift is genuine or transient.",
        "escalate_to": "Governance pod lead"
      }
    ],
    "manual_overrides": [
      {
        "name": "reset-experiment",
        "how": "polytraders gov experiment reset --id <id>",
        "when": "Experiment data is corrupted or needs a fresh start."
      }
    ],
    "healthcheck": "/internal/health/experimenttracker \u2192 green if Postgres reachable; at least one active experiment has received samples in the last hour; red if No samples recorded in 2h for any active experiment"
  },
  "promotion_gates": {
    "to_shadow": [
      {
        "gate": "CI computation unit tests pass",
        "how_measured": "CI",
        "threshold": "100% pass"
      }
    ],
    "to_limited_live": [
      {
        "gate": "End-to-end experiment with synthetic data produces correct verdict",
        "how_measured": "Integration test",
        "threshold": "Pass"
      }
    ],
    "to_general_live": [
      {
        "gate": "One production experiment completed with governance pod review",
        "how_measured": "Governance review",
        "threshold": "Pass"
      }
    ]
  },
  "reporting": {
    "emits_kinds": [
      "OperationsReport"
    ],
    "topics": [
      "polytraders.reports.operations"
    ],
    "cadence": "every-period",
    "retention_class": "1y",
    "sampling_rule": "batched-1/min",
    "bus_failure_action": "drop-after-buffer",
    "user_visible": "no",
    "consumes_kinds": []
  },
  "capital_impact": "Indirect",
  "v3_status": {
    "phase": 7,
    "phase_name": "Governance & replay",
    "docs": {
      "done": 27,
      "total": 27,
      "state": "done"
    },
    "impl": {
      "done": 0,
      "total": 15,
      "state": "pending"
    },
    "runtime": {
      "done": 0,
      "total": 8,
      "state": "pending"
    },
    "overall": "pending"
  }
}