{
  "schema_version": "1.0.0",
  "bot_id": "0.6",
  "bot_name": "DuplicateMarketDetector",
  "slug": "duplicatemarketdetector",
  "layer": "Discovery",
  "layer_key": "disc",
  "bot_class": "Signal Service",
  "authority": [
    "Read-only",
    "Recommend"
  ],
  "status": "planned",
  "readiness": "Spec started",
  "flagship": false,
  "is_reference": false,
  "public_export": false,
  "identity": {
    "layer": "Discovery",
    "bot_class": "Signal Service",
    "authority": "Read-only, Recommend",
    "runs_before": "Strategy OrderIntent generation",
    "runs_after": "MarketScanner and MarketQualityRanker",
    "applies_to": "All active Polymarket markets with similar question text or resolution criteria",
    "default_mode": "shadow_only",
    "user_visible": "Advanced details only",
    "developer_owner": "Polytraders core \u2014 Intelligence pod"
  },
  "purpose": "Detect semantically identical or dangerously overlapping Polymarket markets to prevent accidental correlated exposure and to surface cross-market arbitrage opportunities. Emits ObservationReports tagging each duplicate cluster with a similarity score.",
  "why_it_matters": [
    {
      "failure": "Duplicate markets not detected",
      "consequence": "A strategy may take independent positions on two semantically identical markets, creating unintended double exposure that bypasses position-size limits."
    },
    {
      "failure": "Near-duplicate neg-risk bundles missed",
      "consequence": "Neg-risk outcome tokens across overlapping events can create correlated risk that is not visible from individual market inspection alone."
    }
  ],
  "polymarket_inputs": [
    {
      "input": "Market title, rules text, resolution source, and resolution date",
      "source": "Gamma API",
      "required": true,
      "use": "Primary inputs for NLP-based semantic similarity computation."
    },
    {
      "input": "Condition_id metadata and outcome-token list",
      "source": "Gamma API",
      "required": true,
      "use": "Identify neg-risk bundles that share outcome tokens across events."
    },
    {
      "input": "Neg-risk flag and enableNegRisk status",
      "source": "Gamma API",
      "required": false,
      "use": "Apply enhanced overlap detection for neg-risk market groups."
    }
  ],
  "internal_inputs": [
    {
      "input": "MarketScanner candidate list",
      "source": "disc.marketscanner",
      "required": true,
      "use": "Scope duplicate detection to tradable markets only."
    },
    {
      "input": "KillSwitch active flag",
      "source": "risk.kill_switch",
      "required": true,
      "use": "Suppress emissions when KillSwitch is active."
    }
  ],
  "raw_params": [
    "similarity_threshold \u00b7 0\u20131",
    "require_manual_review \u00b7 bool",
    "publish_to \u00b7 list",
    "max_cluster_size \u00b7 int"
  ],
  "parameters": [
    {
      "name": "similarity_threshold",
      "default": 0.85,
      "warning": 0.75,
      "hard": 0.6,
      "controls": "Minimum cosine similarity score between market embeddings for a pair to be flagged as a duplicate.",
      "why_default_matters": "0.85 catches near-identical phrasings while avoiding false positives on related-but-distinct markets.",
      "threshold_logic": [
        {
          "condition": ">= 0.85",
          "action": "Flag as duplicate; emit ObservationReport"
        },
        {
          "condition": "0.75\u20130.85",
          "action": "Flag as potential overlap with WARN annotation"
        },
        {
          "condition": "< 0.6",
          "action": "Ignore \u2014 too dissimilar"
        }
      ],
      "dev_check": "if (score < params.hard) skip_pair();",
      "user_facing": "Markets are only flagged as overlapping when their questions and resolution criteria are highly similar."
    },
    {
      "name": "max_cluster_size",
      "default": 10,
      "warning": 20,
      "hard": 50,
      "controls": "Maximum number of markets allowed in a single duplicate cluster before emitting a LARGE_CLUSTER_WARN.",
      "why_default_matters": "Clusters larger than 10 often indicate a data quality issue rather than genuine duplicates.",
      "threshold_logic": [
        {
          "condition": "<= 10",
          "action": "Normal cluster"
        },
        {
          "condition": "10\u201320",
          "action": "Large cluster \u2014 WARN"
        },
        {
          "condition": "> 50",
          "action": "LARGE_CLUSTER \u2014 hard flag; escalate for manual review"
        }
      ],
      "dev_check": "if (cluster.size > params.hard) emit(LARGE_CLUSTER_WARN);",
      "user_facing": "When many similar markets are found, the system flags the group for review to ensure quality."
    }
  ],
  "default_config": {
    "bot_id": "disc.duplicate_market_detector",
    "version": "0.1.0",
    "mode": "shadow_only",
    "defaults": {
      "similarity_threshold": 0.85,
      "require_manual_review": false,
      "publish_to": [
        "disc.opportunityqueue"
      ],
      "max_cluster_size": 10
    }
  },
  "implementation_flow": [
    "On each detection cycle, fetch all active markets from Gamma API.",
    "Check KillSwitch; if active, halt emissions.",
    "Compute sentence embeddings for each market's (title + rules_text) using local embedding model.",
    "Build a pairwise cosine similarity matrix across all candidate markets.",
    "Cluster pairs with similarity >= similarity_threshold using Union-Find.",
    "For clusters with size > max_cluster_size, emit LARGE_CLUSTER_WARN and flag for manual review if require_manual_review=true.",
    "Emit one ObservationReport per duplicate cluster with market_ids, similarity_scores, and cluster_type (identical/overlap/negrisk_bundle).",
    "Log cycle summary: total_pairs_evaluated, clusters_found, large_clusters."
  ],
  "decision_logic": {
    "approve": "Not applicable \u2014 DuplicateMarketDetector emits ObservationReports, not approvals.",
    "reshape_required": "Not applicable \u2014 read-only detection bot.",
    "reject": "Market pairs below the hard similarity floor are ignored; no report emitted.",
    "warning_only": "Pairs in the warning band (0.75\u20130.85) are flagged with POTENTIAL_OVERLAP annotation."
  },
  "decision_output_schema": "ObservationReport",
  "decision_output_example": {
    "report_id": "0x1122334455667788990011223344556611223344556677889900112233445566",
    "bot_id": "disc.duplicate_market_detector",
    "cluster_id": "cluster-0001",
    "cluster_type": "identical",
    "market_ids": [
      "0x7f8a9b0c1d2e3f4a5b6c7d8e9f0a1b2c3d4e5f6a7b8c9d0e1f2a3b4c5d6e7f8a",
      "0x8a9b0c1d2e3f4a5b6c7d8e9f0a1b2c3d4e5f6a7b8c9d0e1f2a3b4c5d6e7f8a9b"
    ],
    "similarity_score": 0.93,
    "warnings": [],
    "detected_at_ms": 1746789000000
  },
  "developer_log": {
    "bot_id": "disc.duplicate_market_detector",
    "cycle": 7,
    "markets_evaluated": 47,
    "pairs_compared": 1081,
    "clusters_found": 3,
    "large_clusters": 0,
    "killswitch_active": false,
    "detected_at": "2026-05-09T11:30:00Z"
  },
  "user_explanations": [
    {
      "situation": "Two markets flagged as identical",
      "message": "These two markets ask essentially the same question with the same resolution criteria. Holding positions in both would create unintended correlated exposure."
    },
    {
      "situation": "Markets flagged as overlapping",
      "message": "These markets are closely related but not identical. Strategies will account for the correlation when sizing positions across them."
    }
  ],
  "failure_modes": {
    "main_failure_mode": "Embedding model produces incorrect similarity scores for domain-specific terminology, causing genuine duplicates to be missed or distinct markets to be falsely clustered.",
    "false_positive_risk": "Two markets with similar surface phrasing but distinct resolution criteria (e.g. same candidate, different elections) may be incorrectly clustered as duplicates.",
    "false_negative_risk": "Markets with semantically identical meaning but very different wording may fall below the similarity threshold and escape detection.",
    "safe_fallback": "If embedding model or Gamma API is unavailable, halt detection cycle and emit STALE_MARKET_DATA rather than serving stale clusters.",
    "required_dependencies": [
      "Gamma API market list with title and rules text",
      "Local sentence-embedding model",
      "MarketScanner candidate list",
      "KillSwitch active flag"
    ]
  },
  "acceptance_tests": {
    "unit": [
      {
        "test": "Two identical-text markets cluster at similarity >= 0.85",
        "setup": "Two markets with identical question text",
        "expected": "cluster_type='identical'; ObservationReport emitted with both market_ids"
      },
      {
        "test": "Dissimilar markets below hard floor not clustered",
        "setup": "similarity_score=0.55, hard=0.6",
        "expected": "No cluster emitted"
      },
      {
        "test": "Cluster exceeding max_cluster_size emits LARGE_CLUSTER_WARN",
        "setup": "cluster.size=55, max_cluster_size hard=50",
        "expected": "LARGE_CLUSTER_WARN emitted; cluster flagged for review"
      }
    ],
    "integration": [
      {
        "test": "Duplicate cluster detected and forwarded to OpportunityQueue for position suppression",
        "expected": "OpportunityQueue uses cluster ObservationReport to suppress double-up on duplicate markets"
      },
      {
        "test": "Embedding model unavailability halts cycle with STALE_MARKET_DATA",
        "expected": "No ObservationReports emitted; next cycle resumes when model is available"
      }
    ],
    "property": [
      {
        "property": "Every cluster contains at least 2 market_ids",
        "required": "Always true \u2014 singleton clusters are never emitted"
      },
      {
        "property": "No emission when KillSwitch is active",
        "required": "Always true"
      }
    ]
  },
  "checklist_overrides": {},
  "legacy_goal": "Find semantically identical or dangerously overlapping markets to reduce accidental correlated exposure and to support cross-market arb.",
  "legacy_pm_signals": [
    "Market title and rules text",
    "Condition-id metadata",
    "Resolution source and date",
    "Outcome-token list (for neg-risk events)"
  ],
  "legacy_external_feeds": [
    "Local sentence-embedding model"
  ],
  "reporting_groups": [
    "pretrade_intel"
  ],
  "network": [
    "polygon"
  ],
  "api_surface": [
    "gamma",
    "data",
    "internal"
  ],
  "version": {
    "spec": "2.0.0",
    "implementation": "0.1.0",
    "schema": "2",
    "released": null,
    "planned_release": "Q4-2026"
  },
  "migration_history": [
    {
      "date": "2026-04-28",
      "from": "n/a",
      "to": "v2-spec",
      "reason": "Spec drafted post-CLOB-V2 cutover; bot not yet implemented",
      "action_taken": "Designed against V2 schema (pUSD, builder codes, V2 EIP-712 domain)"
    }
  ],
  "polymarket_v2_compat": {
    "clob_version": "v2",
    "collateral": "pUSD",
    "eip712_domain_version": "2",
    "builder_code_aware": false,
    "negrisk_aware": true,
    "multichain_ready": false,
    "sdk_used": "py-clob-client-v2",
    "settlement_contract": "CTFExchangeV2",
    "notes": "Uses Gamma API outcome-token lists and enableNegRisk flag to detect correlated neg-risk bundles as a specialised cluster type alongside standard duplicate detection."
  },
  "reference_implementation": {
    "pseudocode": "FUNCTION detectionCycle():\n  ks = FETCH internal.killswitch.status\n  IF ks.active: RETURN\n\n  candidates = FETCH disc.marketscanner.latest_candidates()\n  markets = FETCH gamma.GET('/markets?ids=' + join(candidates.ids))\n  IF markets IS NULL:\n    LOG ERROR 'Gamma API unavailable \u2014 halting detection cycle'\n    RETURN\n\n  // Compute embeddings\n  embeddings = {}\n  FOR market IN markets:\n    text = market.question + ' ' + (market.rules_text OR '')\n    embeddings[market.condition_id] = embed_model.encode(text)\n\n  // Pairwise similarity + clustering\n  uf = UnionFind(markets.ids)\n  FOR i, j IN pairs(markets):\n    score = cosine(embeddings[i], embeddings[j])\n    IF score >= params.similarity_threshold.hard:\n      uf.union(i, j)\n\n  FOR cluster IN uf.clusters(min_size=2):\n    max_score = MAX(cosine(a,b) FOR a,b IN pairs(cluster))\n    cluster_type = 'identical' IF max_score >= 0.85 ELSE 'overlap'\n    warnings = []\n    IF len(cluster) > params.max_cluster_size.default:\n      warnings.append('LARGE_CLUSTER_WARN')\n    IF max_score < params.similarity_threshold.default:\n      warnings.append('POTENTIAL_OVERLAP')\n    EMIT ObservationReport(cluster_id, cluster_type, cluster.ids, max_score, warnings)\n\n  LOG detection cycle summary",
    "sdk_calls": [
      "gamma.GET('/markets?ids=<condition_id_list>')",
      "embed_model.encode(text)",
      "cosine(embedding_a, embedding_b)"
    ],
    "complexity": "O(M\u00b2) where M = number of candidate markets; accelerated with FAISS ANN for large M"
  },
  "wire_examples": {
    "input": [
      {
        "label": "Two near-duplicate markets from Gamma API",
        "source": "gamma_api",
        "payload": {
          "markets": [
            {
              "condition_id": "0x7f8a9b0c1d2e3f4a5b6c7d8e9f0a1b2c3d4e5f6a7b8c9d0e1f2a3b4c5d6e7f8a",
              "question": "Will Candidate A win the Senate race?",
              "rules_text": "Resolves YES if Candidate A wins the Senate seat.",
              "resolution_date": "2026-11-04T00:00:00Z"
            },
            {
              "condition_id": "0x8a9b0c1d2e3f4a5b6c7d8e9f0a1b2c3d4e5f6a7b8c9d0e1f2a3b4c5d6e7f8a9b",
              "question": "Will Candidate A win the Senate election?",
              "rules_text": "Resolves YES if Candidate A is elected to the Senate.",
              "resolution_date": "2026-11-04T00:00:00Z"
            }
          ]
        }
      }
    ],
    "output": [
      {
        "label": "ObservationReport \u2014 duplicate cluster detected",
        "payload": {
          "report_id": "0x1122334455667788990011223344556611223344556677889900112233445566",
          "bot_id": "disc.duplicate_market_detector",
          "cluster_id": "cluster-0001",
          "cluster_type": "identical",
          "market_ids": [
            "0x7f8a9b0c1d2e3f4a5b6c7d8e9f0a1b2c3d4e5f6a7b8c9d0e1f2a3b4c5d6e7f8a",
            "0x8a9b0c1d2e3f4a5b6c7d8e9f0a1b2c3d4e5f6a7b8c9d0e1f2a3b4c5d6e7f8a9b"
          ],
          "similarity_score": 0.93,
          "warnings": [],
          "detected_at_ms": 1746789000000
        }
      }
    ],
    "curl": "curl 'https://gamma-api.polymarket.com/markets?ids=0x7f8a...,0x8a9b...'"
  },
  "reason_codes": [
    {
      "code": "DUPLICATE_DETECTED",
      "severity": "INFO",
      "meaning": "Two or more markets have been clustered as semantically identical.",
      "action": "Emit ObservationReport with cluster details; downstream bots use this to suppress correlated positions.",
      "user_message": "Two markets are asking essentially the same question \u2014 holding both could create unintended correlated exposure."
    },
    {
      "code": "POTENTIAL_OVERLAP",
      "severity": "WARN",
      "meaning": "Markets are similar but not identical; similarity between warning and default threshold.",
      "action": "Emit ObservationReport with POTENTIAL_OVERLAP annotation.",
      "user_message": "These markets are closely related. Strategies will account for correlation when sizing positions."
    },
    {
      "code": "LARGE_CLUSTER_WARN",
      "severity": "WARN",
      "meaning": "Cluster exceeds max_cluster_size, suggesting a data quality issue.",
      "action": "Emit with LARGE_CLUSTER_WARN flag; escalate for manual review if require_manual_review=true.",
      "user_message": ""
    },
    {
      "code": "STALE_MARKET_DATA",
      "severity": "HARD_REJECT",
      "meaning": "Gamma API or embedding model unavailable; detection cycle halted.",
      "action": "Halt cycle; retry on next interval.",
      "user_message": ""
    },
    {
      "code": "KILL_SWITCH_ACTIVE",
      "severity": "HARD_REJECT",
      "meaning": "KillSwitch is active; all emissions suppressed.",
      "action": "Return immediately.",
      "user_message": ""
    }
  ],
  "metrics": {
    "emitted": [
      {
        "name": "polytraders_disc_duplicatemarketdetector_clusters_found_total",
        "type": "counter",
        "unit": "count",
        "labels": [
          "cluster_type"
        ],
        "meaning": "Total duplicate clusters detected per cycle, by type (identical/overlap/negrisk_bundle)."
      },
      {
        "name": "polytraders_disc_duplicatemarketdetector_reports_emitted_total",
        "type": "counter",
        "unit": "count",
        "labels": [],
        "meaning": "ObservationReports emitted for duplicate clusters."
      },
      {
        "name": "polytraders_disc_duplicatemarketdetector_similarity_score",
        "type": "histogram",
        "unit": "ratio",
        "labels": [],
        "meaning": "Distribution of max similarity scores for detected clusters."
      }
    ],
    "alerts": [
      {
        "name": "DuplicateMarketDetectorLargeCluster",
        "condition": "polytraders_disc_duplicatemarketdetector_clusters_found_total{cluster_type='large'} > 0",
        "severity": "P2",
        "runbook": "#runbook-duplicatemarketdetector-large-cluster"
      },
      {
        "name": "DuplicateMarketDetectorNoCycles",
        "condition": "rate(polytraders_disc_duplicatemarketdetector_reports_emitted_total[30m]) == 0",
        "severity": "P3",
        "runbook": "#runbook-duplicatemarketdetector-no-cycles"
      }
    ],
    "dashboards": [
      "Grafana \u2014 Discovery / DuplicateMarketDetector cluster overview"
    ],
    "log_levels": {
      "DEBUG": "Per-cluster market_ids, similarity_score, and cluster_type.",
      "INFO": "Cycle summary: markets_evaluated, clusters_found.",
      "WARN": "Large cluster detected; embedding model slow.",
      "ERROR": "Gamma API unavailable; embedding model unavailable."
    }
  },
  "state": {
    "store": "in-memory embedding cache and cluster registry",
    "shape": "{ condition_id -> embedding_vector }, { cluster_id -> [condition_ids] }",
    "ttl": "embeddings cached for 1h; clusters evict when member markets close",
    "recovery": "On cold start, embeddings are recomputed from scratch on first cycle.",
    "size_estimate": "~384 floats \u00d7 4B = ~1.5 KB per embedding; 500 markets \u2192 ~750 KB"
  },
  "concurrency": {
    "execution_model": "single-threaded async loop with batched embedding inference",
    "max_in_flight": 1,
    "idempotency_key": "detection_cycle_id",
    "timeout_ms": 15000,
    "backpressure": "drop newest",
    "locking": "none"
  },
  "dependencies": {
    "depends_on": [
      {
        "bot_id": "disc.marketscanner",
        "why": "Scopes detection to tradable markets only.",
        "contract": "Expects active candidate list with condition_ids."
      },
      {
        "bot_id": "risk.kill_switch",
        "why": "KillSwitch suppresses all emissions.",
        "contract": "If active, no ObservationReports emitted."
      }
    ],
    "emits_to": [
      {
        "bot_id": "disc.opportunityqueue",
        "why": "OpportunityQueue uses duplicate cluster reports to suppress correlated position double-ups.",
        "contract": "ObservationReport includes cluster_id, market_ids, similarity_score."
      }
    ],
    "sibling": [
      "disc.marketqualityranker"
    ],
    "external": [
      {
        "service": "Gamma API",
        "endpoint": "https://gamma-api.polymarket.com",
        "sla": "99.9% / 500ms p99",
        "failure_mode": "Halt cycle; retry next interval."
      },
      {
        "service": "Local sentence-embedding model",
        "endpoint": "localhost:8080/embed",
        "sla": "99.9% / 100ms p99",
        "failure_mode": "Halt cycle; emit STALE_MARKET_DATA; retry next interval."
      }
    ]
  },
  "security_surfaces": {
    "signs_orders": false,
    "private_key_access": "none",
    "abuse_vectors": [
      "Gamma API returning crafted market text designed to force a false duplicate cluster",
      "Embedding model poisoning via crafted input text"
    ],
    "mitigations": [
      "Similarity threshold prevents low-confidence clusters from propagating",
      "Large-cluster warning and require_manual_review flag limit blast radius of false clusters"
    ]
  },
  "failure_injection": [
    {
      "scenario": "EMBEDDING_MODEL_DOWN",
      "how_to_inject": "Kill local embedding model process",
      "expected_behaviour": "Cycle halted; STALE_MARKET_DATA logged; no reports emitted",
      "recovery": "Automatic when embedding model restarts."
    },
    {
      "scenario": "LARGE_CLUSTER_DETECTED",
      "how_to_inject": "Inject 60 markets with near-identical titles",
      "expected_behaviour": "Cluster emitted with LARGE_CLUSTER_WARN; DuplicateMarketDetectorLargeCluster alert fires",
      "recovery": "Manual review; adjust similarity_threshold if needed."
    },
    {
      "scenario": "KILL_SWITCH_ON",
      "how_to_inject": "Set killswitch.active=true",
      "expected_behaviour": "Computation runs; zero ObservationReports emitted",
      "recovery": "Emissions resume after KillSwitch reset."
    }
  ],
  "runbook": {
    "summary": "DuplicateMarketDetector incidents are typically embedding model failures or large false clusters. Bot is read-only; incidents do not affect active positions.",
    "oncall_actions": [
      {
        "alert": "DuplicateMarketDetectorLargeCluster",
        "first_action": "Inspect cluster contents in developer log; verify markets are genuinely similar.",
        "escalate_to": "Intelligence pod lead for manual review if cluster > 50 markets."
      },
      {
        "alert": "DuplicateMarketDetectorNoCycles",
        "first_action": "Check embedding model health and Gamma API availability.",
        "escalate_to": "Intelligence pod lead after 30 minutes."
      }
    ],
    "manual_overrides": [
      {
        "name": "raise-similarity-threshold",
        "how": "Set similarity_threshold above current value via config update",
        "when": "Too many false-positive duplicate clusters are being emitted."
      }
    ],
    "healthcheck": "GET /internal/health/duplicatemarketdetector \u2192 green if Last detection cycle completed within 2\u00d7 cycle interval; embedding model reachable.; red if No cycle in 2\u00d7 interval or embedding model unreachable for >5 minutes."
  },
  "promotion_gates": {
    "to_shadow": [
      {
        "gate": "Identical-text market pairs cluster with similarity >= 0.95 in test suite",
        "how_measured": "Unit test suite",
        "threshold": "100% pass"
      }
    ],
    "to_limited_live": [
      {
        "gate": "False-positive rate < 5% over 48h shadow run (spot-checked sample of 100 clusters)",
        "how_measured": "Manual review",
        "threshold": "< 5 false positives per 100"
      }
    ],
    "to_general_live": [
      {
        "gate": "Zero LARGE_CLUSTER_WARN events during normal operation over 7 days",
        "how_measured": "Alert history",
        "threshold": "0 firings"
      }
    ]
  },
  "reporting": {
    "emits_kinds": [
      "ObservationReport"
    ],
    "topics": [
      "polytraders.reports.observation"
    ],
    "retention_class": "30d",
    "cadence": "every-event",
    "sampling_rule": "emit-every",
    "bus_failure_action": "drop-after-buffer",
    "user_visible": "summary-only",
    "consumes_kinds": []
  },
  "capital_impact": "Indirect",
  "v3_status": {
    "phase": 2,
    "phase_name": "Data normalisation",
    "docs": {
      "done": 27,
      "total": 27,
      "state": "done"
    },
    "impl": {
      "done": 0,
      "total": 15,
      "state": "pending"
    },
    "runtime": {
      "done": 0,
      "total": 8,
      "state": "pending"
    },
    "overall": "pending"
  }
}